Optionally collect RGW GC task stats (#94)
* Optionally collect RGW GC task stats * Minor changes per code-review, add some additional tests to squeeze out extra coverage
This commit is contained in:
parent
ae0f874abb
commit
dc6ab9c636
|
@ -33,7 +33,7 @@ RUN apt-get update && \
|
||||||
RUN wget -q -O- 'https://download.ceph.com/keys/release.asc' | apt-key add -
|
RUN wget -q -O- 'https://download.ceph.com/keys/release.asc' | apt-key add -
|
||||||
RUN echo "deb https://download.ceph.com/debian-luminous xenial main" >> /etc/apt/sources.list && \
|
RUN echo "deb https://download.ceph.com/debian-luminous xenial main" >> /etc/apt/sources.list && \
|
||||||
apt-get update && \
|
apt-get update && \
|
||||||
apt-get install -y --force-yes librados2 librbd1 && \
|
apt-get install -y --force-yes librados2 librbd1 ceph-common && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,183 @@
|
||||||
|
package collectors
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"log"
|
||||||
|
"os/exec"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
)
|
||||||
|
|
||||||
|
const rgwGCTimeFormat = "2006-01-02 15:04:05"
|
||||||
|
const radosgwAdminPath = "/usr/bin/radosgw-admin"
|
||||||
|
|
||||||
|
type rgwTaskGC struct {
|
||||||
|
Tag string `json:"tag"`
|
||||||
|
Time string `json:"time"`
|
||||||
|
Objects []struct {
|
||||||
|
Pool string `json:"pool"`
|
||||||
|
OID string `json:"oid"`
|
||||||
|
Key string `json:"ky"`
|
||||||
|
Instance string `json:"instance"`
|
||||||
|
} `json:"objs"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// Expires returns the timestamp that this task will expire and become active
|
||||||
|
func (gc rgwTaskGC) ExpiresAt() time.Time {
|
||||||
|
tmp := strings.SplitN(gc.Time, ".", 2)
|
||||||
|
|
||||||
|
last, err := time.Parse(rgwGCTimeFormat, tmp[0])
|
||||||
|
if err != nil {
|
||||||
|
return time.Now()
|
||||||
|
}
|
||||||
|
return last
|
||||||
|
}
|
||||||
|
|
||||||
|
// rgwGetGCTaskList get the RGW Garbage Collection task list
|
||||||
|
func rgwGetGCTaskList(config string) ([]byte, error) {
|
||||||
|
var (
|
||||||
|
out []byte
|
||||||
|
err error
|
||||||
|
)
|
||||||
|
|
||||||
|
if out, err = exec.Command(radosgwAdminPath, "-c", config, "gc", "list", "--include-all").Output(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// RGWCollector collects metrics from the RGW service
|
||||||
|
type RGWCollector struct {
|
||||||
|
config string
|
||||||
|
|
||||||
|
// ActiveTasks reports the number of (expired) RGW GC tasks
|
||||||
|
ActiveTasks *prometheus.GaugeVec
|
||||||
|
// ActiveObjects reports the total number of RGW GC objects contained in active tasks
|
||||||
|
ActiveObjects *prometheus.GaugeVec
|
||||||
|
|
||||||
|
// PendingTasks reports the number of RGW GC tasks queued but not yet expired
|
||||||
|
PendingTasks *prometheus.GaugeVec
|
||||||
|
// PendingObjects reports the total number of RGW GC objects contained in pending tasks
|
||||||
|
PendingObjects *prometheus.GaugeVec
|
||||||
|
|
||||||
|
getRGWGCTaskList func(string) ([]byte, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewRGWCollector creates an instance of the RGWCollector and instantiates
|
||||||
|
// the individual metrics that we can collect from the RGW service
|
||||||
|
func NewRGWCollector(cluster string, config string) *RGWCollector {
|
||||||
|
labels := make(prometheus.Labels)
|
||||||
|
labels["cluster"] = cluster
|
||||||
|
return &RGWCollector{
|
||||||
|
config: config,
|
||||||
|
getRGWGCTaskList: rgwGetGCTaskList,
|
||||||
|
|
||||||
|
ActiveTasks: prometheus.NewGaugeVec(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: cephNamespace,
|
||||||
|
Name: "rgw_gc_active_tasks",
|
||||||
|
Help: "RGW GC active task count",
|
||||||
|
ConstLabels: labels,
|
||||||
|
},
|
||||||
|
[]string{},
|
||||||
|
),
|
||||||
|
ActiveObjects: prometheus.NewGaugeVec(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: cephNamespace,
|
||||||
|
Name: "rgw_gc_active_objects",
|
||||||
|
Help: "RGW GC active object count",
|
||||||
|
ConstLabels: labels,
|
||||||
|
},
|
||||||
|
[]string{},
|
||||||
|
),
|
||||||
|
PendingTasks: prometheus.NewGaugeVec(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: cephNamespace,
|
||||||
|
Name: "rgw_gc_pending_tasks",
|
||||||
|
Help: "RGW GC pending task count",
|
||||||
|
ConstLabels: labels,
|
||||||
|
},
|
||||||
|
[]string{},
|
||||||
|
),
|
||||||
|
PendingObjects: prometheus.NewGaugeVec(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: cephNamespace,
|
||||||
|
Name: "rgw_gc_pending_objects",
|
||||||
|
Help: "RGW GC pending object count",
|
||||||
|
ConstLabels: labels,
|
||||||
|
},
|
||||||
|
[]string{},
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *RGWCollector) collectorList() []prometheus.Collector {
|
||||||
|
return []prometheus.Collector{
|
||||||
|
r.ActiveTasks,
|
||||||
|
r.ActiveObjects,
|
||||||
|
r.PendingTasks,
|
||||||
|
r.PendingObjects,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *RGWCollector) collect() error {
|
||||||
|
data, err := r.getRGWGCTaskList(r.config)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks := make([]rgwTaskGC, 0, 0)
|
||||||
|
err = json.Unmarshal(data, &tasks)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
activeTaskCount := int(0)
|
||||||
|
activeObjectCount := int(0)
|
||||||
|
pendingTaskCount := int(0)
|
||||||
|
pendingObjectCount := int(0)
|
||||||
|
|
||||||
|
now := time.Now()
|
||||||
|
for _, task := range tasks {
|
||||||
|
if now.Sub(task.ExpiresAt()) > 0 {
|
||||||
|
// timer expired these are active
|
||||||
|
activeTaskCount += 1
|
||||||
|
activeObjectCount += len(task.Objects)
|
||||||
|
} else {
|
||||||
|
pendingTaskCount += 1
|
||||||
|
pendingObjectCount += len(task.Objects)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
r.ActiveTasks.WithLabelValues().Set(float64(activeTaskCount))
|
||||||
|
r.PendingTasks.WithLabelValues().Set(float64(pendingTaskCount))
|
||||||
|
|
||||||
|
r.ActiveObjects.WithLabelValues().Set(float64(activeObjectCount))
|
||||||
|
r.PendingObjects.WithLabelValues().Set(float64(pendingObjectCount))
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Describe sends the descriptors of each RGWCollector related metrics we have defined
|
||||||
|
// to the provided prometheus channel.
|
||||||
|
func (r *RGWCollector) Describe(ch chan<- *prometheus.Desc) {
|
||||||
|
for _, metric := range r.collectorList() {
|
||||||
|
metric.Describe(ch)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect sends all the collected metrics to the provided prometheus channel.
|
||||||
|
// It requires the caller to handle synchronization.
|
||||||
|
func (r *RGWCollector) Collect(ch chan<- prometheus.Metric) {
|
||||||
|
err := r.collect()
|
||||||
|
if err != nil {
|
||||||
|
log.Println("Failed to collect RGW GC stats", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, metric := range r.collectorList() {
|
||||||
|
metric.Collect(ch)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,157 @@
|
||||||
|
package collectors
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"io/ioutil"
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
|
"regexp"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestRGWCollector(t *testing.T) {
|
||||||
|
for _, tt := range []struct {
|
||||||
|
input []byte
|
||||||
|
reMatch []*regexp.Regexp
|
||||||
|
reUnmatch []*regexp.Regexp
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
input: []byte(`
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"tag": "00000000-0001-0000-0000-9ec86fa9a561.9695966.3129536\u0000",
|
||||||
|
"time": "1975-01-01 16:31:09.0.564455s",
|
||||||
|
"objs": [
|
||||||
|
{
|
||||||
|
"pool": "pool.rgw.buckets.data",
|
||||||
|
"oid": "12345678-0001-0000-0000-000000000000.123456.1100__shadow_.tNcmQWnIAlJMd33ZIdhnLF9HoaY9TOv_1",
|
||||||
|
"key": "",
|
||||||
|
"instance": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pool": "pool.rgw.buckets.data",
|
||||||
|
"oid": "12345678-0002-0000-0000-000000000000.123456.1100__shadow_.tNcmQWnIAlJMd33ZIdhnLF9HoaY9TOv_1",
|
||||||
|
"key": "",
|
||||||
|
"instance": ""
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"tag": "00000000-0002-0000-0000-9ec86fa9a561.9695966.3129536\u0000",
|
||||||
|
"time": "1975-01-01 17:31:09.0.564455s",
|
||||||
|
"objs": [
|
||||||
|
{
|
||||||
|
"pool": "pool.rgw.buckets.data",
|
||||||
|
"oid": "12345678-0004-0000-0000-000000000000.123456.1100__shadow_.tNcmQWnIAlJMd33ZIdhnLF9HoaY9TOv_1",
|
||||||
|
"key": "",
|
||||||
|
"instance": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pool": "pool.rgw.buckets.data",
|
||||||
|
"oid": "12345678-0005-0000-0000-000000000000.123456.1100__shadow_.tNcmQWnIAlJMd33ZIdhnLF9HoaY9TOv_1",
|
||||||
|
"key": "",
|
||||||
|
"instance": ""
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"tag": "00000000-0002-0000-0000-9ec86fa9a561.9695966.3129536\u0000",
|
||||||
|
"time": "3075-01-01 11:30:09.0.123456s",
|
||||||
|
"objs": [
|
||||||
|
{
|
||||||
|
"pool": "pool.rgw.buckets.data",
|
||||||
|
"oid": "12345678-0001-5555-0000-000000000000.123456.1100__shadow_.tNcmQWnIAlJMd33ZIdhnLF9HoaY9TOv_1",
|
||||||
|
"key": "",
|
||||||
|
"instance": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pool": "pool.rgw.buckets.data",
|
||||||
|
"oid": "12345678-0002-5555-0000-000000000000.123456.1100__shadow_.tNcmQWnIAlJMd33ZIdhnLF9HoaY9TOv_1",
|
||||||
|
"key": "",
|
||||||
|
"instance": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pool": "pool.rgw.buckets.data",
|
||||||
|
"oid": "12345678-0003-5555-0000-000000000000.123456.1100__shadow_.tNcmQWnIAlJMd33ZIdhnLF9HoaY9TOv_1",
|
||||||
|
"key": "",
|
||||||
|
"instance": ""
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
`),
|
||||||
|
reMatch: []*regexp.Regexp{
|
||||||
|
regexp.MustCompile(`ceph_rgw_gc_active_tasks{cluster="ceph"} 2`),
|
||||||
|
regexp.MustCompile(`ceph_rgw_gc_active_objects{cluster="ceph"} 4`),
|
||||||
|
regexp.MustCompile(`ceph_rgw_gc_pending_tasks{cluster="ceph"} 1`),
|
||||||
|
regexp.MustCompile(`ceph_rgw_gc_pending_objects{cluster="ceph"} 3`),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte(`[]`),
|
||||||
|
reMatch: []*regexp.Regexp{
|
||||||
|
regexp.MustCompile(`ceph_rgw_gc_active_tasks{cluster="ceph"} 0`),
|
||||||
|
regexp.MustCompile(`ceph_rgw_gc_active_objects{cluster="ceph"} 0`),
|
||||||
|
regexp.MustCompile(`ceph_rgw_gc_pending_tasks{cluster="ceph"} 0`),
|
||||||
|
regexp.MustCompile(`ceph_rgw_gc_pending_objects{cluster="ceph"} 0`),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// force an error return json deserialization
|
||||||
|
input: []byte(`[ { "bad-object": 17,,, ]`),
|
||||||
|
reUnmatch: []*regexp.Regexp{
|
||||||
|
regexp.MustCompile(`ceph_rgw_gc`),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// force an error return from getRGWGCTaskList
|
||||||
|
input: nil,
|
||||||
|
reUnmatch: []*regexp.Regexp{
|
||||||
|
regexp.MustCompile(`ceph_rgw_gc`),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
} {
|
||||||
|
func() {
|
||||||
|
collector := NewRGWCollector("ceph", "")
|
||||||
|
collector.getRGWGCTaskList = func(cluster string) ([]byte, error) {
|
||||||
|
if tt.input != nil {
|
||||||
|
return tt.input, nil
|
||||||
|
}
|
||||||
|
return nil, errors.New("fake error")
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := prometheus.Register(collector); err != nil {
|
||||||
|
t.Fatalf("collector failed to register: %s", err)
|
||||||
|
}
|
||||||
|
defer prometheus.Unregister(collector)
|
||||||
|
|
||||||
|
server := httptest.NewServer(prometheus.Handler())
|
||||||
|
defer server.Close()
|
||||||
|
|
||||||
|
resp, err := http.Get(server.URL)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected failed response from prometheus: %s", err)
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
buf, err := ioutil.ReadAll(resp.Body)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("failed reading server response: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, re := range tt.reMatch {
|
||||||
|
if !re.Match(buf) {
|
||||||
|
t.Errorf("failed matching: %q", re)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, re := range tt.reUnmatch {
|
||||||
|
if re.Match(buf) {
|
||||||
|
t.Errorf("should not have matched %q", re)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
}
|
23
exporter.go
23
exporter.go
|
@ -31,6 +31,11 @@ import (
|
||||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
defaultCephClusterLabel = "ceph"
|
||||||
|
defaultCephConfigPath = "/etc/ceph/ceph.conf"
|
||||||
|
)
|
||||||
|
|
||||||
// This horrible thing is a copy of tcpKeepAliveListener, tweaked to
|
// This horrible thing is a copy of tcpKeepAliveListener, tweaked to
|
||||||
// specifically check if it hits EMFILE when doing an accept, and if so,
|
// specifically check if it hits EMFILE when doing an accept, and if so,
|
||||||
// terminate the process.
|
// terminate the process.
|
||||||
|
@ -74,8 +79,8 @@ var _ prometheus.Collector = &CephExporter{}
|
||||||
// NewCephExporter creates an instance to CephExporter and returns a reference
|
// NewCephExporter creates an instance to CephExporter and returns a reference
|
||||||
// to it. We can choose to enable a collector to extract stats out of by adding
|
// to it. We can choose to enable a collector to extract stats out of by adding
|
||||||
// it to the list of collectors.
|
// it to the list of collectors.
|
||||||
func NewCephExporter(conn *rados.Conn, cluster string) *CephExporter {
|
func NewCephExporter(conn *rados.Conn, cluster string, config string, withRGW bool) *CephExporter {
|
||||||
return &CephExporter{
|
c := &CephExporter{
|
||||||
collectors: []prometheus.Collector{
|
collectors: []prometheus.Collector{
|
||||||
collectors.NewClusterUsageCollector(conn, cluster),
|
collectors.NewClusterUsageCollector(conn, cluster),
|
||||||
collectors.NewPoolUsageCollector(conn, cluster),
|
collectors.NewPoolUsageCollector(conn, cluster),
|
||||||
|
@ -84,6 +89,14 @@ func NewCephExporter(conn *rados.Conn, cluster string) *CephExporter {
|
||||||
collectors.NewOSDCollector(conn, cluster),
|
collectors.NewOSDCollector(conn, cluster),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if withRGW {
|
||||||
|
c.collectors = append(c.collectors,
|
||||||
|
collectors.NewRGWCollector(cluster, config),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
return c
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describe sends all the descriptors of the collectors included to
|
// Describe sends all the descriptors of the collectors included to
|
||||||
|
@ -113,6 +126,8 @@ func main() {
|
||||||
cephConfig = flag.String("ceph.config", "", "path to ceph config file")
|
cephConfig = flag.String("ceph.config", "", "path to ceph config file")
|
||||||
cephUser = flag.String("ceph.user", "admin", "Ceph user to connect to cluster.")
|
cephUser = flag.String("ceph.user", "admin", "Ceph user to connect to cluster.")
|
||||||
|
|
||||||
|
withRGW = flag.Bool("with-rgw", false, "Enable collection of stats from RGW")
|
||||||
|
|
||||||
exporterConfig = flag.String("exporter.config", "/etc/ceph/exporter.yml", "Path to ceph exporter config.")
|
exporterConfig = flag.String("exporter.config", "/etc/ceph/exporter.yml", "Path to ceph exporter config.")
|
||||||
)
|
)
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
|
@ -143,7 +158,7 @@ func main() {
|
||||||
defer conn.Shutdown()
|
defer conn.Shutdown()
|
||||||
|
|
||||||
log.Printf("Starting ceph exporter for cluster: %s", cluster.ClusterLabel)
|
log.Printf("Starting ceph exporter for cluster: %s", cluster.ClusterLabel)
|
||||||
err = prometheus.Register(NewCephExporter(conn, cluster.ClusterLabel))
|
err = prometheus.Register(NewCephExporter(conn, cluster.ClusterLabel, cluster.ConfigFile, *withRGW))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("cannot export cluster: %s error: %v", cluster.ClusterLabel, err)
|
log.Fatalf("cannot export cluster: %s error: %v", cluster.ClusterLabel, err)
|
||||||
}
|
}
|
||||||
|
@ -168,7 +183,7 @@ func main() {
|
||||||
}
|
}
|
||||||
defer conn.Shutdown()
|
defer conn.Shutdown()
|
||||||
|
|
||||||
prometheus.MustRegister(NewCephExporter(conn, "ceph"))
|
prometheus.MustRegister(NewCephExporter(conn, defaultCephClusterLabel, defaultCephConfigPath, *withRGW))
|
||||||
}
|
}
|
||||||
|
|
||||||
http.Handle(*metricsPath, promhttp.Handler())
|
http.Handle(*metricsPath, promhttp.Handler())
|
||||||
|
|
Loading…
Reference in New Issue