From 415d296c31ef1e6fd8dac6dfecfb952ec0e36870 Mon Sep 17 00:00:00 2001 From: ssobolewski Date: Fri, 10 Aug 2018 13:43:02 -0600 Subject: [PATCH] Ssobolewski/run rgw stats in background (#97) * RGW GC stat collection can take a long time if there is a very large backlog * Use a const for background interval * Minor change per code review --- collectors/rgw.go | 41 +++++++++++++++++++++++++++++++++++------ collectors/rgw_test.go | 2 +- exporter.go | 24 ++++++++++++++++++------ 3 files changed, 54 insertions(+), 13 deletions(-) diff --git a/collectors/rgw.go b/collectors/rgw.go index 5ae9c84..2be61d5 100644 --- a/collectors/rgw.go +++ b/collectors/rgw.go @@ -12,6 +12,13 @@ import ( const rgwGCTimeFormat = "2006-01-02 15:04:05" const radosgwAdminPath = "/usr/bin/radosgw-admin" +const backgroundCollectInterval = time.Duration(5 * time.Minute) + +const ( + RGWModeDisabled = 0 + RGWModeForeground = 1 + RGWModeBackground = 2 +) type rgwTaskGC struct { Tag string `json:"tag"` @@ -51,7 +58,8 @@ func rgwGetGCTaskList(config string) ([]byte, error) { // RGWCollector collects metrics from the RGW service type RGWCollector struct { - config string + config string + background bool // ActiveTasks reports the number of (expired) RGW GC tasks ActiveTasks *prometheus.GaugeVec @@ -68,11 +76,12 @@ type RGWCollector struct { // NewRGWCollector creates an instance of the RGWCollector and instantiates // the individual metrics that we can collect from the RGW service -func NewRGWCollector(cluster string, config string) *RGWCollector { +func NewRGWCollector(cluster string, config string, background bool) *RGWCollector { labels := make(prometheus.Labels) labels["cluster"] = cluster - return &RGWCollector{ + rgw := &RGWCollector{ config: config, + background: background, getRGWGCTaskList: rgwGetGCTaskList, ActiveTasks: prometheus.NewGaugeVec( @@ -112,6 +121,14 @@ func NewRGWCollector(cluster string, config string) *RGWCollector { []string{}, ), } + + if rgw.background { + // rgw stats need to be collected in the background as this can take a while + // if we have a large backlog + go rgw.backgroundCollect() + } + + return rgw } func (r *RGWCollector) collectorList() []prometheus.Collector { @@ -123,6 +140,16 @@ func (r *RGWCollector) collectorList() []prometheus.Collector { } } +func (r *RGWCollector) backgroundCollect() error { + for { + err := r.collect() + if err != nil { + log.Println("Failed to collect RGW GC stats", err) + } + time.Sleep(backgroundCollectInterval) + } +} + func (r *RGWCollector) collect() error { data, err := r.getRGWGCTaskList(r.config) if err != nil { @@ -172,9 +199,11 @@ func (r *RGWCollector) Describe(ch chan<- *prometheus.Desc) { // Collect sends all the collected metrics to the provided prometheus channel. // It requires the caller to handle synchronization. func (r *RGWCollector) Collect(ch chan<- prometheus.Metric) { - err := r.collect() - if err != nil { - log.Println("Failed to collect RGW GC stats", err) + if !r.background { + err := r.collect() + if err != nil { + log.Println("Failed to collect RGW GC stats", err) + } } for _, metric := range r.collectorList() { diff --git a/collectors/rgw_test.go b/collectors/rgw_test.go index 8a49915..4a5d0c9 100644 --- a/collectors/rgw_test.go +++ b/collectors/rgw_test.go @@ -114,7 +114,7 @@ func TestRGWCollector(t *testing.T) { }, } { func() { - collector := NewRGWCollector("ceph", "") + collector := NewRGWCollector("ceph", "", false) // run in foreground for testing collector.getRGWGCTaskList = func(cluster string) ([]byte, error) { if tt.input != nil { return tt.input, nil diff --git a/exporter.go b/exporter.go index 2c424d7..d0837e3 100644 --- a/exporter.go +++ b/exporter.go @@ -79,7 +79,7 @@ var _ prometheus.Collector = &CephExporter{} // NewCephExporter creates an instance to CephExporter and returns a reference // to it. We can choose to enable a collector to extract stats out of by adding // it to the list of collectors. -func NewCephExporter(conn *rados.Conn, cluster string, config string, withRGW bool) *CephExporter { +func NewCephExporter(conn *rados.Conn, cluster string, config string, rgwMode int) *CephExporter { c := &CephExporter{ collectors: []prometheus.Collector{ collectors.NewClusterUsageCollector(conn, cluster), @@ -90,10 +90,22 @@ func NewCephExporter(conn *rados.Conn, cluster string, config string, withRGW bo }, } - if withRGW { + switch rgwMode { + case collectors.RGWModeForeground: c.collectors = append(c.collectors, - collectors.NewRGWCollector(cluster, config), + collectors.NewRGWCollector(cluster, config, false), ) + + case collectors.RGWModeBackground: + c.collectors = append(c.collectors, + collectors.NewRGWCollector(cluster, config, true), + ) + + case collectors.RGWModeDisabled: + // nothing to do + + default: + log.Printf("RGW Collector Disabled do to invalid mode (%d)\n", rgwMode) } return c @@ -126,7 +138,7 @@ func main() { cephConfig = flag.String("ceph.config", "", "path to ceph config file") cephUser = flag.String("ceph.user", "admin", "Ceph user to connect to cluster.") - withRGW = flag.Bool("with-rgw", false, "Enable collection of stats from RGW") + rgwMode = flag.Int("rgw.mode", 0, "Enable collection of stats from RGW (0:disabled 1:enabled 2:background)") exporterConfig = flag.String("exporter.config", "/etc/ceph/exporter.yml", "Path to ceph exporter config.") ) @@ -158,7 +170,7 @@ func main() { defer conn.Shutdown() log.Printf("Starting ceph exporter for cluster: %s", cluster.ClusterLabel) - err = prometheus.Register(NewCephExporter(conn, cluster.ClusterLabel, cluster.ConfigFile, *withRGW)) + err = prometheus.Register(NewCephExporter(conn, cluster.ClusterLabel, cluster.ConfigFile, *rgwMode)) if err != nil { log.Fatalf("cannot export cluster: %s error: %v", cluster.ClusterLabel, err) } @@ -183,7 +195,7 @@ func main() { } defer conn.Shutdown() - prometheus.MustRegister(NewCephExporter(conn, defaultCephClusterLabel, defaultCephConfigPath, *withRGW)) + prometheus.MustRegister(NewCephExporter(conn, defaultCephClusterLabel, defaultCephConfigPath, *rgwMode)) } http.Handle(*metricsPath, promhttp.Handler())