Ssobolewski/run rgw stats in background (#97)

* RGW GC stat collection can take a long time if there is a very large backlog

* Use a const for background interval

* Minor change per code review
This commit is contained in:
ssobolewski 2018-08-10 13:43:02 -06:00 committed by GitHub
parent dc6ab9c636
commit 415d296c31
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 54 additions and 13 deletions

View File

@ -12,6 +12,13 @@ import (
const rgwGCTimeFormat = "2006-01-02 15:04:05"
const radosgwAdminPath = "/usr/bin/radosgw-admin"
const backgroundCollectInterval = time.Duration(5 * time.Minute)
const (
RGWModeDisabled = 0
RGWModeForeground = 1
RGWModeBackground = 2
)
type rgwTaskGC struct {
Tag string `json:"tag"`
@ -51,7 +58,8 @@ func rgwGetGCTaskList(config string) ([]byte, error) {
// RGWCollector collects metrics from the RGW service
type RGWCollector struct {
config string
config string
background bool
// ActiveTasks reports the number of (expired) RGW GC tasks
ActiveTasks *prometheus.GaugeVec
@ -68,11 +76,12 @@ type RGWCollector struct {
// NewRGWCollector creates an instance of the RGWCollector and instantiates
// the individual metrics that we can collect from the RGW service
func NewRGWCollector(cluster string, config string) *RGWCollector {
func NewRGWCollector(cluster string, config string, background bool) *RGWCollector {
labels := make(prometheus.Labels)
labels["cluster"] = cluster
return &RGWCollector{
rgw := &RGWCollector{
config: config,
background: background,
getRGWGCTaskList: rgwGetGCTaskList,
ActiveTasks: prometheus.NewGaugeVec(
@ -112,6 +121,14 @@ func NewRGWCollector(cluster string, config string) *RGWCollector {
[]string{},
),
}
if rgw.background {
// rgw stats need to be collected in the background as this can take a while
// if we have a large backlog
go rgw.backgroundCollect()
}
return rgw
}
func (r *RGWCollector) collectorList() []prometheus.Collector {
@ -123,6 +140,16 @@ func (r *RGWCollector) collectorList() []prometheus.Collector {
}
}
func (r *RGWCollector) backgroundCollect() error {
for {
err := r.collect()
if err != nil {
log.Println("Failed to collect RGW GC stats", err)
}
time.Sleep(backgroundCollectInterval)
}
}
func (r *RGWCollector) collect() error {
data, err := r.getRGWGCTaskList(r.config)
if err != nil {
@ -172,9 +199,11 @@ func (r *RGWCollector) Describe(ch chan<- *prometheus.Desc) {
// Collect sends all the collected metrics to the provided prometheus channel.
// It requires the caller to handle synchronization.
func (r *RGWCollector) Collect(ch chan<- prometheus.Metric) {
err := r.collect()
if err != nil {
log.Println("Failed to collect RGW GC stats", err)
if !r.background {
err := r.collect()
if err != nil {
log.Println("Failed to collect RGW GC stats", err)
}
}
for _, metric := range r.collectorList() {

View File

@ -114,7 +114,7 @@ func TestRGWCollector(t *testing.T) {
},
} {
func() {
collector := NewRGWCollector("ceph", "")
collector := NewRGWCollector("ceph", "", false) // run in foreground for testing
collector.getRGWGCTaskList = func(cluster string) ([]byte, error) {
if tt.input != nil {
return tt.input, nil

View File

@ -79,7 +79,7 @@ var _ prometheus.Collector = &CephExporter{}
// NewCephExporter creates an instance to CephExporter and returns a reference
// to it. We can choose to enable a collector to extract stats out of by adding
// it to the list of collectors.
func NewCephExporter(conn *rados.Conn, cluster string, config string, withRGW bool) *CephExporter {
func NewCephExporter(conn *rados.Conn, cluster string, config string, rgwMode int) *CephExporter {
c := &CephExporter{
collectors: []prometheus.Collector{
collectors.NewClusterUsageCollector(conn, cluster),
@ -90,10 +90,22 @@ func NewCephExporter(conn *rados.Conn, cluster string, config string, withRGW bo
},
}
if withRGW {
switch rgwMode {
case collectors.RGWModeForeground:
c.collectors = append(c.collectors,
collectors.NewRGWCollector(cluster, config),
collectors.NewRGWCollector(cluster, config, false),
)
case collectors.RGWModeBackground:
c.collectors = append(c.collectors,
collectors.NewRGWCollector(cluster, config, true),
)
case collectors.RGWModeDisabled:
// nothing to do
default:
log.Printf("RGW Collector Disabled do to invalid mode (%d)\n", rgwMode)
}
return c
@ -126,7 +138,7 @@ func main() {
cephConfig = flag.String("ceph.config", "", "path to ceph config file")
cephUser = flag.String("ceph.user", "admin", "Ceph user to connect to cluster.")
withRGW = flag.Bool("with-rgw", false, "Enable collection of stats from RGW")
rgwMode = flag.Int("rgw.mode", 0, "Enable collection of stats from RGW (0:disabled 1:enabled 2:background)")
exporterConfig = flag.String("exporter.config", "/etc/ceph/exporter.yml", "Path to ceph exporter config.")
)
@ -158,7 +170,7 @@ func main() {
defer conn.Shutdown()
log.Printf("Starting ceph exporter for cluster: %s", cluster.ClusterLabel)
err = prometheus.Register(NewCephExporter(conn, cluster.ClusterLabel, cluster.ConfigFile, *withRGW))
err = prometheus.Register(NewCephExporter(conn, cluster.ClusterLabel, cluster.ConfigFile, *rgwMode))
if err != nil {
log.Fatalf("cannot export cluster: %s error: %v", cluster.ClusterLabel, err)
}
@ -183,7 +195,7 @@ func main() {
}
defer conn.Shutdown()
prometheus.MustRegister(NewCephExporter(conn, defaultCephClusterLabel, defaultCephConfigPath, *withRGW))
prometheus.MustRegister(NewCephExporter(conn, defaultCephClusterLabel, defaultCephConfigPath, *rgwMode))
}
http.Handle(*metricsPath, promhttp.Handler())