From 1909686789a5a5f1cfd7b87807bba819187d5b1f Mon Sep 17 00:00:00 2001
From: Bjoern Rabenstein <bjoern@soundcloud.com>
Date: Wed, 23 Jul 2014 19:55:33 +0200
Subject: [PATCH] Make metrics exported by the Prometheus server itself more
 consistent.

- Always spell out the time unit (e.g. milliseconds instead of ms).

- Remove "_total" from the names of metrics that are not counters.

- Make use of the "Namespace" and "Subsystem" fields in the options.

- Removed the "capacity" facet from all metrics about channels/queues.
  These are all fixed via command line flags and will never change
  during the runtime of a process. Also, they should not be part of
  the same metric family. I have added separate metrics for the
  capacity of queues as convenience. (They will never change and are
  only set once.)

- I left "metric_disk_latency_microseconds" unchanged, although that
  metric measures the latency of the storage device, even if it is not
  a spinning disk. "SSD" is read by many as "solid state disk", so
  it's not too far off. (It should be "solid state drive", of course,
  but "metric_drive_latency_microseconds" is probably confusing.)

- Brian suggested to not mix "failure" and "success" outcome in the
  same metric family (distinguished by labels). For now, I left it as
  it is. We are touching some bigger issue here, especially as other
  parts in the Prometheus ecosystem are following the same
  principle. We still need to come to terms here and then change
  things consistently everywhere.

Change-Id: If799458b450d18f78500f05990301c12525197d3
---
 notification/notification.go     | 47 ++++++++++++++---------
 retrieval/target.go              | 25 +++++++-----
 retrieval/target_provider.go     |  5 ++-
 retrieval/targetpool.go          |  5 ++-
 rules/manager/manager.go         | 24 ++++++------
 storage/metric/tiered/curator.go | 10 +++--
 storage/metric/tiered/tiered.go  | 65 +++++++++++++++++---------------
 storage/remote/queue_manager.go  | 55 ++++++++++++++++-----------
 8 files changed, 136 insertions(+), 100 deletions(-)

diff --git a/notification/notification.go b/notification/notification.go
index 21012ba95..cf9d285ef 100644
--- a/notification/notification.go
+++ b/notification/notification.go
@@ -37,14 +37,13 @@ const (
 
 // String constants for instrumentation.
 const (
+	namespace = "prometheus"
+	subsystem = "notifications"
+
 	result  = "result"
 	success = "success"
 	failure = "failure"
 	dropped = "dropped"
-
-	facet     = "facet"
-	occupancy = "occupancy"
-	capacity  = "capacity"
 )
 
 var (
@@ -86,8 +85,9 @@ type NotificationHandler struct {
 	// HTTP client with custom timeout settings.
 	httpClient httpPoster
 
-	notificationLatency    *prometheus.SummaryVec
-	notificationsQueueSize *prometheus.GaugeVec
+	notificationLatency        *prometheus.SummaryVec
+	notificationsQueueLength   prometheus.Gauge
+	notificationsQueueCapacity prometheus.Metric
 }
 
 // Construct a new NotificationHandler.
@@ -99,17 +99,27 @@ func NewNotificationHandler(alertmanagerUrl string, notificationReqs <-chan Noti
 
 		notificationLatency: prometheus.NewSummaryVec(
 			prometheus.SummaryOpts{
-				Name: "prometheus_notifications_latency_ms",
-				Help: "Latency quantiles for sending alert notifications in milliseconds.",
+				Namespace: namespace,
+				Subsystem: subsystem,
+				Name:      "latency_milliseconds",
+				Help:      "Latency quantiles for sending alert notifications.",
 			},
 			[]string{result},
 		),
-		notificationsQueueSize: prometheus.NewGaugeVec(
-			prometheus.GaugeOpts{
-				Name: "prometheus_notifications_queue_size_total",
-				Help: "The size and capacity of the alert notification queue.",
-			},
-			[]string{facet},
+		notificationsQueueLength: prometheus.NewGauge(prometheus.GaugeOpts{
+			Namespace: namespace,
+			Subsystem: subsystem,
+			Name:      "queue_length",
+			Help:      "The number of alert notifications in the queue.",
+		}),
+		notificationsQueueCapacity: prometheus.MustNewConstMetric(
+			prometheus.NewDesc(
+				prometheus.BuildFQName(namespace, subsystem, "queue_capacity"),
+				"The capacity of the alert notifications queue.",
+				nil, nil,
+			),
+			prometheus.GaugeValue,
+			float64(cap(notificationReqs)),
 		),
 	}
 }
@@ -180,13 +190,14 @@ func (n *NotificationHandler) Run() {
 // Describe implements prometheus.Collector.
 func (n *NotificationHandler) Describe(ch chan<- *prometheus.Desc) {
 	n.notificationLatency.Describe(ch)
-	n.notificationsQueueSize.Describe(ch)
+	ch <- n.notificationsQueueLength.Desc()
+	ch <- n.notificationsQueueCapacity.Desc()
 }
 
 // Collect implements prometheus.Collector.
 func (n *NotificationHandler) Collect(ch chan<- prometheus.Metric) {
 	n.notificationLatency.Collect(ch)
-	n.notificationsQueueSize.WithLabelValues(occupancy).Set(float64(len(n.pendingNotifications)))
-	n.notificationsQueueSize.WithLabelValues(capacity).Set(float64(cap(n.pendingNotifications)))
-	n.notificationsQueueSize.Collect(ch)
+	n.notificationsQueueLength.Set(float64(len(n.pendingNotifications)))
+	ch <- n.notificationsQueueLength
+	ch <- n.notificationsQueueCapacity
 }
diff --git a/retrieval/target.go b/retrieval/target.go
index 7a761593d..976f8f871 100644
--- a/retrieval/target.go
+++ b/retrieval/target.go
@@ -35,12 +35,12 @@ const (
 	ScrapeHealthMetricName clientmodel.LabelValue = "up"
 
 	// Constants for instrumentation.
-	address = "instance"
-	alive   = "alive"
-	failure = "failure"
-	outcome = "outcome"
-	state   = "state"
-	success = "success"
+	namespace = "prometheus"
+	job       = "target_job"
+	instance  = "target_instance"
+	failure   = "failure"
+	outcome   = "outcome"
+	success   = "success"
 )
 
 var (
@@ -48,11 +48,12 @@ var (
 
 	targetOperationLatencies = prometheus.NewSummaryVec(
 		prometheus.SummaryOpts{
-			Name:       "prometheus_target_operation_latency_ms",
-			Help:       "The latencies for various target operations.",
+			Namespace:  namespace,
+			Name:       "target_operation_latency_milliseconds",
+			Help:       "The latencies for target operations.",
 			Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99},
 		},
-		[]string{address, outcome},
+		[]string{job, instance, outcome},
 	)
 )
 
@@ -196,7 +197,11 @@ const acceptHeader = `application/vnd.google.protobuf;proto=io.prometheus.client
 func (t *target) scrape(timestamp clientmodel.Timestamp, ingester extraction.Ingester) (err error) {
 	defer func(start time.Time) {
 		ms := float64(time.Since(start)) / float64(time.Millisecond)
-		labels := prometheus.Labels{address: t.Address(), outcome: success}
+		labels := prometheus.Labels{
+			job:      string(t.baseLabels[clientmodel.JobLabel]),
+			instance: t.Address(),
+			outcome:  success,
+		}
 		if err != nil {
 			labels[outcome] = failure
 		}
diff --git a/retrieval/target_provider.go b/retrieval/target_provider.go
index 0b19b39c8..15cc282ae 100644
--- a/retrieval/target_provider.go
+++ b/retrieval/target_provider.go
@@ -35,8 +35,9 @@ const resolvConf = "/etc/resolv.conf"
 var (
 	dnsSDLookupsCount = prometheus.NewCounterVec(
 		prometheus.CounterOpts{
-			Name: "prometheus_dns_sd_lookups_total",
-			Help: "The number of DNS-SD lookup successes/failures per pool.",
+			Namespace: namespace,
+			Name:      "dns_sd_lookups_total",
+			Help:      "The number of DNS-SD lookup successes/failures per pool.",
 		},
 		[]string{outcome},
 	)
diff --git a/retrieval/targetpool.go b/retrieval/targetpool.go
index f1a87d0e1..bce49be4c 100644
--- a/retrieval/targetpool.go
+++ b/retrieval/targetpool.go
@@ -32,8 +32,9 @@ const (
 var (
 	retrievalDurations = prometheus.NewSummaryVec(
 		prometheus.SummaryOpts{
-			Name:       "prometheus_targetpool_duration_ms",
-			Help:       "The durations for each TargetPool to retrieve state from all included entities.",
+			Namespace:  namespace,
+			Name:       "targetpool_retrieve_time_milliseconds",
+			Help:       "The time needed for each TargetPool to retrieve state from all included entities.",
 			Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99},
 		},
 		[]string{intervalKey},
diff --git a/rules/manager/manager.go b/rules/manager/manager.go
index b69546d8a..e1917454d 100644
--- a/rules/manager/manager.go
+++ b/rules/manager/manager.go
@@ -33,7 +33,8 @@ import (
 
 // Constants for instrumentation.
 const (
-	intervalLabel     = "interval"
+	namespace = "prometheus"
+
 	ruleTypeLabel     = "rule_type"
 	alertingRuleType  = "alerting"
 	recordingRuleType = "recording"
@@ -42,19 +43,18 @@ const (
 var (
 	evalDuration = prometheus.NewSummaryVec(
 		prometheus.SummaryOpts{
-			Name: "prometheus_rule_evaluation_duration_ms",
-			Help: "The duration for a rule to execute.",
+			Namespace: namespace,
+			Name:      "rule_evaluation_duration_milliseconds",
+			Help:      "The duration for a rule to execute.",
 		},
 		[]string{ruleTypeLabel},
 	)
-	iterationDuration = prometheus.NewSummaryVec(
-		prometheus.SummaryOpts{
-			Name:       "prometheus_evaluator_duration_ms",
-			Help:       "The duration for each evaluation pool to execute.",
-			Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99},
-		},
-		[]string{intervalLabel},
-	)
+	iterationDuration = prometheus.NewSummary(prometheus.SummaryOpts{
+		Namespace:  namespace,
+		Name:       "evaluator_duration_milliseconds",
+		Help:       "The duration for all evaluations to execute.",
+		Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99},
+	})
 )
 
 func init() {
@@ -124,7 +124,7 @@ func (m *ruleManager) Run() {
 		case <-ticker.C:
 			start := time.Now()
 			m.runIteration(m.results)
-			iterationDuration.WithLabelValues(m.interval.String()).Observe(float64(time.Since(start) / time.Millisecond))
+			iterationDuration.Observe(float64(time.Since(start) / time.Millisecond))
 		case <-m.done:
 			glog.Info("rules.Rule manager exiting...")
 			return
diff --git a/storage/metric/tiered/curator.go b/storage/metric/tiered/curator.go
index ba2c63b3a..ace1d58eb 100644
--- a/storage/metric/tiered/curator.go
+++ b/storage/metric/tiered/curator.go
@@ -47,16 +47,18 @@ const (
 var (
 	curationDurations = prometheus.NewSummaryVec(
 		prometheus.SummaryOpts{
-			Name:       "prometheus_curation_durations_ms",
-			Help:       "Histogram of time spent in curation (ms).",
+			Namespace:  namespace,
+			Name:       "curation_durations_milliseconds",
+			Help:       "Histogram of time spent in curation.",
 			Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99},
 		},
 		[]string{cutOff, processorName, result},
 	)
 	curationFilterOperations = prometheus.NewCounterVec(
 		prometheus.CounterOpts{
-			Name: "prometheus_curation_filter_operations_total",
-			Help: "The number of curation filter operations completed.",
+			Namespace: namespace,
+			Name:      "curation_filter_operations_total",
+			Help:      "The number of curation filter operations completed.",
 		},
 		[]string{cutOff, processorName, result},
 	)
diff --git a/storage/metric/tiered/tiered.go b/storage/metric/tiered/tiered.go
index cc02e8480..cc5e4fc58 100644
--- a/storage/metric/tiered/tiered.go
+++ b/storage/metric/tiered/tiered.go
@@ -33,6 +33,8 @@ import (
 
 // Constants for instrumentation.
 const (
+	namespace = "prometheus"
+
 	operation = "operation"
 	success   = "success"
 	failure   = "failure"
@@ -51,24 +53,22 @@ const (
 	queue          = "queue"
 	appendToDisk   = "append_to_disk"
 	viewGeneration = "view_generation"
-
-	facet     = "facet"
-	occupancy = "occupancy"
-	capacity  = "capacity"
 )
 
 var (
 	storageLatency = prometheus.NewSummaryVec(
 		prometheus.SummaryOpts{
-			Name:       "prometheus_metric_disk_latency_microseconds",
-			Help:       "Latency for metric disk operations in microseconds.",
+			Namespace:  namespace,
+			Name:       "metric_disk_latency_milliseconds",
+			Help:       "Latency for metric disk operations (includes any storage drive even if it is not strictly a disk, e.g. SSD).",
 			Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99},
 		},
 		[]string{operation, result},
 	)
 	storedSamplesCount = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "prometheus_stored_samples_total",
-		Help: "The number of samples that have been stored.",
+		Namespace: namespace,
+		Name:      "stored_samples_total",
+		Help:      "The number of samples that have been stored.",
 	})
 )
 
@@ -145,7 +145,8 @@ type TieredStorage struct {
 	dtoSampleKeys *dtoSampleKeyList
 	sampleKeys    *sampleKeyList
 
-	queueSizes *prometheus.GaugeVec
+	queueLength   *prometheus.GaugeVec
+	queueCapacity *prometheus.GaugeVec
 }
 
 // viewJob encapsulates a request to extract sample values from the datastore.
@@ -159,10 +160,9 @@ type viewJob struct {
 
 const (
 	tieredMemorySemaphores = 5
+	watermarkCacheLimit    = 1024 * 1024
 )
 
-const watermarkCacheLimit = 1024 * 1024
-
 // NewTieredStorage returns a TieredStorage object ready to use.
 func NewTieredStorage(
 	appendToDiskQueueDepth,
@@ -208,14 +208,25 @@ func NewTieredStorage(
 		dtoSampleKeys: newDtoSampleKeyList(10),
 		sampleKeys:    newSampleKeyList(10),
 
-		queueSizes: prometheus.NewGaugeVec(
+		queueLength: prometheus.NewGaugeVec(
 			prometheus.GaugeOpts{
-				Name: "prometheus_storage_queue_sizes_total",
-				Help: "The various sizes and capacities of the storage queues.",
+				Namespace: namespace,
+				Name:      "storage_queue_length",
+				Help:      "The number of items in the storage queues.",
 			},
-			[]string{queue, facet},
+			[]string{queue},
+		),
+		queueCapacity: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Namespace: namespace,
+				Name:      "storage_queue_capacity",
+				Help:      "The capacity of the storage queues.",
+			},
+			[]string{queue},
 		),
 	}
+	s.queueCapacity.WithLabelValues(appendToDisk).Set(float64(appendToDiskQueueDepth))
+	s.queueCapacity.WithLabelValues(viewGeneration).Set(float64(viewQueueDepth))
 
 	for i := 0; i < tieredMemorySemaphores; i++ {
 		s.memorySemaphore <- true
@@ -444,13 +455,13 @@ func (t *TieredStorage) renderView(viewJob viewJob) {
 			storageLatency.With(
 				prometheus.Labels{operation: renderView, result: success},
 			).Observe(
-				float64(time.Since(begin) / time.Microsecond),
+				float64(time.Since(begin) / time.Millisecond),
 			)
 		} else {
 			storageLatency.With(
 				prometheus.Labels{operation: renderView, result: failure},
 			).Observe(
-				float64(time.Since(begin) / time.Microsecond),
+				float64(time.Since(begin) / time.Millisecond),
 			)
 		}
 	}()
@@ -788,23 +799,15 @@ func (t *TieredStorage) GetMetricForFingerprint(f *clientmodel.Fingerprint) (cli
 
 // Describe implements prometheus.Collector.
 func (t *TieredStorage) Describe(ch chan<- *prometheus.Desc) {
-	t.queueSizes.Describe(ch)
+	t.queueLength.Describe(ch)
+	t.queueCapacity.Describe(ch)
 }
 
 // Collect implements prometheus.Collector.
 func (t *TieredStorage) Collect(ch chan<- prometheus.Metric) {
-	t.queueSizes.With(prometheus.Labels{
-		queue: appendToDisk, facet: occupancy,
-	}).Set(float64(len(t.appendToDiskQueue)))
-	t.queueSizes.With(prometheus.Labels{
-		queue: appendToDisk, facet: capacity,
-	}).Set(float64(cap(t.appendToDiskQueue)))
-	t.queueSizes.With(prometheus.Labels{
-		queue: viewGeneration, facet: occupancy,
-	}).Set(float64(len(t.ViewQueue)))
-	t.queueSizes.With(prometheus.Labels{
-		queue: viewGeneration, facet: capacity,
-	}).Set(float64(cap(t.ViewQueue)))
+	t.queueLength.WithLabelValues(appendToDisk).Set(float64(len(t.appendToDiskQueue)))
+	t.queueLength.WithLabelValues(viewGeneration).Set(float64(len(t.ViewQueue)))
 
-	t.queueSizes.Collect(ch)
+	t.queueLength.Collect(ch)
+	t.queueCapacity.Collect(ch)
 }
diff --git a/storage/remote/queue_manager.go b/storage/remote/queue_manager.go
index 6ef4f3c04..50e083765 100644
--- a/storage/remote/queue_manager.go
+++ b/storage/remote/queue_manager.go
@@ -34,14 +34,13 @@ const (
 
 // String constants for instrumentation.
 const (
+	namespace = "prometheus"
+	subsystem = "remote_tsdb"
+
 	result  = "result"
 	success = "success"
 	failure = "failure"
 	dropped = "dropped"
-
-	facet     = "facet"
-	occupancy = "occupancy"
-	capacity  = "capacity"
 )
 
 // TSDBClient defines an interface for sending a batch of samples to an
@@ -59,9 +58,10 @@ type TSDBQueueManager struct {
 	sendSemaphore  chan bool
 	drained        chan bool
 
-	samplesCount *prometheus.CounterVec
-	sendLatency  *prometheus.SummaryVec
-	queueSize    *prometheus.GaugeVec
+	samplesCount  *prometheus.CounterVec
+	sendLatency   *prometheus.SummaryVec
+	queueLength   prometheus.Gauge
+	queueCapacity prometheus.Metric
 }
 
 // NewTSDBQueueManager builds a new TSDBQueueManager.
@@ -74,24 +74,36 @@ func NewTSDBQueueManager(tsdb TSDBClient, queueCapacity int) *TSDBQueueManager {
 
 		samplesCount: prometheus.NewCounterVec(
 			prometheus.CounterOpts{
-				Name: "prometheus_remote_tsdb_sent_samples_total",
-				Help: "Total number of samples processed to be sent to remote TSDB.",
+				Namespace: namespace,
+				Subsystem: subsystem,
+				Name:      "sent_samples_total",
+				Help:      "Total number of processed samples to be sent to remote TSDB.",
 			},
 			[]string{result},
 		),
 		sendLatency: prometheus.NewSummaryVec(
 			prometheus.SummaryOpts{
-				Name: "prometheus_remote_tsdb_latency_ms",
-				Help: "Latency quantiles for sending samples to the remote TSDB in milliseconds.",
+				Namespace: namespace,
+				Subsystem: subsystem,
+				Name:      "sent_latency_milliseconds",
+				Help:      "Latency quantiles for sending samples to the remote TSDB.",
 			},
 			[]string{result},
 		),
-		queueSize: prometheus.NewGaugeVec(
-			prometheus.GaugeOpts{
-				Name: "prometheus_remote_tsdb_queue_size_total",
-				Help: "The size and capacity of the queue of samples to be sent to the remote TSDB.",
-			},
-			[]string{facet},
+		queueLength: prometheus.NewGauge(prometheus.GaugeOpts{
+			Namespace: namespace,
+			Subsystem: subsystem,
+			Name:      "queue_length",
+			Help:      "The number of processed samples queued to be sent to the remote TSDB.",
+		}),
+		queueCapacity: prometheus.MustNewConstMetric(
+			prometheus.NewDesc(
+				prometheus.BuildFQName(namespace, subsystem, "queue_capacity"),
+				"The capacity of the queue of samples to be sent to the remote TSDB.",
+				nil, nil,
+			),
+			prometheus.GaugeValue,
+			float64(queueCapacity),
 		),
 	}
 }
@@ -122,16 +134,17 @@ func (t *TSDBQueueManager) Close() {
 func (t *TSDBQueueManager) Describe(ch chan<- *prometheus.Desc) {
 	t.samplesCount.Describe(ch)
 	t.sendLatency.Describe(ch)
-	t.queueSize.Describe(ch)
+	ch <- t.queueLength.Desc()
+	ch <- t.queueCapacity.Desc()
 }
 
 // Collect implements prometheus.Collector.
 func (t *TSDBQueueManager) Collect(ch chan<- prometheus.Metric) {
 	t.samplesCount.Collect(ch)
 	t.sendLatency.Collect(ch)
-	t.queueSize.WithLabelValues(occupancy).Set(float64(len(t.queue)))
-	t.queueSize.WithLabelValues(capacity).Set(float64(cap(t.queue)))
-	t.queueSize.Collect(ch)
+	t.queueLength.Set(float64(len(t.queue)))
+	ch <- t.queueLength
+	ch <- t.queueCapacity
 }
 
 func (t *TSDBQueueManager) sendSamples(s clientmodel.Samples) {