Merge pull request #449 from prometheus/beorn7/fix-partitioning-by-outcome

Adjust the partitioning by outcome.
This commit is contained in:
Björn Rabenstein 2015-01-13 18:48:54 +01:00
commit 7ca29308e6
2 changed files with 40 additions and 32 deletions

View File

@ -39,11 +39,6 @@ const (
const ( const (
namespace = "prometheus" namespace = "prometheus"
subsystem = "notifications" subsystem = "notifications"
result = "result"
success = "success"
failure = "failure"
dropped = "dropped"
) )
var ( var (
@ -88,7 +83,9 @@ type NotificationHandler struct {
// HTTP client with custom timeout settings. // HTTP client with custom timeout settings.
httpClient httpPoster httpClient httpPoster
notificationLatency *prometheus.SummaryVec notificationLatency prometheus.Summary
notificationErrors prometheus.Counter
notificationDropped prometheus.Counter
notificationsQueueLength prometheus.Gauge notificationsQueueLength prometheus.Gauge
notificationsQueueCapacity prometheus.Metric notificationsQueueCapacity prometheus.Metric
@ -103,15 +100,24 @@ func NewNotificationHandler(alertmanagerURL string, notificationQueueCapacity in
httpClient: utility.NewDeadlineClient(*deadline), httpClient: utility.NewDeadlineClient(*deadline),
notificationLatency: prometheus.NewSummaryVec( notificationLatency: prometheus.NewSummary(prometheus.SummaryOpts{
prometheus.SummaryOpts{ Namespace: namespace,
Namespace: namespace, Subsystem: subsystem,
Subsystem: subsystem, Name: "latency_milliseconds",
Name: "latency_milliseconds", Help: "Latency quantiles for sending alert notifications (not including dropped notifications).",
Help: "Latency quantiles for sending alert notifications.", }),
}, notificationErrors: prometheus.NewCounter(prometheus.CounterOpts{
[]string{result}, Namespace: namespace,
), Subsystem: subsystem,
Name: "errors_total",
Help: "Total number of errors sending alert notifications.",
}),
notificationDropped: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "dropped_total",
Help: "Total number of alert notifications dropped due to alert manager missing in configuration.",
}),
notificationsQueueLength: prometheus.NewGauge(prometheus.GaugeOpts{ notificationsQueueLength: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace, Namespace: namespace,
Subsystem: subsystem, Subsystem: subsystem,
@ -175,22 +181,19 @@ func (n *NotificationHandler) Run() {
for reqs := range n.pendingNotifications { for reqs := range n.pendingNotifications {
if n.alertmanagerURL == "" { if n.alertmanagerURL == "" {
glog.Warning("No alert manager configured, not dispatching notification") glog.Warning("No alert manager configured, not dispatching notification")
n.notificationLatency.WithLabelValues(dropped).Observe(0) n.notificationDropped.Inc()
continue continue
} }
begin := time.Now() begin := time.Now()
err := n.sendNotifications(reqs) err := n.sendNotifications(reqs)
labelValue := success
if err != nil { if err != nil {
glog.Error("Error sending notification: ", err) glog.Error("Error sending notification: ", err)
labelValue = failure n.notificationErrors.Inc()
} }
n.notificationLatency.WithLabelValues(labelValue).Observe( n.notificationLatency.Observe(float64(time.Since(begin) / time.Millisecond))
float64(time.Since(begin) / time.Millisecond),
)
} }
close(n.stopped) close(n.stopped)
} }

View File

@ -59,7 +59,8 @@ type TSDBQueueManager struct {
drained chan bool drained chan bool
samplesCount *prometheus.CounterVec samplesCount *prometheus.CounterVec
sendLatency *prometheus.SummaryVec sendLatency prometheus.Summary
sendErrors prometheus.Counter
queueLength prometheus.Gauge queueLength prometheus.Gauge
queueCapacity prometheus.Metric queueCapacity prometheus.Metric
} }
@ -81,15 +82,18 @@ func NewTSDBQueueManager(tsdb TSDBClient, queueCapacity int) *TSDBQueueManager {
}, },
[]string{result}, []string{result},
), ),
sendLatency: prometheus.NewSummaryVec( sendLatency: prometheus.NewSummary(prometheus.SummaryOpts{
prometheus.SummaryOpts{ Namespace: namespace,
Namespace: namespace, Subsystem: subsystem,
Subsystem: subsystem, Name: "sent_latency_milliseconds",
Name: "sent_latency_milliseconds", Help: "Latency quantiles for sending sample batches to the remote TSDB.",
Help: "Latency quantiles for sending samples to the remote TSDB.", }),
}, sendErrors: prometheus.NewCounter(prometheus.CounterOpts{
[]string{result}, Namespace: namespace,
), Subsystem: subsystem,
Name: "sent_errors_total",
Help: "Total number of errors sending sample batches to the remote TSDB.",
}),
queueLength: prometheus.NewGauge(prometheus.GaugeOpts{ queueLength: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace, Namespace: namespace,
Subsystem: subsystem, Subsystem: subsystem,
@ -164,9 +168,10 @@ func (t *TSDBQueueManager) sendSamples(s clientmodel.Samples) {
if err != nil { if err != nil {
glog.Warningf("error sending %d samples to TSDB: %s", len(s), err) glog.Warningf("error sending %d samples to TSDB: %s", len(s), err)
labelValue = failure labelValue = failure
t.sendErrors.Inc()
} }
t.samplesCount.WithLabelValues(labelValue).Add(float64(len(s))) t.samplesCount.WithLabelValues(labelValue).Add(float64(len(s)))
t.sendLatency.WithLabelValues(labelValue).Observe(float64(duration)) t.sendLatency.Observe(float64(duration))
} }
// Run continuously sends samples to the TSDB. // Run continuously sends samples to the TSDB.