From e01c5cefac73db460a5c8fec53020a42a9b6bfbe Mon Sep 17 00:00:00 2001 From: beorn7 Date: Wed, 20 Nov 2024 12:58:03 +0100 Subject: [PATCH] notifier: fix increment of metric prometheus_notifications_errors_total Previously, prometheus_notifications_errors_total was incremented by one whenever a batch of alerts was affected by an error during sending to a specific alertmanager. However, the corresponding metric prometheus_notifications_sent_total, counting all alerts that were sent (including those where the sent ended in error), is incremented by the batch size, i.e. the number of alerts. Therefore, the ratio used in the mixin for the PrometheusErrorSendingAlertsToSomeAlertmanagers alert is inconsistent. This commit changes the increment of prometheus_notifications_errors_total to the number of alerts that were sent in the attempt that ended in an error. It also adjusts the metrics help string accordingly and makes the wording in the alert in the mixin more precise. Signed-off-by: beorn7 --- CHANGELOG.md | 1 + documentation/prometheus-mixin/alerts.libsonnet | 4 ++-- notifier/notifier.go | 8 ++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f9050cc9..41d2f9b92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## unreleased +* [CHANGE] Notifier: Increment the prometheus_notifications_errors_total metric by the number of affected alerts rather than by one per batch of affected alerts. #15428 * [ENHANCEMENT] OTLP receiver: Convert also metric metadata. #15416 ## 3.0.0 / 2024-11-14 diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index 563daab80..9a6de90d8 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -84,8 +84,8 @@ severity: 'warning', }, annotations: { - summary: 'Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.', - description: '{{ printf "%%.1f" $value }}%% errors while sending alerts from Prometheus %(prometheusName)s to Alertmanager {{$labels.alertmanager}}.' % $._config, + summary: 'More than 1% of alerts sent by Prometheus to a specific Alertmanager were affected by errors.', + description: '{{ printf "%%.1f" $value }}%% of alerts sent by Prometheus %(prometheusName)s to Alertmanager {{$labels.alertmanager}} were affected by errors.' % $._config, }, }, { diff --git a/notifier/notifier.go b/notifier/notifier.go index 09a2005a3..956fd4652 100644 --- a/notifier/notifier.go +++ b/notifier/notifier.go @@ -160,7 +160,7 @@ func newAlertMetrics(r prometheus.Registerer, queueCap int, queueLen, alertmanag Namespace: namespace, Subsystem: subsystem, Name: "errors_total", - Help: "Total number of errors sending alert notifications.", + Help: "Total number of sent alerts affected by errors.", }, []string{alertmanagerLabel}, ), @@ -619,13 +619,13 @@ func (n *Manager) sendAll(alerts ...*Alert) bool { go func(ctx context.Context, client *http.Client, url string, payload []byte, count int) { if err := n.sendOne(ctx, client, url, payload); err != nil { - n.logger.Error("Error sending alert", "alertmanager", url, "count", count, "err", err) - n.metrics.errors.WithLabelValues(url).Inc() + n.logger.Error("Error sending alerts", "alertmanager", url, "count", count, "err", err) + n.metrics.errors.WithLabelValues(url).Add(float64(count)) } else { numSuccess.Inc() } n.metrics.latency.WithLabelValues(url).Observe(time.Since(begin).Seconds()) - n.metrics.sent.WithLabelValues(url).Add(float64(len(amAlerts))) + n.metrics.sent.WithLabelValues(url).Add(float64(count)) wg.Done() }(ctx, ams.client, am.url().String(), payload, len(amAlerts))