prometheus-mixin: add HA-group aware alerts

There is certainly a potential to add more of these. This is mostly meant to introduce the concept and cover a few critical parts. Signed-off-by: beorn7 <beorn@grafana.com>
2020-11-11 00:14:54 +01:00 · 2020-11-11 00:14:54 +01:00 · 371ca9ff46
parent cda52234eb
commit 371ca9ff46
2 changed files with 132 additions and 20 deletions
--- a/documentation/prometheus-mixin/alerts.libsonnet
+++ b/documentation/prometheus-mixin/alerts.libsonnet
@ -60,26 +60,6 @@
              description: '{{ printf "%%.1f" $value }}%% errors while sending alerts from Prometheus %(prometheusName)s to Alertmanager {{$labels.alertmanager}}.' % $._config,
            },
          },
-          {
-            alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager',
-            expr: |||
-              min without(alertmanager) (
-                rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
-              /
-                rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m])
-              )
-              * 100
-              > 3
-            ||| % $._config,
-            'for': '15m',
-            labels: {
-              severity: 'critical',
-            },
-            annotations: {
-              summary: 'Prometheus encounters more than 3% errors sending alerts to any Alertmanager.',
-              description: '{{ printf "%%.1f" $value }}%% minimum errors while sending alerts from Prometheus %(prometheusName)s to any Alertmanager.' % $._config,
-            },
-          },
          {
            alert: 'PrometheusNotConnectedToAlertmanagers',
            expr: |||
@ -281,6 +261,123 @@
              description: 'Prometheus %(prometheusName)s has dropped {{ printf "%%.0f" $value }} targets because the number of targets exceeded the configured target_limit.' % $._config,
            },
          },
+        ] + if $._config.prometheusHAGroupLabels == '' then self.rulesWithoutHA else self.rulesWithHA,
+        rulesWithoutHA:: [
+          {
+            alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager',
+            expr: |||
+              min without (alertmanager) (
+                rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
+              /
+                rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m])
+              )
+              * 100
+              > 3
+            ||| % $._config,
+            'for': '15m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              summary: 'Prometheus encounters more than 3% errors sending alerts to any Alertmanager.',
+              description: '{{ printf "%%.1f" $value }}%% minimum errors while sending alerts from Prometheus %(prometheusName)s to any Alertmanager.' % $._config,
+            },
+          },
+        ],
+        rulesWithHA:: [
+          {
+            alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager',
+            expr: |||
+              min by (%(prometheusHAGroupLabels)s) (
+                rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
+              /
+                rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m])
+              )
+              * 100
+              > 3
+            ||| % $._config,
+            'for': '15m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              summary: 'Each Prometheus server in an HA group encounters more than 3% errors sending alerts to any Alertmanager.',
+              description: '{{ printf "%%.1f" $value }}%% minimum errors while sending alerts from any Prometheus server in HA group %(prometheusHAGroupName)s to any Alertmanager.' % $._config,
+            },
+          },
+          {
+            alert: 'PrometheusHAGroupNotIngestingSamples',
+            expr: |||
+              max by (%(prometheusHAGroupLabels)s) (
+                rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m])
+              and
+                (
+                  sum without(scrape_job) (prometheus_target_metadata_cache_entries{%(prometheusSelector)s}) > 0
+                or
+                  sum without(rule_group) (prometheus_rule_group_rules{%(prometheusSelector)s}) > 0
+                )
+              )
+              <= 0
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              summary: 'A whole Prometheus HA group is not ingesting samples.',
+              description: 'None of the Prometheus instances in HA group %(prometheusHAGroupName)s is ingesting any samples.' % $._config,
+            },
+          },
+          // Both the following critical alerts, PrometheusHAGroupDown and
+          // PrometheusHAGroupCrashlooping, fire if a whole HA group is
+          // unhealthy. It is implied that a generic warning alert is in place
+          // for individual instances being down or crashlooping.
+          {
+            alert: 'PrometheusHAGroupDown',
+            expr: |||
+              (
+                count by (%(prometheusHAGroupLabels)s) (
+                  avg_over_time(up{%(prometheusSelector)s}[5m]) < 0.5
+                )
+              /
+                count by (%(prometheusHAGroupLabels)s) (
+                  up{%(prometheusSelector)s}
+                )
+              )
+              > 0.5
+            ||| % $._config,
+            'for': '5m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              summary: 'More than half of the Prometheus instances within the same HA group are down.',
+              description: '{{ $value | humanizePercentage }} of Prometheus instances within the %(prometheusHAGroupName)s HA group have been up for less than half of the last 5m.' % $._config,
+            },
+          },
+          {
+            alert: 'PrometheusHAGroupCrashlooping',
+            expr: |||
+              (
+                count by (%(prometheusHAGroupLabels)s) (
+                  changes(process_start_time_seconds{%(prometheusSelector)s}[30m]) > 4
+                )
+              /
+                count by (%(prometheusHAGroupLabels)s) (
+                  up{%(prometheusSelector)s}
+                )
+              )
+              > 0.5
+            ||| % $._config,
+            'for': '5m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              summary: 'More than half of the Prometheus instances within the same HA group are crashlooping.',
+              description: '{{ $value | humanizePercentage }} of Prometheus instances within the %(prometheusHAGroupName)s HA group have restarted at least 5 times in the last 30m.' % $._config,
+            },
+          },
        ],
      },
    ],
--- a/documentation/prometheus-mixin/config.libsonnet
+++ b/documentation/prometheus-mixin/config.libsonnet
@ -5,6 +5,16 @@
    // servers.
    prometheusSelector: 'job="prometheus"',

+    // prometheusHAGroupLabels is a string with comma-separated labels
+    // that are common labels of instances belonging to the same
+    // high-availability group of Prometheus servers, i.e. identically
+    // configured Prometheus servers. Include not only enough labels
+    // to identify the members of the HA group, but also all common
+    // labels you want to keep for resulting HA-group-level alerts.
+    //
+    // If this is set to an empty string, no HA-related alerts are applied.
+    prometheusHAGroupLabels: '',
+
    // prometheusName is inserted into annotations to name the Prometheus
    // instance affected by the alert.
    prometheusName: '{{$labels.instance}}',
@ -12,5 +22,10 @@
    // Operator, you can make use of the configured target labels for
    // nicer naming:
    // prometheusNameTemplate: '{{$labels.namespace}}/{{$labels.pod}}'
+
+    // prometheusHAGroupName is inserted into annotations to name an
+    // HA group. All labels used here must also be present in
+    // prometheusHAGroupLabels above.
+    prometheusHAGroupName: '{{$labels.job}}',
  },
 }