mirror of
https://github.com/prometheus/prometheus
synced 2025-01-27 18:02:57 +00:00
Add Prometheus alerts from kube-prometheus, remove the alertmanager alerts.
Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>
This commit is contained in:
parent
dfbdf8d3bb
commit
8f42192e52
@ -5,7 +5,7 @@
|
|||||||
name: 'prometheus',
|
name: 'prometheus',
|
||||||
rules: [
|
rules: [
|
||||||
{
|
{
|
||||||
alert: 'PromBadConfig',
|
alert: 'PrometheusBadConfig',
|
||||||
expr: |||
|
expr: |||
|
||||||
prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0
|
prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
@ -14,37 +14,134 @@
|
|||||||
severity: 'critical',
|
severity: 'critical',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
mesage: 'Prometheus failed to reload config, see container logs',
|
message: 'Prometheus failed to reload config, see container logs',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
alert: 'PromAlertmanagerBadConfig',
|
alert: 'PrometheusNotificationQueueRunningFull',
|
||||||
expr: |||
|
expr: |||
|
||||||
alertmanager_config_last_reload_successful{%(alertmanagerSelector)s} == 0
|
predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30)
|
||||||
|
>
|
||||||
|
prometheus_notifications_queue_capacity{%(prometheusSelector)s}
|
||||||
|
||| % $._config,
|
||||||
|
'for': '15m',
|
||||||
|
labels: {
|
||||||
|
severity: 'warning',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
message: "Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}}",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'PrometheusErrorSendingAlerts',
|
||||||
|
expr: |||
|
||||||
|
100 * rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
|
||||||
|
/
|
||||||
|
rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 1
|
||||||
|
||| % $._config,
|
||||||
|
'for': '15m',
|
||||||
|
labels: {
|
||||||
|
severity: 'warning',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
message: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'PrometheusErrorSendingAlerts',
|
||||||
|
expr: |||
|
||||||
|
100 * rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
|
||||||
|
/
|
||||||
|
rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 3
|
||||||
|
||| % $._config,
|
||||||
|
'for': '15m',
|
||||||
|
labels: {
|
||||||
|
severity: 'critical',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
message: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'PrometheusNotConnectedToAlertmanagers',
|
||||||
|
expr: |||
|
||||||
|
prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s} < 1
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
'for': '10m',
|
'for': '10m',
|
||||||
labels: {
|
labels: {
|
||||||
severity: 'critical',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'Alertmanager failed to reload config, see container logs',
|
message: 'Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
alert: 'PromAlertsFailed',
|
alert: 'PrometheusTSDBReloadsFailing',
|
||||||
expr: |||
|
expr: |||
|
||||||
100 * rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m]) / rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m]) > 1
|
increase(prometheus_tsdb_reloads_failures_total{%(prometheusSelector)s}[2h]) > 0
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
'for': '5m',
|
'for': '12h',
|
||||||
labels: {
|
labels: {
|
||||||
severity: 'critical',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'Alertmanager failed to send {{ printf "%.1f" $value }}% alerts to {{ $labels.integration }}.',
|
message: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
alert: 'PromRemoteStorageFailures',
|
alert: 'PrometheusTSDBCompactionsFailing',
|
||||||
|
expr: |||
|
||||||
|
increase(prometheus_tsdb_compactions_failed_total{%(prometheusSelector)s}[2h]) > 0
|
||||||
|
||| % $._config,
|
||||||
|
'for': '12h',
|
||||||
|
labels: {
|
||||||
|
severity: 'warning',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
message: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'PrometheusTSDBWALCorruptions',
|
||||||
|
expr: |||
|
||||||
|
tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0
|
||||||
|
||| % $._config,
|
||||||
|
'for': '4h',
|
||||||
|
labels: {
|
||||||
|
severity: 'warning',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
message: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'PrometheusNotIngestingSamples',
|
||||||
|
expr: |||
|
||||||
|
rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m]) <= 0
|
||||||
|
||| % $._config,
|
||||||
|
'for': '10m',
|
||||||
|
labels: {
|
||||||
|
severity: 'warning',
|
||||||
|
},
|
||||||
|
message: {
|
||||||
|
description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'PrometheusTargetScrapesDuplicate',
|
||||||
|
expr: |||
|
||||||
|
increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{%(prometheusSelector)s}[5m]) > 0
|
||||||
|
||| % $._config,
|
||||||
|
'for': '10m',
|
||||||
|
labels: {
|
||||||
|
severity: 'warning',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
message: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'PrometheusRemoteStorageFailures',
|
||||||
expr: |||
|
expr: |||
|
||||||
(rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) * 100)
|
(rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) * 100)
|
||||||
/
|
/
|
||||||
@ -60,7 +157,7 @@
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
alert: 'PromRuleFailures',
|
alert: 'PrometheusRuleFailures',
|
||||||
'for': '15m',
|
'for': '15m',
|
||||||
expr: |||
|
expr: |||
|
||||||
rate(prometheus_rule_evaluation_failures_total{%(prometheusSelector)s}[1m]) > 0
|
rate(prometheus_rule_evaluation_failures_total{%(prometheusSelector)s}[1m]) > 0
|
||||||
|
Loading…
Reference in New Issue
Block a user