postgres_mixin: jsonnify alerts file

Signed-off-by: Paweł Krupa (paulfantom) <pawel@krupa.net.pl>
This commit is contained in:
Paweł Krupa (paulfantom) 2022-04-10 15:36:24 +02:00
parent 7e02b9bd9b
commit 4e78805496
5 changed files with 91 additions and 70 deletions

View File

@ -0,0 +1 @@
(import 'postgres.libsonnet')

View File

@ -1,57 +0,0 @@
---
groups:
- name: PostgreSQL
rules:
- alert: PostgreSQLMaxConnectionsReached
expr: sum(pg_stat_activity_count) by (instance) >= sum(pg_settings_max_connections) by (instance) - sum(pg_settings_superuser_reserved_connections) by (instance)
for: 1m
labels:
severity: email
annotations:
summary: "{{ $labels.instance }} has maxed out Postgres connections."
description: "{{ $labels.instance }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy."
- alert: PostgreSQLHighConnections
expr: sum(pg_stat_activity_count) by (instance) > (sum(pg_settings_max_connections) by (instance) - sum(pg_settings_superuser_reserved_connections) by (instance)) * 0.8
for: 10m
labels:
severity: email
annotations:
summary: "{{ $labels.instance }} is over 80% of max Postgres connections."
description: "{{ $labels.instance }} is exceeding 80% of the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Please check utilization graphs and confirm if this is normal service growth, abuse or an otherwise temporary condition or if new resources need to be provisioned (or the limits increased, which is mostly likely)."
- alert: PostgreSQLDown
expr: pg_up != 1
for: 1m
labels:
severity: email
annotations:
summary: "PostgreSQL is not processing queries: {{ $labels.instance }}"
description: "{{ $labels.instance }} is rejecting query requests from the exporter, and thus probably not allowing DNS requests to work either. User services should not be effected provided at least 1 node is still alive."
- alert: PostgreSQLSlowQueries
expr: avg(rate(pg_stat_activity_max_tx_duration{datname!~"template.*"}[2m])) by (datname) > 2 * 60
for: 2m
labels:
severity: email
annotations:
summary: "PostgreSQL high number of slow on {{ $labels.cluster }} for database {{ $labels.datname }} "
description: "PostgreSQL high number of slow queries {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }} "
- alert: PostgreSQLQPS
expr: avg(irate(pg_stat_database_xact_commit{datname!~"template.*"}[5m]) + irate(pg_stat_database_xact_rollback{datname!~"template.*"}[5m])) by (datname) > 10000
for: 5m
labels:
severity: email
annotations:
summary: "PostgreSQL high number of queries per second {{ $labels.cluster }} for database {{ $labels.datname }}"
description: "PostgreSQL high number of queries per second on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}"
- alert: PostgreSQLCacheHitRatio
expr: avg(rate(pg_stat_database_blks_hit{datname!~"template.*"}[5m]) / (rate(pg_stat_database_blks_hit{datname!~"template.*"}[5m]) + rate(pg_stat_database_blks_read{datname!~"template.*"}[5m]))) by (datname) < 0.98
for: 5m
labels:
severity: email
annotations:
summary: "PostgreSQL low cache hit rate on {{ $labels.cluster }} for database {{ $labels.datname }}"
description: "PostgreSQL low on cache hit rate on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}"

View File

@ -0,0 +1,83 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'PostgreSQL',
rules: [
{
alert: 'PostgreSQLMaxConnectionsReached',
annotations: {
description: '{{ $labels.instance }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy.',
summary: '{{ $labels.instance }} has maxed out Postgres connections.',
},
expr: 'sum(pg_stat_activity_count) by (instance) >= sum(pg_settings_max_connections) by (instance) - sum(pg_settings_superuser_reserved_connections) by (instance)',
'for': '1m',
labels: {
severity: 'email',
},
},
{
alert: 'PostgreSQLHighConnections',
annotations: {
description: '{{ $labels.instance }} is exceeding 80% of the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Please check utilization graphs and confirm if this is normal service growth, abuse or an otherwise temporary condition or if new resources need to be provisioned (or the limits increased, which is mostly likely).',
summary: '{{ $labels.instance }} is over 80% of max Postgres connections.',
},
expr: 'sum(pg_stat_activity_count) by (instance) > (sum(pg_settings_max_connections) by (instance) - sum(pg_settings_superuser_reserved_connections) by (instance)) * 0.8',
'for': '10m',
labels: {
severity: 'email',
},
},
{
alert: 'PostgreSQLDown',
annotations: {
description: '{{ $labels.instance }} is rejecting query requests from the exporter, and thus probably not allowing DNS requests to work either. User services should not be effected provided at least 1 node is still alive.',
summary: 'PostgreSQL is not processing queries: {{ $labels.instance }}',
},
expr: 'pg_up != 1',
'for': '1m',
labels: {
severity: 'email',
},
},
{
alert: 'PostgreSQLSlowQueries',
annotations: {
description: 'PostgreSQL high number of slow queries {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }} ',
summary: 'PostgreSQL high number of slow on {{ $labels.cluster }} for database {{ $labels.datname }} ',
},
expr: 'avg(rate(pg_stat_activity_max_tx_duration{datname!~"template.*"}[2m])) by (datname) > 2 * 60',
'for': '2m',
labels: {
severity: 'email',
},
},
{
alert: 'PostgreSQLQPS',
annotations: {
description: 'PostgreSQL high number of queries per second on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}',
summary: 'PostgreSQL high number of queries per second {{ $labels.cluster }} for database {{ $labels.datname }}',
},
expr: 'avg(irate(pg_stat_database_xact_commit{datname!~"template.*"}[5m]) + irate(pg_stat_database_xact_rollback{datname!~"template.*"}[5m])) by (datname) > 10000',
'for': '5m',
labels: {
severity: 'email',
},
},
{
alert: 'PostgreSQLCacheHitRatio',
annotations: {
description: 'PostgreSQL low on cache hit rate on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}',
summary: 'PostgreSQL low cache hit rate on {{ $labels.cluster }} for database {{ $labels.datname }}',
},
expr: 'avg(rate(pg_stat_database_blks_hit{datname!~"template.*"}[5m]) / (rate(pg_stat_database_blks_hit{datname!~"template.*"}[5m]) + rate(pg_stat_database_blks_read{datname!~"template.*"}[5m]))) by (datname) < 0.98',
'for': '5m',
labels: {
severity: 'email',
},
},
],
},
],
},
}

View File

@ -0,0 +1,5 @@
{
grafanaDashboards+:: {
'postgres-overview.json': (import 'postgres-overview.json'),
},
}

View File

@ -1,13 +1,2 @@
{
grafanaDashboards: {
'postgres-overview.json': (import 'dashboards/postgres-overview.json'),
},
// Helper function to ensure that we don't override other rules, by forcing
// the patching of the groups list, and not the overall rules object.
local importRules(rules) = {
groups+: std.native('parseYaml')(rules)[0].groups,
},
prometheusAlerts+: importRules(importstr 'alerts/alerts.yaml'),
}
(import 'alerts/alerts.libsonnet') +
(import 'dashboards/dashboards.libsonnet')