From 4e78805496b095818a98ea19ee197321290b7fee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Krupa=20=28paulfantom=29?= Date: Sun, 10 Apr 2022 15:36:24 +0200 Subject: [PATCH 1/3] postgres_mixin: jsonnify alerts file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Paweł Krupa (paulfantom) --- postgres_mixin/alerts/alerts.libsonnet | 1 + postgres_mixin/alerts/alerts.yaml | 57 ------------- postgres_mixin/alerts/postgres.libsonnet | 83 +++++++++++++++++++ .../dashboards/dashboards.libsonnet | 5 ++ postgres_mixin/mixin.libsonnet | 15 +--- 5 files changed, 91 insertions(+), 70 deletions(-) create mode 100644 postgres_mixin/alerts/alerts.libsonnet delete mode 100644 postgres_mixin/alerts/alerts.yaml create mode 100644 postgres_mixin/alerts/postgres.libsonnet create mode 100644 postgres_mixin/dashboards/dashboards.libsonnet diff --git a/postgres_mixin/alerts/alerts.libsonnet b/postgres_mixin/alerts/alerts.libsonnet new file mode 100644 index 00000000..7f70d8a8 --- /dev/null +++ b/postgres_mixin/alerts/alerts.libsonnet @@ -0,0 +1 @@ +(import 'postgres.libsonnet') diff --git a/postgres_mixin/alerts/alerts.yaml b/postgres_mixin/alerts/alerts.yaml deleted file mode 100644 index ff3d75a7..00000000 --- a/postgres_mixin/alerts/alerts.yaml +++ /dev/null @@ -1,57 +0,0 @@ ---- -groups: - - name: PostgreSQL - rules: - - alert: PostgreSQLMaxConnectionsReached - expr: sum(pg_stat_activity_count) by (instance) >= sum(pg_settings_max_connections) by (instance) - sum(pg_settings_superuser_reserved_connections) by (instance) - for: 1m - labels: - severity: email - annotations: - summary: "{{ $labels.instance }} has maxed out Postgres connections." - description: "{{ $labels.instance }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy." - - - alert: PostgreSQLHighConnections - expr: sum(pg_stat_activity_count) by (instance) > (sum(pg_settings_max_connections) by (instance) - sum(pg_settings_superuser_reserved_connections) by (instance)) * 0.8 - for: 10m - labels: - severity: email - annotations: - summary: "{{ $labels.instance }} is over 80% of max Postgres connections." - description: "{{ $labels.instance }} is exceeding 80% of the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Please check utilization graphs and confirm if this is normal service growth, abuse or an otherwise temporary condition or if new resources need to be provisioned (or the limits increased, which is mostly likely)." - - - alert: PostgreSQLDown - expr: pg_up != 1 - for: 1m - labels: - severity: email - annotations: - summary: "PostgreSQL is not processing queries: {{ $labels.instance }}" - description: "{{ $labels.instance }} is rejecting query requests from the exporter, and thus probably not allowing DNS requests to work either. User services should not be effected provided at least 1 node is still alive." - - - alert: PostgreSQLSlowQueries - expr: avg(rate(pg_stat_activity_max_tx_duration{datname!~"template.*"}[2m])) by (datname) > 2 * 60 - for: 2m - labels: - severity: email - annotations: - summary: "PostgreSQL high number of slow on {{ $labels.cluster }} for database {{ $labels.datname }} " - description: "PostgreSQL high number of slow queries {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }} " - - - alert: PostgreSQLQPS - expr: avg(irate(pg_stat_database_xact_commit{datname!~"template.*"}[5m]) + irate(pg_stat_database_xact_rollback{datname!~"template.*"}[5m])) by (datname) > 10000 - for: 5m - labels: - severity: email - annotations: - summary: "PostgreSQL high number of queries per second {{ $labels.cluster }} for database {{ $labels.datname }}" - description: "PostgreSQL high number of queries per second on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}" - - - alert: PostgreSQLCacheHitRatio - expr: avg(rate(pg_stat_database_blks_hit{datname!~"template.*"}[5m]) / (rate(pg_stat_database_blks_hit{datname!~"template.*"}[5m]) + rate(pg_stat_database_blks_read{datname!~"template.*"}[5m]))) by (datname) < 0.98 - for: 5m - labels: - severity: email - annotations: - summary: "PostgreSQL low cache hit rate on {{ $labels.cluster }} for database {{ $labels.datname }}" - description: "PostgreSQL low on cache hit rate on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}" diff --git a/postgres_mixin/alerts/postgres.libsonnet b/postgres_mixin/alerts/postgres.libsonnet new file mode 100644 index 00000000..29e33923 --- /dev/null +++ b/postgres_mixin/alerts/postgres.libsonnet @@ -0,0 +1,83 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'PostgreSQL', + rules: [ + { + alert: 'PostgreSQLMaxConnectionsReached', + annotations: { + description: '{{ $labels.instance }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy.', + summary: '{{ $labels.instance }} has maxed out Postgres connections.', + }, + expr: 'sum(pg_stat_activity_count) by (instance) >= sum(pg_settings_max_connections) by (instance) - sum(pg_settings_superuser_reserved_connections) by (instance)', + 'for': '1m', + labels: { + severity: 'email', + }, + }, + { + alert: 'PostgreSQLHighConnections', + annotations: { + description: '{{ $labels.instance }} is exceeding 80% of the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Please check utilization graphs and confirm if this is normal service growth, abuse or an otherwise temporary condition or if new resources need to be provisioned (or the limits increased, which is mostly likely).', + summary: '{{ $labels.instance }} is over 80% of max Postgres connections.', + }, + expr: 'sum(pg_stat_activity_count) by (instance) > (sum(pg_settings_max_connections) by (instance) - sum(pg_settings_superuser_reserved_connections) by (instance)) * 0.8', + 'for': '10m', + labels: { + severity: 'email', + }, + }, + { + alert: 'PostgreSQLDown', + annotations: { + description: '{{ $labels.instance }} is rejecting query requests from the exporter, and thus probably not allowing DNS requests to work either. User services should not be effected provided at least 1 node is still alive.', + summary: 'PostgreSQL is not processing queries: {{ $labels.instance }}', + }, + expr: 'pg_up != 1', + 'for': '1m', + labels: { + severity: 'email', + }, + }, + { + alert: 'PostgreSQLSlowQueries', + annotations: { + description: 'PostgreSQL high number of slow queries {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }} ', + summary: 'PostgreSQL high number of slow on {{ $labels.cluster }} for database {{ $labels.datname }} ', + }, + expr: 'avg(rate(pg_stat_activity_max_tx_duration{datname!~"template.*"}[2m])) by (datname) > 2 * 60', + 'for': '2m', + labels: { + severity: 'email', + }, + }, + { + alert: 'PostgreSQLQPS', + annotations: { + description: 'PostgreSQL high number of queries per second on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}', + summary: 'PostgreSQL high number of queries per second {{ $labels.cluster }} for database {{ $labels.datname }}', + }, + expr: 'avg(irate(pg_stat_database_xact_commit{datname!~"template.*"}[5m]) + irate(pg_stat_database_xact_rollback{datname!~"template.*"}[5m])) by (datname) > 10000', + 'for': '5m', + labels: { + severity: 'email', + }, + }, + { + alert: 'PostgreSQLCacheHitRatio', + annotations: { + description: 'PostgreSQL low on cache hit rate on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}', + summary: 'PostgreSQL low cache hit rate on {{ $labels.cluster }} for database {{ $labels.datname }}', + }, + expr: 'avg(rate(pg_stat_database_blks_hit{datname!~"template.*"}[5m]) / (rate(pg_stat_database_blks_hit{datname!~"template.*"}[5m]) + rate(pg_stat_database_blks_read{datname!~"template.*"}[5m]))) by (datname) < 0.98', + 'for': '5m', + labels: { + severity: 'email', + }, + }, + ], + }, + ], + }, +} diff --git a/postgres_mixin/dashboards/dashboards.libsonnet b/postgres_mixin/dashboards/dashboards.libsonnet new file mode 100644 index 00000000..d55f1ef5 --- /dev/null +++ b/postgres_mixin/dashboards/dashboards.libsonnet @@ -0,0 +1,5 @@ +{ + grafanaDashboards+:: { + 'postgres-overview.json': (import 'postgres-overview.json'), + }, +} diff --git a/postgres_mixin/mixin.libsonnet b/postgres_mixin/mixin.libsonnet index 98c7ea7d..6dac631c 100644 --- a/postgres_mixin/mixin.libsonnet +++ b/postgres_mixin/mixin.libsonnet @@ -1,13 +1,2 @@ -{ - grafanaDashboards: { - 'postgres-overview.json': (import 'dashboards/postgres-overview.json'), - }, - - // Helper function to ensure that we don't override other rules, by forcing - // the patching of the groups list, and not the overall rules object. - local importRules(rules) = { - groups+: std.native('parseYaml')(rules)[0].groups, - }, - - prometheusAlerts+: importRules(importstr 'alerts/alerts.yaml'), -} +(import 'alerts/alerts.libsonnet') + +(import 'dashboards/dashboards.libsonnet') \ No newline at end of file From aa38fa6ba641d1656b3a62121bd72b6c5e93d153 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Krupa=20=28paulfantom=29?= Date: Sun, 10 Apr 2022 16:22:25 +0200 Subject: [PATCH 2/3] postgres_mixin/alerts: change severity to "warning" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changing severity to align with mixin recommendations from https://github.com/monitoring-mixins/docs#guidelines-for-alert-names-labels-and-annotations Signed-off-by: Paweł Krupa (paulfantom) --- postgres_mixin/alerts/postgres.libsonnet | 68 +++++++++++++++++++----- 1 file changed, 56 insertions(+), 12 deletions(-) diff --git a/postgres_mixin/alerts/postgres.libsonnet b/postgres_mixin/alerts/postgres.libsonnet index 29e33923..d70f669d 100644 --- a/postgres_mixin/alerts/postgres.libsonnet +++ b/postgres_mixin/alerts/postgres.libsonnet @@ -10,10 +10,16 @@ description: '{{ $labels.instance }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy.', summary: '{{ $labels.instance }} has maxed out Postgres connections.', }, - expr: 'sum(pg_stat_activity_count) by (instance) >= sum(pg_settings_max_connections) by (instance) - sum(pg_settings_superuser_reserved_connections) by (instance)', + expr: ||| + sum by (instance) (pg_stat_activity_count{%(postgresExporterSelector)s}) + >= + sum by (instance) (pg_settings_max_connections{%(postgresExporterSelector)s}) + - + sum by (instance) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) + ||| % $._config, 'for': '1m', labels: { - severity: 'email', + severity: 'warning', }, }, { @@ -22,10 +28,18 @@ description: '{{ $labels.instance }} is exceeding 80% of the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Please check utilization graphs and confirm if this is normal service growth, abuse or an otherwise temporary condition or if new resources need to be provisioned (or the limits increased, which is mostly likely).', summary: '{{ $labels.instance }} is over 80% of max Postgres connections.', }, - expr: 'sum(pg_stat_activity_count) by (instance) > (sum(pg_settings_max_connections) by (instance) - sum(pg_settings_superuser_reserved_connections) by (instance)) * 0.8', + expr: ||| + sum by (instance) (pg_stat_activity_count{%(postgresExporterSelector)s}) + > + ( + sum by (instance) (pg_settings_max_connections{%(postgresExporterSelector)s}) + - + sum by (instance) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) + ) * 0.8 + ||| % $._config, 'for': '10m', labels: { - severity: 'email', + severity: 'warning', }, }, { @@ -34,10 +48,10 @@ description: '{{ $labels.instance }} is rejecting query requests from the exporter, and thus probably not allowing DNS requests to work either. User services should not be effected provided at least 1 node is still alive.', summary: 'PostgreSQL is not processing queries: {{ $labels.instance }}', }, - expr: 'pg_up != 1', + expr: 'pg_up{%(postgresExporterSelector)s} != 1' % $._config, 'for': '1m', labels: { - severity: 'email', + severity: 'warning', }, }, { @@ -46,10 +60,16 @@ description: 'PostgreSQL high number of slow queries {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }} ', summary: 'PostgreSQL high number of slow on {{ $labels.cluster }} for database {{ $labels.datname }} ', }, - expr: 'avg(rate(pg_stat_activity_max_tx_duration{datname!~"template.*"}[2m])) by (datname) > 2 * 60', + expr: ||| + avg( + rate by (datname) ( + pg_stat_activity_max_tx_duration{datname!~"template.*",%(postgresExporterSelector)s}[2m] + ) + ) > 2 * 60 + ||| % $._config, 'for': '2m', labels: { - severity: 'email', + severity: 'warning', }, }, { @@ -58,10 +78,20 @@ description: 'PostgreSQL high number of queries per second on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}', summary: 'PostgreSQL high number of queries per second {{ $labels.cluster }} for database {{ $labels.datname }}', }, - expr: 'avg(irate(pg_stat_database_xact_commit{datname!~"template.*"}[5m]) + irate(pg_stat_database_xact_rollback{datname!~"template.*"}[5m])) by (datname) > 10000', + expr: ||| + avg by (datname) ( + irate( + pg_stat_database_xact_commit{datname!~"template.*",%(postgresExporterSelector)s}[5m] + ) + + + irate( + pg_stat_database_xact_rollback{datname!~"template.*",%(postgresExporterSelector)s}[5m] + ) + ) > 10000 + ||| % $._config, 'for': '5m', labels: { - severity: 'email', + severity: 'warning', }, }, { @@ -70,10 +100,24 @@ description: 'PostgreSQL low on cache hit rate on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}', summary: 'PostgreSQL low cache hit rate on {{ $labels.cluster }} for database {{ $labels.datname }}', }, - expr: 'avg(rate(pg_stat_database_blks_hit{datname!~"template.*"}[5m]) / (rate(pg_stat_database_blks_hit{datname!~"template.*"}[5m]) + rate(pg_stat_database_blks_read{datname!~"template.*"}[5m]))) by (datname) < 0.98', + expr: ||| + avg by (datname) ( + rate(pg_stat_database_blks_hit{datname!~"template.*",%(postgresExporterSelector)s}[5m]) + / + ( + rate( + pg_stat_database_blks_hit{datname!~"template.*",%(postgresExporterSelector)s}[5m] + ) + + + rate( + pg_stat_database_blks_read{datname!~"template.*",%(postgresExporterSelector)s}[5m] + ) + ) + ) < 0.98 + ||| % $._config, 'for': '5m', labels: { - severity: 'email', + severity: 'warning', }, }, ], From c95fd0d482d359844c8ef081c7e3da86c6fe2f74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Krupa=20=28paulfantom=29?= Date: Sun, 10 Apr 2022 16:23:24 +0200 Subject: [PATCH 3/3] postgres_mixin: allow parametrization of mixin by using _config object MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Paweł Krupa (paulfantom) --- postgres_mixin/config.libsonnet | 5 +++++ postgres_mixin/mixin.libsonnet | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 postgres_mixin/config.libsonnet diff --git a/postgres_mixin/config.libsonnet b/postgres_mixin/config.libsonnet new file mode 100644 index 00000000..d7bd7ac1 --- /dev/null +++ b/postgres_mixin/config.libsonnet @@ -0,0 +1,5 @@ +{ + _config+:: { + postgresExporterSelector: '', + }, +} diff --git a/postgres_mixin/mixin.libsonnet b/postgres_mixin/mixin.libsonnet index 6dac631c..119d2cdd 100644 --- a/postgres_mixin/mixin.libsonnet +++ b/postgres_mixin/mixin.libsonnet @@ -1,2 +1,3 @@ (import 'alerts/alerts.libsonnet') + -(import 'dashboards/dashboards.libsonnet') \ No newline at end of file +(import 'dashboards/dashboards.libsonnet') + +(import 'config.libsonnet')