Merge pull request #1429 from prometheus/beorn7/mixin

First iteration for the node mixin, 2nd attempt.
This commit is contained in:
Björn Rabenstein 2019-07-23 23:14:15 +02:00 committed by GitHub
commit 106b09b4ed
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 867 additions and 0 deletions

4
docs/node-mixin/.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
jsonnetfile.lock.json
vendor
*.yaml
dashboards_out

28
docs/node-mixin/Makefile Normal file
View File

@ -0,0 +1,28 @@
JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s
all: fmt node_alerts.yaml node_rules.yaml dashboards_out lint
fmt:
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
xargs -n 1 -- $(JSONNET_FMT) -i
node_alerts.yaml: mixin.libsonnet config.libsonnet $(wildcard alerts/*)
jsonnet -S alerts.jsonnet > $@
node_rules.yaml: mixin.libsonnet config.libsonnet $(wildcard rules/*)
jsonnet -S rules.jsonnet > $@
dashboards_out: mixin.libsonnet config.libsonnet $(wildcard dashboards/*)
@mkdir -p dashboards_out
jsonnet -J vendor -m dashboards_out dashboards.jsonnet
lint: node_alerts.yaml node_rules.yaml
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
while read f; do \
$(JSONNET_FMT) "$$f" | diff -u "$$f" -; \
done
promtool check rules node_alerts.yaml node_rules.yaml
clean:
rm -rf dashboards_out node_alerts.yaml node_rules.yaml

44
docs/node-mixin/README.md Normal file
View File

@ -0,0 +1,44 @@
# Node Mixin
_This is work in progress. We aim for it to become a good role model for alerts
and dashboards eventually, but it is not quite there yet._
The Node Mixin is a set of configurable, reusable, and extensible alerts and
dashboards based on the metrics exported by the Node Exporter. The mixin create
recording and alerting rules for Prometheus and suitable dashboard descriptions
for Grafana.
To use them, you need to have `jsonnet` (v0.13+) and `jb` installed. If you
have a working Go development environment, it's easiest to run the following:
```bash
$ go get github.com/google/go-jsonnet/cmd/jsonnet
$ go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb
```
_Note: The make targets `lint` and `fmt` need the `jsonnetfmt` binary, which is
currently not included in the Go implementation of `jsonnet`. For the time
being, you have to install the [C++ version of
jsonnetfmt](https://github.com/google/jsonnet) if you want to use `make lint`
or `make fmt`._
Next, install the dependencies by running the following command in this
directory:
```bash
$ jb install
```
You can then build the Prometheus rules files `node_alerts.yaml` and
`node_rules.yaml`:
```bash
$ make node_alerts.yaml node_rules.yaml
```
You can also build a directory `dashboard_out` with the JSON dashboard files
for Grafana:
```bash
$ make dashboards_out
```
For more advanced uses of mixins, see
https://github.com/monitoring-mixins/docs.

View File

@ -0,0 +1 @@
std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts)

View File

@ -0,0 +1,191 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'node-exporter',
rules: [
{
alert: 'NodeFilesystemSpaceFillingUp',
expr: |||
(
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} < 0.4
and
predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 24*60*60) < 0
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
)
||| % $._config,
'for': '1h',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Filesystem is predicted to run out of space within the next 24 hours.',
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.',
},
},
{
alert: 'NodeFilesystemSpaceFillingUp',
expr: |||
(
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} < 0.2
and
predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 4*60*60) < 0
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
)
||| % $._config,
'for': '1h',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Filesystem is predicted to run out of space within the next 4 hours.',
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.',
},
},
{
alert: 'NodeFilesystemAlmostOutOfSpace',
expr: |||
(
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
)
||| % $._config,
'for': '1h',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Filesystem has less than 5% space left.',
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.',
},
},
{
alert: 'NodeFilesystemAlmostOutOfSpace',
expr: |||
(
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
)
||| % $._config,
'for': '1h',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Filesystem has less than 3% space left.',
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.',
},
},
{
alert: 'NodeFilesystemFilesFillingUp',
expr: |||
(
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} < 0.4
and
predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 24*60*60) < 0
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
)
||| % $._config,
'for': '1h',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.',
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.',
},
},
{
alert: 'NodeFilesystemFilesFillingUp',
expr: |||
(
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} < 0.2
and
predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 4*60*60) < 0
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
)
||| % $._config,
'for': '1h',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.',
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.',
},
},
{
alert: 'NodeFilesystemAlmostOutOfFiles',
expr: |||
(
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
)
||| % $._config,
'for': '1h',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Filesystem has less than 5% inodes left.',
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.',
},
},
{
alert: 'NodeFilesystemAlmostOutOfFiles',
expr: |||
(
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
)
||| % $._config,
'for': '1h',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Filesystem has less than 3% inodes left.',
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.',
},
},
{
alert: 'NodeNetworkReceiveErrs',
expr: |||
increase(node_network_receive_errs_total[2m]) > 10
||| % $._config,
'for': '1h',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Network interface is reporting many receive errors.',
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.',
},
},
{
alert: 'NodeNetworkTransmitErrs',
expr: |||
increase(node_network_transmit_errs_total[2m]) > 10
||| % $._config,
'for': '1h',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Network interface is reporting many transmit errors.',
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.',
},
},
],
},
],
},
}

View File

@ -0,0 +1,22 @@
{
_config+:: {
// Selectors are inserted between {} in Prometheus queries.
// Select the metrics coming from the node exporter.
nodeExporterSelector: 'job="node"',
// Select the fstype for filesystem-related queries. If left
// empty, all filesystems are selected. If you have unusual
// filesystem you don't want to include in dashboards and
// alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'.
fsSelector: '',
// Select the device for disk-related queries. If left empty, all
// devices are selected. If you have unusual devices you don't
// want to include in dashboards and alerting, you can exclude
// them here, e.g. 'device!="tmpfs"'.
diskDeviceSelector: '',
grafana_prefix: '',
},
}

View File

@ -0,0 +1,6 @@
local dashboards = (import 'mixin.libsonnet').grafanaDashboards;
{
[name]: dashboards[name]
for name in std.objectFields(dashboards)
}

View File

@ -0,0 +1,2 @@
(import 'node.libsonnet') +
(import 'use.libsonnet')

View File

@ -0,0 +1,192 @@
local grafana = import 'grafonnet/grafana.libsonnet';
local dashboard = grafana.dashboard;
local row = grafana.row;
local prometheus = grafana.prometheus;
local template = grafana.template;
local graphPanel = grafana.graphPanel;
local promgrafonnet = import 'promgrafonnet/promgrafonnet.libsonnet';
local gauge = promgrafonnet.gauge;
{
grafanaDashboards+:: {
'nodes.json':
local idleCPU =
graphPanel.new(
'Idle CPU',
datasource='$datasource',
span=6,
format='percentunit',
max=100,
min=0,
)
.addTarget(prometheus.target(
// TODO: Consider using `${__interval}` as range and a 1m min step.
|||
1 - rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])
||| % $._config,
legendFormat='{{cpu}}',
intervalFactor=10,
));
// TODO: Is this panel useful?
local systemLoad =
graphPanel.new(
'Load Average',
datasource='$datasource',
span=6,
format='short',
)
.addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='1m load average'))
.addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='5m load average'))
.addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='15m load average'));
local memoryGraph =
graphPanel.new(
'Memory Usage',
datasource='$datasource',
span=9,
format='bytes',
)
.addTarget(prometheus.target(
|||
(
node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"}
-
node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}
-
node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}
-
node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}
)
||| % $._config, legendFormat='memory used'
))
.addTarget(prometheus.target('node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers'))
.addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached'))
.addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free'));
// TODO: It would be nicer to have a gauge that gets a 0-1 range and displays it as a percentage 0%-100%.
// This needs to be added upstream in the promgrafonnet library and then changed here.
local memoryGauge = gauge.new(
'Memory Usage',
|||
100 -
(
node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"}
/
node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"}
* 100
)
||| % $._config,
).withLowerBeingBetter();
local diskIO =
graphPanel.new(
'Disk I/O',
datasource='$datasource',
span=9,
)
// TODO: Does it make sense to have those three in the same panel?
// TODO: Consider using `${__interval}` as range and a 1m min step.
.addTarget(prometheus.target('rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m])' % $._config, legendFormat='{{device}} read'))
.addTarget(prometheus.target('rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m])' % $._config, legendFormat='{{device}} written'))
.addTarget(prometheus.target('rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m])' % $._config, legendFormat='{{device}} io time')) +
{
seriesOverrides: [
{
alias: 'read',
yaxis: 1,
},
{
alias: 'io time',
yaxis: 2,
},
],
yaxes: [
self.yaxe(format='bytes'),
self.yaxe(format='s'),
],
};
// TODO: It would be nicer to have a gauge that gets a 0-1 range and displays it as a percentage 0%-100%.
// This needs to be added upstream in the promgrafonnet library and then changed here.
// TODO: Should this be partitioned by mountpoint?
local diskSpaceUsage = gauge.new(
'Disk Space Usage',
|||
100 -
(
sum(node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"}
/
sum(node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"}
* 100
)
||| % $._config,
).withLowerBeingBetter();
local networkReceived =
graphPanel.new(
'Network Received',
datasource='$datasource',
span=6,
format='bytes',
)
// TODO: Consider using `${__interval}` as range and a 1m min step.
.addTarget(prometheus.target('rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
local networkTransmitted =
graphPanel.new(
'Network Transmitted',
datasource='$datasource',
span=6,
format='bytes',
)
// TODO: Consider using `${__interval}` as range and a 1m min step.
.addTarget(prometheus.target('rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
dashboard.new('Nodes', time_from='now-1h')
.addTemplate(
{
current: {
text: 'Prometheus',
value: 'Prometheus',
},
hide: 0,
label: null,
name: 'datasource',
options: [],
query: 'prometheus',
refresh: 1,
regex: '',
type: 'datasource',
},
)
.addTemplate(
template.new(
'instance',
'$datasource',
'label_values(node_exporter_build_info{%(nodeExporterSelector)s}, instance)' % $._config,
refresh='time',
)
)
.addRow(
row.new()
.addPanel(idleCPU)
.addPanel(systemLoad)
)
.addRow(
row.new()
.addPanel(memoryGraph)
.addPanel(memoryGauge)
)
.addRow(
row.new()
.addPanel(diskIO)
.addPanel(diskSpaceUsage)
)
.addRow(
row.new()
.addPanel(networkReceived)
.addPanel(networkTransmitted)
),
},
}

View File

@ -0,0 +1,225 @@
local g = import 'grafana-builder/grafana.libsonnet';
{
grafanaDashboards+:: {
'node-cluster-rsrc-use.json':
local legendLink = '%s/dashboard/file/k8s-node-rsrc-use.json' % $._config.grafana_prefix;
g.dashboard('USE Method / Cluster')
.addRow(
g.row('CPU')
.addPanel(
g.panel('CPU Utilisation') +
g.queryPanel(|||
(
instance:node_cpu_utilisation:rate1m{%(nodeExporterSelector)s}
*
instance:node_num_cpu:sum{%(nodeExporterSelector)s}
/ ignoring (instance) group_left
sum without (instance) (instance:node_num_cpu:sum{%(nodeExporterSelector)s})
)
||| % $._config, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
.addPanel(
// TODO: Is this a useful panel? At least there should be some explanation how load
// average relates to the "CPU saturation" in the title.
g.panel('CPU Saturation (load1 per CPU)') +
g.queryPanel(|||
(
instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}
/ ignoring (instance) group_left
count without (instance) (instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s})
)
||| % $._config, '{{instance}}', legendLink) +
g.stack +
// TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios.
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
)
.addRow(
g.row('Memory')
.addPanel(
g.panel('Memory Utilisation') +
g.queryPanel('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
.addPanel(
g.panel('Memory Saturation (Swapped Pages)') +
g.queryPanel('instance:node_memory_swap_io_pages:rate{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes('rps') },
)
)
.addRow(
g.row('Disk')
.addPanel(
g.panel('Disk IO Utilisation') +
// Full utilisation would be all disks on each node spending an average of
// 1 second per second doing I/O, normalize by metric cardinality for stacked charts.
// TODO: Does the partition by device make sense? Using the most utilized device per
// instance might make more sense.
g.queryPanel(|||
(
instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s}
/ ignoring (instance, device) group_left
count without (instance, device) (instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s})
)
||| % $._config, '{{instance}} {{device}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
.addPanel(
g.panel('Disk IO Saturation') +
g.queryPanel(|||
(
instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s}
/ ignoring (instance, device) group_left
count without (instance, device) (instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s})
)
||| % $._config, '{{instance}} {{device}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
)
.addRow(
g.row('Network')
.addPanel(
g.panel('Net Utilisation (Bytes Receive/Transmit)') +
g.queryPanel(
[
'instance:node_network_receive_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config,
'-instance:node_network_transmit_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config,
],
['{{instance}} Receive', '{{instance}} Transmit'],
legendLink,
) +
g.stack +
{ yaxes: g.yaxes('Bps') },
)
.addPanel(
g.panel('Net Saturation (Drops Receive/Transmit)') +
g.queryPanel(
[
'instance:node_network_receive_drop_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config,
'-instance:node_network_transmit_drop_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config,
],
['{{instance}} Receive', '{{instance}} Transmit'],
legendLink,
) +
g.stack +
{ yaxes: g.yaxes('rps') },
)
)
.addRow(
g.row('Storage')
.addPanel(
g.panel('Disk Space Utilisation') +
g.queryPanel(|||
(
sum without (device) (
max without (fstype, mountpoint) (
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
)
)
/ ignoring (instance) group_left
sum without (instance, device) (
max without (fstype, mountpoint) (
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
)
)
)
||| % $._config, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
),
),
'node-rsrc-use.json':
g.dashboard('USE Method / Node')
.addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance')
.addRow(
g.row('CPU')
.addPanel(
g.panel('CPU Utilisation') +
g.queryPanel('instance:node_cpu_utilisation:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation') +
{ yaxes: g.yaxes('percentunit') },
)
.addPanel(
// TODO: Is this a useful panel? At least there should be some explanation how load
// average relates to the "CPU saturation" in the title.
g.panel('CPU Saturation (Load1)') +
g.queryPanel('instance:node_cpu_saturation_load1:{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') +
{ yaxes: g.yaxes('percentunit') },
)
)
.addRow(
g.row('Memory')
.addPanel(
g.panel('Memory Utilisation') +
g.queryPanel('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Memory') +
{ yaxes: g.yaxes('percentunit') },
)
.addPanel(
g.panel('Memory Saturation (pages swapped per second)') +
g.queryPanel('instance:node_memory_swap_io_pages:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Swap IO') +
{ yaxes: g.yaxes('short') },
)
)
.addRow(
g.row('Disk')
.addPanel(
g.panel('Disk IO Utilisation') +
g.queryPanel('instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation {{device}}') +
{ yaxes: g.yaxes('percentunit') },
)
.addPanel(
g.panel('Disk IO Saturation') +
g.queryPanel('instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation {{device}}') +
{ yaxes: g.yaxes('percentunit') },
)
)
.addRow(
g.row('Net')
.addPanel(
g.panel('Net Utilisation (Bytes Receive/Transmit)') +
g.queryPanel(
[
'instance:node_network_receive_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config,
'-instance:node_network_transmit_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config,
],
['Receive', 'Transmit'],
) +
{ yaxes: g.yaxes('Bps') },
)
.addPanel(
g.panel('Net Saturation (Drops Receive/Transmit)') +
g.queryPanel(
[
'instance:node_network_receive_drop_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config,
'-instance:node_network_transmit_drop_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config,
],
['Receive drops', 'Transmit drops'],
) +
{ yaxes: g.yaxes('rps') },
)
)
.addRow(
g.row('Disk')
.addPanel(
g.panel('Disk Utilisation') +
g.queryPanel(|||
1 -
(
sum(max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s}))
/
sum(max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s}))
)
||| % $._config, 'Disk') +
{ yaxes: g.yaxes('percentunit') },
),
),
},
}

View File

@ -0,0 +1,34 @@
{
"dependencies": [
{
"name": "grafonnet",
"source": {
"git": {
"remote": "https://github.com/grafana/grafonnet-lib",
"subdir": "grafonnet"
}
},
"version": "master"
},
{
"name": "grafana-builder",
"source": {
"git": {
"remote": "https://github.com/grafana/jsonnet-libs",
"subdir": "grafana-builder"
}
},
"version": "master"
},
{
"name": "promgrafonnet",
"source": {
"git": {
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin",
"subdir": "lib/promgrafonnet"
}
},
"version": "master"
}
]
}

View File

@ -0,0 +1,4 @@
(import 'config.libsonnet') +
(import 'alerts/alerts.libsonnet') +
(import 'dashboards/dashboards.libsonnet') +
(import 'rules/rules.libsonnet')

View File

@ -0,0 +1 @@
std.manifestYamlDoc((import 'mixin.libsonnet').prometheusRules)

View File

@ -0,0 +1,113 @@
{
prometheusRules+:: {
groups+: [
{
name: 'node-exporter.rules',
rules: [
{
// This rule gives the number of CPUs per node.
record: 'instance:node_num_cpu:sum',
expr: |||
count without (cpu) (
count without (mode) (
node_cpu_seconds_total{%(nodeExporterSelector)s}
)
)
||| % $._config,
},
{
// CPU utilisation is % CPU is not idle.
record: 'instance:node_cpu_utilisation:rate1m',
expr: |||
1 - avg without (cpu, mode) (
rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}[1m])
)
||| % $._config,
},
{
// This is CPU saturation: 1min avg run queue length / number of CPUs.
// Can go over 1.
// TODO: There are situation where a run queue >1/core is just normal and fine.
// We need to clarify how to read this metric and if its usage is helpful at all.
record: 'instance:node_load1_per_cpu:ratio',
expr: |||
(
node_load1{%(nodeExporterSelector)s}
/
instance:node_num_cpu:sum{%(nodeExporterSelector)s}
)
||| % $._config,
},
{
// Memory utilisation (ratio of used memory per instance).
record: 'instance:node_memory_utilisation:ratio',
expr: |||
1 - (
node_memory_MemAvailable_bytes{%(nodeExporterSelector)s}
/
node_memory_MemTotal_bytes{%(nodeExporterSelector)s}
)
||| % $._config,
},
{
record: 'instance:node_memory_swap_io_pages:rate1m',
expr: |||
(
rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m])
+
rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m])
)
||| % $._config,
},
{
// Disk utilisation (seconds spent, 1 second rate).
record: 'instance_device:node_disk_io_time_seconds:rate1m',
expr: |||
rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m])
||| % $._config,
},
{
// Disk saturation (weighted seconds spent, 1 second rate).
record: 'instance_device:node_disk_io_time_weighted_seconds:rate1m',
expr: |||
rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m])
||| % $._config,
},
{
record: 'instance:node_network_receive_bytes_excluding_lo:rate1m',
expr: |||
sum without (device) (
rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m])
)
||| % $._config,
},
{
record: 'instance:node_network_transmit_bytes_excluding_lo:rate1m',
expr: |||
sum without (device) (
rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m])
)
||| % $._config,
},
// TODO: Find out if those drops ever happen on modern switched networks.
{
record: 'instance:node_network_receive_drop_excluding_lo:rate1m',
expr: |||
sum without (device) (
rate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m])
)
||| % $._config,
},
{
record: 'instance:node_network_transmit_drop_excluding_lo:rate1m',
expr: |||
sum without (device) (
rate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m])
)
||| % $._config,
},
],
},
],
},
}