Merge pull request #1429 from prometheus/beorn7/mixin
First iteration for the node mixin, 2nd attempt.
This commit is contained in:
commit
106b09b4ed
|
@ -0,0 +1,4 @@
|
|||
jsonnetfile.lock.json
|
||||
vendor
|
||||
*.yaml
|
||||
dashboards_out
|
|
@ -0,0 +1,28 @@
|
|||
JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s
|
||||
|
||||
all: fmt node_alerts.yaml node_rules.yaml dashboards_out lint
|
||||
|
||||
fmt:
|
||||
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
|
||||
xargs -n 1 -- $(JSONNET_FMT) -i
|
||||
|
||||
node_alerts.yaml: mixin.libsonnet config.libsonnet $(wildcard alerts/*)
|
||||
jsonnet -S alerts.jsonnet > $@
|
||||
|
||||
node_rules.yaml: mixin.libsonnet config.libsonnet $(wildcard rules/*)
|
||||
jsonnet -S rules.jsonnet > $@
|
||||
|
||||
dashboards_out: mixin.libsonnet config.libsonnet $(wildcard dashboards/*)
|
||||
@mkdir -p dashboards_out
|
||||
jsonnet -J vendor -m dashboards_out dashboards.jsonnet
|
||||
|
||||
lint: node_alerts.yaml node_rules.yaml
|
||||
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
|
||||
while read f; do \
|
||||
$(JSONNET_FMT) "$$f" | diff -u "$$f" -; \
|
||||
done
|
||||
|
||||
promtool check rules node_alerts.yaml node_rules.yaml
|
||||
|
||||
clean:
|
||||
rm -rf dashboards_out node_alerts.yaml node_rules.yaml
|
|
@ -0,0 +1,44 @@
|
|||
# Node Mixin
|
||||
|
||||
_This is work in progress. We aim for it to become a good role model for alerts
|
||||
and dashboards eventually, but it is not quite there yet._
|
||||
|
||||
The Node Mixin is a set of configurable, reusable, and extensible alerts and
|
||||
dashboards based on the metrics exported by the Node Exporter. The mixin create
|
||||
recording and alerting rules for Prometheus and suitable dashboard descriptions
|
||||
for Grafana.
|
||||
|
||||
To use them, you need to have `jsonnet` (v0.13+) and `jb` installed. If you
|
||||
have a working Go development environment, it's easiest to run the following:
|
||||
```bash
|
||||
$ go get github.com/google/go-jsonnet/cmd/jsonnet
|
||||
$ go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb
|
||||
```
|
||||
|
||||
_Note: The make targets `lint` and `fmt` need the `jsonnetfmt` binary, which is
|
||||
currently not included in the Go implementation of `jsonnet`. For the time
|
||||
being, you have to install the [C++ version of
|
||||
jsonnetfmt](https://github.com/google/jsonnet) if you want to use `make lint`
|
||||
or `make fmt`._
|
||||
|
||||
Next, install the dependencies by running the following command in this
|
||||
directory:
|
||||
```bash
|
||||
$ jb install
|
||||
```
|
||||
|
||||
You can then build the Prometheus rules files `node_alerts.yaml` and
|
||||
`node_rules.yaml`:
|
||||
```bash
|
||||
$ make node_alerts.yaml node_rules.yaml
|
||||
```
|
||||
|
||||
You can also build a directory `dashboard_out` with the JSON dashboard files
|
||||
for Grafana:
|
||||
```bash
|
||||
$ make dashboards_out
|
||||
```
|
||||
|
||||
For more advanced uses of mixins, see
|
||||
https://github.com/monitoring-mixins/docs.
|
||||
|
|
@ -0,0 +1 @@
|
|||
std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts)
|
|
@ -0,0 +1,191 @@
|
|||
{
|
||||
prometheusAlerts+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'node-exporter',
|
||||
rules: [
|
||||
{
|
||||
alert: 'NodeFilesystemSpaceFillingUp',
|
||||
expr: |||
|
||||
(
|
||||
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} < 0.4
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
|
||||
)
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Filesystem is predicted to run out of space within the next 24 hours.',
|
||||
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeFilesystemSpaceFillingUp',
|
||||
expr: |||
|
||||
(
|
||||
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} < 0.2
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
|
||||
)
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Filesystem is predicted to run out of space within the next 4 hours.',
|
||||
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeFilesystemAlmostOutOfSpace',
|
||||
expr: |||
|
||||
(
|
||||
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
|
||||
)
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Filesystem has less than 5% space left.',
|
||||
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeFilesystemAlmostOutOfSpace',
|
||||
expr: |||
|
||||
(
|
||||
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
|
||||
)
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Filesystem has less than 3% space left.',
|
||||
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeFilesystemFilesFillingUp',
|
||||
expr: |||
|
||||
(
|
||||
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} < 0.4
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
|
||||
)
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.',
|
||||
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeFilesystemFilesFillingUp',
|
||||
expr: |||
|
||||
(
|
||||
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} < 0.2
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
|
||||
)
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.',
|
||||
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeFilesystemAlmostOutOfFiles',
|
||||
expr: |||
|
||||
(
|
||||
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
|
||||
)
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Filesystem has less than 5% inodes left.',
|
||||
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeFilesystemAlmostOutOfFiles',
|
||||
expr: |||
|
||||
(
|
||||
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
|
||||
)
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Filesystem has less than 3% inodes left.',
|
||||
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeNetworkReceiveErrs',
|
||||
expr: |||
|
||||
increase(node_network_receive_errs_total[2m]) > 10
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Network interface is reporting many receive errors.',
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeNetworkTransmitErrs',
|
||||
expr: |||
|
||||
increase(node_network_transmit_errs_total[2m]) > 10
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Network interface is reporting many transmit errors.',
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
{
|
||||
_config+:: {
|
||||
// Selectors are inserted between {} in Prometheus queries.
|
||||
|
||||
// Select the metrics coming from the node exporter.
|
||||
nodeExporterSelector: 'job="node"',
|
||||
|
||||
// Select the fstype for filesystem-related queries. If left
|
||||
// empty, all filesystems are selected. If you have unusual
|
||||
// filesystem you don't want to include in dashboards and
|
||||
// alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'.
|
||||
fsSelector: '',
|
||||
|
||||
// Select the device for disk-related queries. If left empty, all
|
||||
// devices are selected. If you have unusual devices you don't
|
||||
// want to include in dashboards and alerting, you can exclude
|
||||
// them here, e.g. 'device!="tmpfs"'.
|
||||
diskDeviceSelector: '',
|
||||
|
||||
grafana_prefix: '',
|
||||
},
|
||||
}
|
|
@ -0,0 +1,6 @@
|
|||
local dashboards = (import 'mixin.libsonnet').grafanaDashboards;
|
||||
|
||||
{
|
||||
[name]: dashboards[name]
|
||||
for name in std.objectFields(dashboards)
|
||||
}
|
|
@ -0,0 +1,2 @@
|
|||
(import 'node.libsonnet') +
|
||||
(import 'use.libsonnet')
|
|
@ -0,0 +1,192 @@
|
|||
local grafana = import 'grafonnet/grafana.libsonnet';
|
||||
local dashboard = grafana.dashboard;
|
||||
local row = grafana.row;
|
||||
local prometheus = grafana.prometheus;
|
||||
local template = grafana.template;
|
||||
local graphPanel = grafana.graphPanel;
|
||||
local promgrafonnet = import 'promgrafonnet/promgrafonnet.libsonnet';
|
||||
local gauge = promgrafonnet.gauge;
|
||||
|
||||
{
|
||||
grafanaDashboards+:: {
|
||||
'nodes.json':
|
||||
local idleCPU =
|
||||
graphPanel.new(
|
||||
'Idle CPU',
|
||||
datasource='$datasource',
|
||||
span=6,
|
||||
format='percentunit',
|
||||
max=100,
|
||||
min=0,
|
||||
)
|
||||
.addTarget(prometheus.target(
|
||||
// TODO: Consider using `${__interval}` as range and a 1m min step.
|
||||
|||
|
||||
1 - rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])
|
||||
||| % $._config,
|
||||
legendFormat='{{cpu}}',
|
||||
intervalFactor=10,
|
||||
));
|
||||
|
||||
// TODO: Is this panel useful?
|
||||
local systemLoad =
|
||||
graphPanel.new(
|
||||
'Load Average',
|
||||
datasource='$datasource',
|
||||
span=6,
|
||||
format='short',
|
||||
)
|
||||
.addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='1m load average'))
|
||||
.addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='5m load average'))
|
||||
.addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='15m load average'));
|
||||
|
||||
local memoryGraph =
|
||||
graphPanel.new(
|
||||
'Memory Usage',
|
||||
datasource='$datasource',
|
||||
span=9,
|
||||
format='bytes',
|
||||
)
|
||||
.addTarget(prometheus.target(
|
||||
|||
|
||||
(
|
||||
node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"}
|
||||
-
|
||||
node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}
|
||||
-
|
||||
node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}
|
||||
-
|
||||
node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}
|
||||
)
|
||||
||| % $._config, legendFormat='memory used'
|
||||
))
|
||||
.addTarget(prometheus.target('node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers'))
|
||||
.addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached'))
|
||||
.addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free'));
|
||||
|
||||
// TODO: It would be nicer to have a gauge that gets a 0-1 range and displays it as a percentage 0%-100%.
|
||||
// This needs to be added upstream in the promgrafonnet library and then changed here.
|
||||
local memoryGauge = gauge.new(
|
||||
'Memory Usage',
|
||||
|||
|
||||
100 -
|
||||
(
|
||||
node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"}
|
||||
/
|
||||
node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"}
|
||||
* 100
|
||||
)
|
||||
||| % $._config,
|
||||
).withLowerBeingBetter();
|
||||
|
||||
local diskIO =
|
||||
graphPanel.new(
|
||||
'Disk I/O',
|
||||
datasource='$datasource',
|
||||
span=9,
|
||||
)
|
||||
// TODO: Does it make sense to have those three in the same panel?
|
||||
// TODO: Consider using `${__interval}` as range and a 1m min step.
|
||||
.addTarget(prometheus.target('rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m])' % $._config, legendFormat='{{device}} read'))
|
||||
.addTarget(prometheus.target('rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m])' % $._config, legendFormat='{{device}} written'))
|
||||
.addTarget(prometheus.target('rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m])' % $._config, legendFormat='{{device}} io time')) +
|
||||
{
|
||||
seriesOverrides: [
|
||||
{
|
||||
alias: 'read',
|
||||
yaxis: 1,
|
||||
},
|
||||
{
|
||||
alias: 'io time',
|
||||
yaxis: 2,
|
||||
},
|
||||
],
|
||||
yaxes: [
|
||||
self.yaxe(format='bytes'),
|
||||
self.yaxe(format='s'),
|
||||
],
|
||||
};
|
||||
|
||||
// TODO: It would be nicer to have a gauge that gets a 0-1 range and displays it as a percentage 0%-100%.
|
||||
// This needs to be added upstream in the promgrafonnet library and then changed here.
|
||||
// TODO: Should this be partitioned by mountpoint?
|
||||
local diskSpaceUsage = gauge.new(
|
||||
'Disk Space Usage',
|
||||
|||
|
||||
100 -
|
||||
(
|
||||
sum(node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"}
|
||||
/
|
||||
sum(node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"}
|
||||
* 100
|
||||
)
|
||||
||| % $._config,
|
||||
).withLowerBeingBetter();
|
||||
|
||||
local networkReceived =
|
||||
graphPanel.new(
|
||||
'Network Received',
|
||||
datasource='$datasource',
|
||||
span=6,
|
||||
format='bytes',
|
||||
)
|
||||
// TODO: Consider using `${__interval}` as range and a 1m min step.
|
||||
.addTarget(prometheus.target('rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
|
||||
|
||||
local networkTransmitted =
|
||||
graphPanel.new(
|
||||
'Network Transmitted',
|
||||
datasource='$datasource',
|
||||
span=6,
|
||||
format='bytes',
|
||||
)
|
||||
// TODO: Consider using `${__interval}` as range and a 1m min step.
|
||||
.addTarget(prometheus.target('rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
|
||||
|
||||
dashboard.new('Nodes', time_from='now-1h')
|
||||
.addTemplate(
|
||||
{
|
||||
current: {
|
||||
text: 'Prometheus',
|
||||
value: 'Prometheus',
|
||||
},
|
||||
hide: 0,
|
||||
label: null,
|
||||
name: 'datasource',
|
||||
options: [],
|
||||
query: 'prometheus',
|
||||
refresh: 1,
|
||||
regex: '',
|
||||
type: 'datasource',
|
||||
},
|
||||
)
|
||||
.addTemplate(
|
||||
template.new(
|
||||
'instance',
|
||||
'$datasource',
|
||||
'label_values(node_exporter_build_info{%(nodeExporterSelector)s}, instance)' % $._config,
|
||||
refresh='time',
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
row.new()
|
||||
.addPanel(idleCPU)
|
||||
.addPanel(systemLoad)
|
||||
)
|
||||
.addRow(
|
||||
row.new()
|
||||
.addPanel(memoryGraph)
|
||||
.addPanel(memoryGauge)
|
||||
)
|
||||
.addRow(
|
||||
row.new()
|
||||
.addPanel(diskIO)
|
||||
.addPanel(diskSpaceUsage)
|
||||
)
|
||||
.addRow(
|
||||
row.new()
|
||||
.addPanel(networkReceived)
|
||||
.addPanel(networkTransmitted)
|
||||
),
|
||||
},
|
||||
}
|
|
@ -0,0 +1,225 @@
|
|||
local g = import 'grafana-builder/grafana.libsonnet';
|
||||
|
||||
{
|
||||
grafanaDashboards+:: {
|
||||
'node-cluster-rsrc-use.json':
|
||||
local legendLink = '%s/dashboard/file/k8s-node-rsrc-use.json' % $._config.grafana_prefix;
|
||||
|
||||
g.dashboard('USE Method / Cluster')
|
||||
.addRow(
|
||||
g.row('CPU')
|
||||
.addPanel(
|
||||
g.panel('CPU Utilisation') +
|
||||
g.queryPanel(|||
|
||||
(
|
||||
instance:node_cpu_utilisation:rate1m{%(nodeExporterSelector)s}
|
||||
*
|
||||
instance:node_num_cpu:sum{%(nodeExporterSelector)s}
|
||||
/ ignoring (instance) group_left
|
||||
sum without (instance) (instance:node_num_cpu:sum{%(nodeExporterSelector)s})
|
||||
)
|
||||
||| % $._config, '{{instance}}', legendLink) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
||||
)
|
||||
.addPanel(
|
||||
// TODO: Is this a useful panel? At least there should be some explanation how load
|
||||
// average relates to the "CPU saturation" in the title.
|
||||
g.panel('CPU Saturation (load1 per CPU)') +
|
||||
g.queryPanel(|||
|
||||
(
|
||||
instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}
|
||||
/ ignoring (instance) group_left
|
||||
count without (instance) (instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s})
|
||||
)
|
||||
||| % $._config, '{{instance}}', legendLink) +
|
||||
g.stack +
|
||||
// TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios.
|
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Memory')
|
||||
.addPanel(
|
||||
g.panel('Memory Utilisation') +
|
||||
g.queryPanel('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Memory Saturation (Swapped Pages)') +
|
||||
g.queryPanel('instance:node_memory_swap_io_pages:rate{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('rps') },
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Disk')
|
||||
.addPanel(
|
||||
g.panel('Disk IO Utilisation') +
|
||||
// Full utilisation would be all disks on each node spending an average of
|
||||
// 1 second per second doing I/O, normalize by metric cardinality for stacked charts.
|
||||
// TODO: Does the partition by device make sense? Using the most utilized device per
|
||||
// instance might make more sense.
|
||||
g.queryPanel(|||
|
||||
(
|
||||
instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s}
|
||||
/ ignoring (instance, device) group_left
|
||||
count without (instance, device) (instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s})
|
||||
)
|
||||
||| % $._config, '{{instance}} {{device}}', legendLink) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Disk IO Saturation') +
|
||||
g.queryPanel(|||
|
||||
(
|
||||
instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s}
|
||||
/ ignoring (instance, device) group_left
|
||||
count without (instance, device) (instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s})
|
||||
)
|
||||
||| % $._config, '{{instance}} {{device}}', legendLink) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Network')
|
||||
.addPanel(
|
||||
g.panel('Net Utilisation (Bytes Receive/Transmit)') +
|
||||
g.queryPanel(
|
||||
[
|
||||
'instance:node_network_receive_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config,
|
||||
'-instance:node_network_transmit_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config,
|
||||
],
|
||||
['{{instance}} Receive', '{{instance}} Transmit'],
|
||||
legendLink,
|
||||
) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Net Saturation (Drops Receive/Transmit)') +
|
||||
g.queryPanel(
|
||||
[
|
||||
'instance:node_network_receive_drop_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config,
|
||||
'-instance:node_network_transmit_drop_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config,
|
||||
],
|
||||
['{{instance}} Receive', '{{instance}} Transmit'],
|
||||
legendLink,
|
||||
) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('rps') },
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Storage')
|
||||
.addPanel(
|
||||
g.panel('Disk Space Utilisation') +
|
||||
g.queryPanel(|||
|
||||
(
|
||||
sum without (device) (
|
||||
max without (fstype, mountpoint) (
|
||||
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
|
||||
)
|
||||
)
|
||||
/ ignoring (instance) group_left
|
||||
sum without (instance, device) (
|
||||
max without (fstype, mountpoint) (
|
||||
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
|
||||
)
|
||||
)
|
||||
)
|
||||
||| % $._config, '{{instance}}', legendLink) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
||||
),
|
||||
),
|
||||
|
||||
'node-rsrc-use.json':
|
||||
g.dashboard('USE Method / Node')
|
||||
.addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance')
|
||||
.addRow(
|
||||
g.row('CPU')
|
||||
.addPanel(
|
||||
g.panel('CPU Utilisation') +
|
||||
g.queryPanel('instance:node_cpu_utilisation:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation') +
|
||||
{ yaxes: g.yaxes('percentunit') },
|
||||
)
|
||||
.addPanel(
|
||||
// TODO: Is this a useful panel? At least there should be some explanation how load
|
||||
// average relates to the "CPU saturation" in the title.
|
||||
g.panel('CPU Saturation (Load1)') +
|
||||
g.queryPanel('instance:node_cpu_saturation_load1:{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') +
|
||||
{ yaxes: g.yaxes('percentunit') },
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Memory')
|
||||
.addPanel(
|
||||
g.panel('Memory Utilisation') +
|
||||
g.queryPanel('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Memory') +
|
||||
{ yaxes: g.yaxes('percentunit') },
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Memory Saturation (pages swapped per second)') +
|
||||
g.queryPanel('instance:node_memory_swap_io_pages:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Swap IO') +
|
||||
{ yaxes: g.yaxes('short') },
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Disk')
|
||||
.addPanel(
|
||||
g.panel('Disk IO Utilisation') +
|
||||
g.queryPanel('instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation {{device}}') +
|
||||
{ yaxes: g.yaxes('percentunit') },
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Disk IO Saturation') +
|
||||
g.queryPanel('instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation {{device}}') +
|
||||
{ yaxes: g.yaxes('percentunit') },
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Net')
|
||||
.addPanel(
|
||||
g.panel('Net Utilisation (Bytes Receive/Transmit)') +
|
||||
g.queryPanel(
|
||||
[
|
||||
'instance:node_network_receive_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config,
|
||||
'-instance:node_network_transmit_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config,
|
||||
],
|
||||
['Receive', 'Transmit'],
|
||||
) +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Net Saturation (Drops Receive/Transmit)') +
|
||||
g.queryPanel(
|
||||
[
|
||||
'instance:node_network_receive_drop_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config,
|
||||
'-instance:node_network_transmit_drop_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config,
|
||||
],
|
||||
['Receive drops', 'Transmit drops'],
|
||||
) +
|
||||
{ yaxes: g.yaxes('rps') },
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Disk')
|
||||
.addPanel(
|
||||
g.panel('Disk Utilisation') +
|
||||
g.queryPanel(|||
|
||||
1 -
|
||||
(
|
||||
sum(max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s}))
|
||||
/
|
||||
sum(max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s}))
|
||||
)
|
||||
||| % $._config, 'Disk') +
|
||||
{ yaxes: g.yaxes('percentunit') },
|
||||
),
|
||||
),
|
||||
},
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
{
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "grafonnet",
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/grafana/grafonnet-lib",
|
||||
"subdir": "grafonnet"
|
||||
}
|
||||
},
|
||||
"version": "master"
|
||||
},
|
||||
{
|
||||
"name": "grafana-builder",
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/grafana/jsonnet-libs",
|
||||
"subdir": "grafana-builder"
|
||||
}
|
||||
},
|
||||
"version": "master"
|
||||
},
|
||||
{
|
||||
"name": "promgrafonnet",
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin",
|
||||
"subdir": "lib/promgrafonnet"
|
||||
}
|
||||
},
|
||||
"version": "master"
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
(import 'config.libsonnet') +
|
||||
(import 'alerts/alerts.libsonnet') +
|
||||
(import 'dashboards/dashboards.libsonnet') +
|
||||
(import 'rules/rules.libsonnet')
|
|
@ -0,0 +1 @@
|
|||
std.manifestYamlDoc((import 'mixin.libsonnet').prometheusRules)
|
|
@ -0,0 +1,113 @@
|
|||
{
|
||||
prometheusRules+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'node-exporter.rules',
|
||||
rules: [
|
||||
{
|
||||
// This rule gives the number of CPUs per node.
|
||||
record: 'instance:node_num_cpu:sum',
|
||||
expr: |||
|
||||
count without (cpu) (
|
||||
count without (mode) (
|
||||
node_cpu_seconds_total{%(nodeExporterSelector)s}
|
||||
)
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
// CPU utilisation is % CPU is not idle.
|
||||
record: 'instance:node_cpu_utilisation:rate1m',
|
||||
expr: |||
|
||||
1 - avg without (cpu, mode) (
|
||||
rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}[1m])
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
// This is CPU saturation: 1min avg run queue length / number of CPUs.
|
||||
// Can go over 1.
|
||||
// TODO: There are situation where a run queue >1/core is just normal and fine.
|
||||
// We need to clarify how to read this metric and if its usage is helpful at all.
|
||||
record: 'instance:node_load1_per_cpu:ratio',
|
||||
expr: |||
|
||||
(
|
||||
node_load1{%(nodeExporterSelector)s}
|
||||
/
|
||||
instance:node_num_cpu:sum{%(nodeExporterSelector)s}
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
// Memory utilisation (ratio of used memory per instance).
|
||||
record: 'instance:node_memory_utilisation:ratio',
|
||||
expr: |||
|
||||
1 - (
|
||||
node_memory_MemAvailable_bytes{%(nodeExporterSelector)s}
|
||||
/
|
||||
node_memory_MemTotal_bytes{%(nodeExporterSelector)s}
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
record: 'instance:node_memory_swap_io_pages:rate1m',
|
||||
expr: |||
|
||||
(
|
||||
rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m])
|
||||
+
|
||||
rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m])
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
// Disk utilisation (seconds spent, 1 second rate).
|
||||
record: 'instance_device:node_disk_io_time_seconds:rate1m',
|
||||
expr: |||
|
||||
rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m])
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
// Disk saturation (weighted seconds spent, 1 second rate).
|
||||
record: 'instance_device:node_disk_io_time_weighted_seconds:rate1m',
|
||||
expr: |||
|
||||
rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m])
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
record: 'instance:node_network_receive_bytes_excluding_lo:rate1m',
|
||||
expr: |||
|
||||
sum without (device) (
|
||||
rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m])
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
record: 'instance:node_network_transmit_bytes_excluding_lo:rate1m',
|
||||
expr: |||
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m])
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
// TODO: Find out if those drops ever happen on modern switched networks.
|
||||
{
|
||||
record: 'instance:node_network_receive_drop_excluding_lo:rate1m',
|
||||
expr: |||
|
||||
sum without (device) (
|
||||
rate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m])
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
record: 'instance:node_network_transmit_drop_excluding_lo:rate1m',
|
||||
expr: |||
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m])
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
Loading…
Reference in New Issue