diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet index 57e585b8..898c912d 100644 --- a/docs/node-mixin/dashboards/node.libsonnet +++ b/docs/node-mixin/dashboards/node.libsonnet @@ -1,256 +1,7 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; -local promgrafonnet = import 'github.com/kubernetes-monitoring/kubernetes-mixin/lib/promgrafonnet/promgrafonnet.libsonnet'; -local gauge = promgrafonnet.gauge; - { + local nodemixin = import '../lib/prom-mixin.libsonnet', grafanaDashboards+:: { - 'nodes.json': - local idleCPU = - graphPanel.new( - 'CPU Usage', - datasource='$datasource', - span=6, - format='percentunit', - max=1, - min=0, - stack=true, - ) - .addTarget(prometheus.target( - ||| - ( - (1 - sum without (mode) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode=~"idle|iowait|steal", instance="$instance"}[$__rate_interval]))) - / ignoring(cpu) group_left - count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}) - ) - ||| % $._config, - legendFormat='{{cpu}}', - intervalFactor=5, - )); - - local systemLoad = - graphPanel.new( - 'Load Average', - datasource='$datasource', - span=6, - format='short', - min=0, - fill=0, - ) - .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='1m load average')) - .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='5m load average')) - .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='15m load average')) - .addTarget(prometheus.target('count(node_cpu_seconds_total{%(nodeExporterSelector)s, instance="$instance", mode="idle"})' % $._config, legendFormat='logical cores')); - - local memoryGraph = - graphPanel.new( - 'Memory Usage', - datasource='$datasource', - span=9, - format='bytes', - stack=true, - min=0, - ) - .addTarget(prometheus.target( - ||| - ( - node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} - - - node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"} - - - node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"} - - - node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"} - ) - ||| % $._config, legendFormat='memory used' - )) - .addTarget(prometheus.target('node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers')) - .addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) - .addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); - - // TODO: It would be nicer to have a gauge that gets a 0-1 range and displays it as a percentage 0%-100%. - // This needs to be added upstream in the promgrafonnet library and then changed here. - // NOTE: avg() is used to circumvent a label change caused by a node_exporter rollout. - local memoryGauge = gauge.new( - 'Memory Usage', - ||| - 100 - - ( - avg(node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"}) - / - avg(node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"}) - * 100 - ) - ||| % $._config, - ).withLowerBeingBetter(); - - local diskIO = - graphPanel.new( - 'Disk I/O', - datasource='$datasource', - span=6, - min=0, - fill=0, - ) - // TODO: Does it make sense to have those three in the same panel? - .addTarget(prometheus.target( - 'rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % $._config, - legendFormat='{{device}} read', - )) - .addTarget(prometheus.target( - 'rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % $._config, - legendFormat='{{device}} written', - )) - .addTarget(prometheus.target( - 'rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % $._config, - legendFormat='{{device}} io time', - )) + - { - seriesOverrides: [ - { - alias: '/ read| written/', - yaxis: 1, - }, - { - alias: '/ io time/', - yaxis: 2, - }, - ], - yaxes: [ - self.yaxe(format='bytes'), - self.yaxe(format='s'), - ], - }; - - // TODO: Somehow partition this by device while excluding read-only devices. - local diskSpaceUsage = - graphPanel.new( - 'Disk Space Usage', - datasource='$datasource', - span=6, - format='bytes', - min=0, - fill=1, - stack=true, - ) - .addTarget(prometheus.target( - ||| - sum( - max by (device) ( - node_filesystem_size_bytes{%(nodeExporterSelector)s, instance="$instance", %(fsSelector)s} - - - node_filesystem_avail_bytes{%(nodeExporterSelector)s, instance="$instance", %(fsSelector)s} - ) - ) - ||| % $._config, - legendFormat='used', - )) - .addTarget(prometheus.target( - ||| - sum( - max by (device) ( - node_filesystem_avail_bytes{%(nodeExporterSelector)s, instance="$instance", %(fsSelector)s} - ) - ) - ||| % $._config, - legendFormat='available', - )) + - { - seriesOverrides: [ - { - alias: 'used', - color: '#E0B400', - }, - { - alias: 'available', - color: '#73BF69', - }, - ], - }; - - local networkReceived = - graphPanel.new( - 'Network Received', - datasource='$datasource', - span=6, - format='bytes', - min=0, - fill=0, - ) - .addTarget(prometheus.target( - 'rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__rate_interval])' % $._config, - legendFormat='{{device}}', - )); - - local networkTransmitted = - graphPanel.new( - 'Network Transmitted', - datasource='$datasource', - span=6, - format='bytes', - min=0, - fill=0, - ) - .addTarget(prometheus.target( - 'rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__rate_interval])' % $._config, - legendFormat='{{device}}', - )); - - dashboard.new( - '%sNodes' % $._config.dashboardNamePrefix, - time_from='now-1h', - tags=($._config.dashboardTags), - timezone='utc', - refresh='30s', - graphTooltip='shared_crosshair' - ) - .addTemplate( - { - current: { - text: 'default', - value: 'default', - }, - hide: 0, - label: 'Data Source', - name: 'datasource', - options: [], - query: 'prometheus', - refresh: 1, - regex: '', - type: 'datasource', - }, - ) - .addTemplate( - template.new( - 'instance', - '$datasource', - 'label_values(node_exporter_build_info{%(nodeExporterSelector)s}, instance)' % $._config, - refresh='time', - ) - ) - .addRow( - row.new() - .addPanel(idleCPU) - .addPanel(systemLoad) - ) - .addRow( - row.new() - .addPanel(memoryGraph) - .addPanel(memoryGauge) - ) - .addRow( - row.new() - .addPanel(diskIO) - .addPanel(diskSpaceUsage) - ) - .addRow( - row.new() - .addPanel(networkReceived) - .addPanel(networkTransmitted) - ), + 'nodes.json': nodemixin.new(config=$._config, platform='Linux').dashboard, + 'nodes-darwin.json': nodemixin.new(config=$._config, platform='Darwin').dashboard, }, } diff --git a/docs/node-mixin/jsonnetfile.json b/docs/node-mixin/jsonnetfile.json index 46ebffe4..721d4833 100644 --- a/docs/node-mixin/jsonnetfile.json +++ b/docs/node-mixin/jsonnetfile.json @@ -13,17 +13,8 @@ { "source": { "git": { - "remote": "https://github.com/grafana/jsonnet-libs.git", - "subdir": "grafana-builder" - } - }, - "version": "master" - }, - { - "source": { - "git": { - "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin.git", - "subdir": "lib/promgrafonnet" + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet-7.0" } }, "version": "master" diff --git a/docs/node-mixin/lib/prom-mixin.libsonnet b/docs/node-mixin/lib/prom-mixin.libsonnet new file mode 100644 index 00000000..e545651d --- /dev/null +++ b/docs/node-mixin/lib/prom-mixin.libsonnet @@ -0,0 +1,353 @@ +local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; +local dashboard = grafana.dashboard; +local row = grafana.row; +local prometheus = grafana.prometheus; +local template = grafana.template; +local graphPanel = grafana.graphPanel; +local grafana70 = import 'github.com/grafana/grafonnet-lib/grafonnet-7.0/grafana.libsonnet'; +local gaugePanel = grafana70.panel.gauge; + +{ + + new(config=null, platform=null):: { + + local prometheusDatasourceTemplate = { + current: { + text: 'default', + value: 'default', + }, + hide: 0, + label: 'Data Source', + name: 'datasource', + options: [], + query: 'prometheus', + refresh: 1, + regex: '', + type: 'datasource', + }, + + local instanceTemplatePrototype = + template.new( + 'instance', + '$datasource', + '', + refresh='time', + label='Instance', + ), + local instanceTemplate = + if platform == 'Darwin' then + instanceTemplatePrototype + { query: 'label_values(node_uname_info{%(nodeExporterSelector)s, sysname="Darwin"}, instance)' % config } + else + instanceTemplatePrototype + { query: 'label_values(node_uname_info{%(nodeExporterSelector)s, sysname!="Darwin"}, instance)' % config }, + + + local idleCPU = + graphPanel.new( + 'CPU Usage', + datasource='$datasource', + span=6, + format='percentunit', + max=1, + min=0, + stack=true, + ) + .addTarget(prometheus.target( + ||| + ( + (1 - sum without (mode) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode=~"idle|iowait|steal", instance="$instance"}[$__rate_interval]))) + / ignoring(cpu) group_left + count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}) + ) + ||| % config, + legendFormat='{{cpu}}', + intervalFactor=5, + )), + + local systemLoad = + graphPanel.new( + 'Load Average', + datasource='$datasource', + span=6, + format='short', + min=0, + fill=0, + ) + .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='1m load average')) + .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='5m load average')) + .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='15m load average')) + .addTarget(prometheus.target('count(node_cpu_seconds_total{%(nodeExporterSelector)s, instance="$instance", mode="idle"})' % config, legendFormat='logical cores')), + + local memoryGraphPanelPrototype = + graphPanel.new( + 'Memory Usage', + datasource='$datasource', + span=9, + format='bytes', + min=0, + ), + local memoryGraph = + if platform == 'Linux' then + memoryGraphPanelPrototype { stack: true } + .addTarget(prometheus.target( + ||| + ( + node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} + - + node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"} + - + node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"} + - + node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"} + ) + ||| % config, + legendFormat='memory used' + )) + .addTarget(prometheus.target('node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='memory buffers')) + .addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='memory cached')) + .addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='memory free')) + else if platform == 'Darwin' then + // not useful to stack + memoryGraphPanelPrototype { stack: false } + .addTarget(prometheus.target('node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='Physical Memory')) + .addTarget(prometheus.target( + ||| + ( + node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance"} - + node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance"} + + node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance"} + + node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance"} + ) + ||| % config, legendFormat='Memory Used' + )) + .addTarget(prometheus.target( + ||| + ( + node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance"} - + node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance"} + ) + ||| % config, legendFormat='App Memory' + )) + .addTarget(prometheus.target('node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='Wired Memory')) + .addTarget(prometheus.target('node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='Compressed')), + + // NOTE: avg() is used to circumvent a label change caused by a node_exporter rollout. + local memoryGaugePanelPrototype = + gaugePanel.new( + title='Memory Usage', + datasource='$datasource', + ) + .addThresholdStep('rgba(50, 172, 45, 0.97)') + .addThresholdStep('rgba(237, 129, 40, 0.89)', 80) + .addThresholdStep('rgba(245, 54, 54, 0.9)', 90) + .setFieldConfig(max=100, min=0, unit='percent') + + { + span: 3, + }, + + local memoryGauge = + if platform == 'Linux' then + memoryGaugePanelPrototype + + .addTarget(prometheus.target( + ||| + 100 - + ( + avg(node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"}) / + avg(node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"}) + * 100 + ) + ||| % config, + )) + + else if platform == 'Darwin' then + memoryGaugePanelPrototype + .addTarget(prometheus.target( + ||| + ( + ( + avg(node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance"}) - + avg(node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance"}) + + avg(node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance"}) + + avg(node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance"}) + ) / + avg(node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance"}) + ) + * + 100 + ||| % config + )), + + local diskIO = + graphPanel.new( + 'Disk I/O', + datasource='$datasource', + span=6, + min=0, + fill=0, + ) + // TODO: Does it make sense to have those three in the same panel? + .addTarget(prometheus.target( + 'rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % config, + legendFormat='{{device}} read', + )) + .addTarget(prometheus.target( + 'rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % config, + legendFormat='{{device}} written', + )) + .addTarget(prometheus.target( + 'rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % config, + legendFormat='{{device}} io time', + )) + + { + seriesOverrides: [ + { + alias: '/ read| written/', + yaxis: 1, + }, + { + alias: '/ io time/', + yaxis: 2, + }, + ], + yaxes: [ + self.yaxe(format='bytes'), + self.yaxe(format='s'), + ], + }, + + // TODO: Somehow partition this by device while excluding read-only devices. + local diskSpaceUsage = + graphPanel.new( + 'Disk Space Usage', + datasource='$datasource', + span=6, + format='bytes', + min=0, + fill=1, + stack=true, + ) + .addTarget(prometheus.target( + ||| + sum( + max by (device) ( + node_filesystem_size_bytes{%(nodeExporterSelector)s, instance="$instance", %(fsSelector)s} + - + node_filesystem_avail_bytes{%(nodeExporterSelector)s, instance="$instance", %(fsSelector)s} + ) + ) + ||| % config, + legendFormat='used', + )) + .addTarget(prometheus.target( + ||| + sum( + max by (device) ( + node_filesystem_avail_bytes{%(nodeExporterSelector)s, instance="$instance", %(fsSelector)s} + ) + ) + ||| % config, + legendFormat='available', + )) + + { + seriesOverrides: [ + { + alias: 'used', + color: '#E0B400', + }, + { + alias: 'available', + color: '#73BF69', + }, + ], + }, + + local networkReceived = + graphPanel.new( + 'Network Received', + datasource='$datasource', + span=6, + format='bytes', + min=0, + fill=0, + ) + .addTarget(prometheus.target( + 'rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__rate_interval])' % config, + legendFormat='{{device}}', + )), + + local networkTransmitted = + graphPanel.new( + 'Network Transmitted', + datasource='$datasource', + span=6, + format='bytes', + min=0, + fill=0, + ) + .addTarget(prometheus.target( + 'rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__rate_interval])' % config, + legendFormat='{{device}}', + )), + + local cpuRow = + row.new('CPU') + .addPanel(idleCPU) + .addPanel(systemLoad), + + local memoryRow = + row.new('Memory') + .addPanel(memoryGraph) + .addPanel(memoryGauge), + + local diskRow = + row.new('Disk') + .addPanel(diskIO) + .addPanel(diskSpaceUsage), + + local networkRow = + row.new('Network') + .addPanel(networkReceived) + .addPanel(networkTransmitted), + + local rows = + [ + cpuRow, + memoryRow, + diskRow, + networkRow, + ], + + local templates = + [ + prometheusDatasourceTemplate, + instanceTemplate, + ], + + + dashboard: if platform == 'Linux' then + dashboard.new( + '%sNodes' % config.dashboardNamePrefix, + time_from='now-1h', + tags=(config.dashboardTags), + timezone='utc', + refresh='30s', + graphTooltip='shared_crosshair' + ) + .addTemplates(templates) + .addRows(rows) + else if platform == 'Darwin' then + dashboard.new( + '%sMacOS' % config.dashboardNamePrefix, + time_from='now-1h', + tags=(config.dashboardTags), + timezone='utc', + refresh='30s', + graphTooltip='shared_crosshair' + ) + .addTemplates(templates) + .addRows(rows), + + }, +}