2018-05-08 10:10:29 +00:00
|
|
|
{
|
|
|
|
_config+:: {
|
|
|
|
// Selectors are inserted between {} in Prometheus queries.
|
2019-07-16 17:34:27 +00:00
|
|
|
|
2019-10-30 21:52:36 +00:00
|
|
|
// Select the metrics coming from the node exporter. Note that all
|
|
|
|
// the selected metrics are shown stacked on top of each other in
|
|
|
|
// the 'USE Method / Cluster' dashboard. Consider disabling that
|
|
|
|
// dashboard if mixing up all those metrics in the same dashboard
|
|
|
|
// doesn't make sense (e.g. because they are coming from different
|
|
|
|
// clusters).
|
2019-07-16 19:18:17 +00:00
|
|
|
nodeExporterSelector: 'job="node"',
|
2018-05-08 10:10:29 +00:00
|
|
|
|
2019-07-17 21:54:31 +00:00
|
|
|
// Select the fstype for filesystem-related queries. If left
|
|
|
|
// empty, all filesystems are selected. If you have unusual
|
|
|
|
// filesystem you don't want to include in dashboards and
|
|
|
|
// alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'.
|
2019-09-12 11:57:19 +00:00
|
|
|
fsSelector: 'fstype!=""',
|
2018-05-08 10:10:29 +00:00
|
|
|
|
2022-10-20 11:06:31 +00:00
|
|
|
// Select the mountpoint for filesystem-related queries. If left
|
|
|
|
// empty, all mountpoints are selected. For example if you have a
|
|
|
|
// special purpose tmpfs instance that has a fixed size and will
|
|
|
|
// always be 100% full, but you still want alerts and dashboards for
|
|
|
|
// other tmpfs instances, you can exclude those by mountpoint prefix
|
|
|
|
// like so: 'mountpoint!~"/var/lib/foo.*"'.
|
|
|
|
fsMountpointSelector: 'mountpoint!=""',
|
|
|
|
|
2019-07-17 21:54:31 +00:00
|
|
|
// Select the device for disk-related queries. If left empty, all
|
|
|
|
// devices are selected. If you have unusual devices you don't
|
|
|
|
// want to include in dashboards and alerting, you can exclude
|
|
|
|
// them here, e.g. 'device!="tmpfs"'.
|
2019-09-12 11:57:19 +00:00
|
|
|
diskDeviceSelector: 'device!=""',
|
2019-07-16 17:34:27 +00:00
|
|
|
|
2019-08-14 20:24:24 +00:00
|
|
|
// Some of the alerts are meant to fire if a critical failure of a
|
|
|
|
// node is imminent (e.g. the disk is about to run full). In a
|
|
|
|
// true “cloud native” setup, failures of a single node should be
|
|
|
|
// tolerated. Hence, even imminent failure of a single node is no
|
|
|
|
// reason to create a paging alert. However, in practice there are
|
|
|
|
// still many situations where operators like to get paged in time
|
|
|
|
// before a node runs out of disk space. nodeCriticalSeverity can
|
|
|
|
// be set to the desired severity for this kind of alerts. This
|
|
|
|
// can even be templated to depend on labels of the node, e.g. you
|
|
|
|
// could make this critical for traditional database masters but
|
|
|
|
// just a warning for K8s nodes.
|
|
|
|
nodeCriticalSeverity: 'critical',
|
|
|
|
|
2023-04-05 18:30:53 +00:00
|
|
|
// CPU utilization (%) on which to trigger the
|
|
|
|
// 'NodeCPUHighUsage' alert.
|
|
|
|
cpuHighUsageThreshold: 90,
|
2023-04-05 16:56:00 +00:00
|
|
|
// Load average 1m (per core) on which to trigger the
|
|
|
|
// 'NodeSystemSaturation' alert.
|
|
|
|
systemSaturationPerCoreThreshold: 2,
|
|
|
|
|
2020-03-02 15:24:51 +00:00
|
|
|
// Available disk space (%) thresholds on which to trigger the
|
|
|
|
// 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk
|
|
|
|
// usage grows in a way that it is predicted to run out in 4h or 1d
|
|
|
|
// and if the provided thresholds have been reached right now.
|
2023-11-13 01:10:56 +00:00
|
|
|
// In some cases you'll want to adjust these, e.g., by default, Kubernetes
|
2020-03-02 15:24:51 +00:00
|
|
|
// runs the image garbage collection when the disk usage reaches 85%
|
|
|
|
// of its available space. In that case, you'll want to reduce the
|
|
|
|
// critical threshold below to something like 14 or 15, otherwise
|
|
|
|
// the alert could fire under normal node usage.
|
2023-11-13 01:10:56 +00:00
|
|
|
// Additionally, the prediction window for the alert can be configured
|
|
|
|
// to account for environments where disk usage can fluctuate within
|
|
|
|
// a short time frame. By extending the prediction window, you can
|
|
|
|
// reduce false positives caused by temporary spikes, providing a
|
|
|
|
// more accurate prediction of disk space issues.
|
2020-03-02 15:24:51 +00:00
|
|
|
fsSpaceFillingUpWarningThreshold: 40,
|
|
|
|
fsSpaceFillingUpCriticalThreshold: 20,
|
2023-11-13 01:10:56 +00:00
|
|
|
fsSpaceFillingUpPredictionWindow: '6h',
|
2020-03-02 15:24:51 +00:00
|
|
|
|
2020-09-18 09:28:32 +00:00
|
|
|
// Available disk space (%) thresholds on which to trigger the
|
|
|
|
// 'NodeFilesystemAlmostOutOfSpace' alerts.
|
2022-05-10 12:50:20 +00:00
|
|
|
fsSpaceAvailableWarningThreshold: 5,
|
|
|
|
fsSpaceAvailableCriticalThreshold: 3,
|
2020-09-18 09:28:32 +00:00
|
|
|
|
2024-04-10 11:59:46 +00:00
|
|
|
// Memory utilization (%) level on which to trigger the
|
2023-04-05 16:21:50 +00:00
|
|
|
// 'NodeMemoryHighUtilization' alert.
|
|
|
|
memoryHighUtilizationThreshold: 90,
|
|
|
|
|
|
|
|
// Threshold for the rate of memory major page faults to trigger
|
|
|
|
// 'NodeMemoryMajorPagesFaults' alert.
|
2023-04-05 16:56:00 +00:00
|
|
|
memoryMajorPagesFaultsThreshold: 500,
|
|
|
|
|
|
|
|
// Disk IO queue level above which to trigger
|
|
|
|
// 'NodeDiskIOSaturation' alert.
|
|
|
|
diskIOSaturationThreshold: 10,
|
2023-04-05 16:21:50 +00:00
|
|
|
|
2021-04-03 10:40:22 +00:00
|
|
|
rateInterval: '5m',
|
2021-04-02 00:34:23 +00:00
|
|
|
// Opt-in for multi-cluster support.
|
|
|
|
showMultiCluster: false,
|
|
|
|
clusterLabel: 'cluster',
|
|
|
|
|
|
|
|
dashboardNamePrefix: 'Node Exporter / ',
|
|
|
|
dashboardTags: ['node-exporter-mixin'],
|
2018-05-08 10:10:29 +00:00
|
|
|
},
|
|
|
|
}
|