Remove text_collector_examples/ (#1441)
* Remove text_collector_examples/ These have been moved to https://github.com/prometheus-community/node-exporter-textfile-collector-scripts This closes #1077 Signed-off-by: Johannes 'fish' Ziemke <github@freigeist.org>
This commit is contained in:
parent
0b710bb0c9
commit
fc73586c97
|
@ -1,16 +1,4 @@
|
||||||
# Text collector example scripts
|
# Text collector example scripts
|
||||||
|
|
||||||
These scripts are examples to be used with the Node Exporter Textfile
|
The scripts have been moved to
|
||||||
Collector.
|
https://github.com/prometheus-community/node-exporter-textfile-collector-scripts
|
||||||
|
|
||||||
To use these scripts, we recommend using a `sponge` to atomically write the output.
|
|
||||||
|
|
||||||
<collector_script> | sponge <output_file>
|
|
||||||
|
|
||||||
Sponge comes from [moreutils](https://joeyh.name/code/moreutils/)
|
|
||||||
* [brew install moreutils](http://brewformulas.org/Moreutil)
|
|
||||||
* [apt install moreutils](https://packages.debian.org/search?keywords=moreutils)
|
|
||||||
* [pkg install moreutils](https://www.freshports.org/sysutils/moreutils/)
|
|
||||||
|
|
||||||
For more information see:
|
|
||||||
https://github.com/prometheus/node_exporter#textfile-collector
|
|
||||||
|
|
|
@ -1,32 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
#
|
|
||||||
# Description: Expose metrics from apt updates.
|
|
||||||
#
|
|
||||||
# Author: Ben Kochie <superq@gmail.com>
|
|
||||||
|
|
||||||
upgrades="$(/usr/bin/apt-get --just-print upgrade \
|
|
||||||
| /usr/bin/awk -F'[()]' \
|
|
||||||
'/^Inst/ { sub("^[^ ]+ ", "", $2); gsub(" ","",$2);
|
|
||||||
sub("\\[", " ", $2); sub("\\]", "", $2); print $2 }' \
|
|
||||||
| /usr/bin/sort \
|
|
||||||
| /usr/bin/uniq -c \
|
|
||||||
| awk '{ gsub(/\\\\/, "\\\\", $2); gsub(/\"/, "\\\"", $2);
|
|
||||||
gsub(/\[/, "", $3); gsub(/\]/, "", $3);
|
|
||||||
print "apt_upgrades_pending{origin=\"" $2 "\",arch=\"" $3 "\"} " $1}'
|
|
||||||
)"
|
|
||||||
|
|
||||||
echo '# HELP apt_upgrades_pending Apt package pending updates by origin.'
|
|
||||||
echo '# TYPE apt_upgrades_pending gauge'
|
|
||||||
if [[ -n "${upgrades}" ]] ; then
|
|
||||||
echo "${upgrades}"
|
|
||||||
else
|
|
||||||
echo 'apt_upgrades_pending{origin="",arch=""} 0'
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo '# HELP node_reboot_required Node reboot is required for software updates.'
|
|
||||||
echo '# TYPE node_reboot_required gauge'
|
|
||||||
if [[ -f '/run/reboot-required' ]] ; then
|
|
||||||
echo 'node_reboot_required 1'
|
|
||||||
else
|
|
||||||
echo 'node_reboot_required 0'
|
|
||||||
fi
|
|
|
@ -1,112 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
# Collect per-device btrfs filesystem errors.
|
|
||||||
# Designed to work on Debian and Centos 6 (with python2.6).
|
|
||||||
|
|
||||||
import collections
|
|
||||||
import glob
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import subprocess
|
|
||||||
|
|
||||||
def get_btrfs_mount_points():
|
|
||||||
"""List all btrfs mount points.
|
|
||||||
|
|
||||||
Yields:
|
|
||||||
(string) filesystem mount points.
|
|
||||||
"""
|
|
||||||
with open("/proc/mounts") as f:
|
|
||||||
for line in f:
|
|
||||||
parts = line.split()
|
|
||||||
if parts[2] == "btrfs":
|
|
||||||
yield parts[1]
|
|
||||||
|
|
||||||
def get_btrfs_errors(mountpoint):
|
|
||||||
"""Get per-device errors for a btrfs mount point.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
mountpoint: (string) path to a mount point.
|
|
||||||
|
|
||||||
Yields:
|
|
||||||
(device, error_type, error_count) tuples, where:
|
|
||||||
device: (string) path to block device.
|
|
||||||
error_type: (string) type of btrfs error.
|
|
||||||
error_count: (int) number of btrfs errors of a given type.
|
|
||||||
"""
|
|
||||||
p = subprocess.Popen(["btrfs", "device", "stats", mountpoint],
|
|
||||||
stdout=subprocess.PIPE)
|
|
||||||
(stdout, stderr) = p.communicate()
|
|
||||||
if p.returncode != 0:
|
|
||||||
raise RuntimeError("btrfs returned exit code %d" % p.returncode)
|
|
||||||
for line in stdout.splitlines():
|
|
||||||
if line == '':
|
|
||||||
continue
|
|
||||||
# Sample line:
|
|
||||||
# [/dev/vdb1].flush_io_errs 0
|
|
||||||
m = re.search(r"^\[([^\]]+)\]\.(\S+)\s+(\d+)$", line.decode("utf-8"))
|
|
||||||
if not m:
|
|
||||||
raise RuntimeError("unexpected output from btrfs: '%s'" % line)
|
|
||||||
yield m.group(1), m.group(2), int(m.group(3))
|
|
||||||
|
|
||||||
def btrfs_error_metrics():
|
|
||||||
"""Collect btrfs error metrics.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
a list of strings to be exposed as Prometheus metrics.
|
|
||||||
"""
|
|
||||||
metric = "node_btrfs_errors_total"
|
|
||||||
contents = [
|
|
||||||
"# TYPE %s counter" % metric,
|
|
||||||
"# HELP %s number of btrfs errors" % metric,
|
|
||||||
]
|
|
||||||
errors_by_device = collections.defaultdict(dict)
|
|
||||||
for mountpoint in get_btrfs_mount_points():
|
|
||||||
for device, error_type, error_count in get_btrfs_errors(mountpoint):
|
|
||||||
contents.append(
|
|
||||||
'%s{mountpoint="%s",device="%s",type="%s"} %d' %
|
|
||||||
(metric, mountpoint, device, error_type, error_count))
|
|
||||||
|
|
||||||
if len(contents) > 2:
|
|
||||||
# return metrics if there are actual btrfs filesystems found
|
|
||||||
# (i.e. `contents` contains more than just TYPE and HELP).
|
|
||||||
return contents
|
|
||||||
|
|
||||||
def btrfs_allocation_metrics():
|
|
||||||
"""Collect btrfs allocation metrics.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
a list of strings to be exposed as Prometheus metrics.
|
|
||||||
"""
|
|
||||||
prefix = 'node_btrfs_allocation'
|
|
||||||
metric_to_filename = {
|
|
||||||
'size_bytes': 'total_bytes',
|
|
||||||
'used_bytes': 'bytes_used',
|
|
||||||
'reserved_bytes': 'bytes_reserved',
|
|
||||||
'pinned_bytes': 'bytes_pinned',
|
|
||||||
'disk_size_bytes': 'disk_total',
|
|
||||||
'disk_used_bytes': 'disk_used',
|
|
||||||
}
|
|
||||||
contents = []
|
|
||||||
for m, f in metric_to_filename.items():
|
|
||||||
contents += [
|
|
||||||
"# TYPE %s_%s gauge" % (prefix, m),
|
|
||||||
"# HELP %s_%s btrfs allocation data (%s)" % (prefix, m, f),
|
|
||||||
]
|
|
||||||
|
|
||||||
for alloc in glob.glob("/sys/fs/btrfs/*/allocation"):
|
|
||||||
fs = alloc.split('/')[4]
|
|
||||||
for type_ in ('data', 'metadata', 'system'):
|
|
||||||
for m, f in metric_to_filename.items():
|
|
||||||
filename = os.path.join(alloc, type_, f)
|
|
||||||
with open(filename) as f:
|
|
||||||
value = int(f.read().strip())
|
|
||||||
contents.append('%s_%s{fs="%s",type="%s"} %d' % (
|
|
||||||
prefix, m, fs, type_, value))
|
|
||||||
if len(contents) > 2*len(metric_to_filename):
|
|
||||||
return contents
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
contents = ((btrfs_error_metrics() or []) +
|
|
||||||
(btrfs_allocation_metrics() or []))
|
|
||||||
|
|
||||||
print("\n".join(contents))
|
|
|
@ -1,70 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Script to count the number of deleted libraries that are linked by running
|
|
||||||
processes and expose a summary as Prometheus metrics.
|
|
||||||
|
|
||||||
The aim is to discover processes that are still using libraries that have since
|
|
||||||
been updated, perhaps due security vulnerabilities.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import errno
|
|
||||||
import glob
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
processes_linking_deleted_libraries = {}
|
|
||||||
|
|
||||||
for path in glob.glob('/proc/*/maps'):
|
|
||||||
try:
|
|
||||||
with open(path, 'rb') as file:
|
|
||||||
for line in file:
|
|
||||||
part = line.decode().strip().split()
|
|
||||||
|
|
||||||
if len(part) == 7:
|
|
||||||
library = part[5]
|
|
||||||
comment = part[6]
|
|
||||||
|
|
||||||
if '/lib/' in library and '(deleted)' in comment:
|
|
||||||
if path not in processes_linking_deleted_libraries:
|
|
||||||
processes_linking_deleted_libraries[path] = {}
|
|
||||||
|
|
||||||
if library in processes_linking_deleted_libraries[path]:
|
|
||||||
processes_linking_deleted_libraries[path][library] += 1
|
|
||||||
else:
|
|
||||||
processes_linking_deleted_libraries[path][library] = 1
|
|
||||||
except EnvironmentError as e:
|
|
||||||
# Ignore non-existent files, since the files may have changed since
|
|
||||||
# we globbed.
|
|
||||||
if e.errno != errno.ENOENT:
|
|
||||||
sys.exit('Failed to open file: {0}'.format(path))
|
|
||||||
|
|
||||||
num_processes_per_library = {}
|
|
||||||
|
|
||||||
for process, library_count in processes_linking_deleted_libraries.items():
|
|
||||||
libraries_seen = set()
|
|
||||||
for library, count in library_count.items():
|
|
||||||
if library in libraries_seen:
|
|
||||||
continue
|
|
||||||
|
|
||||||
libraries_seen.add(library)
|
|
||||||
if library in num_processes_per_library:
|
|
||||||
num_processes_per_library[library] += 1
|
|
||||||
else:
|
|
||||||
num_processes_per_library[library] = 1
|
|
||||||
|
|
||||||
metric_name = 'node_processes_linking_deleted_libraries'
|
|
||||||
description = 'Count of running processes that link a deleted library'
|
|
||||||
print('# HELP {0} {1}'.format(metric_name, description))
|
|
||||||
print('# TYPE {0} gauge'.format(metric_name))
|
|
||||||
|
|
||||||
for library, count in num_processes_per_library.items():
|
|
||||||
dir_path, basename = os.path.split(library)
|
|
||||||
basename = basename.replace('"', '\\"')
|
|
||||||
dir_path = dir_path.replace('"', '\\"')
|
|
||||||
print('{0}{{library_path="{1}", library_name="{2}"}} {3}'.format(metric_name, dir_path, basename, count))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,15 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
#
|
|
||||||
# Expose directory usage metrics, passed as an argument.
|
|
||||||
#
|
|
||||||
# Usage: add this to crontab:
|
|
||||||
#
|
|
||||||
# */5 * * * * prometheus directory-size.sh /var/lib/prometheus | sponge /var/lib/node_exporter/directory_size.prom
|
|
||||||
#
|
|
||||||
# sed pattern taken from https://www.robustperception.io/monitoring-directory-sizes-with-the-textfile-collector/
|
|
||||||
#
|
|
||||||
# Author: Antoine Beaupré <anarcat@debian.org>
|
|
||||||
echo "# HELP node_directory_size_bytes Disk space used by some directories"
|
|
||||||
echo "# TYPE node_directory_size_bytes gauge"
|
|
||||||
du --block-size=1 --summarize "$@" \
|
|
||||||
| sed -ne 's/\\/\\\\/;s/"/\\"/g;s/^\([0-9]\+\)\t\(.*\)$/node_directory_size_bytes{directory="\2"} \1/p'
|
|
|
@ -1,141 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
"""
|
|
||||||
Expose Linux inotify(7) instance resource consumption.
|
|
||||||
|
|
||||||
Operational properties:
|
|
||||||
|
|
||||||
- This script may be invoked as an unprivileged user; in this case, metrics
|
|
||||||
will only be exposed for processes owned by that unprivileged user.
|
|
||||||
|
|
||||||
- No metrics will be exposed for processes that do not hold any inotify fds.
|
|
||||||
|
|
||||||
Requires Python 3.5 or later.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import collections
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
|
|
||||||
class Error(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class _PIDGoneError(Error):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
_Process = collections.namedtuple(
|
|
||||||
"Process", ["pid", "uid", "command", "inotify_instances"])
|
|
||||||
|
|
||||||
|
|
||||||
def _read_bytes(name):
|
|
||||||
with open(name, mode='rb') as f:
|
|
||||||
return f.read()
|
|
||||||
|
|
||||||
|
|
||||||
def _pids():
|
|
||||||
for n in os.listdir("/proc"):
|
|
||||||
if not n.isdigit():
|
|
||||||
continue
|
|
||||||
yield int(n)
|
|
||||||
|
|
||||||
|
|
||||||
def _pid_uid(pid):
|
|
||||||
try:
|
|
||||||
s = os.stat("/proc/{}".format(pid))
|
|
||||||
except FileNotFoundError:
|
|
||||||
raise _PIDGoneError()
|
|
||||||
return s.st_uid
|
|
||||||
|
|
||||||
|
|
||||||
def _pid_command(pid):
|
|
||||||
# Avoid GNU ps(1) for it truncates comm.
|
|
||||||
# https://bugs.launchpad.net/ubuntu/+source/procps/+bug/295876/comments/3
|
|
||||||
try:
|
|
||||||
cmdline = _read_bytes("/proc/{}/cmdline".format(pid))
|
|
||||||
except FileNotFoundError:
|
|
||||||
raise _PIDGoneError()
|
|
||||||
|
|
||||||
if not len(cmdline):
|
|
||||||
return "<zombie>"
|
|
||||||
|
|
||||||
try:
|
|
||||||
prog = cmdline[0:cmdline.index(0x00)]
|
|
||||||
except ValueError:
|
|
||||||
prog = cmdline
|
|
||||||
return os.path.basename(prog).decode(encoding="ascii",
|
|
||||||
errors="surrogateescape")
|
|
||||||
|
|
||||||
|
|
||||||
def _pid_inotify_instances(pid):
|
|
||||||
instances = 0
|
|
||||||
try:
|
|
||||||
for fd in os.listdir("/proc/{}/fd".format(pid)):
|
|
||||||
try:
|
|
||||||
target = os.readlink("/proc/{}/fd/{}".format(pid, fd))
|
|
||||||
except FileNotFoundError:
|
|
||||||
continue
|
|
||||||
if target == "anon_inode:inotify":
|
|
||||||
instances += 1
|
|
||||||
except FileNotFoundError:
|
|
||||||
raise _PIDGoneError()
|
|
||||||
return instances
|
|
||||||
|
|
||||||
|
|
||||||
def _get_processes():
|
|
||||||
for p in _pids():
|
|
||||||
try:
|
|
||||||
yield _Process(p, _pid_uid(p), _pid_command(p),
|
|
||||||
_pid_inotify_instances(p))
|
|
||||||
except (PermissionError, _PIDGoneError):
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
|
||||||
def _get_processes_nontrivial():
|
|
||||||
return (p for p in _get_processes() if p.inotify_instances > 0)
|
|
||||||
|
|
||||||
|
|
||||||
def _format_gauge_metric(metric_name, metric_help, samples,
|
|
||||||
value_func, tags_func=None, stream=sys.stdout):
|
|
||||||
|
|
||||||
def _println(*args, **kwargs):
|
|
||||||
if "file" not in kwargs:
|
|
||||||
kwargs["file"] = stream
|
|
||||||
print(*args, **kwargs)
|
|
||||||
|
|
||||||
def _print(*args, **kwargs):
|
|
||||||
if "end" not in kwargs:
|
|
||||||
kwargs["end"] = ""
|
|
||||||
_println(*args, **kwargs)
|
|
||||||
|
|
||||||
_println("# HELP {} {}".format(metric_name, metric_help))
|
|
||||||
_println("# TYPE {} gauge".format(metric_name))
|
|
||||||
|
|
||||||
for s in samples:
|
|
||||||
value = value_func(s)
|
|
||||||
tags = None
|
|
||||||
if tags_func:
|
|
||||||
tags = tags_func(s)
|
|
||||||
|
|
||||||
_print(metric_name)
|
|
||||||
if tags:
|
|
||||||
_print("{")
|
|
||||||
_print(",".join(["{}=\"{}\"".format(k, v) for k, v in tags]))
|
|
||||||
_print("}")
|
|
||||||
_print(" ")
|
|
||||||
_println(value)
|
|
||||||
|
|
||||||
|
|
||||||
def main(args_unused=None):
|
|
||||||
_format_gauge_metric(
|
|
||||||
"inotify_instances",
|
|
||||||
"Total number of inotify instances held open by a process.",
|
|
||||||
_get_processes_nontrivial(),
|
|
||||||
lambda s: s.inotify_instances,
|
|
||||||
lambda s: [("pid", s.pid), ("uid", s.uid), ("command", s.command)])
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sys.exit(main(sys.argv))
|
|
|
@ -1,89 +0,0 @@
|
||||||
#!/usr/bin/awk -f
|
|
||||||
|
|
||||||
#
|
|
||||||
# Converts output of `ipmitool sensor` to prometheus format.
|
|
||||||
#
|
|
||||||
# With GNU awk:
|
|
||||||
# ipmitool sensor | ./ipmitool > ipmitool.prom
|
|
||||||
#
|
|
||||||
# With BSD awk:
|
|
||||||
# ipmitool sensor | awk -f ./ipmitool > ipmitool.prom
|
|
||||||
#
|
|
||||||
|
|
||||||
function export(values, name) {
|
|
||||||
if (values["metric_count"] < 1) {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
delete values["metric_count"]
|
|
||||||
|
|
||||||
printf("# HELP %s%s %s sensor reading from ipmitool\n", namespace, name, help[name]);
|
|
||||||
printf("# TYPE %s%s gauge\n", namespace, name);
|
|
||||||
for (sensor in values) {
|
|
||||||
printf("%s%s{sensor=\"%s\"} %f\n", namespace, name, sensor, values[sensor]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Fields are Bar separated, with space padding.
|
|
||||||
BEGIN {
|
|
||||||
FS = "[ ]*[|][ ]*";
|
|
||||||
namespace = "node_ipmi_";
|
|
||||||
|
|
||||||
# Friendly description of the type of sensor for HELP.
|
|
||||||
help["temperature_celsius"] = "Temperature";
|
|
||||||
help["volts"] = "Voltage";
|
|
||||||
help["power_watts"] = "Power";
|
|
||||||
help["speed_rpm"] = "Fan";
|
|
||||||
help["status"] = "Chassis status";
|
|
||||||
|
|
||||||
temperature_celsius["metric_count"] = 0;
|
|
||||||
volts["metric_count"] = 0;
|
|
||||||
power_watts["metric_count"] = 0;
|
|
||||||
speed_rpm["metric_count"] = 0;
|
|
||||||
status["metric_count"] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
# Not a valid line.
|
|
||||||
{
|
|
||||||
if (NF < 3) {
|
|
||||||
next
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# $2 is value field.
|
|
||||||
$2 ~ /na/ {
|
|
||||||
next
|
|
||||||
}
|
|
||||||
|
|
||||||
# $3 is type field.
|
|
||||||
$3 ~ /degrees C/ {
|
|
||||||
temperature_celsius[$1] = $2;
|
|
||||||
temperature_celsius["metric_count"]++;
|
|
||||||
}
|
|
||||||
|
|
||||||
$3 ~ /Volts/ {
|
|
||||||
volts[$1] = $2;
|
|
||||||
volts["metric_count"]++;
|
|
||||||
}
|
|
||||||
|
|
||||||
$3 ~ /Watts/ {
|
|
||||||
power_watts[$1] = $2;
|
|
||||||
power_watts["metric_count"]++;
|
|
||||||
}
|
|
||||||
|
|
||||||
$3 ~ /RPM/ {
|
|
||||||
speed_rpm[$1] = $2;
|
|
||||||
speed_rpm["metric_count"]++;
|
|
||||||
}
|
|
||||||
|
|
||||||
$3 ~ /discrete/ {
|
|
||||||
status[$1] = sprintf("%d", substr($2,3,2));
|
|
||||||
status["metric_count"]++;
|
|
||||||
}
|
|
||||||
|
|
||||||
END {
|
|
||||||
export(temperature_celsius, "temperature_celsius");
|
|
||||||
export(volts, "volts");
|
|
||||||
export(power_watts, "power_watts");
|
|
||||||
export(speed_rpm, "speed_rpm");
|
|
||||||
export(status, "status");
|
|
||||||
}
|
|
|
@ -1,56 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
set -eu
|
|
||||||
|
|
||||||
for MD_DEVICE in /dev/md/*; do
|
|
||||||
# Subshell to avoid eval'd variables from leaking between iterations
|
|
||||||
(
|
|
||||||
# Resolve symlink to discover device, e.g. /dev/md127
|
|
||||||
MD_DEVICE_NUM=$(readlink -f "${MD_DEVICE}")
|
|
||||||
|
|
||||||
# Remove /dev/ prefix
|
|
||||||
MD_DEVICE_NUM=${MD_DEVICE_NUM#/dev/}
|
|
||||||
MD_DEVICE=${MD_DEVICE#/dev/md/}
|
|
||||||
|
|
||||||
# Query sysfs for info about md device
|
|
||||||
SYSFS_BASE="/sys/devices/virtual/block/${MD_DEVICE_NUM}/md"
|
|
||||||
MD_LAYOUT=$(cat "${SYSFS_BASE}/layout")
|
|
||||||
MD_LEVEL=$(cat "${SYSFS_BASE}/level")
|
|
||||||
MD_METADATA_VERSION=$(cat "${SYSFS_BASE}/metadata_version")
|
|
||||||
MD_NUM_RAID_DISKS=$(cat "${SYSFS_BASE}/raid_disks")
|
|
||||||
|
|
||||||
# Remove 'raid' prefix from RAID level
|
|
||||||
MD_LEVEL=${MD_LEVEL#raid}
|
|
||||||
|
|
||||||
# Output disk metrics
|
|
||||||
for RAID_DISK in ${SYSFS_BASE}/rd[0-9]*; do
|
|
||||||
DISK=$(readlink -f "${RAID_DISK}/block")
|
|
||||||
DISK_DEVICE=$(basename "${DISK}")
|
|
||||||
RAID_DISK_DEVICE=$(basename "${RAID_DISK}")
|
|
||||||
RAID_DISK_INDEX=${RAID_DISK_DEVICE#rd}
|
|
||||||
RAID_DISK_STATE=$(cat "${RAID_DISK}/state")
|
|
||||||
|
|
||||||
DISK_SET=""
|
|
||||||
# Determine disk set using logic from mdadm: https://github.com/neilbrown/mdadm/commit/2c096ebe4b
|
|
||||||
if [[ ${RAID_DISK_STATE} == "in_sync" && ${MD_LEVEL} == 10 && $((MD_LAYOUT & ~0x1ffff)) ]]; then
|
|
||||||
NEAR_COPIES=$((MD_LAYOUT & 0xff))
|
|
||||||
FAR_COPIES=$(((MD_LAYOUT >> 8) & 0xff))
|
|
||||||
COPIES=$((NEAR_COPIES * FAR_COPIES))
|
|
||||||
|
|
||||||
if [[ $((MD_NUM_RAID_DISKS % COPIES == 0)) && $((COPIES <= 26)) ]]; then
|
|
||||||
DISK_SET=$((RAID_DISK_INDEX % COPIES))
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo -n "node_md_disk_info{disk_device=\"${DISK_DEVICE}\", md_device=\"${MD_DEVICE_NUM}\""
|
|
||||||
if [[ -n ${DISK_SET} ]]; then
|
|
||||||
SET_LETTERS=({A..Z})
|
|
||||||
echo -n ", md_set=\"${SET_LETTERS[${DISK_SET}]}\""
|
|
||||||
fi
|
|
||||||
echo "} 1"
|
|
||||||
done
|
|
||||||
|
|
||||||
# Output RAID array metrics
|
|
||||||
# NOTE: Metadata version is a label rather than a separate metric because the version can be a string
|
|
||||||
echo "node_md_info{md_device=\"${MD_DEVICE_NUM}\", md_name=\"${MD_DEVICE}\", raid_level=\"${MD_LEVEL}\", md_metadata_version=\"${MD_METADATA_VERSION}\"} 1"
|
|
||||||
)
|
|
||||||
done
|
|
|
@ -1,87 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
# Note: This script uses "mdadm --detail" to get some of the metrics, so it must be run as root.
|
|
||||||
# It is designed to be run periodically in a cronjob, and output to /var/lib/node_exporter/textfile_collector/md_info_detail.prom
|
|
||||||
# $ cat /etc/cron.d/prometheus_md_info_detail
|
|
||||||
# * * * * * bash /var/lib/node_exporter/md_info_detail.sh > /var/lib/node_exporter/md_info_detail.prom.$$ && mv /var/lib/node_exporter/md_info_detail.prom.$$ /var/lib/node_exporter/md_info_detail.prom
|
|
||||||
|
|
||||||
set -eu
|
|
||||||
|
|
||||||
for MD_DEVICE in /dev/md/*; do
|
|
||||||
# Subshell to avoid eval'd variables from leaking between iterations
|
|
||||||
(
|
|
||||||
# Resolve symlink to discover device, e.g. /dev/md127
|
|
||||||
MD_DEVICE_NUM=$(readlink -f "${MD_DEVICE}")
|
|
||||||
|
|
||||||
# Remove /dev/ prefix
|
|
||||||
MD_DEVICE_NUM=${MD_DEVICE_NUM#/dev/}
|
|
||||||
MD_DEVICE=${MD_DEVICE#/dev/md/}
|
|
||||||
|
|
||||||
# Query sysfs for info about md device
|
|
||||||
SYSFS_BASE="/sys/devices/virtual/block/${MD_DEVICE_NUM}/md"
|
|
||||||
MD_LAYOUT=$(cat "${SYSFS_BASE}/layout")
|
|
||||||
MD_LEVEL=$(cat "${SYSFS_BASE}/level")
|
|
||||||
MD_METADATA_VERSION=$(cat "${SYSFS_BASE}/metadata_version")
|
|
||||||
MD_NUM_RAID_DISKS=$(cat "${SYSFS_BASE}/raid_disks")
|
|
||||||
|
|
||||||
# Remove 'raid' prefix from RAID level
|
|
||||||
MD_LEVEL=${MD_LEVEL#raid}
|
|
||||||
|
|
||||||
# Output disk metrics
|
|
||||||
for RAID_DISK in ${SYSFS_BASE}/rd[0-9]*; do
|
|
||||||
DISK=$(readlink -f "${RAID_DISK}/block")
|
|
||||||
DISK_DEVICE=$(basename "${DISK}")
|
|
||||||
RAID_DISK_DEVICE=$(basename "${RAID_DISK}")
|
|
||||||
RAID_DISK_INDEX=${RAID_DISK_DEVICE#rd}
|
|
||||||
RAID_DISK_STATE=$(cat "${RAID_DISK}/state")
|
|
||||||
|
|
||||||
DISK_SET=""
|
|
||||||
# Determine disk set using logic from mdadm: https://github.com/neilbrown/mdadm/commit/2c096ebe4b
|
|
||||||
if [[ ${RAID_DISK_STATE} == "in_sync" && ${MD_LEVEL} == 10 && $((MD_LAYOUT & ~0x1ffff)) ]]; then
|
|
||||||
NEAR_COPIES=$((MD_LAYOUT & 0xff))
|
|
||||||
FAR_COPIES=$(((MD_LAYOUT >> 8) & 0xff))
|
|
||||||
COPIES=$((NEAR_COPIES * FAR_COPIES))
|
|
||||||
|
|
||||||
if [[ $((MD_NUM_RAID_DISKS % COPIES == 0)) && $((COPIES <= 26)) ]]; then
|
|
||||||
DISK_SET=$((RAID_DISK_INDEX % COPIES))
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo -n "node_md_disk_info{disk_device=\"${DISK_DEVICE}\", md_device=\"${MD_DEVICE_NUM}\""
|
|
||||||
if [[ -n ${DISK_SET} ]]; then
|
|
||||||
SET_LETTERS=({A..Z})
|
|
||||||
echo -n ", md_set=\"${SET_LETTERS[${DISK_SET}]}\""
|
|
||||||
fi
|
|
||||||
echo "} 1"
|
|
||||||
done
|
|
||||||
|
|
||||||
# Get output from mdadm --detail (Note: root/sudo required)
|
|
||||||
MDADM_DETAIL_OUTPUT=$(mdadm --detail /dev/"${MD_DEVICE_NUM}")
|
|
||||||
|
|
||||||
# Output RAID "Devices", "Size" and "Event" metrics, from the output of "mdadm --detail"
|
|
||||||
while IFS= read -r line ; do
|
|
||||||
# Filter out these keys that have numeric values that increment up
|
|
||||||
if echo "$line" | grep -E -q "Devices :|Array Size :| Used Dev Size :|Events :"; then
|
|
||||||
MDADM_DETAIL_KEY=$(echo "$line" | cut -d ":" -f 1 | tr -cd '[a-zA-Z0-9]._-')
|
|
||||||
MDADM_DETAIL_VALUE=$(echo "$line" | cut -d ":" -f 2 | cut -d " " -f 2 | sed 's:^ ::')
|
|
||||||
echo "node_md_info_${MDADM_DETAIL_KEY}{md_device=\"${MD_DEVICE_NUM}\", md_name=\"${MD_DEVICE}\", raid_level=\"${MD_LEVEL}\", md_num_raid_disks=\"${MD_NUM_RAID_DISKS}\", md_metadata_version=\"${MD_METADATA_VERSION}\"} ${MDADM_DETAIL_VALUE}"
|
|
||||||
fi
|
|
||||||
done <<< "$MDADM_DETAIL_OUTPUT"
|
|
||||||
|
|
||||||
# Output RAID detail metrics info from the output of "mdadm --detail"
|
|
||||||
# NOTE: Sending this info as labels rather than separate metrics, because some of them can be strings.
|
|
||||||
echo -n "node_md_info{md_device=\"${MD_DEVICE_NUM}\", md_name=\"${MD_DEVICE}\", raid_level=\"${MD_LEVEL}\", md_num_raid_disks=\"${MD_NUM_RAID_DISKS}\", md_metadata_version=\"${MD_METADATA_VERSION}\""
|
|
||||||
while IFS= read -r line ; do
|
|
||||||
# Filter for lines with a ":", to use for Key/Value pairs in labels
|
|
||||||
if echo "$line" | grep -E -q ":" ; then
|
|
||||||
# Exclude lines with these keys, as they're values are numbers that increment up and captured in individual metrics above
|
|
||||||
if echo "$line" | grep -E -qv "Array Size|Used Dev Size|Events|Update Time" ; then
|
|
||||||
echo -n ", "
|
|
||||||
MDADM_DETAIL_KEY=$(echo "$line" | cut -d ":" -f 1 | tr -cd '[a-zA-Z0-9]._-')
|
|
||||||
MDADM_DETAIL_VALUE=$(echo "$line" | cut -d ":" -f 2- | sed 's:^ ::')
|
|
||||||
echo -n "${MDADM_DETAIL_KEY}=\"${MDADM_DETAIL_VALUE}\""
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
done <<< "$MDADM_DETAIL_OUTPUT"
|
|
||||||
echo "} 1"
|
|
||||||
)
|
|
||||||
done
|
|
|
@ -1,59 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
set -eu
|
|
||||||
|
|
||||||
# Script to read Mellanox HCA temperature using the Mellanox mget_temp_ext tool
|
|
||||||
|
|
||||||
# Copyright 2018 The Prometheus Authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
# Author: Jan Phillip Greimann <jan.greimann@cloud.ionos.com>
|
|
||||||
|
|
||||||
# check if root
|
|
||||||
if [ "$EUID" -ne 0 ]; then
|
|
||||||
echo "${0##*/}: Please run as root!" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# check if programs are installed
|
|
||||||
if ! command -v mget_temp_ext >/dev/null 2>&1; then
|
|
||||||
echo "${0##*/}: mget_temp_ext is not installed. Aborting." >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
cat <<EOF
|
|
||||||
# HELP node_infiniband_hca_temp_celsius Celsius temperature of Mellanox InfiniBand HCA.
|
|
||||||
# TYPE node_infiniband_hca_temp_celsius gauge
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# run for each found Mellanox device
|
|
||||||
for dev in /sys/class/infiniband/*; do
|
|
||||||
if test ! -d "$dev"; then
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
device="${dev##*/}"
|
|
||||||
|
|
||||||
# get temperature
|
|
||||||
if temperature="$(mget_temp_ext -d "${device}")"; then
|
|
||||||
# output
|
|
||||||
echo "node_infiniband_hca_temp_celsius{hca_device=\"${device}\"} ${temperature//[[:space:]]/}"
|
|
||||||
else
|
|
||||||
echo "${0##*/}: Failed to get temperature from InfiniBand HCA '${device}'!" >&2
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
# if device is empty, no device was found
|
|
||||||
if [ -z "${device-}" ]; then
|
|
||||||
echo "${0##*/}: No InfiniBand HCA device found!" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
|
@ -1,9 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
#
|
|
||||||
# Description: Expose device mapper multipathing metrics from multipathd.
|
|
||||||
#
|
|
||||||
# Author: Saket Sinha <saket.sinha@cloud.ionos.com>
|
|
||||||
|
|
||||||
echo '# HELP node_dmpath_info State info for dev-mapper path'
|
|
||||||
echo '# TYPE node_dmpath_info gauge'
|
|
||||||
/sbin/multipathd show paths format '%d %t %T' | /usr/bin/awk '{ if ( NR > 1) {print "node_dmpath_info{device=\""$1"\"," "dm_path_state=\""$2"\"," "path_state=\""$3"\"}" " 1"}}'
|
|
|
@ -1,122 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
#
|
|
||||||
# Description: Extract NTPd metrics from ntpq -np.
|
|
||||||
# Author: Ben Kochie <superq@gmail.com>
|
|
||||||
|
|
||||||
import re
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
|
|
||||||
# NTP peers status, with no DNS lookups.
|
|
||||||
ntpq_cmd = ['ntpq', '-np']
|
|
||||||
ntpq_rv_cmd = ['ntpq', '-c', 'rv 0 offset,sys_jitter,rootdisp,rootdelay']
|
|
||||||
|
|
||||||
# Regex to match all of the fields in the output of ntpq -np
|
|
||||||
metrics_fields = [
|
|
||||||
'^(?P<status>.)(?P<remote>[\w\.]+)',
|
|
||||||
'(?P<refid>[\w\.]+)',
|
|
||||||
'(?P<stratum>\d+)',
|
|
||||||
'(?P<type>\w)',
|
|
||||||
'(?P<when>\d+)',
|
|
||||||
'(?P<poll>\d+)',
|
|
||||||
'(?P<reach>\d+)',
|
|
||||||
'(?P<delay>\d+\.\d+)',
|
|
||||||
'(?P<offset>-?\d+\.\d+)',
|
|
||||||
'(?P<jitter>\d+\.\d+)',
|
|
||||||
]
|
|
||||||
metrics_re = '\s+'.join(metrics_fields)
|
|
||||||
|
|
||||||
# Remote types
|
|
||||||
# http://support.ntp.org/bin/view/Support/TroubleshootingNTP
|
|
||||||
remote_types = {
|
|
||||||
'l': 'local',
|
|
||||||
'u': 'unicast',
|
|
||||||
'm': 'multicast',
|
|
||||||
'b': 'broadcast',
|
|
||||||
'-': 'netaddr',
|
|
||||||
}
|
|
||||||
|
|
||||||
# Status codes:
|
|
||||||
# http://www.eecis.udel.edu/~mills/ntp/html/decode.html#peer
|
|
||||||
status_types = {
|
|
||||||
' ': 0,
|
|
||||||
'x': 1,
|
|
||||||
'.': 2,
|
|
||||||
'-': 3,
|
|
||||||
'+': 4,
|
|
||||||
'#': 5,
|
|
||||||
'*': 6,
|
|
||||||
'o': 7,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# Run the ntpq command.
|
|
||||||
def get_output(command):
|
|
||||||
try:
|
|
||||||
output = subprocess.check_output(command, stderr=subprocess.DEVNULL)
|
|
||||||
except subprocess.CalledProcessError as e:
|
|
||||||
return None
|
|
||||||
return output.decode()
|
|
||||||
|
|
||||||
|
|
||||||
# Print metrics in Prometheus format.
|
|
||||||
def print_prometheus(metric, values):
|
|
||||||
print("# HELP ntpd_%s NTPd metric for %s" % (metric, metric))
|
|
||||||
print("# TYPE ntpd_%s gauge" % (metric))
|
|
||||||
for labels in values:
|
|
||||||
if labels is None:
|
|
||||||
print("ntpd_%s %f" % (metric, values[labels]))
|
|
||||||
else:
|
|
||||||
print("ntpd_%s{%s} %f" % (metric, labels, values[labels]))
|
|
||||||
|
|
||||||
|
|
||||||
# Parse raw ntpq lines.
|
|
||||||
def parse_line(line):
|
|
||||||
if re.match('\s+remote\s+refid', line):
|
|
||||||
return None
|
|
||||||
if re.match('=+', line):
|
|
||||||
return None
|
|
||||||
if re.match('.+\.(LOCL|POOL)\.', line):
|
|
||||||
return None
|
|
||||||
if re.match('^$', line):
|
|
||||||
return None
|
|
||||||
return re.match(metrics_re, line)
|
|
||||||
|
|
||||||
|
|
||||||
# Main function
|
|
||||||
def main(argv):
|
|
||||||
ntpq = get_output(ntpq_cmd)
|
|
||||||
peer_status_metrics = {}
|
|
||||||
delay_metrics = {}
|
|
||||||
offset_metrics = {}
|
|
||||||
jitter_metrics = {}
|
|
||||||
for line in ntpq.split('\n'):
|
|
||||||
metric_match = parse_line(line)
|
|
||||||
if metric_match is None:
|
|
||||||
continue
|
|
||||||
remote = metric_match.group('remote')
|
|
||||||
refid = metric_match.group('refid')
|
|
||||||
stratum = metric_match.group('stratum')
|
|
||||||
remote_type = remote_types[metric_match.group('type')]
|
|
||||||
common_labels = "remote=\"%s\",reference=\"%s\"" % (remote, refid)
|
|
||||||
peer_labels = "%s,stratum=\"%s\",type=\"%s\"" % (common_labels, stratum, remote_type)
|
|
||||||
|
|
||||||
peer_status_metrics[peer_labels] = float(status_types[metric_match.group('status')])
|
|
||||||
delay_metrics[common_labels] = float(metric_match.group('delay'))
|
|
||||||
offset_metrics[common_labels] = float(metric_match.group('offset'))
|
|
||||||
jitter_metrics[common_labels] = float(metric_match.group('jitter'))
|
|
||||||
|
|
||||||
print_prometheus('peer_status', peer_status_metrics)
|
|
||||||
print_prometheus('delay_milliseconds', delay_metrics)
|
|
||||||
print_prometheus('offset_milliseconds', offset_metrics)
|
|
||||||
print_prometheus('jitter_milliseconds', jitter_metrics)
|
|
||||||
|
|
||||||
ntpq_rv = get_output(ntpq_rv_cmd)
|
|
||||||
for metric in ntpq_rv.split(','):
|
|
||||||
metric_name, metric_value = metric.strip().split('=')
|
|
||||||
print_prometheus(metric_name, {None: float(metric_value)})
|
|
||||||
|
|
||||||
|
|
||||||
# Go go go!
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main(sys.argv[1:])
|
|
|
@ -1,97 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
set -eu
|
|
||||||
|
|
||||||
# Dependencies: nvme-cli, jq (packages)
|
|
||||||
# Based on code from
|
|
||||||
# - https://github.com/prometheus/node_exporter/blob/master/text_collector_examples/smartmon.sh
|
|
||||||
# - https://github.com/prometheus/node_exporter/blob/master/text_collector_examples/mellanox_hca_temp
|
|
||||||
# - https://github.com/vorlon/check_nvme/blob/master/check_nvme.sh
|
|
||||||
#
|
|
||||||
# Author: Henk <henk@wearespindle.com>
|
|
||||||
|
|
||||||
# Check if we are root
|
|
||||||
if [ "$EUID" -ne 0 ]; then
|
|
||||||
echo "${0##*/}: Please run as root!" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check if programs are installed
|
|
||||||
if ! command -v nvme >/dev/null 2>&1; then
|
|
||||||
echo "${0##*/}: nvme is not installed. Aborting." >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
output_format_awk="$(
|
|
||||||
cat <<'OUTPUTAWK'
|
|
||||||
BEGIN { v = "" }
|
|
||||||
v != $1 {
|
|
||||||
print "# HELP nvme_" $1 " SMART metric " $1;
|
|
||||||
if ($1 ~ /_total$/)
|
|
||||||
print "# TYPE nvme_" $1 " counter";
|
|
||||||
else
|
|
||||||
print "# TYPE nvme_" $1 " gauge";
|
|
||||||
v = $1
|
|
||||||
}
|
|
||||||
{print "nvme_" $0}
|
|
||||||
OUTPUTAWK
|
|
||||||
)"
|
|
||||||
|
|
||||||
format_output() {
|
|
||||||
sort | awk -F'{' "${output_format_awk}"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Get the nvme-cli version
|
|
||||||
nvme_version="$(nvme version | awk '$1 == "nvme" {print $3}')"
|
|
||||||
echo "nvmecli{version=\"${nvme_version}\"} 1" | format_output
|
|
||||||
|
|
||||||
# Get devices
|
|
||||||
device_list="$(nvme list | awk '/^\/dev/{print $1}')"
|
|
||||||
|
|
||||||
# Loop through the NVMe devices
|
|
||||||
for device in ${device_list}; do
|
|
||||||
json_check="$(nvme smart-log -o json "${device}")"
|
|
||||||
disk="$(echo "${device}" | cut -c6-10)"
|
|
||||||
|
|
||||||
# The temperature value in JSON is in Kelvin, we want Celsius
|
|
||||||
value_temperature="$(echo "$json_check" | jq '.temperature - 273')"
|
|
||||||
echo "temperature_celcius{device=\"${disk}\"} ${value_temperature}"
|
|
||||||
|
|
||||||
value_available_spare="$(echo "$json_check" | jq '.avail_spare / 100')"
|
|
||||||
echo "available_spare_ratio{device=\"${disk}\"} ${value_available_spare}"
|
|
||||||
|
|
||||||
value_available_spare_threshold="$(echo "$json_check" | jq '.spare_thresh / 100')"
|
|
||||||
echo "available_spare_threshold_ratio{device=\"${disk}\"} ${value_available_spare_threshold}"
|
|
||||||
|
|
||||||
value_percentage_used="$(echo "$json_check" | jq '.percent_used / 100')"
|
|
||||||
echo "percentage_used_ratio{device=\"${disk}\"} ${value_percentage_used}"
|
|
||||||
|
|
||||||
value_critical_warning="$(echo "$json_check" | jq '.critical_warning')"
|
|
||||||
echo "critical_warning_total{device=\"${disk}\"} ${value_critical_warning}"
|
|
||||||
|
|
||||||
value_media_errors="$(echo "$json_check" | jq '.media_errors')"
|
|
||||||
echo "media_errors_total{device=\"${disk}\"} ${value_media_errors}"
|
|
||||||
|
|
||||||
value_num_err_log_entries="$(echo "$json_check" | jq '.num_err_log_entries')"
|
|
||||||
echo "num_err_log_entries_total{device=\"${disk}\"} ${value_num_err_log_entries}"
|
|
||||||
|
|
||||||
value_power_cycles="$(echo "$json_check" | jq '.power_cycles')"
|
|
||||||
echo "power_cycles_total{device=\"${disk}\"} ${value_power_cycles}"
|
|
||||||
|
|
||||||
value_power_on_hours="$(echo "$json_check" | jq '.power_on_hours')"
|
|
||||||
echo "power_on_hours_total{device=\"${disk}\"} ${value_power_on_hours}"
|
|
||||||
|
|
||||||
value_controller_busy_time="$(echo "$json_check" | jq '.controller_busy_time')"
|
|
||||||
echo "controller_busy_time_seconds{device=\"${disk}\"} ${value_controller_busy_time}"
|
|
||||||
|
|
||||||
value_data_units_written="$(echo "$json_check" | jq '.data_units_written')"
|
|
||||||
echo "data_units_written_total{device=\"${disk}\"} ${value_data_units_written}"
|
|
||||||
|
|
||||||
value_data_units_read="$(echo "$json_check" | jq '.data_units_read')"
|
|
||||||
echo "data_units_read_total{device=\"${disk}\"} ${value_data_units_read}"
|
|
||||||
|
|
||||||
value_host_read_commands="$(echo "$json_check" | jq '.host_read_commands')"
|
|
||||||
echo "host_read_commands_total{device=\"${disk}\"} ${value_host_read_commands}"
|
|
||||||
|
|
||||||
value_host_write_commands="$(echo "$json_check" | jq '.host_write_commands')"
|
|
||||||
echo "host_write_commands_total{device=\"${disk}\"} ${value_host_write_commands}"
|
|
||||||
done | format_output
|
|
|
@ -1,33 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# Description: Expose metrics from pacman updates
|
|
||||||
# If installed The bash script *checkupdates*, included with the
|
|
||||||
# *pacman-contrib* package, is used to calculate the number of pending updates.
|
|
||||||
# Otherwise *pacman* is used for calculation.
|
|
||||||
#
|
|
||||||
# Author: Sven Haardiek <sven@haardiek.de>
|
|
||||||
|
|
||||||
set -o errexit
|
|
||||||
set -o nounset
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
if [ -x /usr/bin/checkupdates ]
|
|
||||||
then
|
|
||||||
updates=$(/usr/bin/checkupdates | wc -l)
|
|
||||||
cache=0
|
|
||||||
else
|
|
||||||
if ! updates=$(/usr/bin/pacman -Qu | wc -l)
|
|
||||||
then
|
|
||||||
updates=0
|
|
||||||
fi
|
|
||||||
cache=1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "# HELP updates_pending number of pending updates from pacman"
|
|
||||||
echo "# TYPE updates_pending gauge"
|
|
||||||
echo "pacman_updates_pending $updates"
|
|
||||||
|
|
||||||
echo "# HELP pacman_updates_pending_from_cache pending updates information are from cache"
|
|
||||||
echo "# TYPE pacman_updates_pending_from_cache gauge"
|
|
||||||
echo "pacman_updates_pending_from_cache $cache"
|
|
|
@ -1,378 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
import argparse
|
|
||||||
import collections
|
|
||||||
import csv
|
|
||||||
import datetime
|
|
||||||
import decimal
|
|
||||||
import re
|
|
||||||
import shlex
|
|
||||||
import subprocess
|
|
||||||
|
|
||||||
device_info_re = re.compile(r'^(?P<k>[^:]+?)(?:(?:\sis|):)\s*(?P<v>.*)$')
|
|
||||||
|
|
||||||
ata_error_count_re = re.compile(
|
|
||||||
r'^Error (\d+) \[\d+\] occurred', re.MULTILINE)
|
|
||||||
|
|
||||||
self_test_re = re.compile(r'^SMART.*(PASSED|OK)$', re.MULTILINE)
|
|
||||||
|
|
||||||
device_info_map = {
|
|
||||||
'Vendor': 'vendor',
|
|
||||||
'Product': 'product',
|
|
||||||
'Revision': 'revision',
|
|
||||||
'Logical Unit id': 'lun_id',
|
|
||||||
'Model Family': 'model_family',
|
|
||||||
'Device Model': 'device_model',
|
|
||||||
'Serial Number': 'serial_number',
|
|
||||||
'Firmware Version': 'firmware_version',
|
|
||||||
}
|
|
||||||
|
|
||||||
smart_attributes_whitelist = {
|
|
||||||
'airflow_temperature_cel',
|
|
||||||
'command_timeout',
|
|
||||||
'current_pending_sector',
|
|
||||||
'end_to_end_error',
|
|
||||||
'erase_fail_count_total',
|
|
||||||
'g_sense_error_rate',
|
|
||||||
'hardware_ecc_recovered',
|
|
||||||
'host_reads_mib',
|
|
||||||
'host_reads_32mib',
|
|
||||||
'host_writes_mib',
|
|
||||||
'host_writes_32mib',
|
|
||||||
'load_cycle_count',
|
|
||||||
'media_wearout_indicator',
|
|
||||||
'wear_leveling_count',
|
|
||||||
'nand_writes_1gib',
|
|
||||||
'offline_uncorrectable',
|
|
||||||
'power_cycle_count',
|
|
||||||
'power_on_hours',
|
|
||||||
'program_fail_count',
|
|
||||||
'raw_read_error_rate',
|
|
||||||
'reallocated_event_count',
|
|
||||||
'reallocated_sector_ct',
|
|
||||||
'reported_uncorrect',
|
|
||||||
'sata_downshift_count',
|
|
||||||
'seek_error_rate',
|
|
||||||
'spin_retry_count',
|
|
||||||
'spin_up_time',
|
|
||||||
'start_stop_count',
|
|
||||||
'temperature_case',
|
|
||||||
'temperature_celsius',
|
|
||||||
'temperature_internal',
|
|
||||||
'total_lbas_read',
|
|
||||||
'total_lbas_written',
|
|
||||||
'udma_crc_error_count',
|
|
||||||
'unsafe_shutdown_count',
|
|
||||||
'workld_host_reads_perc',
|
|
||||||
'workld_media_wear_indic',
|
|
||||||
'workload_minutes',
|
|
||||||
}
|
|
||||||
|
|
||||||
Metric = collections.namedtuple('Metric', 'name labels value')
|
|
||||||
|
|
||||||
SmartAttribute = collections.namedtuple('SmartAttribute', [
|
|
||||||
'id', 'name', 'flag', 'value', 'worst', 'threshold', 'type', 'updated',
|
|
||||||
'when_failed', 'raw_value',
|
|
||||||
])
|
|
||||||
|
|
||||||
|
|
||||||
class Device(collections.namedtuple('DeviceBase', 'path opts')):
|
|
||||||
"""Representation of a device as found by smartctl --scan output."""
|
|
||||||
|
|
||||||
@property
|
|
||||||
def type(self):
|
|
||||||
return self.opts.type
|
|
||||||
|
|
||||||
@property
|
|
||||||
def base_labels(self):
|
|
||||||
return {'disk': self.path}
|
|
||||||
|
|
||||||
def smartctl_select(self):
|
|
||||||
return ['--device', self.type, self.path]
|
|
||||||
|
|
||||||
|
|
||||||
def metric_key(metric, prefix=''):
|
|
||||||
return '{prefix}{metric.name}'.format(prefix=prefix, metric=metric)
|
|
||||||
|
|
||||||
|
|
||||||
def metric_format(metric, prefix=''):
|
|
||||||
key = metric_key(metric, prefix)
|
|
||||||
labels = ','.join(
|
|
||||||
'{k}="{v}"'.format(k=k, v=v) for k, v in metric.labels.items())
|
|
||||||
value = decimal.Decimal(metric.value)
|
|
||||||
|
|
||||||
return '{key}{{{labels}}} {value}'.format(
|
|
||||||
key=key, labels=labels, value=value)
|
|
||||||
|
|
||||||
|
|
||||||
def metric_print_meta(metric, prefix=''):
|
|
||||||
key = metric_key(metric, prefix)
|
|
||||||
print('# HELP {key} SMART metric {metric.name}'.format(
|
|
||||||
key=key, metric=metric))
|
|
||||||
print('# TYPE {key} gauge'.format(key=key, metric=metric))
|
|
||||||
|
|
||||||
|
|
||||||
def metric_print(metric, prefix=''):
|
|
||||||
print(metric_format(metric, prefix))
|
|
||||||
|
|
||||||
|
|
||||||
def smart_ctl(*args, check=True):
|
|
||||||
"""Wrapper around invoking the smartctl binary.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
(str) Data piped to stdout by the smartctl subprocess.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
return subprocess.run(
|
|
||||||
['smartctl', *args], stdout=subprocess.PIPE, check=check
|
|
||||||
).stdout.decode('utf-8')
|
|
||||||
except subprocess.CalledProcessError as e:
|
|
||||||
return e.output.decode('utf-8')
|
|
||||||
|
|
||||||
def smart_ctl_version():
|
|
||||||
return smart_ctl('-V').split('\n')[0].split()[1]
|
|
||||||
|
|
||||||
|
|
||||||
def find_devices():
|
|
||||||
"""Find SMART devices.
|
|
||||||
|
|
||||||
Yields:
|
|
||||||
(Device) Single device found by smartctl.
|
|
||||||
"""
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument('-d', '--device', dest='type')
|
|
||||||
|
|
||||||
devices = smart_ctl('--scan-open')
|
|
||||||
|
|
||||||
for device in devices.split('\n'):
|
|
||||||
device = device.strip()
|
|
||||||
if not device:
|
|
||||||
continue
|
|
||||||
|
|
||||||
tokens = shlex.split(device, comments=True)
|
|
||||||
if not tokens:
|
|
||||||
continue
|
|
||||||
|
|
||||||
yield Device(tokens[0], parser.parse_args(tokens[1:]))
|
|
||||||
|
|
||||||
|
|
||||||
def device_is_active(device):
|
|
||||||
"""Returns whenever the given device is currently active or not.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
device: (Device) Device in question.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
(bool) True if the device is active and False otherwise.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
smart_ctl('--nocheck', 'standby', *device.smartctl_select())
|
|
||||||
except subprocess.CalledProcessError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def device_info(device):
|
|
||||||
"""Query device for basic model information.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
device: (Device) Device in question.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
(generator): Generator yielding:
|
|
||||||
|
|
||||||
key (str): Key describing the value.
|
|
||||||
value (str): Actual value.
|
|
||||||
"""
|
|
||||||
info_lines = smart_ctl(
|
|
||||||
'--info', *device.smartctl_select()
|
|
||||||
).strip().split('\n')[3:]
|
|
||||||
|
|
||||||
matches = (device_info_re.match(l) for l in info_lines)
|
|
||||||
return (m.groups() for m in matches if m is not None)
|
|
||||||
|
|
||||||
|
|
||||||
def device_smart_capabilities(device):
|
|
||||||
"""Returns SMART capabilities of the given device.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
device: (Device) Device in question.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
(tuple): tuple containing:
|
|
||||||
|
|
||||||
(bool): True whenever SMART is available, False otherwise.
|
|
||||||
(bool): True whenever SMART is enabled, False otherwise.
|
|
||||||
"""
|
|
||||||
groups = device_info(device)
|
|
||||||
|
|
||||||
state = {
|
|
||||||
g[1].split(' ', 1)[0]
|
|
||||||
for g in groups if g[0] == 'SMART support'}
|
|
||||||
|
|
||||||
smart_available = 'Available' in state
|
|
||||||
smart_enabled = 'Enabled' in state
|
|
||||||
|
|
||||||
return smart_available, smart_enabled
|
|
||||||
|
|
||||||
|
|
||||||
def collect_device_info(device):
|
|
||||||
"""Collect basic device information.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
device: (Device) Device in question.
|
|
||||||
|
|
||||||
Yields:
|
|
||||||
(Metric) metrics describing general device information.
|
|
||||||
"""
|
|
||||||
values = dict(device_info(device))
|
|
||||||
yield Metric('device_info', {
|
|
||||||
**device.base_labels,
|
|
||||||
**{v: values[k] for k, v in device_info_map.items() if k in values}
|
|
||||||
}, True)
|
|
||||||
|
|
||||||
|
|
||||||
def collect_device_health_self_assessment(device):
|
|
||||||
"""Collect metric about the device health self assessment.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
device: (Device) Device in question.
|
|
||||||
|
|
||||||
Yields:
|
|
||||||
(Metric) Device health self assessment.
|
|
||||||
"""
|
|
||||||
out = smart_ctl('--health', *device.smartctl_select())
|
|
||||||
|
|
||||||
if self_test_re.search(out):
|
|
||||||
self_assessment_passed = True
|
|
||||||
else:
|
|
||||||
self_assessment_passed = False
|
|
||||||
|
|
||||||
yield Metric(
|
|
||||||
'device_smart_healthy', device.base_labels, self_assessment_passed)
|
|
||||||
|
|
||||||
|
|
||||||
def collect_ata_metrics(device):
|
|
||||||
# Fetch SMART attributes for the given device.
|
|
||||||
attributes = smart_ctl(
|
|
||||||
'--attributes', *device.smartctl_select()
|
|
||||||
)
|
|
||||||
|
|
||||||
# replace multiple occurrences of whitespace with a single whitespace
|
|
||||||
# so that the CSV Parser recognizes individual columns properly.
|
|
||||||
attributes = re.sub(r'[\t\x20]+', ' ', attributes)
|
|
||||||
|
|
||||||
# Turn smartctl output into a list of lines and skip to the table of
|
|
||||||
# SMART attributes.
|
|
||||||
attribute_lines = attributes.strip().split('\n')[7:]
|
|
||||||
|
|
||||||
reader = csv.DictReader(
|
|
||||||
(l.strip() for l in attribute_lines),
|
|
||||||
fieldnames=SmartAttribute._fields[:-1],
|
|
||||||
restkey=SmartAttribute._fields[-1], delimiter=' ')
|
|
||||||
for entry in reader:
|
|
||||||
# We're only interested in the SMART attributes that are
|
|
||||||
# whitelisted here.
|
|
||||||
entry['name'] = entry['name'].lower()
|
|
||||||
if entry['name'] not in smart_attributes_whitelist:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Ensure that only the numeric parts are fetched from the raw_value.
|
|
||||||
# Attributes such as 194 Temperature_Celsius reported by my SSD
|
|
||||||
# are in the format of "36 (Min/Max 24/40)" which can't be expressed
|
|
||||||
# properly as a prometheus metric.
|
|
||||||
m = re.match('^(\d+)', ' '.join(entry['raw_value']))
|
|
||||||
if not m:
|
|
||||||
continue
|
|
||||||
entry['raw_value'] = m.group(1)
|
|
||||||
|
|
||||||
if entry['name'] in smart_attributes_whitelist:
|
|
||||||
labels = {
|
|
||||||
'name': entry['name'],
|
|
||||||
**device.base_labels,
|
|
||||||
}
|
|
||||||
|
|
||||||
for col in 'value', 'worst', 'threshold':
|
|
||||||
yield Metric(
|
|
||||||
'attr_{col}'.format(name=entry["name"], col=col),
|
|
||||||
labels, entry[col])
|
|
||||||
|
|
||||||
|
|
||||||
def collect_ata_error_count(device):
|
|
||||||
"""Inspect the device error log and report the amount of entries.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
device: (Device) Device in question.
|
|
||||||
|
|
||||||
Yields:
|
|
||||||
(Metric) Device error count.
|
|
||||||
"""
|
|
||||||
error_log = smart_ctl(
|
|
||||||
'-l', 'xerror,1', *device.smartctl_select(), check=False)
|
|
||||||
|
|
||||||
m = ata_error_count_re.search(error_log)
|
|
||||||
|
|
||||||
error_count = m.group(1) if m is not None else 0
|
|
||||||
|
|
||||||
yield Metric('device_errors', device.base_labels, error_count)
|
|
||||||
|
|
||||||
|
|
||||||
def collect_disks_smart_metrics():
|
|
||||||
now = int(datetime.datetime.utcnow().timestamp())
|
|
||||||
|
|
||||||
for device in find_devices():
|
|
||||||
yield Metric('smartctl_run', device.base_labels, now)
|
|
||||||
|
|
||||||
is_active = device_is_active(device)
|
|
||||||
|
|
||||||
yield Metric('device_active', device.base_labels, is_active)
|
|
||||||
|
|
||||||
# Skip further metrics collection to prevent the disk from
|
|
||||||
# spinning up.
|
|
||||||
if not is_active:
|
|
||||||
continue
|
|
||||||
|
|
||||||
yield from collect_device_info(device)
|
|
||||||
|
|
||||||
smart_available, smart_enabled = device_smart_capabilities(device)
|
|
||||||
|
|
||||||
yield Metric(
|
|
||||||
'device_smart_available', device.base_labels, smart_available)
|
|
||||||
yield Metric(
|
|
||||||
'device_smart_enabled', device.base_labels, smart_enabled)
|
|
||||||
|
|
||||||
# Skip further metrics collection here if SMART is disabled
|
|
||||||
# on the device. Further smartctl invocations would fail
|
|
||||||
# anyways.
|
|
||||||
if not smart_available:
|
|
||||||
continue
|
|
||||||
|
|
||||||
yield from collect_device_health_self_assessment(device)
|
|
||||||
|
|
||||||
if device.type.startswith('sat'):
|
|
||||||
yield from collect_ata_metrics(device)
|
|
||||||
|
|
||||||
yield from collect_ata_error_count(device)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
version_metric = Metric('smartctl_version', {
|
|
||||||
'version': smart_ctl_version()
|
|
||||||
}, True)
|
|
||||||
metric_print_meta(version_metric, 'smartmon_')
|
|
||||||
metric_print(version_metric, 'smartmon_')
|
|
||||||
|
|
||||||
metrics = list(collect_disks_smart_metrics())
|
|
||||||
metrics.sort(key=lambda i: i.name)
|
|
||||||
|
|
||||||
previous_name = None
|
|
||||||
for m in metrics:
|
|
||||||
if m.name != previous_name:
|
|
||||||
metric_print_meta(m, 'smartmon_')
|
|
||||||
|
|
||||||
previous_name = m.name
|
|
||||||
|
|
||||||
metric_print(m, 'smartmon_')
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
|
@ -1,194 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
# Script informed by the collectd monitoring script for smartmontools (using smartctl)
|
|
||||||
# by Samuel B. <samuel_._behan_(at)_dob_._sk> (c) 2012
|
|
||||||
# source at: http://devel.dob.sk/collectd-scripts/
|
|
||||||
|
|
||||||
# TODO: This probably needs to be a little more complex. The raw numbers can have more
|
|
||||||
# data in them than you'd think.
|
|
||||||
# http://arstechnica.com/civis/viewtopic.php?p=22062211
|
|
||||||
|
|
||||||
# Formatting done via shfmt -i 2
|
|
||||||
# https://github.com/mvdan/sh
|
|
||||||
|
|
||||||
parse_smartctl_attributes_awk="$(
|
|
||||||
cat <<'SMARTCTLAWK'
|
|
||||||
$1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ {
|
|
||||||
gsub(/-/, "_");
|
|
||||||
printf "%s_value{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $4
|
|
||||||
printf "%s_worst{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $5
|
|
||||||
printf "%s_threshold{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $6
|
|
||||||
printf "%s_raw_value{%s,smart_id=\"%s\"} %e\n", $2, labels, $1, $10
|
|
||||||
}
|
|
||||||
SMARTCTLAWK
|
|
||||||
)"
|
|
||||||
|
|
||||||
smartmon_attrs="$(
|
|
||||||
cat <<'SMARTMONATTRS'
|
|
||||||
airflow_temperature_cel
|
|
||||||
command_timeout
|
|
||||||
current_pending_sector
|
|
||||||
end_to_end_error
|
|
||||||
erase_fail_count
|
|
||||||
g_sense_error_rate
|
|
||||||
hardware_ecc_recovered
|
|
||||||
host_reads_mib
|
|
||||||
host_reads_32mib
|
|
||||||
host_writes_mib
|
|
||||||
host_writes_32mib
|
|
||||||
load_cycle_count
|
|
||||||
media_wearout_indicator
|
|
||||||
wear_leveling_count
|
|
||||||
nand_writes_1gib
|
|
||||||
offline_uncorrectable
|
|
||||||
power_cycle_count
|
|
||||||
power_on_hours
|
|
||||||
program_fail_count
|
|
||||||
raw_read_error_rate
|
|
||||||
reallocated_event_count
|
|
||||||
reallocated_sector_ct
|
|
||||||
reported_uncorrect
|
|
||||||
sata_downshift_count
|
|
||||||
seek_error_rate
|
|
||||||
spin_retry_count
|
|
||||||
spin_up_time
|
|
||||||
start_stop_count
|
|
||||||
temperature_case
|
|
||||||
temperature_celsius
|
|
||||||
temperature_internal
|
|
||||||
total_lbas_read
|
|
||||||
total_lbas_written
|
|
||||||
udma_crc_error_count
|
|
||||||
unsafe_shutdown_count
|
|
||||||
workld_host_reads_perc
|
|
||||||
workld_media_wear_indic
|
|
||||||
workload_minutes
|
|
||||||
SMARTMONATTRS
|
|
||||||
)"
|
|
||||||
smartmon_attrs="$(echo ${smartmon_attrs} | xargs | tr ' ' '|')"
|
|
||||||
|
|
||||||
parse_smartctl_attributes() {
|
|
||||||
local disk="$1"
|
|
||||||
local disk_type="$2"
|
|
||||||
local labels="disk=\"${disk}\",type=\"${disk_type}\""
|
|
||||||
local vars="$(echo "${smartmon_attrs}" | xargs | tr ' ' '|')"
|
|
||||||
sed 's/^ \+//g' |
|
|
||||||
awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null |
|
|
||||||
tr A-Z a-z |
|
|
||||||
grep -E "(${smartmon_attrs})"
|
|
||||||
}
|
|
||||||
|
|
||||||
parse_smartctl_scsi_attributes() {
|
|
||||||
local disk="$1"
|
|
||||||
local disk_type="$2"
|
|
||||||
local labels="disk=\"${disk}\",type=\"${disk_type}\""
|
|
||||||
while read line; do
|
|
||||||
attr_type="$(echo "${line}" | tr '=' ':' | cut -f1 -d: | sed 's/^ \+//g' | tr ' ' '_')"
|
|
||||||
attr_value="$(echo "${line}" | tr '=' ':' | cut -f2 -d: | sed 's/^ \+//g')"
|
|
||||||
case "${attr_type}" in
|
|
||||||
number_of_hours_powered_up_) power_on="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;;
|
|
||||||
Current_Drive_Temperature) temp_cel="$(echo ${attr_value} | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;;
|
|
||||||
Blocks_sent_to_initiator_) lbas_read="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
|
|
||||||
Blocks_received_from_initiator_) lbas_written="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
|
|
||||||
Accumulated_start-stop_cycles) power_cycle="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
|
|
||||||
Elements_in_grown_defect_list) grown_defects="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
[ ! -z "$power_on" ] && echo "power_on_hours_raw_value{${labels},smart_id=\"9\"} ${power_on}"
|
|
||||||
[ ! -z "$temp_cel" ] && echo "temperature_celsius_raw_value{${labels},smart_id=\"194\"} ${temp_cel}"
|
|
||||||
[ ! -z "$lbas_read" ] && echo "total_lbas_read_raw_value{${labels},smart_id=\"242\"} ${lbas_read}"
|
|
||||||
[ ! -z "$lbas_written" ] && echo "total_lbas_written_raw_value{${labels},smart_id=\"242\"} ${lbas_written}"
|
|
||||||
[ ! -z "$power_cycle" ] && echo "power_cycle_count_raw_value{${labels},smart_id=\"12\"} ${power_cycle}"
|
|
||||||
[ ! -z "$grown_defects" ] && echo "grown_defects_count_raw_value{${labels},smart_id=\"12\"} ${grown_defects}"
|
|
||||||
}
|
|
||||||
|
|
||||||
parse_smartctl_info() {
|
|
||||||
local -i smart_available=0 smart_enabled=0 smart_healthy=0
|
|
||||||
local disk="$1" disk_type="$2"
|
|
||||||
local model_family='' device_model='' serial_number='' fw_version='' vendor='' product='' revision='' lun_id=''
|
|
||||||
while read line; do
|
|
||||||
info_type="$(echo "${line}" | cut -f1 -d: | tr ' ' '_')"
|
|
||||||
info_value="$(echo "${line}" | cut -f2- -d: | sed 's/^ \+//g' | sed 's/"/\\"/')"
|
|
||||||
case "${info_type}" in
|
|
||||||
Model_Family) model_family="${info_value}" ;;
|
|
||||||
Device_Model) device_model="${info_value}" ;;
|
|
||||||
Serial_Number) serial_number="${info_value}" ;;
|
|
||||||
Firmware_Version) fw_version="${info_value}" ;;
|
|
||||||
Vendor) vendor="${info_value}" ;;
|
|
||||||
Product) product="${info_value}" ;;
|
|
||||||
Revision) revision="${info_value}" ;;
|
|
||||||
Logical_Unit_id) lun_id="${info_value}" ;;
|
|
||||||
esac
|
|
||||||
if [[ "${info_type}" == 'SMART_support_is' ]]; then
|
|
||||||
case "${info_value:0:7}" in
|
|
||||||
Enabled) smart_enabled=1 ;;
|
|
||||||
Availab) smart_available=1 ;;
|
|
||||||
Unavail) smart_available=0 ;;
|
|
||||||
esac
|
|
||||||
fi
|
|
||||||
if [[ "${info_type}" == 'SMART_overall-health_self-assessment_test_result' ]]; then
|
|
||||||
case "${info_value:0:6}" in
|
|
||||||
PASSED) smart_healthy=1 ;;
|
|
||||||
esac
|
|
||||||
elif [[ "${info_type}" == 'SMART_Health_Status' ]]; then
|
|
||||||
case "${info_value:0:2}" in
|
|
||||||
OK) smart_healthy=1 ;;
|
|
||||||
esac
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
echo "device_info{disk=\"${disk}\",type=\"${disk_type}\",vendor=\"${vendor}\",product=\"${product}\",revision=\"${revision}\",lun_id=\"${lun_id}\",model_family=\"${model_family}\",device_model=\"${device_model}\",serial_number=\"${serial_number}\",firmware_version=\"${fw_version}\"} 1"
|
|
||||||
echo "device_smart_available{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_available}"
|
|
||||||
echo "device_smart_enabled{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_enabled}"
|
|
||||||
echo "device_smart_healthy{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_healthy}"
|
|
||||||
}
|
|
||||||
|
|
||||||
output_format_awk="$(
|
|
||||||
cat <<'OUTPUTAWK'
|
|
||||||
BEGIN { v = "" }
|
|
||||||
v != $1 {
|
|
||||||
print "# HELP smartmon_" $1 " SMART metric " $1;
|
|
||||||
print "# TYPE smartmon_" $1 " gauge";
|
|
||||||
v = $1
|
|
||||||
}
|
|
||||||
{print "smartmon_" $0}
|
|
||||||
OUTPUTAWK
|
|
||||||
)"
|
|
||||||
|
|
||||||
format_output() {
|
|
||||||
sort |
|
|
||||||
awk -F'{' "${output_format_awk}"
|
|
||||||
}
|
|
||||||
|
|
||||||
smartctl_version="$(/usr/sbin/smartctl -V | head -n1 | awk '$1 == "smartctl" {print $2}')"
|
|
||||||
|
|
||||||
echo "smartctl_version{version=\"${smartctl_version}\"} 1" | format_output
|
|
||||||
|
|
||||||
if [[ "$(expr "${smartctl_version}" : '\([0-9]*\)\..*')" -lt 6 ]]; then
|
|
||||||
exit
|
|
||||||
fi
|
|
||||||
|
|
||||||
device_list="$(/usr/sbin/smartctl --scan-open | awk '/^\/dev/{print $1 "|" $3}')"
|
|
||||||
|
|
||||||
for device in ${device_list}; do
|
|
||||||
disk="$(echo ${device} | cut -f1 -d'|')"
|
|
||||||
type="$(echo ${device} | cut -f2 -d'|')"
|
|
||||||
active=1
|
|
||||||
echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" "$(TZ=UTC date '+%s')"
|
|
||||||
# Check if the device is in a low-power mode
|
|
||||||
/usr/sbin/smartctl -n standby -d "${type}" "${disk}" > /dev/null || active=0
|
|
||||||
echo "device_active{disk=\"${disk}\",type=\"${type}\"}" "${active}"
|
|
||||||
# Skip further metrics to prevent the disk from spinning up
|
|
||||||
test ${active} -eq 0 && continue
|
|
||||||
# Get the SMART information and health
|
|
||||||
/usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}"
|
|
||||||
# Get the SMART attributes
|
|
||||||
case ${type} in
|
|
||||||
sat) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;;
|
|
||||||
sat+megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;;
|
|
||||||
scsi) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;;
|
|
||||||
megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;;
|
|
||||||
*)
|
|
||||||
echo "disk type is not sat, scsi or megaraid but ${type}"
|
|
||||||
exit
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done | format_output
|
|
|
@ -1,242 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Script to parse StorCLI's JSON output and expose
|
|
||||||
MegaRAID health as Prometheus metrics.
|
|
||||||
|
|
||||||
Tested against StorCLI 'Ver 1.14.12 Nov 25, 2014'.
|
|
||||||
|
|
||||||
StorCLI reference manual:
|
|
||||||
http://docs.avagotech.com/docs/12352476
|
|
||||||
|
|
||||||
Advanced Software Options (ASO) not exposed as metrics currently.
|
|
||||||
|
|
||||||
JSON key abbreviations used by StorCLI are documented in the standard command
|
|
||||||
output, i.e. when you omit the trailing 'J' from the command.
|
|
||||||
|
|
||||||
Formatting done with YAPF:
|
|
||||||
$ yapf -i --style '{COLUMN_LIMIT: 99}' storcli.py
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import print_function
|
|
||||||
from datetime import datetime
|
|
||||||
import argparse
|
|
||||||
import collections
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import shlex
|
|
||||||
import subprocess
|
|
||||||
|
|
||||||
DESCRIPTION = """Parses StorCLI's JSON output and exposes MegaRAID health as
|
|
||||||
Prometheus metrics."""
|
|
||||||
VERSION = '0.0.3'
|
|
||||||
|
|
||||||
storcli_path = ''
|
|
||||||
metric_prefix = 'megaraid_'
|
|
||||||
metric_list = {}
|
|
||||||
metric_list = collections.defaultdict(list)
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
|
||||||
""" main """
|
|
||||||
global storcli_path
|
|
||||||
storcli_path = args.storcli_path
|
|
||||||
data = get_storcli_json('/cALL show all J')
|
|
||||||
|
|
||||||
try:
|
|
||||||
# All the information is collected underneath the Controllers key
|
|
||||||
data = data['Controllers']
|
|
||||||
|
|
||||||
for controller in data:
|
|
||||||
response = controller['Response Data']
|
|
||||||
|
|
||||||
handle_common_controller(response)
|
|
||||||
if response['Version']['Driver Name'] == 'megaraid_sas':
|
|
||||||
handle_megaraid_controller(response)
|
|
||||||
elif response['Version']['Driver Name'] == 'mpt3sas':
|
|
||||||
handle_sas_controller(response)
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
print_all_metrics(metric_list)
|
|
||||||
|
|
||||||
def handle_common_controller(response):
|
|
||||||
(controller_index, baselabel) = get_basic_controller_info(response)
|
|
||||||
|
|
||||||
# Split up string to not trigger CodeSpell issues
|
|
||||||
if 'ROC temperature(Degree Celc' + 'ius)' in response['HwCfg'].keys():
|
|
||||||
response['HwCfg']['ROC temperature(Degree Celsius)'] = response['HwCfg'].pop('ROC temperature(Degree Celc' + 'ius)')
|
|
||||||
add_metric('temperature', baselabel, int(response['HwCfg']['ROC temperature(Degree Celsius)']))
|
|
||||||
|
|
||||||
def handle_sas_controller(response):
|
|
||||||
(controller_index, baselabel) = get_basic_controller_info(response)
|
|
||||||
add_metric('healthy', baselabel, int(response['Status']['Controller Status'] == 'OK'))
|
|
||||||
add_metric('ports', baselabel, response['HwCfg']['Backend Port Count'])
|
|
||||||
try:
|
|
||||||
# The number of physical disks is half of the number of items in this dict
|
|
||||||
# Every disk is listed twice - once for basic info, again for detailed info
|
|
||||||
add_metric('physical_drives', baselabel,
|
|
||||||
len(response['Physical Device Information'].keys()) / 2)
|
|
||||||
except AttributeError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
for key, basic_disk_info in response['Physical Device Information'].items():
|
|
||||||
if 'Detailed Information' in key:
|
|
||||||
continue
|
|
||||||
create_metrics_of_physical_drive(basic_disk_info[0],
|
|
||||||
response['Physical Device Information'], controller_index)
|
|
||||||
|
|
||||||
|
|
||||||
def handle_megaraid_controller(response):
|
|
||||||
(controller_index, baselabel) = get_basic_controller_info(response)
|
|
||||||
|
|
||||||
# BBU Status Optimal value is 0 for cachevault and 32 for BBU
|
|
||||||
add_metric('battery_backup_healthy', baselabel,
|
|
||||||
int(response['Status']['BBU Status'] in [0, 32]))
|
|
||||||
add_metric('degraded', baselabel, int(response['Status']['Controller Status'] == 'Degraded'))
|
|
||||||
add_metric('failed', baselabel, int(response['Status']['Controller Status'] == 'Failed'))
|
|
||||||
add_metric('healthy', baselabel, int(response['Status']['Controller Status'] == 'Optimal'))
|
|
||||||
add_metric('ports', baselabel, response['HwCfg']['Backend Port Count'])
|
|
||||||
add_metric('scheduled_patrol_read', baselabel,
|
|
||||||
int('hrs' in response['Scheduled Tasks']['Patrol Read Reoccurrence']))
|
|
||||||
for cvidx, cvinfo in enumerate(response['Cachevault_Info']):
|
|
||||||
add_metric('cv_temperature', baselabel + ',cvidx="' + str(cvidx) + '"', int(cvinfo['Temp'].replace('C','')))
|
|
||||||
|
|
||||||
time_difference_seconds = -1
|
|
||||||
system_time = datetime.strptime(response['Basics'].get('Current System Date/time'),
|
|
||||||
"%m/%d/%Y, %H:%M:%S")
|
|
||||||
controller_time = datetime.strptime(response['Basics'].get('Current Controller Date/Time'),
|
|
||||||
"%m/%d/%Y, %H:%M:%S")
|
|
||||||
if system_time and controller_time:
|
|
||||||
time_difference_seconds = abs(system_time - controller_time).seconds
|
|
||||||
add_metric('time_difference', baselabel, time_difference_seconds)
|
|
||||||
|
|
||||||
# Make sure it doesn't crash if it's a JBOD setup
|
|
||||||
if 'Drive Groups' in response.keys():
|
|
||||||
add_metric('drive_groups', baselabel, response['Drive Groups'])
|
|
||||||
add_metric('virtual_drives', baselabel, response['Virtual Drives'])
|
|
||||||
|
|
||||||
for virtual_drive in response['VD LIST']:
|
|
||||||
vd_position = virtual_drive.get('DG/VD')
|
|
||||||
drive_group, volume_group = -1, -1
|
|
||||||
if vd_position:
|
|
||||||
drive_group = vd_position.split('/')[0]
|
|
||||||
volume_group = vd_position.split('/')[1]
|
|
||||||
vd_baselabel = 'controller="{0}",DG="{1}",VG="{2}"'.format(controller_index, drive_group,
|
|
||||||
volume_group)
|
|
||||||
vd_info_label = vd_baselabel + ',name="{0}",cache="{1}",type="{2}",state="{3}"'.format(
|
|
||||||
str(virtual_drive.get('Name')).strip(),
|
|
||||||
str(virtual_drive.get('Cache')).strip(),
|
|
||||||
str(virtual_drive.get('TYPE')).strip(),
|
|
||||||
str(virtual_drive.get('State')).strip())
|
|
||||||
add_metric('vd_info', vd_info_label, 1)
|
|
||||||
|
|
||||||
add_metric('physical_drives', baselabel, response['Physical Drives'])
|
|
||||||
if response['Physical Drives'] > 0:
|
|
||||||
data = get_storcli_json('/cALL/eALL/sALL show all J')
|
|
||||||
drive_info = data['Controllers'][controller_index]['Response Data']
|
|
||||||
for physical_drive in response['PD LIST']:
|
|
||||||
create_metrics_of_physical_drive(physical_drive, drive_info, controller_index)
|
|
||||||
|
|
||||||
|
|
||||||
def get_basic_controller_info(response):
|
|
||||||
controller_index = response['Basics']['Controller']
|
|
||||||
baselabel = 'controller="{0}"'.format(controller_index)
|
|
||||||
|
|
||||||
controller_info_label = baselabel + ',model="{0}",serial="{1}",fwversion="{2}"'.format(
|
|
||||||
str(response['Basics']['Model']).strip(),
|
|
||||||
str(response['Basics']['Serial Number']).strip(),
|
|
||||||
str(response['Version']['Firmware Version']).strip(),
|
|
||||||
)
|
|
||||||
add_metric('controller_info', controller_info_label, 1)
|
|
||||||
|
|
||||||
return (controller_index, baselabel)
|
|
||||||
|
|
||||||
|
|
||||||
def create_metrics_of_physical_drive(physical_drive, detailed_info_array, controller_index):
|
|
||||||
enclosure = physical_drive.get('EID:Slt').split(':')[0]
|
|
||||||
slot = physical_drive.get('EID:Slt').split(':')[1]
|
|
||||||
|
|
||||||
pd_baselabel = 'controller="{0}",enclosure="{1}",slot="{2}"'.format(controller_index, enclosure,
|
|
||||||
slot)
|
|
||||||
pd_info_label = pd_baselabel + \
|
|
||||||
',disk_id="{0}",interface="{1}",media="{2}",model="{3}",DG="{4}",state="{5}"'.format(
|
|
||||||
str(physical_drive.get('DID')).strip(),
|
|
||||||
str(physical_drive.get('Intf')).strip(),
|
|
||||||
str(physical_drive.get('Med')).strip(),
|
|
||||||
str(physical_drive.get('Model')).strip(),
|
|
||||||
str(physical_drive.get('DG')).strip(),
|
|
||||||
str(physical_drive.get('State')).strip())
|
|
||||||
|
|
||||||
drive_identifier = 'Drive /c' + str(controller_index) + '/e' + str(enclosure) + '/s' + str(
|
|
||||||
slot)
|
|
||||||
if enclosure == ' ':
|
|
||||||
drive_identifier = 'Drive /c' + str(controller_index) + '/s' + str(slot)
|
|
||||||
try:
|
|
||||||
info = detailed_info_array[drive_identifier + ' - Detailed Information']
|
|
||||||
state = info[drive_identifier + ' State']
|
|
||||||
attributes = info[drive_identifier + ' Device attributes']
|
|
||||||
settings = info[drive_identifier + ' Policies/Settings']
|
|
||||||
|
|
||||||
add_metric('pd_shield_counter', pd_baselabel, state['Shield Counter'])
|
|
||||||
add_metric('pd_media_errors', pd_baselabel, state['Media Error Count'])
|
|
||||||
add_metric('pd_other_errors', pd_baselabel, state['Other Error Count'])
|
|
||||||
add_metric('pd_predictive_errors', pd_baselabel, state['Predictive Failure Count'])
|
|
||||||
add_metric('pd_smart_alerted', pd_baselabel,
|
|
||||||
int(state['S.M.A.R.T alert flagged by drive'] == 'Yes'))
|
|
||||||
add_metric('pd_link_speed_gbps', pd_baselabel, attributes['Link Speed'].split('.')[0])
|
|
||||||
add_metric('pd_device_speed_gbps', pd_baselabel, attributes['Device Speed'].split('.')[0])
|
|
||||||
add_metric('pd_commissioned_spare', pd_baselabel,
|
|
||||||
int(settings['Commissioned Spare'] == 'Yes'))
|
|
||||||
add_metric('pd_emergency_spare', pd_baselabel, int(settings['Emergency Spare'] == 'Yes'))
|
|
||||||
pd_info_label += ',firmware="{0}"'.format(attributes['Firmware Revision'].strip())
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
add_metric('pd_info', pd_info_label, 1)
|
|
||||||
|
|
||||||
|
|
||||||
def add_metric(name, labels, value):
|
|
||||||
global metric_list
|
|
||||||
try:
|
|
||||||
metric_list[name].append({
|
|
||||||
'labels': labels,
|
|
||||||
'value': float(value),
|
|
||||||
})
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def print_all_metrics(metrics):
|
|
||||||
for metric, measurements in metrics.items():
|
|
||||||
print('# HELP {0}{1} MegaRAID {2}'.format(metric_prefix, metric, metric.replace('_', ' ')))
|
|
||||||
print('# TYPE {0}{1} gauge'.format(metric_prefix, metric))
|
|
||||||
for measurement in measurements:
|
|
||||||
if measurement['value'] != 'Unknown':
|
|
||||||
print('{0}{1}{2} {3}'.format(metric_prefix, metric, '{' + measurement['labels'] + '}',
|
|
||||||
measurement['value']))
|
|
||||||
|
|
||||||
|
|
||||||
def get_storcli_json(storcli_args):
|
|
||||||
"""Get storcli output in JSON format."""
|
|
||||||
# Check if storcli is installed and executable
|
|
||||||
if not (os.path.isfile(storcli_path) and os.access(storcli_path, os.X_OK)):
|
|
||||||
SystemExit(1)
|
|
||||||
storcli_cmd = shlex.split(storcli_path + ' ' + storcli_args)
|
|
||||||
proc = subprocess.Popen(
|
|
||||||
storcli_cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
||||||
output_json = proc.communicate()[0]
|
|
||||||
data = json.loads(output_json.decode("utf-8"))
|
|
||||||
|
|
||||||
if data["Controllers"][0]["Command Status"]["Status"] != "Success":
|
|
||||||
SystemExit(1)
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
PARSER = argparse.ArgumentParser(
|
|
||||||
description=DESCRIPTION, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
||||||
PARSER.add_argument(
|
|
||||||
'--storcli_path', default='/opt/MegaRAID/storcli/storcli64', help='path to StorCLi binary')
|
|
||||||
PARSER.add_argument('--version', action='version', version='%(prog)s {0}'.format(VERSION))
|
|
||||||
ARGS = PARSER.parse_args()
|
|
||||||
|
|
||||||
main(ARGS)
|
|
|
@ -1,18 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
#
|
|
||||||
# Description: Expose metrics from yum updates.
|
|
||||||
#
|
|
||||||
# Author: Slawomir Gonet <slawek@otwiera.cz>
|
|
||||||
#
|
|
||||||
# Based on apt.sh by Ben Kochie <superq@gmail.com>
|
|
||||||
|
|
||||||
upgrades=$(/usr/bin/yum -q check-updates | awk 'BEGIN { mute=1 } /Obsoleting Packages/ { mute=0 } mute { print }' | egrep '^\w+\.\w+' | awk '{print $3}' | sort | uniq -c | awk '{print "yum_upgrades_pending{origin=\""$2"\"} "$1}')
|
|
||||||
|
|
||||||
echo '# HELP yum_upgrades_pending Yum package pending updates by origin.'
|
|
||||||
echo '# TYPE yum_upgrades_pending gauge'
|
|
||||||
if [[ -n "${upgrades}" ]] ; then
|
|
||||||
echo "${upgrades}"
|
|
||||||
else
|
|
||||||
echo 'yum_upgrades_pending{origin=""} 0'
|
|
||||||
fi
|
|
||||||
|
|
Loading…
Reference in New Issue