Add a sample btrfs stats collector script (#1200)

Signed-off-by: Anton Tolchanov <commits@knyar.net>
2018-12-21 13:10:03 +00:00 · 2018-12-21 13:10:03 +00:00 · cf8b29d1fb
parent 97dab59e18
commit cf8b29d1fb
1 changed files with 112 additions and 0 deletions
--- a/text_collector_examples/btrfs_stats.py
+++ b/text_collector_examples/btrfs_stats.py
@ -0,0 +1,112 @@
+#!/usr/bin/env python
+
+# Collect per-device btrfs filesystem errors.
+# Designed to work on Debian and Centos 6 (with python2.6).
+
+import collections
+import glob
+import os
+import re
+import subprocess
+
+def get_btrfs_mount_points():
+    """List all btrfs mount points.
+
+    Yields:
+        (string) filesystem mount points.
+    """
+    with open("/proc/mounts") as f:
+        for line in f:
+            parts = line.split()
+            if parts[2] == "btrfs":
+                yield parts[1]
+
+def get_btrfs_errors(mountpoint):
+    """Get per-device errors for a btrfs mount point.
+
+    Args:
+        mountpoint: (string) path to a mount point.
+
+    Yields:
+        (device, error_type, error_count) tuples, where:
+            device: (string) path to block device.
+            error_type: (string) type of btrfs error.
+            error_count: (int) number of btrfs errors of a given type.
+    """
+    p = subprocess.Popen(["btrfs", "device", "stats", mountpoint],
+                         stdout=subprocess.PIPE)
+    (stdout, stderr) = p.communicate()
+    if p.returncode != 0:
+        raise RuntimeError("btrfs returned exit code %d" % p.returncode)
+    for line in stdout.splitlines():
+        if line == '':
+            continue
+        # Sample line:
+        # [/dev/vdb1].flush_io_errs   0
+        m = re.search(r"^\[([^\]]+)\]\.(\S+)\s+(\d+)$", line.decode("utf-8"))
+        if not m:
+            raise RuntimeError("unexpected output from btrfs: '%s'" % line)
+        yield m.group(1), m.group(2), int(m.group(3))
+
+def btrfs_error_metrics():
+    """Collect btrfs error metrics.
+
+    Returns:
+        a list of strings to be exposed as Prometheus metrics.
+    """
+    metric = "node_btrfs_errors_total"
+    contents = [
+        "# TYPE %s counter" % metric,
+        "# HELP %s number of btrfs errors" % metric,
+    ]
+    errors_by_device = collections.defaultdict(dict)
+    for mountpoint in get_btrfs_mount_points():
+        for device, error_type, error_count in get_btrfs_errors(mountpoint):
+            contents.append(
+                '%s{mountpoint="%s",device="%s",type="%s"} %d' %
+                (metric, mountpoint, device, error_type, error_count))
+
+    if len(contents) > 2:
+        # return metrics if there are actual btrfs filesystems found
+        # (i.e. `contents` contains more than just TYPE and HELP).
+        return contents
+
+def btrfs_allocation_metrics():
+    """Collect btrfs allocation metrics.
+
+    Returns:
+        a list of strings to be exposed as Prometheus metrics.
+    """
+    prefix = 'node_btrfs_allocation'
+    metric_to_filename = {
+        'size_bytes': 'total_bytes',
+        'used_bytes': 'bytes_used',
+        'reserved_bytes': 'bytes_reserved',
+        'pinned_bytes': 'bytes_pinned',
+        'disk_size_bytes': 'disk_total',
+        'disk_used_bytes': 'disk_used',
+    }
+    contents = []
+    for m, f in metric_to_filename.items():
+        contents += [
+            "# TYPE %s_%s gauge" % (prefix, m),
+            "# HELP %s_%s btrfs allocation data (%s)" % (prefix, m, f),
+        ]
+
+    for alloc in glob.glob("/sys/fs/btrfs/*/allocation"):
+        fs = alloc.split('/')[4]
+        for type_ in ('data', 'metadata', 'system'):
+            for m, f in metric_to_filename.items():
+                filename = os.path.join(alloc, type_, f)
+                with open(filename) as f:
+                    value = int(f.read().strip())
+                    contents.append('%s_%s{fs="%s",type="%s"} %d' % (
+                        prefix, m, fs, type_, value))
+    if len(contents) > 2*len(metric_to_filename):
+        return contents
+
+if __name__ == "__main__":
+    contents = ((btrfs_error_metrics() or []) +
+                (btrfs_allocation_metrics() or []))
+
+    print("\n".join(contents))