Add the inotify-instances text collector (#1186)

This is an alternative take on the embedded inotify collector:

https://github.com/prometheus/node_exporter/pull/988

The proposed embedded collector was not accepted for inclusion because
it was not possible for a single unprivileged node_exporter process to
detect inotify resource utilisation in other user domains.

This text collector works around the problem by giving the operator a
choice between the following:

  - Run only the text collector as root to gain visibility over all
    processes on the system.

  - Run one or more instances of the text collector as an unprivileged
    user to gain visibility over subsets of the system.

In either case, the data generated by this collector can be useful when
hunting down inotify instance leaks -- and when confirming the
resolution of such leaks.

Signed-off-by: Saj Goonatilleke <sg@redu.cx>
This commit is contained in:
Saj Goonatilleke 2019-02-27 11:03:25 +11:00 committed by Ben Kochie
parent 83c9b11747
commit d546916c6b
1 changed files with 141 additions and 0 deletions

View File

@ -0,0 +1,141 @@
#!/usr/bin/env python3
"""
Expose Linux inotify(7) instance resource consumption.
Operational properties:
- This script may be invoked as an unprivileged user; in this case, metrics
will only be exposed for processes owned by that unprivileged user.
- No metrics will be exposed for processes that do not hold any inotify fds.
Requires Python 3.5 or later.
"""
import collections
import os
import sys
class Error(Exception):
pass
class _PIDGoneError(Error):
pass
_Process = collections.namedtuple(
"Process", ["pid", "uid", "command", "inotify_instances"])
def _read_bytes(name):
with open(name, mode='rb') as f:
return f.read()
def _pids():
for n in os.listdir("/proc"):
if not n.isdigit():
continue
yield int(n)
def _pid_uid(pid):
try:
s = os.stat("/proc/{}".format(pid))
except FileNotFoundError:
raise _PIDGoneError()
return s.st_uid
def _pid_command(pid):
# Avoid GNU ps(1) for it truncates comm.
# https://bugs.launchpad.net/ubuntu/+source/procps/+bug/295876/comments/3
try:
cmdline = _read_bytes("/proc/{}/cmdline".format(pid))
except FileNotFoundError:
raise _PIDGoneError()
if not len(cmdline):
return "<zombie>"
try:
prog = cmdline[0:cmdline.index(0x00)]
except ValueError:
prog = cmdline
return os.path.basename(prog).decode(encoding="ascii",
errors="surrogateescape")
def _pid_inotify_instances(pid):
instances = 0
try:
for fd in os.listdir("/proc/{}/fd".format(pid)):
try:
target = os.readlink("/proc/{}/fd/{}".format(pid, fd))
except FileNotFoundError:
continue
if target == "anon_inode:inotify":
instances += 1
except FileNotFoundError:
raise _PIDGoneError()
return instances
def _get_processes():
for p in _pids():
try:
yield _Process(p, _pid_uid(p), _pid_command(p),
_pid_inotify_instances(p))
except (PermissionError, _PIDGoneError):
continue
def _get_processes_nontrivial():
return (p for p in _get_processes() if p.inotify_instances > 0)
def _format_gauge_metric(metric_name, metric_help, samples,
value_func, tags_func=None, stream=sys.stdout):
def _println(*args, **kwargs):
if "file" not in kwargs:
kwargs["file"] = stream
print(*args, **kwargs)
def _print(*args, **kwargs):
if "end" not in kwargs:
kwargs["end"] = ""
_println(*args, **kwargs)
_println("# HELP {} {}".format(metric_name, metric_help))
_println("# TYPE {} gauge".format(metric_name))
for s in samples:
value = value_func(s)
tags = None
if tags_func:
tags = tags_func(s)
_print(metric_name)
if tags:
_print("{")
_print(",".join(["{}=\"{}\"".format(k, v) for k, v in tags]))
_print("}")
_print(" ")
_println(value)
def main(args_unused=None):
_format_gauge_metric(
"inotify_instances",
"Total number of inotify instances held open by a process.",
_get_processes_nontrivial(),
lambda s: s.inotify_instances,
lambda s: [("pid", s.pid), ("uid", s.uid), ("command", s.command)])
if __name__ == "__main__":
sys.exit(main(sys.argv))