Add the inotify-instances text collector (#1186)
This is an alternative take on the embedded inotify collector: https://github.com/prometheus/node_exporter/pull/988 The proposed embedded collector was not accepted for inclusion because it was not possible for a single unprivileged node_exporter process to detect inotify resource utilisation in other user domains. This text collector works around the problem by giving the operator a choice between the following: - Run only the text collector as root to gain visibility over all processes on the system. - Run one or more instances of the text collector as an unprivileged user to gain visibility over subsets of the system. In either case, the data generated by this collector can be useful when hunting down inotify instance leaks -- and when confirming the resolution of such leaks. Signed-off-by: Saj Goonatilleke <sg@redu.cx>
This commit is contained in:
parent
83c9b11747
commit
d546916c6b
|
@ -0,0 +1,141 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
Expose Linux inotify(7) instance resource consumption.
|
||||
|
||||
Operational properties:
|
||||
|
||||
- This script may be invoked as an unprivileged user; in this case, metrics
|
||||
will only be exposed for processes owned by that unprivileged user.
|
||||
|
||||
- No metrics will be exposed for processes that do not hold any inotify fds.
|
||||
|
||||
Requires Python 3.5 or later.
|
||||
"""
|
||||
|
||||
import collections
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
class Error(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class _PIDGoneError(Error):
|
||||
pass
|
||||
|
||||
|
||||
_Process = collections.namedtuple(
|
||||
"Process", ["pid", "uid", "command", "inotify_instances"])
|
||||
|
||||
|
||||
def _read_bytes(name):
|
||||
with open(name, mode='rb') as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
def _pids():
|
||||
for n in os.listdir("/proc"):
|
||||
if not n.isdigit():
|
||||
continue
|
||||
yield int(n)
|
||||
|
||||
|
||||
def _pid_uid(pid):
|
||||
try:
|
||||
s = os.stat("/proc/{}".format(pid))
|
||||
except FileNotFoundError:
|
||||
raise _PIDGoneError()
|
||||
return s.st_uid
|
||||
|
||||
|
||||
def _pid_command(pid):
|
||||
# Avoid GNU ps(1) for it truncates comm.
|
||||
# https://bugs.launchpad.net/ubuntu/+source/procps/+bug/295876/comments/3
|
||||
try:
|
||||
cmdline = _read_bytes("/proc/{}/cmdline".format(pid))
|
||||
except FileNotFoundError:
|
||||
raise _PIDGoneError()
|
||||
|
||||
if not len(cmdline):
|
||||
return "<zombie>"
|
||||
|
||||
try:
|
||||
prog = cmdline[0:cmdline.index(0x00)]
|
||||
except ValueError:
|
||||
prog = cmdline
|
||||
return os.path.basename(prog).decode(encoding="ascii",
|
||||
errors="surrogateescape")
|
||||
|
||||
|
||||
def _pid_inotify_instances(pid):
|
||||
instances = 0
|
||||
try:
|
||||
for fd in os.listdir("/proc/{}/fd".format(pid)):
|
||||
try:
|
||||
target = os.readlink("/proc/{}/fd/{}".format(pid, fd))
|
||||
except FileNotFoundError:
|
||||
continue
|
||||
if target == "anon_inode:inotify":
|
||||
instances += 1
|
||||
except FileNotFoundError:
|
||||
raise _PIDGoneError()
|
||||
return instances
|
||||
|
||||
|
||||
def _get_processes():
|
||||
for p in _pids():
|
||||
try:
|
||||
yield _Process(p, _pid_uid(p), _pid_command(p),
|
||||
_pid_inotify_instances(p))
|
||||
except (PermissionError, _PIDGoneError):
|
||||
continue
|
||||
|
||||
|
||||
def _get_processes_nontrivial():
|
||||
return (p for p in _get_processes() if p.inotify_instances > 0)
|
||||
|
||||
|
||||
def _format_gauge_metric(metric_name, metric_help, samples,
|
||||
value_func, tags_func=None, stream=sys.stdout):
|
||||
|
||||
def _println(*args, **kwargs):
|
||||
if "file" not in kwargs:
|
||||
kwargs["file"] = stream
|
||||
print(*args, **kwargs)
|
||||
|
||||
def _print(*args, **kwargs):
|
||||
if "end" not in kwargs:
|
||||
kwargs["end"] = ""
|
||||
_println(*args, **kwargs)
|
||||
|
||||
_println("# HELP {} {}".format(metric_name, metric_help))
|
||||
_println("# TYPE {} gauge".format(metric_name))
|
||||
|
||||
for s in samples:
|
||||
value = value_func(s)
|
||||
tags = None
|
||||
if tags_func:
|
||||
tags = tags_func(s)
|
||||
|
||||
_print(metric_name)
|
||||
if tags:
|
||||
_print("{")
|
||||
_print(",".join(["{}=\"{}\"".format(k, v) for k, v in tags]))
|
||||
_print("}")
|
||||
_print(" ")
|
||||
_println(value)
|
||||
|
||||
|
||||
def main(args_unused=None):
|
||||
_format_gauge_metric(
|
||||
"inotify_instances",
|
||||
"Total number of inotify instances held open by a process.",
|
||||
_get_processes_nontrivial(),
|
||||
lambda s: s.inotify_instances,
|
||||
lambda s: [("pid", s.pid), ("uid", s.uid), ("command", s.command)])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv))
|
Loading…
Reference in New Issue