ceph/src/ceph-crash.in
Sage Weil 4b93fb8d31 ceph-crash: exit code 0 on SIGINT, SIGTERM
This makes the systemd unit not go into a failure state when you kill the
cephadm unit's podman container.

Signed-off-by: Sage Weil <sage@redhat.com>
2020-01-16 13:25:52 -06:00

107 lines
3.1 KiB
Python
Executable File

#!@Python3_EXECUTABLE@
# -*- mode:python -*-
# vim: ts=4 sw=4 smarttab expandtab
import argparse
import logging
import os
import signal
import socket
import subprocess
import sys
import time
logging.basicConfig(level=logging.INFO)
log = logging.getLogger('ceph-crash')
auth_names = ['client.crash.%s' % socket.gethostname(),
'client.crash',
'client.admin']
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
'-p', '--path', default='/var/lib/ceph/crash',
help='base path to monitor for crash dumps')
parser.add_argument(
'-d', '--delay', default=10.0, type=float,
help='minutes to delay between scans (0 to exit after one)',
)
parser.add_argument(
'--name', '-n',
help='ceph name to authenticate as (default: try client.crash, client.admin)')
return parser.parse_args()
def post_crash(path):
rc = 0
for n in auth_names:
pr = subprocess.Popen(
args=['timeout', '30', 'ceph',
'-n', n,
'crash', 'post', '-i', '-'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
f = open(os.path.join(path, 'meta'), 'rb')
stdout, stderr = pr.communicate(input=f.read())
rc = pr.wait()
f.close()
if rc != 0:
log.warning('post %s as %s failed: %s' % (path, n, stderr))
if rc == 0:
break
return rc
def scrape_path(path):
for p in os.listdir(path):
crashpath = os.path.join(path, p)
metapath = os.path.join(crashpath, 'meta')
donepath = os.path.join(crashpath, 'done')
if os.path.isfile(metapath):
if not os.path.isfile(donepath):
# hang out just for a bit; either we interrupted the dump
# or the daemon crashed before finishing it
time.sleep(1)
if not os.path.isfile(donepath):
return
# ok, we can process this one
rc = post_crash(crashpath)
if rc == 0:
os.rename(crashpath, os.path.join(path, 'posted/', p))
log.debug(
"posted %s and renamed %s -> %s " %
(metapath, p, os.path.join('posted/', p))
)
def handler(signum, frame):
print('*** Interrupted with signal %d ***' % signum)
sys.exit(0)
def main():
# exit code 0 on SIGINT, SIGTERM
signal.signal(signal.SIGINT, handler)
signal.signal(signal.SIGTERM, handler)
args = parse_args()
postdir = os.path.join(args.path, 'posted')
if args.name:
auth_names = [args.name]
while not os.path.isdir(postdir):
log.error("directory %s does not exist; please create" % postdir)
time.sleep(30)
log.info("monitoring path %s, delay %ds" % (args.path, args.delay * 60.0))
while True:
scrape_path(args.path)
if args.delay == 0:
sys.exit(0)
time.sleep(args.delay * 60)
if __name__ == "__main__":
main()