2019-12-18 09:08:21 +00:00
|
|
|
#!@Python3_EXECUTABLE@
|
2018-07-24 05:05:01 +00:00
|
|
|
# -*- mode:python -*-
|
|
|
|
# vim: ts=4 sw=4 smarttab expandtab
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
import logging
|
|
|
|
import os
|
2020-01-16 19:23:46 +00:00
|
|
|
import signal
|
2019-10-05 18:30:21 +00:00
|
|
|
import socket
|
2018-07-24 05:05:01 +00:00
|
|
|
import subprocess
|
|
|
|
import sys
|
|
|
|
import time
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
2019-10-18 09:47:12 +00:00
|
|
|
log = logging.getLogger('ceph-crash')
|
2018-07-24 05:05:01 +00:00
|
|
|
|
2019-10-05 18:30:21 +00:00
|
|
|
auth_names = ['client.crash.%s' % socket.gethostname(),
|
|
|
|
'client.crash',
|
|
|
|
'client.admin']
|
2018-07-24 05:05:01 +00:00
|
|
|
|
|
|
|
def parse_args():
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument(
|
|
|
|
'-p', '--path', default='/var/lib/ceph/crash',
|
|
|
|
help='base path to monitor for crash dumps')
|
|
|
|
parser.add_argument(
|
|
|
|
'-d', '--delay', default=10.0, type=float,
|
|
|
|
help='minutes to delay between scans (0 to exit after one)',
|
|
|
|
)
|
2019-10-05 18:30:21 +00:00
|
|
|
parser.add_argument(
|
|
|
|
'--name', '-n',
|
|
|
|
help='ceph name to authenticate as (default: try client.crash, client.admin)')
|
2018-07-24 05:05:01 +00:00
|
|
|
return parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
def post_crash(path):
|
2019-10-05 18:30:21 +00:00
|
|
|
rc = 0
|
|
|
|
for n in auth_names:
|
|
|
|
pr = subprocess.Popen(
|
|
|
|
args=['timeout', '30', 'ceph',
|
|
|
|
'-n', n,
|
|
|
|
'crash', 'post', '-i', '-'],
|
|
|
|
stdin=subprocess.PIPE,
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE,
|
|
|
|
)
|
|
|
|
f = open(os.path.join(path, 'meta'), 'rb')
|
|
|
|
stdout, stderr = pr.communicate(input=f.read())
|
|
|
|
rc = pr.wait()
|
|
|
|
f.close()
|
|
|
|
if rc != 0:
|
|
|
|
log.warning('post %s as %s failed: %s' % (path, n, stderr))
|
|
|
|
if rc == 0:
|
|
|
|
break
|
2018-07-24 05:05:01 +00:00
|
|
|
return rc
|
|
|
|
|
|
|
|
|
|
|
|
def scrape_path(path):
|
|
|
|
for p in os.listdir(path):
|
|
|
|
crashpath = os.path.join(path, p)
|
|
|
|
metapath = os.path.join(crashpath, 'meta')
|
|
|
|
donepath = os.path.join(crashpath, 'done')
|
|
|
|
if os.path.isfile(metapath):
|
|
|
|
if not os.path.isfile(donepath):
|
|
|
|
# hang out just for a bit; either we interrupted the dump
|
|
|
|
# or the daemon crashed before finishing it
|
|
|
|
time.sleep(1)
|
|
|
|
if not os.path.isfile(donepath):
|
|
|
|
return
|
|
|
|
# ok, we can process this one
|
|
|
|
rc = post_crash(crashpath)
|
|
|
|
if rc == 0:
|
|
|
|
os.rename(crashpath, os.path.join(path, 'posted/', p))
|
|
|
|
log.debug(
|
|
|
|
"posted %s and renamed %s -> %s " %
|
|
|
|
(metapath, p, os.path.join('posted/', p))
|
|
|
|
)
|
|
|
|
|
2020-01-16 19:23:46 +00:00
|
|
|
def handler(signum, frame):
|
|
|
|
print('*** Interrupted with signal %d ***' % signum)
|
|
|
|
sys.exit(0)
|
2018-07-24 05:05:01 +00:00
|
|
|
|
|
|
|
def main():
|
2020-01-16 19:23:46 +00:00
|
|
|
# exit code 0 on SIGINT, SIGTERM
|
|
|
|
signal.signal(signal.SIGINT, handler)
|
|
|
|
signal.signal(signal.SIGTERM, handler)
|
|
|
|
|
2018-07-24 05:05:01 +00:00
|
|
|
args = parse_args()
|
|
|
|
postdir = os.path.join(args.path, 'posted')
|
2019-10-05 18:30:21 +00:00
|
|
|
if args.name:
|
|
|
|
auth_names = [args.name]
|
2018-07-24 05:05:01 +00:00
|
|
|
|
|
|
|
while not os.path.isdir(postdir):
|
2018-10-21 07:08:13 +00:00
|
|
|
log.error("directory %s does not exist; please create" % postdir)
|
2018-07-24 05:05:01 +00:00
|
|
|
time.sleep(30)
|
|
|
|
|
|
|
|
log.info("monitoring path %s, delay %ds" % (args.path, args.delay * 60.0))
|
|
|
|
while True:
|
|
|
|
scrape_path(args.path)
|
|
|
|
if args.delay == 0:
|
|
|
|
sys.exit(0)
|
|
|
|
time.sleep(args.delay * 60)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|