ceph/qa/tasks/systemd.py

143 lines
6.4 KiB
Python

"""
Systemd test
"""
import contextlib
import logging
import re
import time
from cStringIO import StringIO
from teuthology.orchestra import run
from teuthology.misc import reconnect, get_first_mon, wait_until_healthy
log = logging.getLogger(__name__)
@contextlib.contextmanager
def task(ctx, config):
"""
- tasks:
ceph-deploy:
systemd:
Test ceph systemd services can start, stop and restart and
check for any failed services and report back errors
"""
for remote, roles in ctx.cluster.remotes.iteritems():
remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
'grep', 'ceph'])
r = remote.run(args=['sudo', 'systemctl', 'list-units', run.Raw('|'),
'grep', 'ceph'], stdout=StringIO(),
check_status=False)
log.info(r.stdout.getvalue())
if r.stdout.getvalue().find('failed'):
log.info("Ceph services in failed state")
# test overall service stop and start using ceph.target
# ceph.target tests are meant for ceph systemd tests
# and not actual process testing using 'ps'
log.info("Stopping all Ceph services")
remote.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'],
stdout=StringIO(), check_status=False)
log.info(r.stdout.getvalue())
log.info("Checking process status")
r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
'grep', 'ceph'], stdout=StringIO())
if r.stdout.getvalue().find('Active: inactive'):
log.info("Successfully stopped all ceph services")
else:
log.info("Failed to stop ceph services")
log.info("Starting all Ceph services")
remote.run(args=['sudo', 'systemctl', 'start', 'ceph.target'])
r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'],
stdout=StringIO())
log.info(r.stdout.getvalue())
if r.stdout.getvalue().find('Active: active'):
log.info("Successfully started all Ceph services")
else:
log.info("info", "Failed to start Ceph services")
r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
'grep', 'ceph'], stdout=StringIO())
log.info(r.stdout.getvalue())
time.sleep(4)
# test individual services start stop
name = remote.shortname
mon_name = 'ceph-mon@' + name + '.service'
mds_name = 'ceph-mds@' + name + '.service'
mgr_name = 'ceph-mgr@' + name + '.service'
mon_role_name = 'mon.' + name
mds_role_name = 'mds.' + name
mgr_role_name = 'mgr.' + name
m_osd = re.search('--id (\d+) --setuser ceph', r.stdout.getvalue())
if m_osd:
osd_service = 'ceph-osd@{m}.service'.format(m=m_osd.group(1))
remote.run(args=['sudo', 'systemctl', 'status',
osd_service])
remote.run(args=['sudo', 'systemctl', 'stop',
osd_service])
time.sleep(4) # immediate check will result in deactivating state
r = remote.run(args=['sudo', 'systemctl', 'status', osd_service],
stdout=StringIO(), check_status=False)
log.info(r.stdout.getvalue())
if r.stdout.getvalue().find('Active: inactive'):
log.info("Successfully stopped single osd ceph service")
else:
log.info("Failed to stop ceph osd services")
remote.run(args=['sudo', 'systemctl', 'start',
osd_service])
time.sleep(4)
if mon_role_name in roles:
remote.run(args=['sudo', 'systemctl', 'status', mon_name])
remote.run(args=['sudo', 'systemctl', 'stop', mon_name])
time.sleep(4) # immediate check will result in deactivating state
r = remote.run(args=['sudo', 'systemctl', 'status', mon_name],
stdout=StringIO(), check_status=False)
if r.stdout.getvalue().find('Active: inactive'):
log.info("Successfully stopped single mon ceph service")
else:
log.info("Failed to stop ceph mon service")
remote.run(args=['sudo', 'systemctl', 'start', mon_name])
time.sleep(4)
if mgr_role_name in roles:
remote.run(args=['sudo', 'systemctl', 'status', mgr_name])
remote.run(args=['sudo', 'systemctl', 'stop', mgr_name])
time.sleep(4) # immediate check will result in deactivating state
r = remote.run(args=['sudo', 'systemctl', 'status', mgr_name],
stdout=StringIO(), check_status=False)
if r.stdout.getvalue().find('Active: inactive'):
log.info("Successfully stopped single ceph mgr service")
else:
log.info("Failed to stop ceph mgr service")
remote.run(args=['sudo', 'systemctl', 'start', mgr_name])
time.sleep(4)
if mds_role_name in roles:
remote.run(args=['sudo', 'systemctl', 'status', mds_name])
remote.run(args=['sudo', 'systemctl', 'stop', mds_name])
time.sleep(4) # immediate check will result in deactivating state
r = remote.run(args=['sudo', 'systemctl', 'status', mds_name],
stdout=StringIO(), check_status=False)
if r.stdout.getvalue().find('Active: inactive'):
log.info("Successfully stopped single ceph mds service")
else:
log.info("Failed to stop ceph mds service")
remote.run(args=['sudo', 'systemctl', 'start', mds_name])
time.sleep(4)
# reboot all nodes and verify the systemd units restart
# workunit that runs would fail if any of the systemd unit doesnt start
ctx.cluster.run(args='sudo reboot', wait=False, check_status=False)
# avoid immediate reconnect
time.sleep(120)
reconnect(ctx, 480) # reconnect all nodes
# for debug info
ctx.cluster.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
'grep', 'ceph'])
# wait for HEALTH_OK
mon = get_first_mon(ctx, config)
(mon_remote,) = ctx.cluster.only(mon).remotes.iterkeys()
wait_until_healthy(ctx, mon_remote, use_sudo=True)
yield