ceph/qa/tasks/systemd.py

143 lines
6.4 KiB
Python
Raw Normal View History

"""
Systemd test
"""
import contextlib
import logging
import re
import time
from cStringIO import StringIO
from teuthology.orchestra import run
from teuthology.misc import reconnect, get_first_mon, wait_until_healthy
log = logging.getLogger(__name__)
@contextlib.contextmanager
def task(ctx, config):
"""
- tasks:
ceph-deploy:
systemd:
Test ceph systemd services can start, stop and restart and
check for any failed services and report back errors
"""
for remote, roles in ctx.cluster.remotes.items():
remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
'grep', 'ceph'])
r = remote.run(args=['sudo', 'systemctl', 'list-units', run.Raw('|'),
'grep', 'ceph'], stdout=StringIO(),
check_status=False)
log.info(r.stdout.getvalue())
if r.stdout.getvalue().find('failed'):
log.info("Ceph services in failed state")
# test overall service stop and start using ceph.target
# ceph.target tests are meant for ceph systemd tests
# and not actual process testing using 'ps'
log.info("Stopping all Ceph services")
remote.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'],
stdout=StringIO(), check_status=False)
log.info(r.stdout.getvalue())
log.info("Checking process status")
r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
'grep', 'ceph'], stdout=StringIO())
if r.stdout.getvalue().find('Active: inactive'):
log.info("Successfully stopped all ceph services")
else:
log.info("Failed to stop ceph services")
log.info("Starting all Ceph services")
remote.run(args=['sudo', 'systemctl', 'start', 'ceph.target'])
r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'],
stdout=StringIO())
log.info(r.stdout.getvalue())
if r.stdout.getvalue().find('Active: active'):
log.info("Successfully started all Ceph services")
else:
log.info("info", "Failed to start Ceph services")
r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
'grep', 'ceph'], stdout=StringIO())
log.info(r.stdout.getvalue())
time.sleep(4)
# test individual services start stop
name = remote.shortname
mon_name = 'ceph-mon@' + name + '.service'
mds_name = 'ceph-mds@' + name + '.service'
mgr_name = 'ceph-mgr@' + name + '.service'
mon_role_name = 'mon.' + name
mds_role_name = 'mds.' + name
mgr_role_name = 'mgr.' + name
m_osd = re.search('--id (\d+) --setuser ceph', r.stdout.getvalue())
if m_osd:
osd_service = 'ceph-osd@{m}.service'.format(m=m_osd.group(1))
remote.run(args=['sudo', 'systemctl', 'status',
osd_service])
remote.run(args=['sudo', 'systemctl', 'stop',
osd_service])
time.sleep(4) # immediate check will result in deactivating state
r = remote.run(args=['sudo', 'systemctl', 'status', osd_service],
stdout=StringIO(), check_status=False)
log.info(r.stdout.getvalue())
if r.stdout.getvalue().find('Active: inactive'):
log.info("Successfully stopped single osd ceph service")
else:
log.info("Failed to stop ceph osd services")
remote.run(args=['sudo', 'systemctl', 'start',
osd_service])
time.sleep(4)
if mon_role_name in roles:
remote.run(args=['sudo', 'systemctl', 'status', mon_name])
remote.run(args=['sudo', 'systemctl', 'stop', mon_name])
time.sleep(4) # immediate check will result in deactivating state
r = remote.run(args=['sudo', 'systemctl', 'status', mon_name],
stdout=StringIO(), check_status=False)
if r.stdout.getvalue().find('Active: inactive'):
log.info("Successfully stopped single mon ceph service")
else:
log.info("Failed to stop ceph mon service")
remote.run(args=['sudo', 'systemctl', 'start', mon_name])
time.sleep(4)
if mgr_role_name in roles:
remote.run(args=['sudo', 'systemctl', 'status', mgr_name])
remote.run(args=['sudo', 'systemctl', 'stop', mgr_name])
time.sleep(4) # immediate check will result in deactivating state
r = remote.run(args=['sudo', 'systemctl', 'status', mgr_name],
stdout=StringIO(), check_status=False)
if r.stdout.getvalue().find('Active: inactive'):
log.info("Successfully stopped single ceph mgr service")
else:
log.info("Failed to stop ceph mgr service")
remote.run(args=['sudo', 'systemctl', 'start', mgr_name])
time.sleep(4)
if mds_role_name in roles:
remote.run(args=['sudo', 'systemctl', 'status', mds_name])
remote.run(args=['sudo', 'systemctl', 'stop', mds_name])
time.sleep(4) # immediate check will result in deactivating state
r = remote.run(args=['sudo', 'systemctl', 'status', mds_name],
stdout=StringIO(), check_status=False)
if r.stdout.getvalue().find('Active: inactive'):
log.info("Successfully stopped single ceph mds service")
else:
log.info("Failed to stop ceph mds service")
remote.run(args=['sudo', 'systemctl', 'start', mds_name])
time.sleep(4)
# reboot all nodes and verify the systemd units restart
# workunit that runs would fail if any of the systemd unit doesnt start
ctx.cluster.run(args='sudo reboot', wait=False, check_status=False)
# avoid immediate reconnect
time.sleep(120)
reconnect(ctx, 480) # reconnect all nodes
# for debug info
ctx.cluster.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
'grep', 'ceph'])
# wait for HEALTH_OK
mon = get_first_mon(ctx, config)
(mon_remote,) = ctx.cluster.only(mon).remotes.keys()
wait_until_healthy(ctx, mon_remote, use_sudo=True)
yield