ceph/qa/tasks/recovery_bench.py

"""
Recovery system benchmarking
"""
from cStringIO import StringIO

import contextlib
import gevent
import json
import logging
import random
import time

import ceph_manager
from teuthology import misc as teuthology

log = logging.getLogger(__name__)

@contextlib.contextmanager
def task(ctx, config):
    """
    Benchmark the recovery system.

    Generates objects with smalliobench, runs it normally to get a
    baseline performance measurement, then marks an OSD out and reruns
    to measure performance during recovery.

    The config should be as follows:

    recovery_bench:
        duration: <seconds for each measurement run>
        num_objects: <number of objects>
        io_size: <io size in bytes>

    example:

    tasks:
    - ceph:
    - recovery_bench:
        duration: 60
        num_objects: 500
        io_size: 4096
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'recovery_bench task only accepts a dict for configuration'

    log.info('Beginning recovery bench...')

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    while len(manager.get_osd_status()['up']) < num_osds:
        time.sleep(10)

    bench_proc = RecoveryBencher(
        manager,
        config,
        )
    try:
        yield
    finally:
        log.info('joining recovery bencher')
        bench_proc.do_join()

class RecoveryBencher:
    """
    RecoveryBencher
    """
    def __init__(self, manager, config):
        self.ceph_manager = manager
        self.ceph_manager.wait_for_clean()

        osd_status = self.ceph_manager.get_osd_status()
        self.osds = osd_status['up']

        self.config = config
        if self.config is None:
            self.config = dict()

        else:
            def tmp(x):
                """
                Local wrapper to print value.
                """
                print x
            self.log = tmp

        log.info("spawning thread")

        self.thread = gevent.spawn(self.do_bench)

    def do_join(self):
        """
        Join the recovery bencher.  This is called after the main
        task exits.
        """
        self.thread.get()

    def do_bench(self):
        """
        Do the benchmarking.
        """
        duration = self.config.get("duration", 60)
        num_objects = self.config.get("num_objects", 500)
        io_size = self.config.get("io_size", 4096)

        osd = str(random.choice(self.osds))
        (osd_remote,) = self.ceph_manager.ctx.cluster.only('osd.%s' % osd).remotes.iterkeys()

        testdir = teuthology.get_testdir(self.ceph_manager.ctx)

        # create the objects
        osd_remote.run(
            args=[
                'adjust-ulimits',
                'ceph-coverage',
                '{tdir}/archive/coverage'.format(tdir=testdir),
                'smalliobench'.format(tdir=testdir),
                '--use-prefix', 'recovery_bench',
                '--init-only', '1',
                '--num-objects', str(num_objects),
                '--io-size', str(io_size),
                ],
            wait=True,
        )

        # baseline bench
        log.info('non-recovery (baseline)')
        p = osd_remote.run(
            args=[
                'adjust-ulimits',
                'ceph-coverage',
                '{tdir}/archive/coverage'.format(tdir=testdir),
                'smalliobench',
                '--use-prefix', 'recovery_bench',
                '--do-not-init', '1',
                '--duration', str(duration),
                '--io-size', str(io_size),
                ],
            stdout=StringIO(),
            stderr=StringIO(),
            wait=True,
        )
        self.process_samples(p.stderr.getvalue())

        self.ceph_manager.raw_cluster_cmd('osd', 'out', osd)
        time.sleep(5)

        # recovery bench
        log.info('recovery active')
        p = osd_remote.run(
            args=[
                'adjust-ulimits',
                'ceph-coverage',
                '{tdir}/archive/coverage'.format(tdir=testdir),
                'smalliobench',
                '--use-prefix', 'recovery_bench',
                '--do-not-init', '1',
                '--duration', str(duration),
                '--io-size', str(io_size),
                ],
            stdout=StringIO(),
            stderr=StringIO(),
            wait=True,
        )
        self.process_samples(p.stderr.getvalue())

        self.ceph_manager.raw_cluster_cmd('osd', 'in', osd)

    def process_samples(self, input):
        """
        Extract samples from the input and process the results

        :param input: input lines in JSON format
        """
        lat = {}
        for line in input.split('\n'):
            try:
                sample = json.loads(line)
                samples = lat.setdefault(sample['type'], [])
                samples.append(float(sample['latency']))
            except Exception:
                pass

        for type in lat:
            samples = lat[type]
            samples.sort()

            num = len(samples)

            # median
            if num & 1 == 1: # odd number of samples
                median = samples[num / 2]
            else:
                median = (samples[num / 2] + samples[num / 2 - 1]) / 2

            # 99%
            ninety_nine = samples[int(num * 0.99)]

            log.info("%s: median %f, 99%% %f" % (type, median, ninety_nine))
Fix docstrings for task/blktrace.py, peering_speed_test.py, proc_thrasher.py and recovery_bench.py. Fixes: 6535 Signed-off-by: Warren Usui <warren.usui@inktank.com> 2014-02-14 23:13:01 +00:00			`"""`
			`Recovery system benchmarking`
			`"""`
task: benchmark recovery Measures latency before and during recovery using smalliobench. Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-11-15 21:07:42 +00:00			`from cStringIO import StringIO`

			`import contextlib`
			`import gevent`
			`import json`
			`import logging`
			`import random`
			`import time`

			`import ceph_manager`
			`from teuthology import misc as teuthology`

			`log = logging.getLogger(__name__)`

			`@contextlib.contextmanager`
			`def task(ctx, config):`
			`"""`
			`Benchmark the recovery system.`

			`Generates objects with smalliobench, runs it normally to get a`
			`baseline performance measurement, then marks an OSD out and reruns`
			`to measure performance during recovery.`

			`The config should be as follows:`

			`recovery_bench:`
			`duration: <seconds for each measurement run>`
			`num_objects: <number of objects>`
			`io_size: <io size in bytes>`

			`example:`

			`tasks:`
			`- ceph:`
			`- recovery_bench:`
			`duration: 60`
			`num_objects: 500`
			`io_size: 4096`
			`"""`
			`if config is None:`
			`config = {}`
			`assert isinstance(config, dict), \`
			`'recovery_bench task only accepts a dict for configuration'`

			`log.info('Beginning recovery bench...')`

			`first_mon = teuthology.get_first_mon(ctx, config)`
Revert "Lines formerly of the form '(remote,) = ctx.cluster.only(role).remotes.keys()'" This reverts commit d693b3f8950ffd1f2492a4db0f8234fee31f00f0. 2014-03-27 16:35:28 +00:00			`(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()`
task: benchmark recovery Measures latency before and during recovery using smalliobench. Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-11-15 21:07:42 +00:00
			`manager = ceph_manager.CephManager(`
			`mon,`
			`ctx=ctx,`
			`logger=log.getChild('ceph_manager'),`
			`)`

			`num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')`
			`while len(manager.get_osd_status()['up']) < num_osds:`
tasks: fix non-existent sleep function CephManager has no sleep function. Use time.sleep() instead. Ran into this while testing a branch. Apparently it doesn't happen much since this hasn't changed in years, but the error was copied into several tasks. Signed-off-by: Josh Durgin <jdurgin@redhat.com> 2016-06-02 22:24:56 +00:00			`time.sleep(10)`
task: benchmark recovery Measures latency before and during recovery using smalliobench. Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-11-15 21:07:42 +00:00
			`bench_proc = RecoveryBencher(`
			`manager,`
			`config,`
			`)`
			`try:`
			`yield`
			`finally:`
			`log.info('joining recovery bencher')`
			`bench_proc.do_join()`

			`class RecoveryBencher:`
Fix docstrings for task/blktrace.py, peering_speed_test.py, proc_thrasher.py and recovery_bench.py. Fixes: 6535 Signed-off-by: Warren Usui <warren.usui@inktank.com> 2014-02-14 23:13:01 +00:00			`"""`
			`RecoveryBencher`
			`"""`
task: benchmark recovery Measures latency before and during recovery using smalliobench. Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-11-15 21:07:42 +00:00			`def __init__(self, manager, config):`
			`self.ceph_manager = manager`
			`self.ceph_manager.wait_for_clean()`

			`osd_status = self.ceph_manager.get_osd_status()`
			`self.osds = osd_status['up']`

			`self.config = config`
			`if self.config is None:`
			`self.config = dict()`

			`else:`
			`def tmp(x):`
Fix docstrings for task/blktrace.py, peering_speed_test.py, proc_thrasher.py and recovery_bench.py. Fixes: 6535 Signed-off-by: Warren Usui <warren.usui@inktank.com> 2014-02-14 23:13:01 +00:00			`"""`
			`Local wrapper to print value.`
			`"""`
task: benchmark recovery Measures latency before and during recovery using smalliobench. Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-11-15 21:07:42 +00:00			`print x`
			`self.log = tmp`

			`log.info("spawning thread")`

			`self.thread = gevent.spawn(self.do_bench)`

			`def do_join(self):`
Fix docstrings for task/blktrace.py, peering_speed_test.py, proc_thrasher.py and recovery_bench.py. Fixes: 6535 Signed-off-by: Warren Usui <warren.usui@inktank.com> 2014-02-14 23:13:01 +00:00			`"""`
			`Join the recovery bencher. This is called after the main`
			`task exits.`
			`"""`
task: benchmark recovery Measures latency before and during recovery using smalliobench. Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-11-15 21:07:42 +00:00			`self.thread.get()`

			`def do_bench(self):`
Fix docstrings for task/blktrace.py, peering_speed_test.py, proc_thrasher.py and recovery_bench.py. Fixes: 6535 Signed-off-by: Warren Usui <warren.usui@inktank.com> 2014-02-14 23:13:01 +00:00			`"""`
			`Do the benchmarking.`
			`"""`
task: benchmark recovery Measures latency before and during recovery using smalliobench. Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-11-15 21:07:42 +00:00			`duration = self.config.get("duration", 60)`
			`num_objects = self.config.get("num_objects", 500)`
			`io_size = self.config.get("io_size", 4096)`

			`osd = str(random.choice(self.osds))`
Revert "Lines formerly of the form '(remote,) = ctx.cluster.only(role).remotes.keys()'" This reverts commit d693b3f8950ffd1f2492a4db0f8234fee31f00f0. 2014-03-27 16:35:28 +00:00			`(osd_remote,) = self.ceph_manager.ctx.cluster.only('osd.%s' % osd).remotes.iterkeys()`
task: benchmark recovery Measures latency before and during recovery using smalliobench. Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-11-15 21:07:42 +00:00
Replace /tmp/cephtest/ with configurable path Teuthology uses /tmp/cephtest/ as the scratch test directory for a run. This patch replaces /tmp/cephtest/ everywhere with a per-run directory: {basedir}/{rundir} where {basedir} is a directory configured in .teuthology.yaml (/tmp/cephtest if not specified), and {rundir} is the name of the run, as given in --name. If no name is specified, {user}-{timestamp} is used. To get the old behavior (/tmp/cephtest), set test_path: /tmp/cephtest in .teuthology.yaml. This change was modivated by #3782, which requires a test dir that survives across reboots, but also resolves #3767. Signed-off-by: Sam Lang <sam.lang@inktank.com> Reviewed-by: Josh Durgin <josh.durgin@inktank.com> 2013-01-23 20:37:39 +00:00			`testdir = teuthology.get_testdir(self.ceph_manager.ctx)`

task: benchmark recovery Measures latency before and during recovery using smalliobench. Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-11-15 21:07:42 +00:00			`# create the objects`
			`osd_remote.run(`
			`args=[`
Helper scripts live in /usr/local/bin now! 2013-09-06 20:08:01 +00:00			`'adjust-ulimits',`
Install ceph debs and use installed debs The ceph task installs ceph using the debian packages now, and all invocations of binaries installed in {tmpdir}/binary/usr/local/bin/ are replace with the use of the binaries installed in standard locations by the debs. Author: Sander Pool <sander.pool@inktank.com> Signed-off-by: Sam Lang <sam.lang@inktank.com> 2013-02-06 19:16:52 +00:00			`'ceph-coverage',`
Replace /tmp/cephtest/ with configurable path Teuthology uses /tmp/cephtest/ as the scratch test directory for a run. This patch replaces /tmp/cephtest/ everywhere with a per-run directory: {basedir}/{rundir} where {basedir} is a directory configured in .teuthology.yaml (/tmp/cephtest if not specified), and {rundir} is the name of the run, as given in --name. If no name is specified, {user}-{timestamp} is used. To get the old behavior (/tmp/cephtest), set test_path: /tmp/cephtest in .teuthology.yaml. This change was modivated by #3782, which requires a test dir that survives across reboots, but also resolves #3767. Signed-off-by: Sam Lang <sam.lang@inktank.com> Reviewed-by: Josh Durgin <josh.durgin@inktank.com> 2013-01-23 20:37:39 +00:00			`'{tdir}/archive/coverage'.format(tdir=testdir),`
Install ceph debs and use installed debs The ceph task installs ceph using the debian packages now, and all invocations of binaries installed in {tmpdir}/binary/usr/local/bin/ are replace with the use of the binaries installed in standard locations by the debs. Author: Sander Pool <sander.pool@inktank.com> Signed-off-by: Sam Lang <sam.lang@inktank.com> 2013-02-06 19:16:52 +00:00			`'smalliobench'.format(tdir=testdir),`
task: benchmark recovery Measures latency before and during recovery using smalliobench. Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-11-15 21:07:42 +00:00			`'--use-prefix', 'recovery_bench',`
			`'--init-only', '1',`
			`'--num-objects', str(num_objects),`
			`'--io-size', str(io_size),`
			`],`
			`wait=True,`
			`)`

			`# baseline bench`
			`log.info('non-recovery (baseline)')`
			`p = osd_remote.run(`
			`args=[`
Helper scripts live in /usr/local/bin now! 2013-09-06 20:08:01 +00:00			`'adjust-ulimits',`
Install ceph debs and use installed debs The ceph task installs ceph using the debian packages now, and all invocations of binaries installed in {tmpdir}/binary/usr/local/bin/ are replace with the use of the binaries installed in standard locations by the debs. Author: Sander Pool <sander.pool@inktank.com> Signed-off-by: Sam Lang <sam.lang@inktank.com> 2013-02-06 19:16:52 +00:00			`'ceph-coverage',`
Replace /tmp/cephtest/ with configurable path Teuthology uses /tmp/cephtest/ as the scratch test directory for a run. This patch replaces /tmp/cephtest/ everywhere with a per-run directory: {basedir}/{rundir} where {basedir} is a directory configured in .teuthology.yaml (/tmp/cephtest if not specified), and {rundir} is the name of the run, as given in --name. If no name is specified, {user}-{timestamp} is used. To get the old behavior (/tmp/cephtest), set test_path: /tmp/cephtest in .teuthology.yaml. This change was modivated by #3782, which requires a test dir that survives across reboots, but also resolves #3767. Signed-off-by: Sam Lang <sam.lang@inktank.com> Reviewed-by: Josh Durgin <josh.durgin@inktank.com> 2013-01-23 20:37:39 +00:00			`'{tdir}/archive/coverage'.format(tdir=testdir),`
Install ceph debs and use installed debs The ceph task installs ceph using the debian packages now, and all invocations of binaries installed in {tmpdir}/binary/usr/local/bin/ are replace with the use of the binaries installed in standard locations by the debs. Author: Sander Pool <sander.pool@inktank.com> Signed-off-by: Sam Lang <sam.lang@inktank.com> 2013-02-06 19:16:52 +00:00			`'smalliobench',`
task: benchmark recovery Measures latency before and during recovery using smalliobench. Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-11-15 21:07:42 +00:00			`'--use-prefix', 'recovery_bench',`
			`'--do-not-init', '1',`
			`'--duration', str(duration),`
			`'--io-size', str(io_size),`
			`],`
			`stdout=StringIO(),`
			`stderr=StringIO(),`
			`wait=True,`
			`)`
			`self.process_samples(p.stderr.getvalue())`

			`self.ceph_manager.raw_cluster_cmd('osd', 'out', osd)`
			`time.sleep(5)`

			`# recovery bench`
			`log.info('recovery active')`
			`p = osd_remote.run(`
			`args=[`
Helper scripts live in /usr/local/bin now! 2013-09-06 20:08:01 +00:00			`'adjust-ulimits',`
Install ceph debs and use installed debs The ceph task installs ceph using the debian packages now, and all invocations of binaries installed in {tmpdir}/binary/usr/local/bin/ are replace with the use of the binaries installed in standard locations by the debs. Author: Sander Pool <sander.pool@inktank.com> Signed-off-by: Sam Lang <sam.lang@inktank.com> 2013-02-06 19:16:52 +00:00			`'ceph-coverage',`
Replace /tmp/cephtest/ with configurable path Teuthology uses /tmp/cephtest/ as the scratch test directory for a run. This patch replaces /tmp/cephtest/ everywhere with a per-run directory: {basedir}/{rundir} where {basedir} is a directory configured in .teuthology.yaml (/tmp/cephtest if not specified), and {rundir} is the name of the run, as given in --name. If no name is specified, {user}-{timestamp} is used. To get the old behavior (/tmp/cephtest), set test_path: /tmp/cephtest in .teuthology.yaml. This change was modivated by #3782, which requires a test dir that survives across reboots, but also resolves #3767. Signed-off-by: Sam Lang <sam.lang@inktank.com> Reviewed-by: Josh Durgin <josh.durgin@inktank.com> 2013-01-23 20:37:39 +00:00			`'{tdir}/archive/coverage'.format(tdir=testdir),`
Install ceph debs and use installed debs The ceph task installs ceph using the debian packages now, and all invocations of binaries installed in {tmpdir}/binary/usr/local/bin/ are replace with the use of the binaries installed in standard locations by the debs. Author: Sander Pool <sander.pool@inktank.com> Signed-off-by: Sam Lang <sam.lang@inktank.com> 2013-02-06 19:16:52 +00:00			`'smalliobench',`
task: benchmark recovery Measures latency before and during recovery using smalliobench. Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-11-15 21:07:42 +00:00			`'--use-prefix', 'recovery_bench',`
			`'--do-not-init', '1',`
			`'--duration', str(duration),`
			`'--io-size', str(io_size),`
			`],`
			`stdout=StringIO(),`
			`stderr=StringIO(),`
			`wait=True,`
			`)`
			`self.process_samples(p.stderr.getvalue())`

			`self.ceph_manager.raw_cluster_cmd('osd', 'in', osd)`

			`def process_samples(self, input):`
Fix docstrings for task/blktrace.py, peering_speed_test.py, proc_thrasher.py and recovery_bench.py. Fixes: 6535 Signed-off-by: Warren Usui <warren.usui@inktank.com> 2014-02-14 23:13:01 +00:00			`"""`
			`Extract samples from the input and process the results`

			`:param input: input lines in JSON format`
			`"""`
task: benchmark recovery Measures latency before and during recovery using smalliobench. Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-11-15 21:07:42 +00:00			`lat = {}`
			`for line in input.split('\n'):`
			`try:`
			`sample = json.loads(line)`
			`samples = lat.setdefault(sample['type'], [])`
			`samples.append(float(sample['latency']))`
Fixes for syntax errors found by pyflakes. This patch includes minor fixes to the teuthology python code for syntax errors found by running check-syntax.sh (which runs pyflakes on each file). Signed-off-by: Sam Lang <sam.lang@inktank.com> Reviewed-by: Josh Durgin <josh.durgin@inktank.com> 2013-01-31 13:58:57 +00:00			`except Exception:`
Fix docstrings for task/blktrace.py, peering_speed_test.py, proc_thrasher.py and recovery_bench.py. Fixes: 6535 Signed-off-by: Warren Usui <warren.usui@inktank.com> 2014-02-14 23:13:01 +00:00			`pass`
task: benchmark recovery Measures latency before and during recovery using smalliobench. Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-11-15 21:07:42 +00:00
			`for type in lat:`
			`samples = lat[type]`
			`samples.sort()`

			`num = len(samples)`

			`# median`
			`if num & 1 == 1: # odd number of samples`
			`median = samples[num / 2]`
			`else:`
			`median = (samples[num / 2] + samples[num / 2 - 1]) / 2`

			`# 99%`
			`ninety_nine = samples[int(num * 0.99)]`

			`log.info("%s: median %f, 99%% %f" % (type, median, ninety_nine))`