2017-03-08 11:05:00 +00:00
|
|
|
|
|
|
|
import logging
|
|
|
|
import json
|
|
|
|
|
|
|
|
from teuthology.task import Task
|
|
|
|
from teuthology import misc
|
|
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
class CheckCounter(Task):
|
|
|
|
"""
|
|
|
|
Use this task to validate that some daemon perf counters were
|
|
|
|
incremented by the nested tasks.
|
|
|
|
|
|
|
|
Config:
|
|
|
|
'cluster_name': optional, specify which cluster
|
|
|
|
'target': dictionary of daemon type to list of performance counters.
|
|
|
|
'dry_run': just log the value of the counters, don't fail if they
|
|
|
|
aren't nonzero.
|
|
|
|
|
|
|
|
Success condition is that for all of the named counters, at least
|
|
|
|
one of the daemons of that type has the counter nonzero.
|
|
|
|
|
|
|
|
Example to check cephfs dirfrag splits are happening:
|
|
|
|
- install:
|
|
|
|
- ceph:
|
|
|
|
- ceph-fuse:
|
|
|
|
- check-counter:
|
|
|
|
counters:
|
|
|
|
mds:
|
|
|
|
- "mds.dir_split"
|
2022-06-11 00:35:32 +00:00
|
|
|
-
|
|
|
|
name: "mds.dir_update"
|
|
|
|
min: 3
|
2017-03-08 11:05:00 +00:00
|
|
|
- workunit: ...
|
|
|
|
"""
|
|
|
|
|
|
|
|
def start(self):
|
|
|
|
log.info("START")
|
|
|
|
|
|
|
|
def end(self):
|
2018-01-02 09:14:14 +00:00
|
|
|
overrides = self.ctx.config.get('overrides', {})
|
|
|
|
misc.deep_merge(self.config, overrides.get('check-counter', {}))
|
|
|
|
|
2017-03-08 11:05:00 +00:00
|
|
|
cluster_name = self.config.get('cluster_name', None)
|
|
|
|
dry_run = self.config.get('dry_run', False)
|
|
|
|
targets = self.config.get('counters', {})
|
|
|
|
|
|
|
|
if cluster_name is None:
|
qa/tasks: use next(iter(..)) for accessing first element in a view
in python2, dict.values() and dict.keys() return lists. but in python3,
they return views, which cannot be indexed directly using an integer index.
there are three use cases when we access these views in python3:
1. get the first element
2. get all the elements and then *might* want to access them by index
3. get the first element assuming there is only a single element in
the view
4. iterate thru the view
in the 1st case, we cannot assume the number of elements, so to be
python3 compatible, we should use `next(iter(a_dict))` instead.
in the 2nd case, in this change, the view is materialized using
`list(a_dict)`.
in the 3rd case, we can just continue using the short hand of
```py
(first_element,) = a_dict.keys()
```
to unpack the view. this works in both python2 and python3.
in the 4th case, the existing code works in both python2 and python3, as
both list and view can be iterated using `iter`, and `len` works as
well.
Signed-off-by: Kefu Chai <kchai@redhat.com>
2020-03-31 02:16:40 +00:00
|
|
|
cluster_name = next(iter(self.ctx.managers.keys()))
|
2017-03-08 11:05:00 +00:00
|
|
|
|
|
|
|
for daemon_type, counters in targets.items():
|
|
|
|
# List of 'a', 'b', 'c'...
|
|
|
|
daemon_ids = list(misc.all_roles_of_type(self.ctx.cluster, daemon_type))
|
|
|
|
daemons = dict([(daemon_id,
|
|
|
|
self.ctx.daemons.get_daemon(daemon_type, daemon_id))
|
|
|
|
for daemon_id in daemon_ids])
|
|
|
|
|
2022-06-11 00:35:32 +00:00
|
|
|
expected = set()
|
2017-03-08 11:05:00 +00:00
|
|
|
seen = set()
|
|
|
|
|
|
|
|
for daemon_id, daemon in daemons.items():
|
|
|
|
if not daemon.running():
|
|
|
|
log.info("Ignoring daemon {0}, it isn't running".format(daemon_id))
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
log.debug("Getting stats from {0}".format(daemon_id))
|
|
|
|
|
|
|
|
manager = self.ctx.managers[cluster_name]
|
|
|
|
proc = manager.admin_socket(daemon_type, daemon_id, ["perf", "dump"])
|
|
|
|
response_data = proc.stdout.getvalue().strip()
|
|
|
|
if response_data:
|
|
|
|
perf_dump = json.loads(response_data)
|
|
|
|
else:
|
|
|
|
log.warning("No admin socket response from {0}, skipping".format(daemon_id))
|
|
|
|
continue
|
|
|
|
|
|
|
|
for counter in counters:
|
2022-06-11 00:35:32 +00:00
|
|
|
if isinstance(counter, dict):
|
|
|
|
name = counter['name']
|
|
|
|
minval = counter['min']
|
|
|
|
else:
|
|
|
|
name = counter
|
|
|
|
minval = 1
|
|
|
|
expected.add(name)
|
|
|
|
subsys, counter_id = name.split(".")
|
2017-03-08 11:05:00 +00:00
|
|
|
if subsys not in perf_dump or counter_id not in perf_dump[subsys]:
|
|
|
|
log.warning("Counter '{0}' not found on daemon {1}.{2}".format(
|
2022-06-11 00:35:32 +00:00
|
|
|
name, daemon_type, daemon_id))
|
2017-03-08 11:05:00 +00:00
|
|
|
continue
|
|
|
|
value = perf_dump[subsys][counter_id]
|
|
|
|
|
|
|
|
log.info("Daemon {0}.{1} {2}={3}".format(
|
2022-06-11 00:35:32 +00:00
|
|
|
daemon_type, daemon_id, name, value
|
2017-03-08 11:05:00 +00:00
|
|
|
))
|
|
|
|
|
2022-06-11 00:35:32 +00:00
|
|
|
if value >= minval:
|
|
|
|
seen.add(name)
|
2017-03-08 11:05:00 +00:00
|
|
|
|
|
|
|
if not dry_run:
|
2022-06-11 00:35:32 +00:00
|
|
|
unseen = set(expected) - set(seen)
|
2017-03-08 11:05:00 +00:00
|
|
|
if unseen:
|
|
|
|
raise RuntimeError("The following counters failed to be set "
|
|
|
|
"on {0} daemons: {1}".format(
|
|
|
|
daemon_type, unseen
|
|
|
|
))
|
|
|
|
|
|
|
|
task = CheckCounter
|