ceph/qa/tasks/mgr/test_insights.py

224 lines
8.0 KiB
Python
Raw Normal View History

import logging
import json
import datetime
import time
from mgr_test_case import MgrTestCase
log = logging.getLogger(__name__)
UUID = 'd5775432-0742-44a3-a435-45095e32e6b2'
DATEFMT = '%Y-%m-%d %H:%M:%S.%f'
class TestInsights(MgrTestCase):
def setUp(self):
self.setup_mgrs()
self._load_module("insights")
self._load_module("selftest")
self.crash_ids = []
def tearDown(self):
self._clear_crashes()
def _insights(self):
retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd("insights")
return json.loads(retstr)
def _add_crash(self, hours, make_invalid = False):
now = datetime.datetime.utcnow()
timestamp = now - datetime.timedelta(hours = hours)
timestamp = timestamp.strftime(DATEFMT) + 'Z'
crash_id = '_'.join((timestamp, UUID)).replace(' ', '_')
crash = {
'crash_id': crash_id,
'timestamp': timestamp,
}
if make_invalid:
crash["timestamp"] = "not a timestamp"
ret = self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
'crash', 'post', '-i', '-',
stdin=json.dumps(crash)
)
self.crash_ids.append(crash_id)
self.assertEqual(0, ret)
def _clear_crashes(self):
for crash_id in self.crash_ids:
self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
'crash', 'rm', crash_id
)
def _wait_for_health_history_checks(self, *args):
"""Wait for a set of health checks to appear in the health history"""
timeout = datetime.datetime.utcnow() + \
datetime.timedelta(seconds = 15)
while True:
report = self._insights()
missing = False
for check in args:
if check not in report["health"]["history"]["checks"]:
missing = True
break
if not missing:
return
self.assertGreater(timeout,
datetime.datetime.utcnow())
time.sleep(0.25)
def _wait_for_curr_health_cleared(self, check):
timeout = datetime.datetime.utcnow() + \
datetime.timedelta(seconds = 15)
while True:
report = self._insights()
if check not in report["health"]["current"]["checks"]:
return
self.assertGreater(timeout,
datetime.datetime.utcnow())
time.sleep(0.25)
def test_health_history(self):
# use empty health history as starting point
self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
"insights", "prune-health", "0")
report = self._insights()
self.assertFalse(report["health"]["history"]["checks"])
# generate health check history entries. we want to avoid the edge case
# of running these tests at _exactly_ the top of the hour so we can
# explicitly control when hourly work occurs. for this we use the
# current time offset to a half hour.
now = datetime.datetime.utcnow()
now = datetime.datetime(
year = now.year,
month = now.month,
day = now.day,
hour = now.hour,
minute = 30)
check_names = set()
for hours in [-18, -11, -5, -1, 0]:
# change the insight module's perception of "now" ...
self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
"mgr", "self-test", "insights_set_now_offset", str(hours))
# ... to simulate health check arrivals in the past
unique_check_name = "insights_health_check_{}".format(hours)
health_check = {
unique_check_name: {
"severity": "warning",
"summary": "summary",
"detail": ["detail"]
}
}
self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
"mgr", "self-test", "health", "set",
json.dumps(health_check))
check_names.add(unique_check_name)
# and also set the same health check to test deduplication
dupe_check_name = "insights_health_check".format(hours)
health_check = {
dupe_check_name: {
"severity": "warning",
"summary": "summary",
"detail": ["detail"]
}
}
self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
"mgr", "self-test", "health", "set",
json.dumps(health_check))
check_names.add(dupe_check_name)
# wait for the health check to show up in the history report
self._wait_for_health_history_checks(unique_check_name, dupe_check_name)
# clear out the current health checks before moving on
self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
"mgr", "self-test", "health", "clear")
self._wait_for_curr_health_cleared(unique_check_name)
report = self._insights()
for check in check_names:
self.assertIn(check, report["health"]["history"]["checks"])
# restart the manager
active_id = self.mgr_cluster.get_active_id()
self.mgr_cluster.mgr_restart(active_id)
# ensure that at least one of the checks is present after the restart.
# we don't for them all to be present because "earlier" checks may not
# have sat in memory long enough to be flushed.
all_missing = True
report = self._insights()
for check in check_names:
if check in report["health"]["history"]["checks"]:
all_missing = False
break
self.assertFalse(all_missing)
# pruning really removes history
self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
"insights", "prune-health", "0")
report = self._insights()
self.assertFalse(report["health"]["history"]["checks"])
def test_insights_health(self):
"""The insights module reports health checks"""
self._add_crash(1, True) # add invalid crash data
timeout = 10
while timeout > 0:
time.sleep(1)
timeout -= 1
# should observe a health check because it can't read the invalid
# crash data created at the beginning of this test
report = self._insights()
if "MGR_INSIGHTS_WARNING" in report["health"]["current"]["checks"]:
self._clear_crashes()
return
self._clear_crashes()
self.fail("Insights module did not set health check")
pass
def test_schema(self):
"""TODO: assert conformance to a full schema specification?"""
report = self._insights()
for key in ["osd_metadata",
"pg_summary",
"mon_status",
"manager_map",
"service_map",
"mon_map",
"crush_map",
"fs_map",
"osd_tree",
"df",
"osd_dump",
"config",
"health",
"crashes",
"version",
"errors"]:
self.assertIn(key, report)
def test_crash_history(self):
self._clear_crashes()
report = self._insights()
self.assertFalse(report["crashes"]["summary"])
self.assertFalse(report["errors"])
# crashes show up in the report
self._add_crash(1)
report = self._insights()
self.assertTrue(report["crashes"]["summary"])
self.assertFalse(report["errors"])
log.warning("{}".format(json.dumps(report["crashes"], indent=2)))
# handling of comm. error with crash module
self._add_crash(1, True)
report = self._insights()
self.assertFalse(report["crashes"]["summary"])
self.assertTrue(report["errors"])
self._clear_crashes()