mirror of
https://github.com/ceph/ceph
synced 2024-12-22 11:31:55 +00:00
224 lines
8.0 KiB
Python
224 lines
8.0 KiB
Python
|
import logging
|
||
|
import json
|
||
|
import datetime
|
||
|
import time
|
||
|
from mgr_test_case import MgrTestCase
|
||
|
|
||
|
log = logging.getLogger(__name__)
|
||
|
UUID = 'd5775432-0742-44a3-a435-45095e32e6b2'
|
||
|
DATEFMT = '%Y-%m-%d %H:%M:%S.%f'
|
||
|
|
||
|
class TestInsights(MgrTestCase):
|
||
|
def setUp(self):
|
||
|
self.setup_mgrs()
|
||
|
self._load_module("insights")
|
||
|
self._load_module("selftest")
|
||
|
self.crash_ids = []
|
||
|
|
||
|
def tearDown(self):
|
||
|
self._clear_crashes()
|
||
|
|
||
|
def _insights(self):
|
||
|
retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd("insights")
|
||
|
return json.loads(retstr)
|
||
|
|
||
|
def _add_crash(self, hours, make_invalid = False):
|
||
|
now = datetime.datetime.utcnow()
|
||
|
timestamp = now - datetime.timedelta(hours = hours)
|
||
|
timestamp = timestamp.strftime(DATEFMT) + 'Z'
|
||
|
crash_id = '_'.join((timestamp, UUID)).replace(' ', '_')
|
||
|
crash = {
|
||
|
'crash_id': crash_id,
|
||
|
'timestamp': timestamp,
|
||
|
}
|
||
|
if make_invalid:
|
||
|
crash["timestamp"] = "not a timestamp"
|
||
|
|
||
|
ret = self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
|
||
|
'crash', 'post', '-i', '-',
|
||
|
stdin=json.dumps(crash)
|
||
|
)
|
||
|
self.crash_ids.append(crash_id)
|
||
|
self.assertEqual(0, ret)
|
||
|
|
||
|
def _clear_crashes(self):
|
||
|
for crash_id in self.crash_ids:
|
||
|
self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
|
||
|
'crash', 'rm', crash_id
|
||
|
)
|
||
|
|
||
|
def _wait_for_health_history_checks(self, *args):
|
||
|
"""Wait for a set of health checks to appear in the health history"""
|
||
|
timeout = datetime.datetime.utcnow() + \
|
||
|
datetime.timedelta(seconds = 15)
|
||
|
while True:
|
||
|
report = self._insights()
|
||
|
missing = False
|
||
|
for check in args:
|
||
|
if check not in report["health"]["history"]["checks"]:
|
||
|
missing = True
|
||
|
break
|
||
|
if not missing:
|
||
|
return
|
||
|
self.assertGreater(timeout,
|
||
|
datetime.datetime.utcnow())
|
||
|
time.sleep(0.25)
|
||
|
|
||
|
def _wait_for_curr_health_cleared(self, check):
|
||
|
timeout = datetime.datetime.utcnow() + \
|
||
|
datetime.timedelta(seconds = 15)
|
||
|
while True:
|
||
|
report = self._insights()
|
||
|
if check not in report["health"]["current"]["checks"]:
|
||
|
return
|
||
|
self.assertGreater(timeout,
|
||
|
datetime.datetime.utcnow())
|
||
|
time.sleep(0.25)
|
||
|
|
||
|
def test_health_history(self):
|
||
|
# use empty health history as starting point
|
||
|
self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
|
||
|
"insights", "prune-health", "0")
|
||
|
report = self._insights()
|
||
|
self.assertFalse(report["health"]["history"]["checks"])
|
||
|
|
||
|
# generate health check history entries. we want to avoid the edge case
|
||
|
# of running these tests at _exactly_ the top of the hour so we can
|
||
|
# explicitly control when hourly work occurs. for this we use the
|
||
|
# current time offset to a half hour.
|
||
|
now = datetime.datetime.utcnow()
|
||
|
now = datetime.datetime(
|
||
|
year = now.year,
|
||
|
month = now.month,
|
||
|
day = now.day,
|
||
|
hour = now.hour,
|
||
|
minute = 30)
|
||
|
|
||
|
check_names = set()
|
||
|
for hours in [-18, -11, -5, -1, 0]:
|
||
|
# change the insight module's perception of "now" ...
|
||
|
self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
|
||
|
"mgr", "self-test", "insights_set_now_offset", str(hours))
|
||
|
|
||
|
# ... to simulate health check arrivals in the past
|
||
|
unique_check_name = "insights_health_check_{}".format(hours)
|
||
|
health_check = {
|
||
|
unique_check_name: {
|
||
|
"severity": "warning",
|
||
|
"summary": "summary",
|
||
|
"detail": ["detail"]
|
||
|
}
|
||
|
}
|
||
|
self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
|
||
|
"mgr", "self-test", "health", "set",
|
||
|
json.dumps(health_check))
|
||
|
|
||
|
check_names.add(unique_check_name)
|
||
|
|
||
|
# and also set the same health check to test deduplication
|
||
|
dupe_check_name = "insights_health_check".format(hours)
|
||
|
health_check = {
|
||
|
dupe_check_name: {
|
||
|
"severity": "warning",
|
||
|
"summary": "summary",
|
||
|
"detail": ["detail"]
|
||
|
}
|
||
|
}
|
||
|
self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
|
||
|
"mgr", "self-test", "health", "set",
|
||
|
json.dumps(health_check))
|
||
|
|
||
|
check_names.add(dupe_check_name)
|
||
|
|
||
|
# wait for the health check to show up in the history report
|
||
|
self._wait_for_health_history_checks(unique_check_name, dupe_check_name)
|
||
|
|
||
|
# clear out the current health checks before moving on
|
||
|
self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
|
||
|
"mgr", "self-test", "health", "clear")
|
||
|
self._wait_for_curr_health_cleared(unique_check_name)
|
||
|
|
||
|
report = self._insights()
|
||
|
for check in check_names:
|
||
|
self.assertIn(check, report["health"]["history"]["checks"])
|
||
|
|
||
|
# restart the manager
|
||
|
active_id = self.mgr_cluster.get_active_id()
|
||
|
self.mgr_cluster.mgr_restart(active_id)
|
||
|
|
||
|
# ensure that at least one of the checks is present after the restart.
|
||
|
# we don't for them all to be present because "earlier" checks may not
|
||
|
# have sat in memory long enough to be flushed.
|
||
|
all_missing = True
|
||
|
report = self._insights()
|
||
|
for check in check_names:
|
||
|
if check in report["health"]["history"]["checks"]:
|
||
|
all_missing = False
|
||
|
break
|
||
|
self.assertFalse(all_missing)
|
||
|
|
||
|
# pruning really removes history
|
||
|
self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
|
||
|
"insights", "prune-health", "0")
|
||
|
report = self._insights()
|
||
|
self.assertFalse(report["health"]["history"]["checks"])
|
||
|
|
||
|
def test_insights_health(self):
|
||
|
"""The insights module reports health checks"""
|
||
|
self._add_crash(1, True) # add invalid crash data
|
||
|
timeout = 10
|
||
|
while timeout > 0:
|
||
|
time.sleep(1)
|
||
|
timeout -= 1
|
||
|
# should observe a health check because it can't read the invalid
|
||
|
# crash data created at the beginning of this test
|
||
|
report = self._insights()
|
||
|
if "MGR_INSIGHTS_WARNING" in report["health"]["current"]["checks"]:
|
||
|
self._clear_crashes()
|
||
|
return
|
||
|
self._clear_crashes()
|
||
|
self.fail("Insights module did not set health check")
|
||
|
pass
|
||
|
|
||
|
def test_schema(self):
|
||
|
"""TODO: assert conformance to a full schema specification?"""
|
||
|
report = self._insights()
|
||
|
for key in ["osd_metadata",
|
||
|
"pg_summary",
|
||
|
"mon_status",
|
||
|
"manager_map",
|
||
|
"service_map",
|
||
|
"mon_map",
|
||
|
"crush_map",
|
||
|
"fs_map",
|
||
|
"osd_tree",
|
||
|
"df",
|
||
|
"osd_dump",
|
||
|
"config",
|
||
|
"health",
|
||
|
"crashes",
|
||
|
"version",
|
||
|
"errors"]:
|
||
|
self.assertIn(key, report)
|
||
|
|
||
|
def test_crash_history(self):
|
||
|
self._clear_crashes()
|
||
|
report = self._insights()
|
||
|
self.assertFalse(report["crashes"]["summary"])
|
||
|
self.assertFalse(report["errors"])
|
||
|
|
||
|
# crashes show up in the report
|
||
|
self._add_crash(1)
|
||
|
report = self._insights()
|
||
|
self.assertTrue(report["crashes"]["summary"])
|
||
|
self.assertFalse(report["errors"])
|
||
|
log.warning("{}".format(json.dumps(report["crashes"], indent=2)))
|
||
|
|
||
|
# handling of comm. error with crash module
|
||
|
self._add_crash(1, True)
|
||
|
report = self._insights()
|
||
|
self.assertFalse(report["crashes"]["summary"])
|
||
|
self.assertTrue(report["errors"])
|
||
|
|
||
|
self._clear_crashes()
|