ceph/qa/tasks/mgr/test_insights.py

import logging
import json
import datetime
import time
from mgr_test_case import MgrTestCase

log = logging.getLogger(__name__)
UUID = 'd5775432-0742-44a3-a435-45095e32e6b2'
DATEFMT = '%Y-%m-%d %H:%M:%S.%f'

class TestInsights(MgrTestCase):
    def setUp(self):
        self.setup_mgrs()
        self._load_module("insights")
        self._load_module("selftest")
        self.crash_ids = []

    def tearDown(self):
        self._clear_crashes()

    def _insights(self):
        retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd("insights")
        return json.loads(retstr)

    def _add_crash(self, hours, make_invalid = False):
        now = datetime.datetime.utcnow()
        timestamp = now - datetime.timedelta(hours = hours)
        timestamp = timestamp.strftime(DATEFMT) + 'Z'
        crash_id = '_'.join((timestamp, UUID)).replace(' ', '_')
        crash = {
            'crash_id': crash_id,
            'timestamp': timestamp,
        }
        if make_invalid:
            crash["timestamp"] = "not a timestamp"

        ret = self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
            'crash', 'post', '-i', '-',
            stdin=json.dumps(crash)
        )
        self.crash_ids.append(crash_id)
        self.assertEqual(0, ret)

    def _clear_crashes(self):
        for crash_id in self.crash_ids:
            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
                'crash', 'rm', crash_id
            )

    def _wait_for_health_history_checks(self, *args):
        """Wait for a set of health checks to appear in the health history"""
        timeout = datetime.datetime.utcnow() + \
            datetime.timedelta(seconds = 15)
        while True:
            report = self._insights()
            missing = False
            for check in args:
                if check not in report["health"]["history"]["checks"]:
                    missing = True
                    break
            if not missing:
                return
            self.assertGreater(timeout,
                    datetime.datetime.utcnow())
            time.sleep(0.25)

    def _wait_for_curr_health_cleared(self, check):
        timeout = datetime.datetime.utcnow() + \
            datetime.timedelta(seconds = 15)
        while True:
            report = self._insights()
            if check not in report["health"]["current"]["checks"]:
                return
            self.assertGreater(timeout,
                    datetime.datetime.utcnow())
            time.sleep(0.25)

    def test_health_history(self):
        # use empty health history as starting point
        self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
            "insights", "prune-health", "0")
        report = self._insights()
        self.assertFalse(report["health"]["history"]["checks"])

        # generate health check history entries. we want to avoid the edge case
        # of running these tests at _exactly_ the top of the hour so we can
        # explicitly control when hourly work occurs. for this we use the
        # current time offset to a half hour.
        now = datetime.datetime.utcnow()
        now = datetime.datetime(
            year = now.year,
            month = now.month,
            day = now.day,
            hour = now.hour,
            minute = 30)

        check_names = set()
        for hours in [-18, -11, -5, -1, 0]:
            # change the insight module's perception of "now" ...
            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
                "mgr", "self-test", "insights_set_now_offset", str(hours))

            # ... to simulate health check arrivals in the past
            unique_check_name = "insights_health_check_{}".format(hours)
            health_check = {
                unique_check_name: {
                    "severity": "warning",
                    "summary": "summary",
                    "detail": ["detail"]
                }
            }
            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
                "mgr", "self-test", "health", "set",
                json.dumps(health_check))

            check_names.add(unique_check_name)

            # and also set the same health check to test deduplication
            dupe_check_name = "insights_health_check".format(hours)
            health_check = {
                dupe_check_name: {
                    "severity": "warning",
                    "summary": "summary",
                    "detail": ["detail"]
                }
            }
            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
                "mgr", "self-test", "health", "set",
                json.dumps(health_check))

            check_names.add(dupe_check_name)

            # wait for the health check to show up in the history report
            self._wait_for_health_history_checks(unique_check_name, dupe_check_name)

            # clear out the current health checks before moving on
            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
                "mgr", "self-test", "health", "clear")
            self._wait_for_curr_health_cleared(unique_check_name)

        report = self._insights()
        for check in check_names:
            self.assertIn(check, report["health"]["history"]["checks"])

        # restart the manager
        active_id = self.mgr_cluster.get_active_id()
        self.mgr_cluster.mgr_restart(active_id)

        # ensure that at least one of the checks is present after the restart.
        # we don't for them all to be present because "earlier" checks may not
        # have sat in memory long enough to be flushed.
        all_missing = True
        report = self._insights()
        for check in check_names:
            if check in report["health"]["history"]["checks"]:
                all_missing = False
                break
        self.assertFalse(all_missing)

        # pruning really removes history
        self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
            "insights", "prune-health", "0")
        report = self._insights()
        self.assertFalse(report["health"]["history"]["checks"])

    def test_insights_health(self):
        """The insights module reports health checks"""
        self._add_crash(1, True) # add invalid crash data
        timeout = 10
        while timeout > 0:
            time.sleep(1)
            timeout -= 1
            # should observe a health check because it can't read the invalid
            # crash data created at the beginning of this test
            report = self._insights()
            if "MGR_INSIGHTS_WARNING" in report["health"]["current"]["checks"]:
                self._clear_crashes()
                return
        self._clear_crashes()
        self.fail("Insights module did not set health check")
        pass

    def test_schema(self):
        """TODO: assert conformance to a full schema specification?"""
        report = self._insights()
        for key in ["osd_metadata",
                    "pg_summary",
                    "mon_status",
                    "manager_map",
                    "service_map",
                    "mon_map",
                    "crush_map",
                    "fs_map",
                    "osd_tree",
                    "df",
                    "osd_dump",
                    "config",
                    "health",
                    "crashes",
                    "version",
                    "errors"]:
            self.assertIn(key, report)

    def test_crash_history(self):
        self._clear_crashes()
        report = self._insights()
        self.assertFalse(report["crashes"]["summary"])
        self.assertFalse(report["errors"])

        # crashes show up in the report
        self._add_crash(1)
        report = self._insights()
        self.assertTrue(report["crashes"]["summary"])
        self.assertFalse(report["errors"])
        log.warning("{}".format(json.dumps(report["crashes"], indent=2)))

        # handling of comm. error with crash module
        self._add_crash(1, True)
        report = self._insights()
        self.assertFalse(report["crashes"]["summary"])
        self.assertTrue(report["errors"])

        self._clear_crashes()