ceph/qa/tasks/mgr/test_insights.py

import logging
import json
import datetime
import time
from mgr_test_case import MgrTestCase

log = logging.getLogger(__name__)
UUID = 'd5775432-0742-44a3-a435-45095e32e6b2'
DATEFMT = '%Y-%m-%d %H:%M:%S.%f'

class TestInsights(MgrTestCase):
    def setUp(self):
        self.setup_mgrs()
        self._load_module("insights")
        self._load_module("selftest")
        self.crash_ids = []

    def tearDown(self):
        self._clear_crashes()

    def _insights(self):
        retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd("insights")
        return json.loads(retstr)

    def _add_crash(self, hours, make_invalid = False):
        now = datetime.datetime.utcnow()
        timestamp = now - datetime.timedelta(hours = hours)
        timestamp = timestamp.strftime(DATEFMT) + 'Z'
        crash_id = '_'.join((timestamp, UUID)).replace(' ', '_')
        crash = {
            'crash_id': crash_id,
            'timestamp': timestamp,
        }
        if make_invalid:
            crash["timestamp"] = "not a timestamp"

        ret = self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
            'crash', 'post', '-i', '-',
            stdin=json.dumps(crash)
        )
        self.crash_ids.append(crash_id)
        self.assertEqual(0, ret)

    def _clear_crashes(self):
        for crash_id in self.crash_ids:
            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
                'crash', 'rm', crash_id
            )

    def _wait_for_health_history_checks(self, *args):
        """Wait for a set of health checks to appear in the health history"""
        timeout = datetime.datetime.utcnow() + \
            datetime.timedelta(seconds = 15)
        while True:
            report = self._insights()
            missing = False
            for check in args:
                if check not in report["health"]["history"]["checks"]:
                    missing = True
                    break
            if not missing:
                return
            self.assertGreater(timeout,
                    datetime.datetime.utcnow())
            time.sleep(0.25)

    def _wait_for_curr_health_cleared(self, check):
        timeout = datetime.datetime.utcnow() + \
            datetime.timedelta(seconds = 15)
        while True:
            report = self._insights()
            if check not in report["health"]["current"]["checks"]:
                return
            self.assertGreater(timeout,
                    datetime.datetime.utcnow())
            time.sleep(0.25)

    def test_health_history(self):
        # use empty health history as starting point
        self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
            "insights", "prune-health", "0")
        report = self._insights()
        self.assertFalse(report["health"]["history"]["checks"])

        # generate health check history entries. we want to avoid the edge case
        # of running these tests at _exactly_ the top of the hour so we can
        # explicitly control when hourly work occurs. for this we use the
        # current time offset to a half hour.
        now = datetime.datetime.utcnow()
        now = datetime.datetime(
            year = now.year,
            month = now.month,
            day = now.day,
            hour = now.hour,
            minute = 30)

        check_names = set()
        for hours in [-18, -11, -5, -1, 0]:
            # change the insight module's perception of "now" ...
            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
                "mgr", "self-test", "insights_set_now_offset", str(hours))

            # ... to simulate health check arrivals in the past
            unique_check_name = "insights_health_check_{}".format(hours)
            health_check = {
                unique_check_name: {
                    "severity": "warning",
                    "summary": "summary",
                    "detail": ["detail"]
                }
            }
            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
                "mgr", "self-test", "health", "set",
                json.dumps(health_check))

            check_names.add(unique_check_name)

            # and also set the same health check to test deduplication
            dupe_check_name = "insights_health_check".format(hours)
            health_check = {
                dupe_check_name: {
                    "severity": "warning",
                    "summary": "summary",
                    "detail": ["detail"]
                }
            }
            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
                "mgr", "self-test", "health", "set",
                json.dumps(health_check))

            check_names.add(dupe_check_name)

            # wait for the health check to show up in the history report
            self._wait_for_health_history_checks(unique_check_name, dupe_check_name)

            # clear out the current health checks before moving on
            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
                "mgr", "self-test", "health", "clear")
            self._wait_for_curr_health_cleared(unique_check_name)

        report = self._insights()
        for check in check_names:
            self.assertIn(check, report["health"]["history"]["checks"])

        # restart the manager
        active_id = self.mgr_cluster.get_active_id()
        self.mgr_cluster.mgr_restart(active_id)

        # ensure that at least one of the checks is present after the restart.
        # we don't for them all to be present because "earlier" checks may not
        # have sat in memory long enough to be flushed.
        all_missing = True
        report = self._insights()
        for check in check_names:
            if check in report["health"]["history"]["checks"]:
                all_missing = False
                break
        self.assertFalse(all_missing)

        # pruning really removes history
        self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
            "insights", "prune-health", "0")
        report = self._insights()
        self.assertFalse(report["health"]["history"]["checks"])

    def test_insights_health(self):
        """The insights module reports health checks"""
        self._add_crash(1, True) # add invalid crash data
        timeout = 10
        while timeout > 0:
            time.sleep(1)
            timeout -= 1
            # should observe a health check because it can't read the invalid
            # crash data created at the beginning of this test
            report = self._insights()
            if "MGR_INSIGHTS_WARNING" in report["health"]["current"]["checks"]:
                self._clear_crashes()
                return
        self._clear_crashes()
        self.fail("Insights module did not set health check")
        pass

    def test_schema(self):
        """TODO: assert conformance to a full schema specification?"""
        report = self._insights()
        for key in ["osd_metadata",
                    "pg_summary",
                    "mon_status",
                    "manager_map",
                    "service_map",
                    "mon_map",
                    "crush_map",
                    "fs_map",
                    "osd_tree",
                    "df",
                    "osd_dump",
                    "config",
                    "health",
                    "crashes",
                    "version",
                    "errors"]:
            self.assertIn(key, report)

    def test_crash_history(self):
        self._clear_crashes()
        report = self._insights()
        self.assertFalse(report["crashes"]["summary"])
        self.assertFalse(report["errors"])

        # crashes show up in the report
        self._add_crash(1)
        report = self._insights()
        self.assertTrue(report["crashes"]["summary"])
        self.assertFalse(report["errors"])
        log.warning("{}".format(json.dumps(report["crashes"], indent=2)))

        # handling of comm. error with crash module
        self._add_crash(1, True)
        report = self._insights()
        self.assertFalse(report["crashes"]["summary"])
        self.assertTrue(report["errors"])

        self._clear_crashes()
qa/tasks/mgr: add tests for insights module Signed-off-by: Noah Watkins <nwatkins@redhat.com> 2018-08-16 18:24:01 +00:00			`import logging`
			`import json`
			`import datetime`
			`import time`
			`from mgr_test_case import MgrTestCase`

			`log = logging.getLogger(__name__)`
			`UUID = 'd5775432-0742-44a3-a435-45095e32e6b2'`
			`DATEFMT = '%Y-%m-%d %H:%M:%S.%f'`

			`class TestInsights(MgrTestCase):`
			`def setUp(self):`
			`self.setup_mgrs()`
			`self._load_module("insights")`
			`self._load_module("selftest")`
			`self.crash_ids = []`

			`def tearDown(self):`
			`self._clear_crashes()`

			`def _insights(self):`
			`retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd("insights")`
			`return json.loads(retstr)`

			`def _add_crash(self, hours, make_invalid = False):`
			`now = datetime.datetime.utcnow()`
			`timestamp = now - datetime.timedelta(hours = hours)`
			`timestamp = timestamp.strftime(DATEFMT) + 'Z'`
			`crash_id = '_'.join((timestamp, UUID)).replace(' ', '_')`
			`crash = {`
			`'crash_id': crash_id,`
			`'timestamp': timestamp,`
			`}`
			`if make_invalid:`
			`crash["timestamp"] = "not a timestamp"`

			`ret = self.mgr_cluster.mon_manager.raw_cluster_cmd_result(`
			`'crash', 'post', '-i', '-',`
			`stdin=json.dumps(crash)`
			`)`
			`self.crash_ids.append(crash_id)`
			`self.assertEqual(0, ret)`

			`def _clear_crashes(self):`
			`for crash_id in self.crash_ids:`
			`self.mgr_cluster.mon_manager.raw_cluster_cmd_result(`
			`'crash', 'rm', crash_id`
			`)`

			`def _wait_for_health_history_checks(self, *args):`
			`"""Wait for a set of health checks to appear in the health history"""`
			`timeout = datetime.datetime.utcnow() + \`
			`datetime.timedelta(seconds = 15)`
			`while True:`
			`report = self._insights()`
			`missing = False`
			`for check in args:`
			`if check not in report["health"]["history"]["checks"]:`
			`missing = True`
			`break`
			`if not missing:`
			`return`
			`self.assertGreater(timeout,`
			`datetime.datetime.utcnow())`
			`time.sleep(0.25)`

			`def _wait_for_curr_health_cleared(self, check):`
			`timeout = datetime.datetime.utcnow() + \`
			`datetime.timedelta(seconds = 15)`
			`while True:`
			`report = self._insights()`
			`if check not in report["health"]["current"]["checks"]:`
			`return`
			`self.assertGreater(timeout,`
			`datetime.datetime.utcnow())`
			`time.sleep(0.25)`

			`def test_health_history(self):`
			`# use empty health history as starting point`
			`self.mgr_cluster.mon_manager.raw_cluster_cmd_result(`
			`"insights", "prune-health", "0")`
			`report = self._insights()`
			`self.assertFalse(report["health"]["history"]["checks"])`

			`# generate health check history entries. we want to avoid the edge case`
			`# of running these tests at _exactly_ the top of the hour so we can`
			`# explicitly control when hourly work occurs. for this we use the`
			`# current time offset to a half hour.`
			`now = datetime.datetime.utcnow()`
			`now = datetime.datetime(`
			`year = now.year,`
			`month = now.month,`
			`day = now.day,`
			`hour = now.hour,`
			`minute = 30)`

			`check_names = set()`
			`for hours in [-18, -11, -5, -1, 0]:`
			`# change the insight module's perception of "now" ...`
			`self.mgr_cluster.mon_manager.raw_cluster_cmd_result(`
			`"mgr", "self-test", "insights_set_now_offset", str(hours))`

			`# ... to simulate health check arrivals in the past`
			`unique_check_name = "insights_health_check_{}".format(hours)`
			`health_check = {`
			`unique_check_name: {`
			`"severity": "warning",`
			`"summary": "summary",`
			`"detail": ["detail"]`
			`}`
			`}`
			`self.mgr_cluster.mon_manager.raw_cluster_cmd_result(`
			`"mgr", "self-test", "health", "set",`
			`json.dumps(health_check))`

			`check_names.add(unique_check_name)`

			`# and also set the same health check to test deduplication`
			`dupe_check_name = "insights_health_check".format(hours)`
			`health_check = {`
			`dupe_check_name: {`
			`"severity": "warning",`
			`"summary": "summary",`
			`"detail": ["detail"]`
			`}`
			`}`
			`self.mgr_cluster.mon_manager.raw_cluster_cmd_result(`
			`"mgr", "self-test", "health", "set",`
			`json.dumps(health_check))`

			`check_names.add(dupe_check_name)`

			`# wait for the health check to show up in the history report`
			`self._wait_for_health_history_checks(unique_check_name, dupe_check_name)`

			`# clear out the current health checks before moving on`
			`self.mgr_cluster.mon_manager.raw_cluster_cmd_result(`
			`"mgr", "self-test", "health", "clear")`
			`self._wait_for_curr_health_cleared(unique_check_name)`

			`report = self._insights()`
			`for check in check_names:`
			`self.assertIn(check, report["health"]["history"]["checks"])`

			`# restart the manager`
			`active_id = self.mgr_cluster.get_active_id()`
			`self.mgr_cluster.mgr_restart(active_id)`

			`# ensure that at least one of the checks is present after the restart.`
			`# we don't for them all to be present because "earlier" checks may not`
			`# have sat in memory long enough to be flushed.`
			`all_missing = True`
			`report = self._insights()`
			`for check in check_names:`
			`if check in report["health"]["history"]["checks"]:`
			`all_missing = False`
			`break`
			`self.assertFalse(all_missing)`

			`# pruning really removes history`
			`self.mgr_cluster.mon_manager.raw_cluster_cmd_result(`
			`"insights", "prune-health", "0")`
			`report = self._insights()`
			`self.assertFalse(report["health"]["history"]["checks"])`

			`def test_insights_health(self):`
			`"""The insights module reports health checks"""`
			`self._add_crash(1, True) # add invalid crash data`
			`timeout = 10`
			`while timeout > 0:`
			`time.sleep(1)`
			`timeout -= 1`
			`# should observe a health check because it can't read the invalid`
			`# crash data created at the beginning of this test`
			`report = self._insights()`
			`if "MGR_INSIGHTS_WARNING" in report["health"]["current"]["checks"]:`
			`self._clear_crashes()`
			`return`
			`self._clear_crashes()`
			`self.fail("Insights module did not set health check")`
			`pass`

			`def test_schema(self):`
			`"""TODO: assert conformance to a full schema specification?"""`
			`report = self._insights()`
			`for key in ["osd_metadata",`
			`"pg_summary",`
			`"mon_status",`
			`"manager_map",`
			`"service_map",`
			`"mon_map",`
			`"crush_map",`
			`"fs_map",`
			`"osd_tree",`
			`"df",`
			`"osd_dump",`
			`"config",`
			`"health",`
			`"crashes",`
			`"version",`
			`"errors"]:`
			`self.assertIn(key, report)`

			`def test_crash_history(self):`
			`self._clear_crashes()`
			`report = self._insights()`
			`self.assertFalse(report["crashes"]["summary"])`
			`self.assertFalse(report["errors"])`

			`# crashes show up in the report`
			`self._add_crash(1)`
			`report = self._insights()`
			`self.assertTrue(report["crashes"]["summary"])`
			`self.assertFalse(report["errors"])`
			`log.warning("{}".format(json.dumps(report["crashes"], indent=2)))`

			`# handling of comm. error with crash module`
			`self._add_crash(1, True)`
			`report = self._insights()`
			`self.assertFalse(report["crashes"]["summary"])`
			`self.assertTrue(report["errors"])`

			`self._clear_crashes()`