From e51be85c24d36cb3f50f98f7c401f352fd1cd7e4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 21 Jul 2017 13:29:47 -0400 Subject: [PATCH 1/3] mgr: keep per-module checks, and report them back to the mon Signed-off-by: Sage Weil --- src/mgr/DaemonServer.cc | 3 +++ src/mgr/MgrPyModule.cc | 4 ++++ src/mgr/MgrPyModule.h | 9 +++++++++ src/mgr/PyModules.cc | 18 ++++++++++++++++++ src/mgr/PyModules.h | 5 +++++ 5 files changed, 39 insertions(+) diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc index dc9837cac82..a58675ed24c 100644 --- a/src/mgr/DaemonServer.cc +++ b/src/mgr/DaemonServer.cc @@ -1169,6 +1169,8 @@ void DaemonServer::send_report() } auto m = new MMonMgrReport(); + py_modules.get_health_checks(&m->health_checks); + cluster_state.with_pgmap([&](const PGMap& pg_map) { cluster_state.update_delta_stats(); @@ -1191,6 +1193,7 @@ void DaemonServer::send_report() pg_map.get_health_checks(g_ceph_context, osdmap, &m->health_checks); + dout(10) << m->health_checks.checks.size() << " health checks" << dendl; dout(20) << "health checks:\n"; diff --git a/src/mgr/MgrPyModule.cc b/src/mgr/MgrPyModule.cc index fda9bf6528d..a2bf73ca379 100644 --- a/src/mgr/MgrPyModule.cc +++ b/src/mgr/MgrPyModule.cc @@ -365,3 +365,7 @@ int MgrPyModule::handle_command( return r; } +void MgrPyModule::get_health_checks(health_check_map_t *checks) +{ + checks->merge(health_checks); +} diff --git a/src/mgr/MgrPyModule.h b/src/mgr/MgrPyModule.h index 14be1566f06..6eea29eee24 100644 --- a/src/mgr/MgrPyModule.h +++ b/src/mgr/MgrPyModule.h @@ -21,6 +21,8 @@ #include "common/cmdparse.h" #include "common/LogEntry.h" +#include "common/Mutex.h" +#include "mon/health_check.h" #include #include @@ -47,6 +49,8 @@ private: PyThreadState *pMainThreadState; PyThreadState *pMyThreadState = nullptr; + health_check_map_t health_checks; + std::vector commands; int load_commands(); @@ -75,6 +79,11 @@ public: const cmdmap_t &cmdmap, std::stringstream *ds, std::stringstream *ss); + + void set_health_checks(health_check_map_t&& c) { + health_checks = std::move(c); + } + void get_health_checks(health_check_map_t *checks); }; std::string handle_pyerror(); diff --git a/src/mgr/PyModules.cc b/src/mgr/PyModules.cc index 74628dbbf1c..e95bec8d75d 100644 --- a/src/mgr/PyModules.cc +++ b/src/mgr/PyModules.cc @@ -800,3 +800,21 @@ void PyModules::list_modules(std::set *modules) { _list_modules(g_conf->mgr_module_path, modules); } + +void PyModules::set_health_checks(const std::string& handle, + health_check_map_t&& checks) +{ + Mutex::Locker l(lock); + auto p = modules.find(handle); + if (p != modules.end()) { + p->second->set_health_checks(std::move(checks)); + } +} + +void PyModules::get_health_checks(health_check_map_t *checks) +{ + Mutex::Locker l(lock); + for (auto& p : modules) { + p.second->get_health_checks(checks); + } +} diff --git a/src/mgr/PyModules.h b/src/mgr/PyModules.h index 1c6751f4e51..431abec76f0 100644 --- a/src/mgr/PyModules.h +++ b/src/mgr/PyModules.h @@ -30,6 +30,7 @@ #include "ClusterState.h" class ServeThread; +class health_check_map_t; class PyModules { @@ -115,6 +116,10 @@ public: void set_config(const std::string &handle, const std::string &key, const std::string &val); + void set_health_checks(const std::string& handle, + health_check_map_t&& checks); + void get_health_checks(health_check_map_t *checks); + void log(const std::string &handle, int level, const std::string &record); From ed57f8c7a4591f264a099c5205b23b09f9a0215a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 20 Jul 2017 23:53:18 -0400 Subject: [PATCH 2/3] mgr/PyState: add set_health_checks method Signed-off-by: Sage Weil --- src/mgr/PyState.cc | 103 +++++++++++++++++++++++++++++++++++ src/pybind/mgr/mgr_module.py | 24 ++++++++ 2 files changed, 127 insertions(+) diff --git a/src/mgr/PyState.cc b/src/mgr/PyState.cc index ae237b2e599..efde9587708 100644 --- a/src/mgr/PyState.cc +++ b/src/mgr/PyState.cc @@ -182,6 +182,107 @@ ceph_send_command(PyObject *self, PyObject *args) Py_RETURN_NONE; } +static PyObject* +ceph_set_health_checks(PyObject *self, PyObject *args) +{ + char *handle = nullptr; + PyObject *checks = NULL; + if (!PyArg_ParseTuple(args, "sO:ceph_set_health_checks", &handle, &checks)) { + return NULL; + } + if (!PyDict_Check(checks)) { + derr << __func__ << " arg not a dict" << dendl; + Py_RETURN_NONE; + } + PyObject *checksls = PyDict_Items(checks); + health_check_map_t out_checks; + for (int i = 0; i < PyList_Size(checksls); ++i) { + PyObject *kv = PyList_GET_ITEM(checksls, i); + char *check_name = nullptr; + PyObject *check_info = nullptr; + if (!PyArg_ParseTuple(kv, "sO:pair", &check_name, &check_info)) { + derr << __func__ << " dict item " << i + << " not a size 2 tuple" << dendl; + continue; + } + if (!PyDict_Check(check_info)) { + derr << __func__ << " item " << i << " " << check_name + << " value not a dict" << dendl; + continue; + } + health_status_t severity = HEALTH_OK; + string summary; + list detail; + PyObject *infols = PyDict_Items(check_info); + for (int j = 0; j < PyList_Size(infols); ++j) { + PyObject *pair = PyList_GET_ITEM(infols, j); + if (!PyTuple_Check(pair)) { + derr << __func__ << " item " << i << " pair " << j + << " not a tuple" << dendl; + continue; + } + char *k = nullptr; + PyObject *v = nullptr; + if (!PyArg_ParseTuple(pair, "sO:pair", &k, &v)) { + derr << __func__ << " item " << i << " pair " << j + << " not a size 2 tuple" << dendl; + continue; + } + string ks(k); + if (ks == "severity") { + if (!PyString_Check(v)) { + derr << __func__ << " check " << check_name + << " severity value not string" << dendl; + continue; + } + string vs(PyString_AsString(v)); + if (vs == "warning") { + severity = HEALTH_WARN; + } else if (vs == "error") { + severity = HEALTH_ERR; + } + } else if (ks == "summary") { + if (!PyString_Check(v)) { + derr << __func__ << " check " << check_name + << " summary value not string" << dendl; + continue; + } + summary = PyString_AsString(v); + } else if (ks == "detail") { + if (!PyList_Check(v)) { + derr << __func__ << " check " << check_name + << " detail value not list" << dendl; + continue; + } + for (int k = 0; k < PyList_Size(v); ++k) { + PyObject *di = PyList_GET_ITEM(v, k); + if (!PyString_Check(di)) { + derr << __func__ << " check " << check_name + << " detail item " << k << " not a string" << dendl; + continue; + } + detail.push_back(PyString_AsString(di)); + } + } else { + derr << __func__ << " check " << check_name + << " unexpected key " << k << dendl; + } + } + auto& d = out_checks.add(check_name, severity, summary); + d.detail.swap(detail); + } + + JSONFormatter jf(true); + dout(10) << "module " << handle << " health checks:\n"; + out_checks.dump(&jf); + jf.flush(*_dout); + *_dout << dendl; + + global_handle->set_health_checks(handle, std::move(out_checks)); + + Py_RETURN_NONE; +} + static PyObject* ceph_state_get(PyObject *self, PyObject *args) @@ -359,6 +460,8 @@ PyMethodDef CephStateMethods[] = { "Get a service's status"}, {"send_command", ceph_send_command, METH_VARARGS, "Send a mon command"}, + {"set_health_checks", ceph_set_health_checks, METH_VARARGS, + "Set health checks for this module"}, {"get_mgr_id", ceph_get_mgr_id, METH_NOARGS, "Get the mgr id"}, {"get_config", ceph_config_get, METH_VARARGS, diff --git a/src/pybind/mgr/mgr_module.py b/src/pybind/mgr/mgr_module.py index 49c7efe15ca..2463bafe75d 100644 --- a/src/pybind/mgr/mgr_module.py +++ b/src/pybind/mgr/mgr_module.py @@ -169,6 +169,30 @@ class MgrModule(object): """ ceph_state.send_command(self._handle, *args, **kwargs) + def set_health_checks(self, checks): + """ + Set module's health checks + + Set the module's current map of health checks. Argument is a + dict of check names to info, in this form: + + { + 'CHECK_FOO': { + 'severity': 'warning', # or 'error' + 'summary': 'summary string', + 'detail': [ 'list', 'of', 'detail', 'strings' ], + }, + 'CHECK_BAR': { + 'severity': 'error', + 'summary': 'bars are bad', + 'detail': [ 'too hard' ], + }, + } + + :param list: dict of health check dicts + """ + ceph_state.set_health_checks(self._handle, checks) + def handle_command(self, cmd): """ Called by ceph-mgr to request the plugin to handle one From 76a35c17973ab339f17add69c2cc3b84a3b3f19b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 25 Jul 2017 12:26:04 -0400 Subject: [PATCH 3/3] mon/PGMap: do not clear checks We may be one of many contributors to the checks. Signed-off-by: Sage Weil --- src/mon/PGMap.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 1b5ce757c1b..d29848a5a9f 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -2562,8 +2562,6 @@ void PGMap::get_health_checks( const unsigned max = cct->_conf->mon_health_max_detail; const auto& pools = osdmap.get_pools(); - checks->clear(); - typedef enum pg_consequence_t { UNAVAILABLE = 1, // Client IO to the pool may block DEGRADED = 2, // Fewer than the requested number of replicas are present