mgr/devicehealth: extract+store wear level from metrics scraping

When we scrape and store health metrics for a device, extract the wear
level from the JSON.  If present, also store it in the config-key
per-device metadata.

Signed-off-by: Sage Weil <sage@newdream.net>
This commit is contained in:
Sage Weil 2021-02-08 13:00:56 -06:00
parent 8f93e3b553
commit 4840507cfc
7 changed files with 100 additions and 0 deletions

View File

@ -1084,6 +1084,36 @@ void ActivePyModules::set_uri(const std::string& module_name,
modules.at(module_name)->set_uri(uri);
}
void ActivePyModules::set_device_wear_level(const std::string& devid,
float wear_level)
{
// update mgr state
map<string,string> meta;
daemon_state.with_device(
devid,
[wear_level, &meta] (DeviceState& dev) {
dev.set_wear_level(wear_level);
meta = dev.metadata;
});
// tell mon
json_spirit::Object json_object;
for (auto& i : meta) {
json_spirit::Config::add(json_object, i.first, i.second);
}
bufferlist json;
json.append(json_spirit::write(json_object));
const string cmd =
"{"
"\"prefix\": \"config-key set\", "
"\"key\": \"device/" + devid + "\""
"}";
Command set_cmd;
set_cmd.run(&monc, cmd, json);
set_cmd.wait();
}
MetricQueryID ActivePyModules::add_osd_perf_query(
const OSDPerfMetricQuery &query,
const std::optional<OSDPerfMetricLimit> &limit)

View File

@ -151,6 +151,7 @@ public:
void config_notify();
void set_uri(const std::string& module_name, const std::string &uri);
void set_device_wear_level(const std::string& devid, float wear_level);
int handle_command(
const ModuleCommand& module_command,

View File

@ -677,6 +677,23 @@ ceph_set_uri(BaseMgrModule *self, PyObject *args)
Py_RETURN_NONE;
}
static PyObject*
ceph_set_wear_level(BaseMgrModule *self, PyObject *args)
{
char *devid = nullptr;
float wear_level;
if (!PyArg_ParseTuple(args, "sf:ceph_set_wear_level",
&devid, &wear_level)) {
return nullptr;
}
PyThreadState *tstate = PyEval_SaveThread();
self->py_modules->set_device_wear_level(devid, wear_level);
PyEval_RestoreThread(tstate);
Py_RETURN_NONE;
}
static PyObject*
ceph_have_mon_connection(BaseMgrModule *self, PyObject *args)
{
@ -1437,6 +1454,9 @@ PyMethodDef BaseMgrModule_methods[] = {
{"_ceph_set_uri", (PyCFunction)ceph_set_uri, METH_VARARGS,
"Advertize a service URI served by this module"},
{"_ceph_set_device_wear_level", (PyCFunction)ceph_set_wear_level, METH_VARARGS,
"Set device wear_level value"},
{"_ceph_have_mon_connection", (PyCFunction)ceph_have_mon_connection,
METH_NOARGS, "Find out whether this mgr daemon currently has "
"a connection to a monitor"},

View File

@ -35,6 +35,12 @@ public:
&outbl, &outs, &cond);
}
void run(MonClient *monc, const std::string &command, const ceph::buffer::list &inbl)
{
monc->start_mon_command({command}, inbl,
&outbl, &outs, &cond);
}
virtual void wait()
{
r = cond.wait();

View File

@ -61,6 +61,7 @@ class BaseMgrModule(object):
def _ceph_get_store(self, key):...
def _ceph_get_osdmap(self):...
def _ceph_set_uri(self, uri):...
def _ceph_set_device_wear_level(self, devid, val):...
def _ceph_have_mon_connection(self):...
def _ceph_update_progress_event(self, evid, desc, progress, add_to_ceph_s):...
def _ceph_complete_progress_event(self, evid):...

View File

@ -25,6 +25,29 @@ HEALTH_MESSAGES = {
MAX_SAMPLES = 500
def get_ata_wear_level(data: Dict[Any,Any]) -> Optional[float]:
"""
Extract wear level (as float) from smartctl -x --json output for SATA SSD
"""
for page in data.get("ata_device_statistics", {}).get("pages", []):
if page.get("number") != 7:
continue
for item in page.get("table", []):
if item["offset"] == 8:
return item["value"] / 100.0
return None
def get_nvme_wear_level(data: Dict[Any,Any]) -> Optional[float]:
"""
Extract wear level (as float) from smartctl -x --json output for NVME SSD
"""
pct_used = data.get("nvme_smart_health_information_log", {}).get("percentage_used")
if pct_used is None:
return None
return pct_used / 100.0
class Module(MgrModule):
MODULE_OPTIONS = [
Option(
@ -450,6 +473,22 @@ class Module(MgrModule):
ioctx.remove_omap_keys(op, tuple(erase))
ioctx.operate_write_op(op, devid)
# extract wear level?
wear_level = get_ata_wear_level(data)
if wear_level is None:
wear_level = get_nvme_wear_level(data)
dev_data = self.get(f"device {devid}") or {}
if wear_level is not None:
if dev_data.get(wear_level) != str(wear_level):
dev_data["wear_level"] = str(wear_level)
self.log.debug(f"updating {devid} wear level to {wear_level}")
self.set_device_wear_level(devid, wear_level)
else:
if "wear_level" in dev_data:
del dev_data["wear_level"]
self.log.debug(f"removing {devid} wear level")
self.set_device_wear_level(devid, -1.0)
def _get_device_metrics(self, devid: str,
sample: Optional[str] = None,
min_sample: Optional[str] = None) -> Dict[str, Dict[str, Any]]:

View File

@ -1611,6 +1611,9 @@ class MgrModule(ceph_module.BaseMgrModule, MgrModuleLoggingMixin):
"""
return self._ceph_set_uri(uri)
def set_device_wear_level(self, devid: str, wear_level: float) -> None:
return self._ceph_set_device_wear_level(devid, wear_level)
def have_mon_connection(self) -> bool:
"""
Check whether this ceph-mgr daemon has an open connection