mirror of
https://github.com/ceph/ceph
synced 2024-12-29 15:03:33 +00:00
mgr/devicehealth: extract+store wear level from metrics scraping
When we scrape and store health metrics for a device, extract the wear level from the JSON. If present, also store it in the config-key per-device metadata. Signed-off-by: Sage Weil <sage@newdream.net>
This commit is contained in:
parent
8f93e3b553
commit
4840507cfc
@ -1084,6 +1084,36 @@ void ActivePyModules::set_uri(const std::string& module_name,
|
||||
modules.at(module_name)->set_uri(uri);
|
||||
}
|
||||
|
||||
void ActivePyModules::set_device_wear_level(const std::string& devid,
|
||||
float wear_level)
|
||||
{
|
||||
// update mgr state
|
||||
map<string,string> meta;
|
||||
daemon_state.with_device(
|
||||
devid,
|
||||
[wear_level, &meta] (DeviceState& dev) {
|
||||
dev.set_wear_level(wear_level);
|
||||
meta = dev.metadata;
|
||||
});
|
||||
|
||||
// tell mon
|
||||
json_spirit::Object json_object;
|
||||
for (auto& i : meta) {
|
||||
json_spirit::Config::add(json_object, i.first, i.second);
|
||||
}
|
||||
bufferlist json;
|
||||
json.append(json_spirit::write(json_object));
|
||||
const string cmd =
|
||||
"{"
|
||||
"\"prefix\": \"config-key set\", "
|
||||
"\"key\": \"device/" + devid + "\""
|
||||
"}";
|
||||
|
||||
Command set_cmd;
|
||||
set_cmd.run(&monc, cmd, json);
|
||||
set_cmd.wait();
|
||||
}
|
||||
|
||||
MetricQueryID ActivePyModules::add_osd_perf_query(
|
||||
const OSDPerfMetricQuery &query,
|
||||
const std::optional<OSDPerfMetricLimit> &limit)
|
||||
|
@ -151,6 +151,7 @@ public:
|
||||
void config_notify();
|
||||
|
||||
void set_uri(const std::string& module_name, const std::string &uri);
|
||||
void set_device_wear_level(const std::string& devid, float wear_level);
|
||||
|
||||
int handle_command(
|
||||
const ModuleCommand& module_command,
|
||||
|
@ -677,6 +677,23 @@ ceph_set_uri(BaseMgrModule *self, PyObject *args)
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
static PyObject*
|
||||
ceph_set_wear_level(BaseMgrModule *self, PyObject *args)
|
||||
{
|
||||
char *devid = nullptr;
|
||||
float wear_level;
|
||||
if (!PyArg_ParseTuple(args, "sf:ceph_set_wear_level",
|
||||
&devid, &wear_level)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
PyThreadState *tstate = PyEval_SaveThread();
|
||||
self->py_modules->set_device_wear_level(devid, wear_level);
|
||||
PyEval_RestoreThread(tstate);
|
||||
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
static PyObject*
|
||||
ceph_have_mon_connection(BaseMgrModule *self, PyObject *args)
|
||||
{
|
||||
@ -1437,6 +1454,9 @@ PyMethodDef BaseMgrModule_methods[] = {
|
||||
{"_ceph_set_uri", (PyCFunction)ceph_set_uri, METH_VARARGS,
|
||||
"Advertize a service URI served by this module"},
|
||||
|
||||
{"_ceph_set_device_wear_level", (PyCFunction)ceph_set_wear_level, METH_VARARGS,
|
||||
"Set device wear_level value"},
|
||||
|
||||
{"_ceph_have_mon_connection", (PyCFunction)ceph_have_mon_connection,
|
||||
METH_NOARGS, "Find out whether this mgr daemon currently has "
|
||||
"a connection to a monitor"},
|
||||
|
@ -35,6 +35,12 @@ public:
|
||||
&outbl, &outs, &cond);
|
||||
}
|
||||
|
||||
void run(MonClient *monc, const std::string &command, const ceph::buffer::list &inbl)
|
||||
{
|
||||
monc->start_mon_command({command}, inbl,
|
||||
&outbl, &outs, &cond);
|
||||
}
|
||||
|
||||
virtual void wait()
|
||||
{
|
||||
r = cond.wait();
|
||||
|
@ -61,6 +61,7 @@ class BaseMgrModule(object):
|
||||
def _ceph_get_store(self, key):...
|
||||
def _ceph_get_osdmap(self):...
|
||||
def _ceph_set_uri(self, uri):...
|
||||
def _ceph_set_device_wear_level(self, devid, val):...
|
||||
def _ceph_have_mon_connection(self):...
|
||||
def _ceph_update_progress_event(self, evid, desc, progress, add_to_ceph_s):...
|
||||
def _ceph_complete_progress_event(self, evid):...
|
||||
|
@ -25,6 +25,29 @@ HEALTH_MESSAGES = {
|
||||
MAX_SAMPLES = 500
|
||||
|
||||
|
||||
def get_ata_wear_level(data: Dict[Any,Any]) -> Optional[float]:
|
||||
"""
|
||||
Extract wear level (as float) from smartctl -x --json output for SATA SSD
|
||||
"""
|
||||
for page in data.get("ata_device_statistics", {}).get("pages", []):
|
||||
if page.get("number") != 7:
|
||||
continue
|
||||
for item in page.get("table", []):
|
||||
if item["offset"] == 8:
|
||||
return item["value"] / 100.0
|
||||
return None
|
||||
|
||||
|
||||
def get_nvme_wear_level(data: Dict[Any,Any]) -> Optional[float]:
|
||||
"""
|
||||
Extract wear level (as float) from smartctl -x --json output for NVME SSD
|
||||
"""
|
||||
pct_used = data.get("nvme_smart_health_information_log", {}).get("percentage_used")
|
||||
if pct_used is None:
|
||||
return None
|
||||
return pct_used / 100.0
|
||||
|
||||
|
||||
class Module(MgrModule):
|
||||
MODULE_OPTIONS = [
|
||||
Option(
|
||||
@ -450,6 +473,22 @@ class Module(MgrModule):
|
||||
ioctx.remove_omap_keys(op, tuple(erase))
|
||||
ioctx.operate_write_op(op, devid)
|
||||
|
||||
# extract wear level?
|
||||
wear_level = get_ata_wear_level(data)
|
||||
if wear_level is None:
|
||||
wear_level = get_nvme_wear_level(data)
|
||||
dev_data = self.get(f"device {devid}") or {}
|
||||
if wear_level is not None:
|
||||
if dev_data.get(wear_level) != str(wear_level):
|
||||
dev_data["wear_level"] = str(wear_level)
|
||||
self.log.debug(f"updating {devid} wear level to {wear_level}")
|
||||
self.set_device_wear_level(devid, wear_level)
|
||||
else:
|
||||
if "wear_level" in dev_data:
|
||||
del dev_data["wear_level"]
|
||||
self.log.debug(f"removing {devid} wear level")
|
||||
self.set_device_wear_level(devid, -1.0)
|
||||
|
||||
def _get_device_metrics(self, devid: str,
|
||||
sample: Optional[str] = None,
|
||||
min_sample: Optional[str] = None) -> Dict[str, Dict[str, Any]]:
|
||||
|
@ -1611,6 +1611,9 @@ class MgrModule(ceph_module.BaseMgrModule, MgrModuleLoggingMixin):
|
||||
"""
|
||||
return self._ceph_set_uri(uri)
|
||||
|
||||
def set_device_wear_level(self, devid: str, wear_level: float) -> None:
|
||||
return self._ceph_set_device_wear_level(devid, wear_level)
|
||||
|
||||
def have_mon_connection(self) -> bool:
|
||||
"""
|
||||
Check whether this ceph-mgr daemon has an open connection
|
||||
|
Loading…
Reference in New Issue
Block a user