mirror of
https://github.com/ceph/ceph
synced 2025-01-20 10:01:45 +00:00
mgr/telemetry: collect what we can from heap stats, mempools, and osd histograms
If we run into a problem collecting heap stats, mempools, or osd histograms from a particular osd (i.e. the osd is down), we should continue to collect what we can from other osds rather than exiting and returning an empty JSON object. Some log messages are also refined. Fixes: https://tracker.ceph.com/issues/53985 Signed-off-by: Laura Flores <lflores@redhat.com>
This commit is contained in:
parent
a9b3e2f801
commit
c617b78f7b
@ -414,23 +414,28 @@ class Module(MgrModule):
|
||||
else:
|
||||
if 'tcmalloc heap stats' in outs:
|
||||
values = [int(i) for i in outs.split() if i.isdigit()]
|
||||
# `categories` must be ordered this way for the correct output to be parsed
|
||||
categories = ['use_by_application',
|
||||
'page_heap_freelist',
|
||||
'central_cache_freelist',
|
||||
'transfer_cache_freelist',
|
||||
'thread_cache_freelists',
|
||||
'malloc_metadata',
|
||||
'actual_memory_used',
|
||||
'released_to_os',
|
||||
'virtual_address_space_used',
|
||||
'spans_in_use',
|
||||
'thread_heaps_in_use',
|
||||
'tcmalloc_page_size']
|
||||
if len(values) != 12:
|
||||
self.log.debug('Received unexpected output: | outs: {} ' \
|
||||
'| values: {} |'.format(outs, values))
|
||||
return {}
|
||||
|
||||
categories = ['use_by_application', 'page_heap_freelist',
|
||||
'central_cache_freelist', 'transfer_cache_freelist',
|
||||
'thread_cache_freelists', 'malloc_metadata',
|
||||
'actual_memory_used', 'released_to_os',
|
||||
'virtual_address_space_used', 'spans_in_use',
|
||||
'thread_heaps_in_use', 'tcmalloc_page_size']
|
||||
|
||||
self.log.debug('Received unexpected output from osd.{}; number of values should match the number of expected categories:\n' \
|
||||
'values: len={} {} ~ categories: len={} {} ~ outs: {}'.format(osd_id, len(values), values, len(categories), categories, outs))
|
||||
continue
|
||||
osd = 'osd.' + str(osd_id)
|
||||
result[osd] = dict(zip(categories, values))
|
||||
else:
|
||||
self.log.debug('No heap stats available: {}'.format(outs))
|
||||
return {}
|
||||
self.log.debug('No heap stats available on osd.{}: {}'.format(osd_id, outs))
|
||||
continue
|
||||
|
||||
return result
|
||||
|
||||
@ -465,8 +470,8 @@ class Module(MgrModule):
|
||||
else:
|
||||
self.log.debug("Incorrect mode specified in get_mempool")
|
||||
except (json.decoder.JSONDecodeError, KeyError) as e:
|
||||
self.log.debug("Error caught: {}".format(e))
|
||||
return {}
|
||||
self.log.debug("Error caught on osd.{}: {}".format(osd_id, e))
|
||||
continue
|
||||
|
||||
return result
|
||||
|
||||
@ -576,10 +581,10 @@ class Module(MgrModule):
|
||||
# I am also putting in a catch for a KeyError since it could
|
||||
# happen where the code is assuming that a key exists in the
|
||||
# schema when it doesn't. In either case, we'll handle that
|
||||
# by returning an empty dict.
|
||||
# by continuing and collecting what we can from other osds.
|
||||
except (json.decoder.JSONDecodeError, KeyError) as e:
|
||||
self.log.debug("Error caught: {}".format(e))
|
||||
return list()
|
||||
self.log.debug("Error caught on osd.{}: {}".format(osd_id, e))
|
||||
continue
|
||||
|
||||
return list(result.values())
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user