mirror of
https://github.com/ceph/ceph
synced 2025-02-24 03:27:10 +00:00
Merge pull request #38277 from b-ranto/wip-prom-fixes
mgr/prometheus: Make module more stable Reviewed-by: Mykola Golub <mgolub@suse.com> Reviewed-by: Sebastian Wagner <swagner@suse.com> Reviewed-by: Kefu Chai <kchai@redhat.com>
This commit is contained in:
commit
fd3c0f905b
@ -187,15 +187,25 @@ class MetricCollectionThread(threading.Thread):
|
||||
def __init__(self, module):
|
||||
# type: (Module) -> None
|
||||
self.mod = module
|
||||
self.active = True
|
||||
self.event = threading.Event()
|
||||
super(MetricCollectionThread, self).__init__(target=self.collect)
|
||||
|
||||
def collect(self):
|
||||
self.mod.log.info('starting metric collection thread')
|
||||
while True:
|
||||
while self.active:
|
||||
self.mod.log.debug('collecting cache in thread')
|
||||
if self.mod.have_mon_connection():
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
data = self.mod.collect()
|
||||
except Exception as e:
|
||||
# Log any issues encountered during the data collection and continue
|
||||
self.mod.log.exception("failed to collect metrics:")
|
||||
self.event.wait(self.mod.scrape_interval)
|
||||
continue
|
||||
|
||||
duration = time.time() - start_time
|
||||
self.mod.log.debug('collecting cache in thread done')
|
||||
|
||||
@ -217,11 +227,14 @@ class MetricCollectionThread(threading.Thread):
|
||||
self.mod.collect_cache = data
|
||||
self.mod.collect_time = duration
|
||||
|
||||
time.sleep(sleep_time)
|
||||
self.event.wait(sleep_time)
|
||||
else:
|
||||
self.mod.log.error('No MON connection')
|
||||
time.sleep(self.mod.scrape_interval)
|
||||
self.event.wait(self.mod.scrape_interval)
|
||||
|
||||
def stop(self):
|
||||
self.active = False
|
||||
self.event.set()
|
||||
|
||||
class Module(MgrModule):
|
||||
COMMANDS = [
|
||||
@ -273,7 +286,7 @@ class Module(MgrModule):
|
||||
} # type: Dict[str, Any]
|
||||
global _global_instance
|
||||
_global_instance = self
|
||||
MetricCollectionThread(_global_instance).start()
|
||||
self.metrics_thread = MetricCollectionThread(_global_instance)
|
||||
|
||||
def _setup_static_metrics(self):
|
||||
metrics = {}
|
||||
@ -584,12 +597,10 @@ class Module(MgrModule):
|
||||
|
||||
all_modules = {module.get('name'):module.get('can_run') for module in mgr_map['available_modules']}
|
||||
|
||||
ceph_release = None
|
||||
for mgr in all_mgrs:
|
||||
host_version = servers.get((mgr, 'mgr'), ('', ''))
|
||||
if mgr == active:
|
||||
_state = 1
|
||||
ceph_release = host_version[1].split()[-2] # e.g. nautilus
|
||||
else:
|
||||
_state = 0
|
||||
|
||||
@ -600,7 +611,7 @@ class Module(MgrModule):
|
||||
self.metrics['mgr_status'].set(_state, (
|
||||
'mgr.{}'.format(mgr),
|
||||
))
|
||||
always_on_modules = mgr_map['always_on_modules'].get(ceph_release, [])
|
||||
always_on_modules = mgr_map['always_on_modules'].get(self.release_name, [])
|
||||
active_modules = list(always_on_modules)
|
||||
active_modules.extend(mgr_map['modules'])
|
||||
|
||||
@ -1254,6 +1265,8 @@ class Module(MgrModule):
|
||||
(server_addr, server_port)
|
||||
)
|
||||
|
||||
self.metrics_thread.start()
|
||||
|
||||
# Publish the URI that others may use to access the service we're
|
||||
# about to start serving
|
||||
self.set_uri('http://{0}:{1}/'.format(
|
||||
@ -1273,9 +1286,13 @@ class Module(MgrModule):
|
||||
# wait for the shutdown event
|
||||
self.shutdown_event.wait()
|
||||
self.shutdown_event.clear()
|
||||
# tell metrics collection thread to stop collecting new metrics
|
||||
self.metrics_thread.stop()
|
||||
cherrypy.engine.stop()
|
||||
self.log.info('Engine stopped.')
|
||||
self.shutdown_rbd_stats()
|
||||
# wait for the metrics collection thread to stop
|
||||
self.metrics_thread.join()
|
||||
|
||||
def shutdown(self):
|
||||
self.log.info('Stopping engine...')
|
||||
|
Loading…
Reference in New Issue
Block a user