Merge pull request #38277 from b-ranto/wip-prom-fixes

mgr/prometheus: Make module more stable

Reviewed-by: Mykola Golub <mgolub@suse.com>
Reviewed-by: Sebastian Wagner <swagner@suse.com>
Reviewed-by: Kefu Chai <kchai@redhat.com>
This commit is contained in:
Kefu Chai 2020-11-28 16:15:30 +08:00 committed by GitHub
commit fd3c0f905b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -187,15 +187,25 @@ class MetricCollectionThread(threading.Thread):
def __init__(self, module):
# type: (Module) -> None
self.mod = module
self.active = True
self.event = threading.Event()
super(MetricCollectionThread, self).__init__(target=self.collect)
def collect(self):
self.mod.log.info('starting metric collection thread')
while True:
while self.active:
self.mod.log.debug('collecting cache in thread')
if self.mod.have_mon_connection():
start_time = time.time()
try:
data = self.mod.collect()
except Exception as e:
# Log any issues encountered during the data collection and continue
self.mod.log.exception("failed to collect metrics:")
self.event.wait(self.mod.scrape_interval)
continue
duration = time.time() - start_time
self.mod.log.debug('collecting cache in thread done')
@ -217,11 +227,14 @@ class MetricCollectionThread(threading.Thread):
self.mod.collect_cache = data
self.mod.collect_time = duration
time.sleep(sleep_time)
self.event.wait(sleep_time)
else:
self.mod.log.error('No MON connection')
time.sleep(self.mod.scrape_interval)
self.event.wait(self.mod.scrape_interval)
def stop(self):
self.active = False
self.event.set()
class Module(MgrModule):
COMMANDS = [
@ -273,7 +286,7 @@ class Module(MgrModule):
} # type: Dict[str, Any]
global _global_instance
_global_instance = self
MetricCollectionThread(_global_instance).start()
self.metrics_thread = MetricCollectionThread(_global_instance)
def _setup_static_metrics(self):
metrics = {}
@ -584,12 +597,10 @@ class Module(MgrModule):
all_modules = {module.get('name'):module.get('can_run') for module in mgr_map['available_modules']}
ceph_release = None
for mgr in all_mgrs:
host_version = servers.get((mgr, 'mgr'), ('', ''))
if mgr == active:
_state = 1
ceph_release = host_version[1].split()[-2] # e.g. nautilus
else:
_state = 0
@ -600,7 +611,7 @@ class Module(MgrModule):
self.metrics['mgr_status'].set(_state, (
'mgr.{}'.format(mgr),
))
always_on_modules = mgr_map['always_on_modules'].get(ceph_release, [])
always_on_modules = mgr_map['always_on_modules'].get(self.release_name, [])
active_modules = list(always_on_modules)
active_modules.extend(mgr_map['modules'])
@ -1254,6 +1265,8 @@ class Module(MgrModule):
(server_addr, server_port)
)
self.metrics_thread.start()
# Publish the URI that others may use to access the service we're
# about to start serving
self.set_uri('http://{0}:{1}/'.format(
@ -1273,9 +1286,13 @@ class Module(MgrModule):
# wait for the shutdown event
self.shutdown_event.wait()
self.shutdown_event.clear()
# tell metrics collection thread to stop collecting new metrics
self.metrics_thread.stop()
cherrypy.engine.stop()
self.log.info('Engine stopped.')
self.shutdown_rbd_stats()
# wait for the metrics collection thread to stop
self.metrics_thread.join()
def shutdown(self):
self.log.info('Stopping engine...')