1
0
mirror of https://github.com/ceph/ceph synced 2025-03-25 11:48:05 +00:00

mgr/alerts: simple module to send health alerts

Initialy SMTP support is implemented; nothing else.

This is just smart enough for me to get emails from my home cluster when
something goes wrong.  No bells and whistled at this point.

Signed-off-by: Sage Weil <sage@redhat.com>
This commit is contained in:
Sage Weil 2019-10-06 10:45:57 -05:00
parent bbc7bb5a22
commit d7223938f8
6 changed files with 302 additions and 0 deletions

View File

@ -1609,6 +1609,7 @@ fi
%files mgr
%{_bindir}/ceph-mgr
%dir %{_datadir}/ceph/mgr
%{_datadir}/ceph/mgr/alerts
%{_datadir}/ceph/mgr/ansible
%{_datadir}/ceph/mgr/balancer
%{_datadir}/ceph/mgr/crash

View File

@ -1,5 +1,6 @@
lib/systemd/system/ceph-mgr*
usr/bin/ceph-mgr
usr/share/ceph/mgr/alerts
usr/share/ceph/mgr/ansible
usr/share/ceph/mgr/balancer
usr/share/ceph/mgr/crash

58
doc/mgr/alerts.rst Normal file
View File

@ -0,0 +1,58 @@
Alerts module
=============
The alerts module can send simple alert messages about cluster health
via e-mail. In the future, it will support other notification methods
as well.
:note: This module is *not* intended to be a robust monitoring
solution. The fact that it is run as part of the Ceph cluster
itself is fundamentally limiting in that a failure of the
ceph-mgr daemon prevents alerts from being sent. This module
can, however, be useful for standalone clusters that exist in
environments where existing monitoring infrastructure does not
exist.
Enabling
--------
The *alerts* module is enabled with::
ceph mgr module enable alerts
Configuration
-------------
To configure SMTP, all of the following config options must be set::
ceph config set mgr mgr/alerts/smtp_host *<smtp-server>*
ceph config set mgr mgr/alerts/smtp_destination *<email-address-to-send-to>*
ceph config set mgr mgr/alerts/smtp_sender *<from-email-address>*
By default, the module will use SSL and port 465. To change that,::
ceph config set mgr mgr/alerts/smtp_ssl false # if not SSL
ceph config set mgr mgr/alerts/smtp_port *<port-number>* # if not 465
To authenticate to the SMTP server, you must set the user and password::
ceph config set mgr mgr/alerts/smtp_user *<username>*
ceph config set mgr mgr/alerts/smtp_password *<password>*
By default, the name in the ``From:`` line is simply ``Ceph``. To
change that (e.g., to identify which cluster this is),::
ceph config set mgr mgr/alerts/smtp_from_name 'Ceph Cluster Foo'
By default, the module will check the cluster health once per minute
and, if there is a change, send a message. To change that
frequency,::
ceph config set mgr mgr/alerts/interval *<interval>* # e.g., "5m" for 5 minutes
Commands
--------
To force an alert to be send immediately,::
ceph alerts send

View File

@ -29,6 +29,7 @@ sensible.
Writing modules <modules>
Writing orchestrator plugins <orchestrator_modules>
Dashboard module <dashboard>
Alerts module <alerts>
DiskPrediction module <diskprediction>
Local pool module <localpool>
RESTful module <restful>

View File

@ -0,0 +1 @@
from .module import Alerts

View File

@ -0,0 +1,240 @@
"""
A simple cluster health alerting module.
"""
from mgr_module import MgrModule, HandleCommandResult
from threading import Event
import errno
import json
import smtplib
class Alerts(MgrModule):
COMMANDS = [
{
"cmd": "alerts send",
"desc": "(re)send alerts immediately",
"perm": "r"
},
]
MODULE_OPTIONS = [
{
'name': 'interval',
'type': 'seconds',
'default': 60,
'desc': 'How frequently to reexamine health status',
'runtime': True,
},
# smtp
{
'name': 'smtp_host',
'desc': 'SMTP server',
'runtime': True,
},
{
'name': 'smtp_destination',
'default': '',
'desc': 'Email address to send alerts to',
'runtime': True,
},
{
'name': 'smtp_port',
'type': 'int',
'default': 465,
'desc': 'SMTP port',
'runtime': True,
},
{
'name': 'smtp_ssl',
'type': 'bool',
'default': True,
'desc': 'Use SSL to connect to SMTP server',
'runtime': True,
},
{
'name': 'smtp_user',
'default': '',
'desc': 'User to authenticate as',
'runtime': True,
},
{
'name': 'smtp_password',
'default': '',
'desc': 'Password to authenticate with',
'runtime': True,
},
{
'name': 'smtp_sender',
'default': '',
'desc': 'SMTP envelope sender',
'runtime': True,
},
{
'name': 'smtp_from_name',
'default': 'Ceph',
'desc': 'Email From: name',
'runtime': True,
},
]
# These are "native" Ceph options that this module cares about.
NATIVE_OPTIONS = [
]
def __init__(self, *args, **kwargs):
super(Alerts, self).__init__(*args, **kwargs)
# set up some members to enable the serve() method and shutdown()
self.run = True
self.event = Event()
# ensure config options members are initialized; see config_notify()
self.config_notify()
self.log.info("Init")
def config_notify(self):
"""
This method is called whenever one of our config options is changed.
"""
# This is some boilerplate that stores MODULE_OPTIONS in a class
# member, so that, for instance, the 'emphatic' option is always
# available as 'self.emphatic'.
for opt in self.MODULE_OPTIONS:
setattr(self,
opt['name'],
self.get_module_option(opt['name']) or opt['default'])
self.log.debug(' mgr option %s = %s',
opt['name'], getattr(self, opt['name']))
# Do the same for the native options.
for opt in self.NATIVE_OPTIONS:
setattr(self,
opt,
self.get_ceph_option(opt))
self.log.debug(' native option %s = %s', opt, getattr(self, opt))
def handle_command(self, inbuf, cmd):
ret = 0
out = ''
err = ''
if cmd['prefix'] == 'alerts send':
status = json.loads(self.get('health')['json'])
self._send_alert(status, {})
return HandleCommandResult(
retval=ret, # exit code
stdout=out, # stdout
stderr=err)
def _diff(self, last, new):
d = {}
for code, alert in new.get('checks', {}).items():
self.log.debug('new code %s alert %s' % (code, alert))
if code not in last.get('checks', {}):
if 'new' not in d:
d['new'] = {}
d['new'][code] = alert
elif alert['summary'].get('count', 0) > \
last['checks'][code]['summary'].get('count', 0):
if 'updated' not in d:
d['updated'] = {}
d['updated'][code] = alert
for code, alert in last.get('checks', {}).items():
self.log.debug('old code %s alert %s' % (code, alert))
if code not in new.get('checks', {}):
if 'cleared' not in d:
d['cleared'] = {}
d['cleared'][code] = alert
return d
def _send_alert(self, status, diff):
if self.smtp_host:
self._send_alert_smtp(status, diff)
def serve(self):
"""
This method is called by the mgr when the module starts and can be
used for any background activity.
"""
self.log.info("Starting")
last_status = {}
while self.run:
# Do some useful background work here.
new_status = json.loads(self.get('health')['json'])
if new_status != last_status:
self.log.debug('last_status %s' % last_status)
self.log.debug('new_status %s' % new_status)
diff = self._diff(last_status,
new_status)
self.log.debug('diff %s' % diff)
if diff:
self._send_alert(new_status, diff)
last_status = new_status
self.log.debug('Sleeping for %d seconds', self.interval)
ret = self.event.wait(self.interval)
self.event.clear()
def shutdown(self):
"""
This method is called by the mgr when the module needs to shut
down (i.e., when the serve() function needs to exit).
"""
self.log.info('Stopping')
self.run = False
self.event.set()
# SMTP
def _smtp_format_alert(self, code, alert):
r = '[{sev}] {code}: {summary}\n'.format(
code=code,
sev=alert['severity'].split('_')[1],
summary=alert['summary']['message'])
for detail in alert['detail']:
r += ' {message}\n'.format(
message=detail['message'])
return r
def _send_alert_smtp(self, status, diff):
# message
self.log.debug('_send_alert_smtp')
message = ('From: {from_name} <{sender}>\n'
'Subject: {status}\n'
'To: {target}\n'
'\n'
'{status}\n'.format(
sender=self.smtp_sender,
from_name=self.smtp_from_name,
status=status['status'],
target=self.smtp_destination))
if 'new' in diff:
message += ('\n--- New ---\n')
for code, alert in diff['new'].items():
message += self._smtp_format_alert(code, alert)
if 'updated' in diff:
message += ('\n--- Updated ---\n')
for code, alert in diff['updated'].items():
message += self._smtp_format_alert(code, alert)
if 'cleared' in diff:
message += ('\n--- Cleared ---\n')
for code, alert in diff['cleared'].items():
message += self._smtp_format_alert(code, alert)
message += ('\n\n=== Full health status ===\n')
for code, alert in status['checks'].items():
message += self._smtp_format_alert(code, alert)
self.log.debug('message: %s' % message)
# send
if self.smtp_ssl:
server = smtplib.SMTP_SSL(self.smtp_host, self.smtp_port)
else:
server = smtplib.SMTP(self.smtp_host, self.smtp_port)
if self.smtp_password:
server.login(self.smtp_user, self.smtp_password)
server.sendmail(self.smtp_sender, self.smtp_destination, message)
server.quit()
self.log.debug('Sent email to %s' % self.smtp_destination)