mirror of
https://github.com/ceph/ceph
synced 2025-03-25 11:48:05 +00:00
mgr/alerts: simple module to send health alerts
Initialy SMTP support is implemented; nothing else. This is just smart enough for me to get emails from my home cluster when something goes wrong. No bells and whistled at this point. Signed-off-by: Sage Weil <sage@redhat.com>
This commit is contained in:
parent
bbc7bb5a22
commit
d7223938f8
@ -1609,6 +1609,7 @@ fi
|
||||
%files mgr
|
||||
%{_bindir}/ceph-mgr
|
||||
%dir %{_datadir}/ceph/mgr
|
||||
%{_datadir}/ceph/mgr/alerts
|
||||
%{_datadir}/ceph/mgr/ansible
|
||||
%{_datadir}/ceph/mgr/balancer
|
||||
%{_datadir}/ceph/mgr/crash
|
||||
|
1
debian/ceph-mgr.install
vendored
1
debian/ceph-mgr.install
vendored
@ -1,5 +1,6 @@
|
||||
lib/systemd/system/ceph-mgr*
|
||||
usr/bin/ceph-mgr
|
||||
usr/share/ceph/mgr/alerts
|
||||
usr/share/ceph/mgr/ansible
|
||||
usr/share/ceph/mgr/balancer
|
||||
usr/share/ceph/mgr/crash
|
||||
|
58
doc/mgr/alerts.rst
Normal file
58
doc/mgr/alerts.rst
Normal file
@ -0,0 +1,58 @@
|
||||
Alerts module
|
||||
=============
|
||||
|
||||
The alerts module can send simple alert messages about cluster health
|
||||
via e-mail. In the future, it will support other notification methods
|
||||
as well.
|
||||
|
||||
:note: This module is *not* intended to be a robust monitoring
|
||||
solution. The fact that it is run as part of the Ceph cluster
|
||||
itself is fundamentally limiting in that a failure of the
|
||||
ceph-mgr daemon prevents alerts from being sent. This module
|
||||
can, however, be useful for standalone clusters that exist in
|
||||
environments where existing monitoring infrastructure does not
|
||||
exist.
|
||||
|
||||
Enabling
|
||||
--------
|
||||
|
||||
The *alerts* module is enabled with::
|
||||
|
||||
ceph mgr module enable alerts
|
||||
|
||||
Configuration
|
||||
-------------
|
||||
|
||||
To configure SMTP, all of the following config options must be set::
|
||||
|
||||
ceph config set mgr mgr/alerts/smtp_host *<smtp-server>*
|
||||
ceph config set mgr mgr/alerts/smtp_destination *<email-address-to-send-to>*
|
||||
ceph config set mgr mgr/alerts/smtp_sender *<from-email-address>*
|
||||
|
||||
By default, the module will use SSL and port 465. To change that,::
|
||||
|
||||
ceph config set mgr mgr/alerts/smtp_ssl false # if not SSL
|
||||
ceph config set mgr mgr/alerts/smtp_port *<port-number>* # if not 465
|
||||
|
||||
To authenticate to the SMTP server, you must set the user and password::
|
||||
|
||||
ceph config set mgr mgr/alerts/smtp_user *<username>*
|
||||
ceph config set mgr mgr/alerts/smtp_password *<password>*
|
||||
|
||||
By default, the name in the ``From:`` line is simply ``Ceph``. To
|
||||
change that (e.g., to identify which cluster this is),::
|
||||
|
||||
ceph config set mgr mgr/alerts/smtp_from_name 'Ceph Cluster Foo'
|
||||
|
||||
By default, the module will check the cluster health once per minute
|
||||
and, if there is a change, send a message. To change that
|
||||
frequency,::
|
||||
|
||||
ceph config set mgr mgr/alerts/interval *<interval>* # e.g., "5m" for 5 minutes
|
||||
|
||||
Commands
|
||||
--------
|
||||
|
||||
To force an alert to be send immediately,::
|
||||
|
||||
ceph alerts send
|
@ -29,6 +29,7 @@ sensible.
|
||||
Writing modules <modules>
|
||||
Writing orchestrator plugins <orchestrator_modules>
|
||||
Dashboard module <dashboard>
|
||||
Alerts module <alerts>
|
||||
DiskPrediction module <diskprediction>
|
||||
Local pool module <localpool>
|
||||
RESTful module <restful>
|
||||
|
1
src/pybind/mgr/alerts/__init__.py
Normal file
1
src/pybind/mgr/alerts/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from .module import Alerts
|
240
src/pybind/mgr/alerts/module.py
Normal file
240
src/pybind/mgr/alerts/module.py
Normal file
@ -0,0 +1,240 @@
|
||||
|
||||
"""
|
||||
A simple cluster health alerting module.
|
||||
"""
|
||||
|
||||
from mgr_module import MgrModule, HandleCommandResult
|
||||
from threading import Event
|
||||
import errno
|
||||
import json
|
||||
import smtplib
|
||||
|
||||
class Alerts(MgrModule):
|
||||
COMMANDS = [
|
||||
{
|
||||
"cmd": "alerts send",
|
||||
"desc": "(re)send alerts immediately",
|
||||
"perm": "r"
|
||||
},
|
||||
]
|
||||
|
||||
MODULE_OPTIONS = [
|
||||
{
|
||||
'name': 'interval',
|
||||
'type': 'seconds',
|
||||
'default': 60,
|
||||
'desc': 'How frequently to reexamine health status',
|
||||
'runtime': True,
|
||||
},
|
||||
# smtp
|
||||
{
|
||||
'name': 'smtp_host',
|
||||
'desc': 'SMTP server',
|
||||
'runtime': True,
|
||||
},
|
||||
{
|
||||
'name': 'smtp_destination',
|
||||
'default': '',
|
||||
'desc': 'Email address to send alerts to',
|
||||
'runtime': True,
|
||||
},
|
||||
{
|
||||
'name': 'smtp_port',
|
||||
'type': 'int',
|
||||
'default': 465,
|
||||
'desc': 'SMTP port',
|
||||
'runtime': True,
|
||||
},
|
||||
{
|
||||
'name': 'smtp_ssl',
|
||||
'type': 'bool',
|
||||
'default': True,
|
||||
'desc': 'Use SSL to connect to SMTP server',
|
||||
'runtime': True,
|
||||
},
|
||||
{
|
||||
'name': 'smtp_user',
|
||||
'default': '',
|
||||
'desc': 'User to authenticate as',
|
||||
'runtime': True,
|
||||
},
|
||||
{
|
||||
'name': 'smtp_password',
|
||||
'default': '',
|
||||
'desc': 'Password to authenticate with',
|
||||
'runtime': True,
|
||||
},
|
||||
{
|
||||
'name': 'smtp_sender',
|
||||
'default': '',
|
||||
'desc': 'SMTP envelope sender',
|
||||
'runtime': True,
|
||||
},
|
||||
{
|
||||
'name': 'smtp_from_name',
|
||||
'default': 'Ceph',
|
||||
'desc': 'Email From: name',
|
||||
'runtime': True,
|
||||
},
|
||||
]
|
||||
|
||||
# These are "native" Ceph options that this module cares about.
|
||||
NATIVE_OPTIONS = [
|
||||
]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(Alerts, self).__init__(*args, **kwargs)
|
||||
|
||||
# set up some members to enable the serve() method and shutdown()
|
||||
self.run = True
|
||||
self.event = Event()
|
||||
|
||||
# ensure config options members are initialized; see config_notify()
|
||||
self.config_notify()
|
||||
|
||||
self.log.info("Init")
|
||||
|
||||
|
||||
def config_notify(self):
|
||||
"""
|
||||
This method is called whenever one of our config options is changed.
|
||||
"""
|
||||
# This is some boilerplate that stores MODULE_OPTIONS in a class
|
||||
# member, so that, for instance, the 'emphatic' option is always
|
||||
# available as 'self.emphatic'.
|
||||
for opt in self.MODULE_OPTIONS:
|
||||
setattr(self,
|
||||
opt['name'],
|
||||
self.get_module_option(opt['name']) or opt['default'])
|
||||
self.log.debug(' mgr option %s = %s',
|
||||
opt['name'], getattr(self, opt['name']))
|
||||
# Do the same for the native options.
|
||||
for opt in self.NATIVE_OPTIONS:
|
||||
setattr(self,
|
||||
opt,
|
||||
self.get_ceph_option(opt))
|
||||
self.log.debug(' native option %s = %s', opt, getattr(self, opt))
|
||||
|
||||
def handle_command(self, inbuf, cmd):
|
||||
ret = 0
|
||||
out = ''
|
||||
err = ''
|
||||
if cmd['prefix'] == 'alerts send':
|
||||
status = json.loads(self.get('health')['json'])
|
||||
self._send_alert(status, {})
|
||||
return HandleCommandResult(
|
||||
retval=ret, # exit code
|
||||
stdout=out, # stdout
|
||||
stderr=err)
|
||||
|
||||
def _diff(self, last, new):
|
||||
d = {}
|
||||
for code, alert in new.get('checks', {}).items():
|
||||
self.log.debug('new code %s alert %s' % (code, alert))
|
||||
if code not in last.get('checks', {}):
|
||||
if 'new' not in d:
|
||||
d['new'] = {}
|
||||
d['new'][code] = alert
|
||||
elif alert['summary'].get('count', 0) > \
|
||||
last['checks'][code]['summary'].get('count', 0):
|
||||
if 'updated' not in d:
|
||||
d['updated'] = {}
|
||||
d['updated'][code] = alert
|
||||
for code, alert in last.get('checks', {}).items():
|
||||
self.log.debug('old code %s alert %s' % (code, alert))
|
||||
if code not in new.get('checks', {}):
|
||||
if 'cleared' not in d:
|
||||
d['cleared'] = {}
|
||||
d['cleared'][code] = alert
|
||||
return d
|
||||
|
||||
def _send_alert(self, status, diff):
|
||||
if self.smtp_host:
|
||||
self._send_alert_smtp(status, diff)
|
||||
|
||||
def serve(self):
|
||||
"""
|
||||
This method is called by the mgr when the module starts and can be
|
||||
used for any background activity.
|
||||
"""
|
||||
self.log.info("Starting")
|
||||
last_status = {}
|
||||
while self.run:
|
||||
# Do some useful background work here.
|
||||
new_status = json.loads(self.get('health')['json'])
|
||||
if new_status != last_status:
|
||||
self.log.debug('last_status %s' % last_status)
|
||||
self.log.debug('new_status %s' % new_status)
|
||||
diff = self._diff(last_status,
|
||||
new_status)
|
||||
self.log.debug('diff %s' % diff)
|
||||
if diff:
|
||||
self._send_alert(new_status, diff)
|
||||
last_status = new_status
|
||||
|
||||
self.log.debug('Sleeping for %d seconds', self.interval)
|
||||
ret = self.event.wait(self.interval)
|
||||
self.event.clear()
|
||||
|
||||
def shutdown(self):
|
||||
"""
|
||||
This method is called by the mgr when the module needs to shut
|
||||
down (i.e., when the serve() function needs to exit).
|
||||
"""
|
||||
self.log.info('Stopping')
|
||||
self.run = False
|
||||
self.event.set()
|
||||
|
||||
# SMTP
|
||||
def _smtp_format_alert(self, code, alert):
|
||||
r = '[{sev}] {code}: {summary}\n'.format(
|
||||
code=code,
|
||||
sev=alert['severity'].split('_')[1],
|
||||
summary=alert['summary']['message'])
|
||||
for detail in alert['detail']:
|
||||
r += ' {message}\n'.format(
|
||||
message=detail['message'])
|
||||
return r
|
||||
|
||||
def _send_alert_smtp(self, status, diff):
|
||||
# message
|
||||
self.log.debug('_send_alert_smtp')
|
||||
message = ('From: {from_name} <{sender}>\n'
|
||||
'Subject: {status}\n'
|
||||
'To: {target}\n'
|
||||
'\n'
|
||||
'{status}\n'.format(
|
||||
sender=self.smtp_sender,
|
||||
from_name=self.smtp_from_name,
|
||||
status=status['status'],
|
||||
target=self.smtp_destination))
|
||||
|
||||
if 'new' in diff:
|
||||
message += ('\n--- New ---\n')
|
||||
for code, alert in diff['new'].items():
|
||||
message += self._smtp_format_alert(code, alert)
|
||||
if 'updated' in diff:
|
||||
message += ('\n--- Updated ---\n')
|
||||
for code, alert in diff['updated'].items():
|
||||
message += self._smtp_format_alert(code, alert)
|
||||
if 'cleared' in diff:
|
||||
message += ('\n--- Cleared ---\n')
|
||||
for code, alert in diff['cleared'].items():
|
||||
message += self._smtp_format_alert(code, alert)
|
||||
|
||||
message += ('\n\n=== Full health status ===\n')
|
||||
for code, alert in status['checks'].items():
|
||||
message += self._smtp_format_alert(code, alert)
|
||||
|
||||
self.log.debug('message: %s' % message)
|
||||
|
||||
# send
|
||||
if self.smtp_ssl:
|
||||
server = smtplib.SMTP_SSL(self.smtp_host, self.smtp_port)
|
||||
else:
|
||||
server = smtplib.SMTP(self.smtp_host, self.smtp_port)
|
||||
if self.smtp_password:
|
||||
server.login(self.smtp_user, self.smtp_password)
|
||||
server.sendmail(self.smtp_sender, self.smtp_destination, message)
|
||||
server.quit()
|
||||
self.log.debug('Sent email to %s' % self.smtp_destination)
|
Loading…
Reference in New Issue
Block a user