mirror of https://github.com/ceph/ceph
369 lines
12 KiB
Plaintext
369 lines
12 KiB
Plaintext
CEPH-MIB DEFINITIONS ::= BEGIN
|
|
|
|
IMPORTS
|
|
MODULE-IDENTITY, NOTIFICATION-TYPE, enterprises
|
|
FROM SNMPv2-SMI
|
|
MODULE-COMPLIANCE, NOTIFICATION-GROUP
|
|
FROM SNMPv2-CONF
|
|
;
|
|
|
|
-- Linting information:
|
|
--
|
|
-- # smilint -l 6 -i notification-not-reversible ./CEPH-MIB.txt
|
|
--
|
|
-- ignore: notification-not-reversible since our SNMP gateway doesn't use SNMPv1
|
|
--
|
|
|
|
ceph MODULE-IDENTITY
|
|
LAST-UPDATED
|
|
"202111010000Z" -- Nov 01, 2021
|
|
ORGANIZATION
|
|
"The Ceph Project
|
|
https://ceph.io"
|
|
CONTACT-INFO
|
|
"Email: <dev@ceph.io>
|
|
|
|
Send comments to: <dev@ceph.io>"
|
|
DESCRIPTION
|
|
"The MIB module for Ceph. In it's current form it only
|
|
supports Notifications, since Ceph itself doesn't provide
|
|
any SNMP agent functionality.
|
|
|
|
Notifications are provided through a Prometheus/Alertmanager
|
|
webhook passing alerts to an external gateway service that is
|
|
responsible for formatting, forwarding and authenticating to
|
|
the SNMP receiver.
|
|
"
|
|
REVISION
|
|
"202111010000Z" --Nov 01, 2021
|
|
DESCRIPTION
|
|
"Latest version including the following updates;
|
|
|
|
- MIB restructure to align with linting
|
|
- names shortened and simplified (less verbose)
|
|
- Simplified structure due to switch to https://github.com/maxwo/snmp_notifier
|
|
- objects removed
|
|
- notifications updated
|
|
- Added module compliance
|
|
- Updated to latest prometheus alert rule definitions
|
|
"
|
|
::= { enterprises 50495 }
|
|
|
|
cephCluster OBJECT IDENTIFIER ::= { ceph 1 }
|
|
cephConformance OBJECT IDENTIFIER ::= { ceph 2 }
|
|
|
|
-- cephMetadata is a placeholder for possible future expansion via an agent
|
|
-- where we could provide an overview of the clusters configuration
|
|
cephMetadata OBJECT IDENTIFIER ::= { cephCluster 1 }
|
|
cephNotifications OBJECT IDENTIFIER ::= { cephCluster 2 }
|
|
|
|
prometheus OBJECT IDENTIFIER ::= { cephNotifications 1 }
|
|
|
|
--
|
|
-- Notifications: first we define the notification 'branches' for the
|
|
-- different categories of notifications / alerts
|
|
promGeneric OBJECT IDENTIFIER ::= { prometheus 1 }
|
|
promHealthStatus OBJECT IDENTIFIER ::= { prometheus 2 }
|
|
promMon OBJECT IDENTIFIER ::= { prometheus 3 }
|
|
promOsd OBJECT IDENTIFIER ::= { prometheus 4 }
|
|
promMds OBJECT IDENTIFIER ::= { prometheus 5 }
|
|
promMgr OBJECT IDENTIFIER ::= { prometheus 6 }
|
|
promPGs OBJECT IDENTIFIER ::= { prometheus 7 }
|
|
promNode OBJECT IDENTIFIER ::= { prometheus 8 }
|
|
promPool OBJECT IDENTIFIER ::= { prometheus 9 }
|
|
promRados OBJECT IDENTIFIER ::= { prometheus 10 }
|
|
promCephadm OBJECT IDENTIFIER ::= { prometheus 11 }
|
|
promPrometheus OBJECT IDENTIFIER ::= { prometheus 12 }
|
|
promNVMeGateway OBJECT IDENTIFIER ::= { prometheus 14 }
|
|
|
|
promGenericNotification NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "Generic alert issued when the Prometheus rule doesn't provide an OID."
|
|
::= { promGeneric 1 }
|
|
|
|
promGenericDaemonCrash NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "One or more daemons have crashed recently, and are yet to be archived"
|
|
::= { promGeneric 2 }
|
|
|
|
promHealthStatusError NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "Ceph in health_error state for too long."
|
|
::= { promHealthStatus 1 }
|
|
|
|
promHealthStatusWarning NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "Ceph in health_warn for too long."
|
|
::= { promHealthStatus 2 }
|
|
|
|
promMonLowQuorum NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "Monitor count in quorum is low."
|
|
::= { promMon 1 }
|
|
|
|
promMonDiskSpaceCritical NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "Monitor diskspace is critically low."
|
|
::= { promMon 2 }
|
|
|
|
promOsdDownHigh NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "A high number of OSDs are down."
|
|
::= { promOsd 1 }
|
|
|
|
promOsdDown NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "One or more Osds down."
|
|
::= { promOsd 2 }
|
|
|
|
promOsdNearFull NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "An OSD is dangerously full."
|
|
::= { promOsd 3 }
|
|
|
|
promOsdFlapping NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "An OSD was marked down at back up at least once a minute for 5 minutes."
|
|
::= { promOsd 4 }
|
|
|
|
promOsdHighPgDeviation NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "An OSD deviates by more then 30% from average PG count."
|
|
::= { promOsd 5 }
|
|
|
|
promOsdFull NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "An OSD has reached its full threshold."
|
|
::= { promOsd 6 }
|
|
|
|
promOsdHighPredictedFailures NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "Normal self healing unable to cope with the number of devices predicted to fail."
|
|
::= { promOsd 7 }
|
|
|
|
promOsdHostDown NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "Ceph OSD host is down."
|
|
::= { promOsd 8 }
|
|
|
|
promMdsDamaged NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "Cephfs filesystem is damaged."
|
|
::= { promMds 1 }
|
|
|
|
promMdsReadOnly NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "Cephfs filesystem marked as READ-ONLY"
|
|
::= { promMds 2 }
|
|
|
|
promMdsOffline NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "Cephfs filesystem is unavailable/offline."
|
|
::= { promMds 3 }
|
|
|
|
promMdsDegraded NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "Cephfs filesystem is in a degraded state."
|
|
::= { promMds 4 }
|
|
|
|
promMdsNoStandby NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "Cephfs MDS daemon failure, no standby available"
|
|
::= { promMds 5 }
|
|
|
|
promMgrModuleCrash NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "Ceph mgr module has crashed recently"
|
|
::= { promMgr 1 }
|
|
|
|
promMgrPrometheusInactive NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "Ceph mgr prometheus module not responding"
|
|
::= { promMgr 2 }
|
|
|
|
promPGsInactive NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "One or more PGs are inactive for more than 5 minutes."
|
|
::= { promPGs 1 }
|
|
|
|
promPGsUnclean NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "One or more PGs are not clean for more than 15 minutes."
|
|
::= { promPGs 2 }
|
|
|
|
promPGsUnavailable NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "One or more PGs is unavailable, blocking I/O to those objects."
|
|
::= { promPGs 3 }
|
|
|
|
promPGsDamaged NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "One or more PGs is damaged."
|
|
::= { promPGs 4 }
|
|
|
|
promPGsRecoveryFull NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "PG recovery is impaired due to full OSDs."
|
|
::= { promPGs 5 }
|
|
|
|
promPGsBackfillFull NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "PG backfill is impaired due to full OSDs."
|
|
::= { promPGs 6 }
|
|
|
|
promNodeRootVolumeFull NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "Root volume (OSD and MON store) is dangerously full (< 5% free)."
|
|
::= { promNode 1 }
|
|
|
|
promNodeNetworkPacketDrops NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "A node experiences packet drop > 1 packet/s on an interface."
|
|
::= { promNode 2 }
|
|
|
|
promNodeNetworkPacketErrors NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "A node experiences packet errors > 1 packet/s on an interface."
|
|
::= { promNode 3 }
|
|
|
|
promNodeStorageFilling NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "A mountpoint will be full in less then 5 days assuming the average fillup rate of the past 48 hours."
|
|
::= { promNode 4 }
|
|
|
|
promPoolFull NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "A pool is at 90% capacity or over."
|
|
::= { promPool 1 }
|
|
|
|
promPoolFilling NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "A pool will be full in less then 5 days assuming the average fillup rate of the past 48 hours."
|
|
::= { promPool 2 }
|
|
|
|
promRadosUnfound NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "A RADOS object can not be found, even though all OSDs are online."
|
|
::= { promRados 1 }
|
|
|
|
promRadosRBDMirrorImagesVeryHigh NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "Number of RBD image replications are very high."
|
|
::= { promRados 2 }
|
|
|
|
promRadosRBDMirrorUnsyncImages NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "Local RBD images are not in sync with the remote counter parts"
|
|
::= { promRados 3 }
|
|
|
|
promRadosRBDMirrorUnsyncImagesHigh NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "There is a high percentage of un-sync RBD images."
|
|
::= { promRados 4 }
|
|
|
|
promRadosRBDMirrorHighBandwidth NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "A high bandwidth usage is detected during RBD image transfers."
|
|
::= { promRados 5 }
|
|
|
|
promCephadmDaemonDown NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "Cephadm has determined that a daemon is down."
|
|
::= { promCephadm 1 }
|
|
|
|
promCephadmUpgradeFailure NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "Cephadm attempted to upgrade the cluster and encountered a problem."
|
|
::= { promCephadm 2 }
|
|
|
|
promPrometheusJobMissing NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "The prometheus scrape job is not defined."
|
|
::= { promPrometheus 1 }
|
|
|
|
promNVMeGatewayNicDown NOTIFICATION-TYPE
|
|
STATUS current
|
|
DESCRIPTION "A NIC used for NVMe gateway client traffic is down."
|
|
::= { promNVMeGateway 1 }
|
|
-- ---------------------------------------------------------- --
|
|
-- IEEE 802.1D MIB - Conformance Information
|
|
-- ---------------------------------------------------------- --
|
|
|
|
cephAlertGroups OBJECT IDENTIFIER ::= { cephConformance 1 }
|
|
cephCompliances OBJECT IDENTIFIER ::= { cephConformance 2 }
|
|
|
|
-- ---------------------------------------------------------- --
|
|
-- units of conformance
|
|
-- ---------------------------------------------------------- --
|
|
|
|
-- ---------------------------------------------------------- --
|
|
-- The Trap Notification Group
|
|
-- ---------------------------------------------------------- --
|
|
|
|
cephNotificationGroup NOTIFICATION-GROUP
|
|
NOTIFICATIONS {
|
|
promGenericNotification,
|
|
promGenericDaemonCrash,
|
|
promHealthStatusError,
|
|
promHealthStatusWarning,
|
|
promMonLowQuorum,
|
|
promMonDiskSpaceCritical,
|
|
promOsdDownHigh,
|
|
promOsdDown,
|
|
promOsdNearFull,
|
|
promOsdFlapping,
|
|
promOsdHighPgDeviation,
|
|
promOsdFull,
|
|
promOsdHighPredictedFailures,
|
|
promOsdHostDown,
|
|
promMdsDamaged,
|
|
promMdsReadOnly,
|
|
promMdsOffline,
|
|
promMdsDegraded,
|
|
promMdsNoStandby,
|
|
promMgrModuleCrash,
|
|
promMgrPrometheusInactive,
|
|
promPGsInactive,
|
|
promPGsUnclean,
|
|
promPGsUnavailable,
|
|
promPGsDamaged,
|
|
promPGsRecoveryFull,
|
|
promPGsBackfillFull,
|
|
promNodeRootVolumeFull,
|
|
promNodeNetworkPacketDrops,
|
|
promNodeNetworkPacketErrors,
|
|
promNodeStorageFilling,
|
|
promPoolFull,
|
|
promPoolFilling,
|
|
promRadosUnfound,
|
|
promRadosRBDMirrorImagesVeryHigh,
|
|
promRadosRBDMirrorUnsyncImages,
|
|
promRadosRBDMirrorUnsyncImagesHigh,
|
|
promRadosRBDMirrorHighBandwidth,
|
|
promCephadmDaemonDown,
|
|
promCephadmUpgradeFailure,
|
|
promPrometheusJobMissing,
|
|
promNVMeGatewayNicDown
|
|
}
|
|
STATUS current
|
|
DESCRIPTION
|
|
"A collection of notifications triggered by the Prometheus
|
|
rules to convey Ceph cluster state"
|
|
::= { cephAlertGroups 2 }
|
|
|
|
-- ---------------------------------------------------------- --
|
|
-- compliance statements
|
|
-- ---------------------------------------------------------- --
|
|
|
|
cephCompliance MODULE-COMPLIANCE
|
|
STATUS current
|
|
DESCRIPTION
|
|
"The Compliance statement for the Ceph MIB"
|
|
MODULE
|
|
MANDATORY-GROUPS {
|
|
cephNotificationGroup
|
|
}
|
|
::= { cephCompliances 1 }
|
|
|
|
END
|