Merge pull request #43274 from pcuzner/add-mib

monitoring:Adding the Ceph MIB

Reviewed-by: Sebastian Wagner <sewagner@redhat.com>
This commit is contained in:
Sebastian Wagner 2021-10-12 22:29:06 +02:00 committed by GitHub
commit 53382d70eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 408 additions and 0 deletions

View File

@ -5,3 +5,10 @@ In monitoring/prometheus/alerts you'll find a set of Prometheus alert rules that
should provide a decent set of default alerts for a Ceph cluster. Just put this
file in a place according to your Prometheus configuration (wherever the `rules`
configuration stanza points).
### SNMP
Ceph provides a MIB (CEPH-PROMETHEUS-ALERT-MIB.txt) to support sending Prometheus
alerts through to an SNMP management platform. The translation from Prometheus
alert to SNMP trap requires the Prometheus alert to contain an OID that maps to
a definition within the MIB. When making changes to the Prometheus alert rules
file, developers should include any necessary changes to the MIB.

View File

@ -0,0 +1,377 @@
CEPH-PROMETHEUS-ALERT-MIB DEFINITIONS ::= BEGIN
IMPORTS
MODULE-IDENTITY, OBJECT-TYPE, NOTIFICATION-TYPE, enterprises, TimeTicks
FROM SNMPv2-SMI
DisplayString
FROM SNMPv2-TC
;
ceph OBJECT IDENTIFIER ::= { enterprises 50495 }
prometheus OBJECT IDENTIFIER ::= { ceph 15 }
prometheusAlert MODULE-IDENTITY
LAST-UPDATED "201904010000Z" -- 1. Apr 2019
ORGANIZATION "The Ceph Project"
CONTACT-INFO "https://ceph.com"
DESCRIPTION "Prometheus Alert SNMP MIB"
REVISION "201904010000Z" -- 1. Apr 2019
DESCRIPTION "Initial version."
::= { prometheus 1 }
prometheusAlertObjects OBJECT IDENTIFIER ::= { prometheusAlert 1 }
prometheusAlertTraps OBJECT IDENTIFIER ::= { prometheusAlert 2 }
--
-- Objects
--
prometheusAlertNotificationAlertName OBJECT-TYPE
SYNTAX DisplayString
MAX-ACCESS accessible-for-notify
STATUS current
DESCRIPTION "The name of the Prometheus alert."
::= { prometheusAlertObjects 1 }
prometheusAlertNotificationStatus OBJECT-TYPE
SYNTAX DisplayString
MAX-ACCESS accessible-for-notify
STATUS current
DESCRIPTION "The status of the Prometheus alert."
::= { prometheusAlertObjects 2 }
prometheusAlertNotificationSeverity OBJECT-TYPE
SYNTAX DisplayString
MAX-ACCESS accessible-for-notify
STATUS current
DESCRIPTION "The severity of the Prometheus alert."
::= { prometheusAlertObjects 3 }
prometheusAlertNotificationInstance OBJECT-TYPE
SYNTAX DisplayString
MAX-ACCESS accessible-for-notify
STATUS current
DESCRIPTION "Unique identifier for the Prometheus instance."
::= { prometheusAlertObjects 4 }
prometheusAlertNotificationJob OBJECT-TYPE
SYNTAX DisplayString
MAX-ACCESS accessible-for-notify
STATUS current
DESCRIPTION "The name of the Prometheus job."
::= { prometheusAlertObjects 5 }
prometheusAlertNotificationDescription OBJECT-TYPE
SYNTAX DisplayString
MAX-ACCESS accessible-for-notify
STATUS current
DESCRIPTION "The Prometheus alert description field."
::= { prometheusAlertObjects 6 }
prometheusAlertNotificationLabels OBJECT-TYPE
SYNTAX DisplayString
MAX-ACCESS accessible-for-notify
STATUS current
DESCRIPTION "Additional Prometheus alert labels as JSON string."
::= { prometheusAlertObjects 7 }
prometheusAlertNotificationTimestamp OBJECT-TYPE
SYNTAX TimeTicks
MAX-ACCESS accessible-for-notify
STATUS current
DESCRIPTION "The time when the Prometheus alert occurred."
::= { prometheusAlertObjects 8 }
prometheusAlertNotificationRawData OBJECT-TYPE
SYNTAX DisplayString
MAX-ACCESS accessible-for-notify
STATUS current
DESCRIPTION "The raw Prometheus alert as JSON string."
::= { prometheusAlertObjects 9 }
--
-- Traps
--
prometheusAlertTrapDefault NOTIFICATION-TYPE
OBJECTS {
prometheusAlertNotificationAlertName,
prometheusAlertNotificationStatus,
prometheusAlertNotificationSeverity,
prometheusAlertNotificationInstance,
prometheusAlertNotificationJob,
prometheusAlertNotificationDescription,
prometheusAlertNotificationLabels,
prometheusAlertNotificationTimestamp,
prometheusAlertNotificationRawData
}
STATUS current
DESCRIPTION "Default trap."
::= { prometheusAlertTraps 1 }
prometheusAlertClusterHealthTraps OBJECT IDENTIFIER ::= { prometheusAlertTraps 2 }
prometheusAlertMonTraps OBJECT IDENTIFIER ::= { prometheusAlertTraps 3 }
prometheusAlertOsdTraps OBJECT IDENTIFIER ::= { prometheusAlertTraps 4 }
prometheusAlertMdsTraps OBJECT IDENTIFIER ::= { prometheusAlertTraps 5 }
prometheusAlertMgrTraps OBJECT IDENTIFIER ::= { prometheusAlertTraps 6 }
prometheusAlertPgsTraps OBJECT IDENTIFIER ::= { prometheusAlertTraps 7 }
prometheusAlertNodesTraps OBJECT IDENTIFIER ::= { prometheusAlertTraps 8 }
prometheusAlertPoolsTraps OBJECT IDENTIFIER ::= { prometheusAlertTraps 9 }
prometheusAlertClusterHealthTrapHealthError NOTIFICATION-TYPE
OBJECTS {
prometheusAlertNotificationAlertName,
prometheusAlertNotificationStatus,
prometheusAlertNotificationSeverity,
prometheusAlertNotificationInstance,
prometheusAlertNotificationJob,
prometheusAlertNotificationDescription,
prometheusAlertNotificationLabels,
prometheusAlertNotificationTimestamp,
prometheusAlertNotificationRawData
}
STATUS current
DESCRIPTION "Ceph in health_error state for more than 5m."
::= { prometheusAlertClusterHealthTraps 1 }
prometheusAlertClusterHealthTrapHealthWarn NOTIFICATION-TYPE
OBJECTS {
prometheusAlertNotificationAlertName,
prometheusAlertNotificationStatus,
prometheusAlertNotificationSeverity,
prometheusAlertNotificationInstance,
prometheusAlertNotificationJob,
prometheusAlertNotificationDescription,
prometheusAlertNotificationLabels,
prometheusAlertNotificationTimestamp,
prometheusAlertNotificationRawData
}
STATUS current
DESCRIPTION "Ceph in health_warn for more than 15m."
::= { prometheusAlertClusterHealthTraps 2 }
prometheusAlertMonTrapLowMonitorQuorumCount NOTIFICATION-TYPE
OBJECTS {
prometheusAlertNotificationAlertName,
prometheusAlertNotificationStatus,
prometheusAlertNotificationSeverity,
prometheusAlertNotificationInstance,
prometheusAlertNotificationJob,
prometheusAlertNotificationDescription,
prometheusAlertNotificationLabels,
prometheusAlertNotificationTimestamp,
prometheusAlertNotificationRawData
}
STATUS current
DESCRIPTION "Monitor count in quorum is low."
::= { prometheusAlertMonTraps 1 }
prometheusAlertOsdTrap10PercentOsdsDown NOTIFICATION-TYPE
OBJECTS {
prometheusAlertNotificationAlertName,
prometheusAlertNotificationStatus,
prometheusAlertNotificationSeverity,
prometheusAlertNotificationInstance,
prometheusAlertNotificationJob,
prometheusAlertNotificationDescription,
prometheusAlertNotificationLabels,
prometheusAlertNotificationTimestamp,
prometheusAlertNotificationRawData
}
STATUS current
DESCRIPTION "More than 10% of OSDs are down."
::= { prometheusAlertOsdTraps 1 }
prometheusAlertOsdTrapOsdDown NOTIFICATION-TYPE
OBJECTS {
prometheusAlertNotificationAlertName,
prometheusAlertNotificationStatus,
prometheusAlertNotificationSeverity,
prometheusAlertNotificationInstance,
prometheusAlertNotificationJob,
prometheusAlertNotificationDescription,
prometheusAlertNotificationLabels,
prometheusAlertNotificationTimestamp,
prometheusAlertNotificationRawData
}
STATUS current
DESCRIPTION "One or more OSDs down for more than 15 minutes."
::= { prometheusAlertOsdTraps 2 }
prometheusAlertOsdTrapOsdsNearFull NOTIFICATION-TYPE
OBJECTS {
prometheusAlertNotificationAlertName,
prometheusAlertNotificationStatus,
prometheusAlertNotificationSeverity,
prometheusAlertNotificationInstance,
prometheusAlertNotificationJob,
prometheusAlertNotificationDescription,
prometheusAlertNotificationLabels,
prometheusAlertNotificationTimestamp,
prometheusAlertNotificationRawData
}
STATUS current
DESCRIPTION "An OSD is dangerously full, over 80%."
::= { prometheusAlertOsdTraps 3 }
prometheusAlertOsdTrapFlapOsd NOTIFICATION-TYPE
OBJECTS {
prometheusAlertNotificationAlertName,
prometheusAlertNotificationStatus,
prometheusAlertNotificationSeverity,
prometheusAlertNotificationInstance,
prometheusAlertNotificationJob,
prometheusAlertNotificationDescription,
prometheusAlertNotificationLabels,
prometheusAlertNotificationTimestamp,
prometheusAlertNotificationRawData
}
STATUS current
DESCRIPTION "An OSD was marked down at back up at least once a minute for 5 minutes."
::= { prometheusAlertOsdTraps 4 }
prometheusAlertOsdTrapHighPgCountDeviation NOTIFICATION-TYPE
OBJECTS {
prometheusAlertNotificationAlertName,
prometheusAlertNotificationStatus,
prometheusAlertNotificationSeverity,
prometheusAlertNotificationInstance,
prometheusAlertNotificationJob,
prometheusAlertNotificationDescription,
prometheusAlertNotificationLabels,
prometheusAlertNotificationTimestamp,
prometheusAlertNotificationRawData
}
STATUS current
DESCRIPTION "An OSD deviates by more then 30% from average PG count."
::= { prometheusAlertOsdTraps 5 }
prometheusAlertPgsTrapPgsInactive NOTIFICATION-TYPE
OBJECTS {
prometheusAlertNotificationAlertName,
prometheusAlertNotificationStatus,
prometheusAlertNotificationSeverity,
prometheusAlertNotificationInstance,
prometheusAlertNotificationJob,
prometheusAlertNotificationDescription,
prometheusAlertNotificationLabels,
prometheusAlertNotificationTimestamp,
prometheusAlertNotificationRawData
}
STATUS current
DESCRIPTION "One or more PGs are inactive for more than 5 minutes."
::= { prometheusAlertPgsTraps 1 }
prometheusAlertPgsTrapPgsUnclean NOTIFICATION-TYPE
OBJECTS {
prometheusAlertNotificationAlertName,
prometheusAlertNotificationStatus,
prometheusAlertNotificationSeverity,
prometheusAlertNotificationInstance,
prometheusAlertNotificationJob,
prometheusAlertNotificationDescription,
prometheusAlertNotificationLabels,
prometheusAlertNotificationTimestamp,
prometheusAlertNotificationRawData
}
STATUS current
DESCRIPTION "One or more PGs are not clean for more than 15 minutes."
::= { prometheusAlertPgsTraps 2 }
prometheusAlertNodesTrapRootVolumeFull NOTIFICATION-TYPE
OBJECTS {
prometheusAlertNotificationAlertName,
prometheusAlertNotificationStatus,
prometheusAlertNotificationSeverity,
prometheusAlertNotificationInstance,
prometheusAlertNotificationJob,
prometheusAlertNotificationDescription,
prometheusAlertNotificationLabels,
prometheusAlertNotificationTimestamp,
prometheusAlertNotificationRawData
}
STATUS current
DESCRIPTION "Root volume (OSD and MON store) is dangerously full (< 5% free)."
::= { prometheusAlertNodesTraps 1 }
prometheusAlertNodesTrapNetworkPacketsDropped NOTIFICATION-TYPE
OBJECTS {
prometheusAlertNotificationAlertName,
prometheusAlertNotificationStatus,
prometheusAlertNotificationSeverity,
prometheusAlertNotificationInstance,
prometheusAlertNotificationJob,
prometheusAlertNotificationDescription,
prometheusAlertNotificationLabels,
prometheusAlertNotificationTimestamp,
prometheusAlertNotificationRawData
}
STATUS current
DESCRIPTION "A node experiences packet drop > 1 packet/s on an interface."
::= { prometheusAlertNodesTraps 2 }
prometheusAlertNodesTrapNetworkPacketErrors NOTIFICATION-TYPE
OBJECTS {
prometheusAlertNotificationAlertName,
prometheusAlertNotificationStatus,
prometheusAlertNotificationSeverity,
prometheusAlertNotificationInstance,
prometheusAlertNotificationJob,
prometheusAlertNotificationDescription,
prometheusAlertNotificationLabels,
prometheusAlertNotificationTimestamp,
prometheusAlertNotificationRawData
}
STATUS current
DESCRIPTION "A node experiences packet errors > 1 packet/s on an interface."
::= { prometheusAlertNodesTraps 3 }
prometheusAlertNodesTrapStorageFilling NOTIFICATION-TYPE
OBJECTS {
prometheusAlertNotificationAlertName,
prometheusAlertNotificationStatus,
prometheusAlertNotificationSeverity,
prometheusAlertNotificationInstance,
prometheusAlertNotificationJob,
prometheusAlertNotificationDescription,
prometheusAlertNotificationLabels,
prometheusAlertNotificationTimestamp,
prometheusAlertNotificationRawData
}
STATUS current
DESCRIPTION "A mountpoint will be full in less then 5 days assuming the average fillup rate of the past 48 hours."
::= { prometheusAlertNodesTraps 4 }
prometheusAlertPoolsTrapPoolFull NOTIFICATION-TYPE
OBJECTS {
prometheusAlertNotificationAlertName,
prometheusAlertNotificationStatus,
prometheusAlertNotificationSeverity,
prometheusAlertNotificationInstance,
prometheusAlertNotificationJob,
prometheusAlertNotificationDescription,
prometheusAlertNotificationLabels,
prometheusAlertNotificationTimestamp,
prometheusAlertNotificationRawData
}
STATUS current
DESCRIPTION "A pool is at 90% capacity or over."
::= { prometheusAlertPoolsTraps 1 }
prometheusAlertPoolsTrapPoolFillingUp NOTIFICATION-TYPE
OBJECTS {
prometheusAlertNotificationAlertName,
prometheusAlertNotificationStatus,
prometheusAlertNotificationSeverity,
prometheusAlertNotificationInstance,
prometheusAlertNotificationJob,
prometheusAlertNotificationDescription,
prometheusAlertNotificationLabels,
prometheusAlertNotificationTimestamp,
prometheusAlertNotificationRawData
}
STATUS current
DESCRIPTION "A pool will be full in less then 5 days assuming the average fillup rate of the past 48 hours."
::= { prometheusAlertPoolsTraps 2 }
END

24
monitoring/snmp/README.md Normal file
View File

@ -0,0 +1,24 @@
# SNMP schema
## Traps
| OID | Description |
| :--- | :--- |
| 1.3.6.1.4.1.50495.15.1.2.1 | The default trap. This is used if no OID is specified in the alert labels. |
| 1.3.6.1.4.1.50495.15.1.2.[2...N] | Custom traps. |
## Objects
The following objects are appended as variable binds to an SNMP trap.
| OID | Type | Description |
| :--- | :---: | :--- |
| 1.3.6.1.4.1.50495.15.1.1.1 | String | The name of the Prometheus alert. |
| 1.3.6.1.4.1.50495.15.1.1.2 | String | The status of the Prometheus alert. |
| 1.3.6.1.4.1.50495.15.1.1.3 | String | The severity of the Prometheus alert. |
| 1.3.6.1.4.1.50495.15.1.1.4 | String | Unique identifier for the Prometheus instance. |
| 1.3.6.1.4.1.50495.15.1.1.5 | String | The name of the Prometheus job. |
| 1.3.6.1.4.1.50495.15.1.1.6 | String | The Prometheus alert description field. |
| 1.3.6.1.4.1.50495.15.1.1.7 | String | Additional Prometheus alert labels as JSON string. |
| 1.3.6.1.4.1.50495.15.1.1.8 | Unix timestamp | The time when the Prometheus alert occurred. |
| 1.3.6.1.4.1.50495.15.1.1.9 | String | The raw Prometheus alert as JSON string. |