feat: custom SEL events metrics

Signed-off-by: Aleksandr Aleksandrov <aleks.aleksandrov@corp.mail.ru>
This commit is contained in:
Aleksandr Aleksandrov 2024-01-12 11:35:50 +03:00
parent 3853e45ee9
commit 45ff1b4947
6 changed files with 325 additions and 94 deletions

141
collector_sel_events.go Normal file
View File

@ -0,0 +1,141 @@
// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"time"
"github.com/go-kit/log/level"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus-community/ipmi_exporter/freeipmi"
)
const (
SELEventsCollectorName CollectorName = "sel-events"
)
var (
selEventsCountByStateDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "sel_events", "count_by_state"),
"Current number of log entries in the SEL by state.",
[]string{"state"},
nil,
)
selEventsCountByNameDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "sel_events", "count_by_name"),
"Current number of custom log entries in the SEL by name.",
[]string{"name"},
nil,
)
selEventsLatestTimestampDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "sel_events", "latest_timestamp"),
"Latest timestamp of custom log entries in the SEL by name.",
[]string{"name"},
nil,
)
)
type SELEventsCollector struct{}
func (c SELEventsCollector) Name() CollectorName {
return SELEventsCollectorName
}
func (c SELEventsCollector) Cmd() string {
return "ipmi-sel"
}
func (c SELEventsCollector) Args() []string {
return []string{
"-Q",
"--comma-separated-output",
"--no-header-output",
"--sdr-cache-recreate",
"--output-event-state",
"--interpret-oem-data",
"--entity-sensor-names",
}
}
func (c SELEventsCollector) Collect(result freeipmi.Result, ch chan<- prometheus.Metric, target ipmiTarget) (int, error) {
selEventConfigs := target.config.SELEvents
events, err := freeipmi.GetSELEvents(result)
if err != nil {
level.Error(logger).Log("msg", "Failed to collect SEL events", "target", targetName(target.host), "error", err)
return 0, err
}
selEventByStateCount := map[string]float64{}
selEventByNameCount := map[string]float64{}
selEventByNameTimestamp := map[string]float64{}
// initialize sel event metrics by zero
for _, metricConfig := range selEventConfigs {
selEventByNameTimestamp[metricConfig.Name] = 0
selEventByNameCount[metricConfig.Name] = 0
}
for _, data := range events {
for _, metricConfig := range selEventConfigs {
match := metricConfig.Regex.FindStringSubmatch(data.Event)
if match != nil {
t, err := time.Parse("Jan-02-2006 15:04:05", data.Date+" "+data.Time)
if err != nil {
level.Error(logger).Log("msg", "Failed to collect SEL event metrics", "target", targetName(target.host), "error", err)
return 0, err
}
newTimestamp := float64(t.Unix())
// save latest timestamp by name metrics
if newTimestamp > selEventByNameTimestamp[metricConfig.Name] {
selEventByNameTimestamp[metricConfig.Name] = newTimestamp
}
// save count by name metrics
selEventByNameCount[metricConfig.Name]++
}
}
// save count by state metrics
_, ok := selEventByStateCount[data.State]
if !ok {
selEventByStateCount[data.State] = 0
}
selEventByStateCount[data.State]++
}
for state, value := range selEventByStateCount {
ch <- prometheus.MustNewConstMetric(
selEventsCountByStateDesc,
prometheus.GaugeValue,
value,
state,
)
}
for name, value := range selEventByNameCount {
ch <- prometheus.MustNewConstMetric(
selEventsCountByNameDesc,
prometheus.GaugeValue,
value,
name,
)
ch <- prometheus.MustNewConstMetric(
selEventsLatestTimestampDesc,
prometheus.GaugeValue,
selEventByNameTimestamp[name],
name,
)
}
return 1, nil
}

View File

@ -16,6 +16,7 @@ package main
import (
"fmt"
"os"
"regexp"
"strings"
"sync"
@ -80,6 +81,8 @@ func (c CollectorName) GetInstance() (collector, error) {
return BMCWatchdogCollector{}, nil
case SELCollectorName:
return SELCollector{}, nil
case SELEventsCollectorName:
return SELEventsCollector{}, nil
case DCMICollectorName:
return DCMICollector{}, nil
case ChassisCollectorName:
@ -124,10 +127,17 @@ type IPMIConfig struct {
CollectorArgs map[CollectorName][]string `yaml:"default_args"`
CustomArgs map[CollectorName][]string `yaml:"custom_args"`
SELEvents []*IpmiSELEvent `yaml:"sel_events,omitempty"`
// Catches all undefined fields and must be empty after parsing.
XXX map[string]interface{} `yaml:",inline"`
}
type IpmiSELEvent struct {
Name string `yaml:"name"`
RegexRaw string `yaml:"regex"`
Regex *regexp.Regexp `yaml:"-"`
}
var defaultConfig = IPMIConfig{
Collectors: []CollectorName{IPMICollectorName, DCMICollectorName, BMCCollectorName, ChassisCollectorName},
}
@ -170,6 +180,9 @@ func (s *IPMIConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
return err
}
}
for _, selEvent := range s.SELEvents {
selEvent.Regex = regexp.MustCompile(selEvent.RegexRaw)
}
return nil
}

View File

@ -19,6 +19,8 @@ These metrics provide data about the scrape itself:
the chassis power state metric (see below) will not be available
- `sel`: collects system event log (SEL) details. If it fails, SEL metrics
(see below) will not be available
- `sel-events`: collects metrics for user-defined events in system event log
(SEL). If it fails, SEL entries metrics (see below) will not be available
- `sm-lan-mode`: collects the "LAN mode" setting in the current BMC config.
If it fails, the LAN mode metric (see below) will not be available
- `ipmi_scrape_duration_seconds` is the amount of time it took to retrieve the
@ -87,7 +89,6 @@ countdown in seconds.
The metric `ipmi_bmc_watchdog_current_countdown_seconds` shows the current
countdown in seconds.
## Chassis Power State
This metric is only provided if the `chassis` collector is enabled.
@ -116,6 +117,23 @@ no labels.
The metric `ipmi_sel_free_space_bytes` contains the current number of free
space for new SEL entries, in bytes. This metric has no labels.
## System event log (SEL) entries metrics
These metrics are only provided if the `sel-events` collector is enabled (it
isn't by default).
For each event specified in the configuration file (`sel_events` field), will be
generated metrics containing the number of such events and the timestamp of their
last occurrence. Example:
ipmi_sel_events_count_by_name{name="my_custom_event_from_config"} 77
ipmi_sel_events_latest_timestamp{name="my_custom_event_from_config"} 1.703613275e+09
also next aggregated metrics will be exported:
ipmi_sel_events_count_by_state{state="Nominal"} 10
ipmi_sel_events_count_by_state{state="Warning"} 5
## Supermicro LAN mode setting
This metric is only provided if the `sm-lan-mode` collector is enabled (it

View File

@ -14,6 +14,7 @@
package freeipmi
import (
"bufio"
"bytes"
"crypto/rand"
"encoding/csv"
@ -40,6 +41,7 @@ var (
ipmiChassisCoolingFaultRegex = regexp.MustCompile(`^Cooling/fan fault\s*:\s(?P<value>.*)`)
ipmiSELEntriesRegex = regexp.MustCompile(`^Number of log entries\s*:\s(?P<value>[0-9.]*)`)
ipmiSELFreeSpaceRegex = regexp.MustCompile(`^Free space remaining\s*:\s(?P<value>[0-9.]*)\s*bytes.*`)
ipmiSELEventRegex = regexp.MustCompile(`^(?P<id>[0-9]+),\s*(?P<date>[^,]*),(?P<time>[^,]*),(?P<name>[^,]*),(?P<type>[^,]*),(?P<state>[^,]*),(?P<event>[^,]*)$`)
bmcInfoFirmwareRevisionRegex = regexp.MustCompile(`^Firmware Revision\s*:\s*(?P<value>[0-9.]*).*`)
bmcInfoSystemFirmwareVersionRegex = regexp.MustCompile(`^System Firmware Version\s*:\s*(?P<value>[0-9.]*).*`)
bmcInfoManufacturerIDRegex = regexp.MustCompile(`^Manufacturer ID\s*:\s*(?P<value>.*)`)
@ -71,6 +73,17 @@ type SensorData struct {
Event string
}
// SELEvent represents log line from SEL
type SELEventData struct {
ID int64
Date string
Time string
Name string
Type string
State string
Event string
}
// EscapePassword escapes a password so that the result is suitable for usage in a
// FreeIPMI config file.
func EscapePassword(password string) string {
@ -417,3 +430,44 @@ func GetBMCWatchdogCurrentCountdown(ipmiOutput Result) (float64, error) {
}
return strconv.ParseFloat(value, 64)
}
func GetSELEvents(ipmiOutput Result) ([]SELEventData, error) {
if ipmiOutput.err != nil {
return nil, fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output)
}
scanner := bufio.NewScanner(bytes.NewReader(ipmiOutput.output))
events := []SELEventData{}
for scanner.Scan() {
line := scanner.Text()
match := ipmiSELEventRegex.FindStringSubmatch(line)
// ignore lines which does not matches event regexp
if match == nil {
continue
}
result := make(map[string]string)
for i, name := range ipmiSELEventRegex.SubexpNames() {
if i != 0 && name != "" {
result[name] = match[i]
}
}
id, err := strconv.ParseInt(result["id"], 10, 64)
// ignore lines which does not starts with number
if err != nil {
continue
}
events = append(events, SELEventData{
ID: id,
Date: result["date"],
Time: result["time"],
Name: result["name"],
Type: result["type"],
State: result["state"],
Event: result["event"],
})
}
return events, nil
}

View File

@ -3,16 +3,21 @@
# This is an example config for scraping the local host.
# In most cases, this should work without using a config file at all.
modules:
default:
# Available collectors are bmc, bmc-watchdog, ipmi, chassis, dcmi, sel, and sm-lan-mode
collectors:
- bmc
- ipmi
- dcmi
- chassis
- sel
# Got any sensors you don't care about? Add them here.
exclude_sensor_ids:
- 2
- 29
- 32
default:
# Available collectors are bmc, bmc-watchdog, ipmi, chassis, dcmi, sel, sel-events and sm-lan-mode
collectors:
- bmc
- ipmi
- dcmi
- chassis
- sel
- sel-events
# Got any sensors you don't care about? Add them here.
exclude_sensor_ids:
- 2
- 29
- 32
# Define custom metrics for SEL entries
sel_events:
- name: correctable_memory_error
regex: Correctable memory error.*

View File

@ -6,83 +6,83 @@
# setting the `module` URL parameter.
modules:
default:
# These settings are used if no module is specified, the
# specified module doesn't exist, or of course if
# module=default is specified.
user: "default_user"
pass: "example_pw"
# The below settings correspond to driver-type, privilege-level, and
# session-timeout respectively, see `man 5 freeipmi.conf` (and e.g.
# `man 8 ipmi-sensors` for a list of driver types).
driver: "LAN_2_0"
privilege: "user"
# The session timeout is in milliseconds. Note that a scrape can take up
# to (session-timeout * #-of-collectors) milliseconds, so set the scrape
# timeout in Prometheus accordingly.
# Must be larger than the retransmission timeout, which defaults to 1000.
timeout: 10000
# Available collectors are bmc, bmc-watchdog, ipmi, chassis, dcmi, sel,
# and sm-lan-mode
# If _not_ specified, bmc, ipmi, chassis, and dcmi are used
collectors:
- bmc
- ipmi
- chassis
# Got any sensors you don't care about? Add them here.
exclude_sensor_ids:
- 2
- 29
- 32
- 50
- 52
- 55
dcmi:
# Use these settings when scraped with module=dcmi.
user: "admin_user"
pass: "another_pw"
privilege: "admin"
driver: "LAN_2_0"
collectors:
- dcmi
thatspecialhost:
# Use these settings when scraped with module=thatspecialhost.
user: "some_user"
pass: "secret_pw"
privilege: "admin"
driver: "LAN"
collectors:
- ipmi
- sel
# Need any special workaround flags set? Add them here.
# Workaround flags might be needed to address issues with specific vendor implementations
# e.g. https://www.gnu.org/software/freeipmi/freeipmi-faq.html#Why-is-the-output-from-FreeIPMI-different-than-another-software_003f
# For a full list of flags, refer to:
# https://www.gnu.org/software/freeipmi/manpages/man8/ipmi-sensors.8.html#lbAL
workaround_flags:
- discretereading
# If you require additional command line arguments (e.g. --bridge-sensors for ipmimonitoring),
# you can specify them per collector - BE CAREFUL, you can easily break the exporter with this!
custom_args:
ipmi:
- "--bridge-sensors"
advanced:
# Use these settings when scraped with module=advanced.
user: "some_user"
pass: "secret_pw"
privilege: "admin"
driver: "LAN"
collectors:
- ipmi
- sel
# USING ANY OF THE BELOW VOIDS YOUR WARRANTY! YOU MAY GET BITTEN BY SHARKS!
# You can override the command to be executed for a collector. Paired with
# custom_args, this can be used to e.g. execute the IPMI tools with sudo:
collector_cmd:
ipmi: sudo
sel: sudo
custom_args:
ipmi:
- "ipmimonitoring"
sel:
- "ipmi-sel"
default:
# These settings are used if no module is specified, the
# specified module doesn't exist, or of course if
# module=default is specified.
user: "default_user"
pass: "example_pw"
# The below settings correspond to driver-type, privilege-level, and
# session-timeout respectively, see `man 5 freeipmi.conf` (and e.g.
# `man 8 ipmi-sensors` for a list of driver types).
driver: "LAN_2_0"
privilege: "user"
# The session timeout is in milliseconds. Note that a scrape can take up
# to (session-timeout * #-of-collectors) milliseconds, so set the scrape
# timeout in Prometheus accordingly.
# Must be larger than the retransmission timeout, which defaults to 1000.
timeout: 10000
# Available collectors are bmc, bmc-watchdog, ipmi, chassis, dcmi, sel,
# and sm-lan-mode
# If _not_ specified, bmc, ipmi, chassis, and dcmi are used
collectors:
- bmc
- ipmi
- chassis
# Got any sensors you don't care about? Add them here.
exclude_sensor_ids:
- 2
- 29
- 32
- 50
- 52
- 55
dcmi:
# Use these settings when scraped with module=dcmi.
user: "admin_user"
pass: "another_pw"
privilege: "admin"
driver: "LAN_2_0"
collectors:
- dcmi
thatspecialhost:
# Use these settings when scraped with module=thatspecialhost.
user: "some_user"
pass: "secret_pw"
privilege: "admin"
driver: "LAN"
collectors:
- ipmi
- sel
# Need any special workaround flags set? Add them here.
# Workaround flags might be needed to address issues with specific vendor implementations
# e.g. https://www.gnu.org/software/freeipmi/freeipmi-faq.html#Why-is-the-output-from-FreeIPMI-different-than-another-software_003f
# For a full list of flags, refer to:
# https://www.gnu.org/software/freeipmi/manpages/man8/ipmi-sensors.8.html#lbAL
workaround_flags:
- discretereading
# If you require additional command line arguments (e.g. --bridge-sensors for ipmimonitoring),
# you can specify them per collector - BE CAREFUL, you can easily break the exporter with this!
custom_args:
ipmi:
- "--bridge-sensors"
advanced:
# Use these settings when scraped with module=advanced.
user: "some_user"
pass: "secret_pw"
privilege: "admin"
driver: "LAN"
collectors:
- ipmi
- sel
# USING ANY OF THE BELOW VOIDS YOUR WARRANTY! YOU MAY GET BITTEN BY SHARKS!
# You can override the command to be executed for a collector. Paired with
# custom_args, this can be used to e.g. execute the IPMI tools with sudo:
collector_cmd:
ipmi: sudo
sel: sudo
custom_args:
ipmi:
- "ipmimonitoring"
sel:
- "ipmi-sel"