feat: custom SEL events metrics
Signed-off-by: Aleksandr Aleksandrov <aleks.aleksandrov@corp.mail.ru>
This commit is contained in:
parent
3853e45ee9
commit
45ff1b4947
|
@ -0,0 +1,141 @@
|
|||
// Copyright 2021 The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/go-kit/log/level"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
|
||||
"github.com/prometheus-community/ipmi_exporter/freeipmi"
|
||||
)
|
||||
|
||||
const (
|
||||
SELEventsCollectorName CollectorName = "sel-events"
|
||||
)
|
||||
|
||||
var (
|
||||
selEventsCountByStateDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "sel_events", "count_by_state"),
|
||||
"Current number of log entries in the SEL by state.",
|
||||
[]string{"state"},
|
||||
nil,
|
||||
)
|
||||
selEventsCountByNameDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "sel_events", "count_by_name"),
|
||||
"Current number of custom log entries in the SEL by name.",
|
||||
[]string{"name"},
|
||||
nil,
|
||||
)
|
||||
selEventsLatestTimestampDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "sel_events", "latest_timestamp"),
|
||||
"Latest timestamp of custom log entries in the SEL by name.",
|
||||
[]string{"name"},
|
||||
nil,
|
||||
)
|
||||
)
|
||||
|
||||
type SELEventsCollector struct{}
|
||||
|
||||
func (c SELEventsCollector) Name() CollectorName {
|
||||
return SELEventsCollectorName
|
||||
}
|
||||
|
||||
func (c SELEventsCollector) Cmd() string {
|
||||
return "ipmi-sel"
|
||||
}
|
||||
|
||||
func (c SELEventsCollector) Args() []string {
|
||||
return []string{
|
||||
"-Q",
|
||||
"--comma-separated-output",
|
||||
"--no-header-output",
|
||||
"--sdr-cache-recreate",
|
||||
"--output-event-state",
|
||||
"--interpret-oem-data",
|
||||
"--entity-sensor-names",
|
||||
}
|
||||
}
|
||||
|
||||
func (c SELEventsCollector) Collect(result freeipmi.Result, ch chan<- prometheus.Metric, target ipmiTarget) (int, error) {
|
||||
selEventConfigs := target.config.SELEvents
|
||||
|
||||
events, err := freeipmi.GetSELEvents(result)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Failed to collect SEL events", "target", targetName(target.host), "error", err)
|
||||
return 0, err
|
||||
}
|
||||
|
||||
selEventByStateCount := map[string]float64{}
|
||||
selEventByNameCount := map[string]float64{}
|
||||
selEventByNameTimestamp := map[string]float64{}
|
||||
|
||||
// initialize sel event metrics by zero
|
||||
for _, metricConfig := range selEventConfigs {
|
||||
selEventByNameTimestamp[metricConfig.Name] = 0
|
||||
selEventByNameCount[metricConfig.Name] = 0
|
||||
}
|
||||
|
||||
for _, data := range events {
|
||||
for _, metricConfig := range selEventConfigs {
|
||||
match := metricConfig.Regex.FindStringSubmatch(data.Event)
|
||||
if match != nil {
|
||||
t, err := time.Parse("Jan-02-2006 15:04:05", data.Date+" "+data.Time)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Failed to collect SEL event metrics", "target", targetName(target.host), "error", err)
|
||||
return 0, err
|
||||
}
|
||||
newTimestamp := float64(t.Unix())
|
||||
// save latest timestamp by name metrics
|
||||
if newTimestamp > selEventByNameTimestamp[metricConfig.Name] {
|
||||
selEventByNameTimestamp[metricConfig.Name] = newTimestamp
|
||||
}
|
||||
// save count by name metrics
|
||||
selEventByNameCount[metricConfig.Name]++
|
||||
}
|
||||
}
|
||||
// save count by state metrics
|
||||
_, ok := selEventByStateCount[data.State]
|
||||
if !ok {
|
||||
selEventByStateCount[data.State] = 0
|
||||
}
|
||||
selEventByStateCount[data.State]++
|
||||
}
|
||||
|
||||
for state, value := range selEventByStateCount {
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
selEventsCountByStateDesc,
|
||||
prometheus.GaugeValue,
|
||||
value,
|
||||
state,
|
||||
)
|
||||
}
|
||||
|
||||
for name, value := range selEventByNameCount {
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
selEventsCountByNameDesc,
|
||||
prometheus.GaugeValue,
|
||||
value,
|
||||
name,
|
||||
)
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
selEventsLatestTimestampDesc,
|
||||
prometheus.GaugeValue,
|
||||
selEventByNameTimestamp[name],
|
||||
name,
|
||||
)
|
||||
}
|
||||
return 1, nil
|
||||
}
|
13
config.go
13
config.go
|
@ -16,6 +16,7 @@ package main
|
|||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
|
@ -80,6 +81,8 @@ func (c CollectorName) GetInstance() (collector, error) {
|
|||
return BMCWatchdogCollector{}, nil
|
||||
case SELCollectorName:
|
||||
return SELCollector{}, nil
|
||||
case SELEventsCollectorName:
|
||||
return SELEventsCollector{}, nil
|
||||
case DCMICollectorName:
|
||||
return DCMICollector{}, nil
|
||||
case ChassisCollectorName:
|
||||
|
@ -124,10 +127,17 @@ type IPMIConfig struct {
|
|||
CollectorArgs map[CollectorName][]string `yaml:"default_args"`
|
||||
CustomArgs map[CollectorName][]string `yaml:"custom_args"`
|
||||
|
||||
SELEvents []*IpmiSELEvent `yaml:"sel_events,omitempty"`
|
||||
// Catches all undefined fields and must be empty after parsing.
|
||||
XXX map[string]interface{} `yaml:",inline"`
|
||||
}
|
||||
|
||||
type IpmiSELEvent struct {
|
||||
Name string `yaml:"name"`
|
||||
RegexRaw string `yaml:"regex"`
|
||||
Regex *regexp.Regexp `yaml:"-"`
|
||||
}
|
||||
|
||||
var defaultConfig = IPMIConfig{
|
||||
Collectors: []CollectorName{IPMICollectorName, DCMICollectorName, BMCCollectorName, ChassisCollectorName},
|
||||
}
|
||||
|
@ -170,6 +180,9 @@ func (s *IPMIConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
|||
return err
|
||||
}
|
||||
}
|
||||
for _, selEvent := range s.SELEvents {
|
||||
selEvent.Regex = regexp.MustCompile(selEvent.RegexRaw)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
|
@ -19,6 +19,8 @@ These metrics provide data about the scrape itself:
|
|||
the chassis power state metric (see below) will not be available
|
||||
- `sel`: collects system event log (SEL) details. If it fails, SEL metrics
|
||||
(see below) will not be available
|
||||
- `sel-events`: collects metrics for user-defined events in system event log
|
||||
(SEL). If it fails, SEL entries metrics (see below) will not be available
|
||||
- `sm-lan-mode`: collects the "LAN mode" setting in the current BMC config.
|
||||
If it fails, the LAN mode metric (see below) will not be available
|
||||
- `ipmi_scrape_duration_seconds` is the amount of time it took to retrieve the
|
||||
|
@ -87,7 +89,6 @@ countdown in seconds.
|
|||
The metric `ipmi_bmc_watchdog_current_countdown_seconds` shows the current
|
||||
countdown in seconds.
|
||||
|
||||
|
||||
## Chassis Power State
|
||||
|
||||
This metric is only provided if the `chassis` collector is enabled.
|
||||
|
@ -116,6 +117,23 @@ no labels.
|
|||
The metric `ipmi_sel_free_space_bytes` contains the current number of free
|
||||
space for new SEL entries, in bytes. This metric has no labels.
|
||||
|
||||
## System event log (SEL) entries metrics
|
||||
|
||||
These metrics are only provided if the `sel-events` collector is enabled (it
|
||||
isn't by default).
|
||||
|
||||
For each event specified in the configuration file (`sel_events` field), will be
|
||||
generated metrics containing the number of such events and the timestamp of their
|
||||
last occurrence. Example:
|
||||
|
||||
ipmi_sel_events_count_by_name{name="my_custom_event_from_config"} 77
|
||||
ipmi_sel_events_latest_timestamp{name="my_custom_event_from_config"} 1.703613275e+09
|
||||
|
||||
also next aggregated metrics will be exported:
|
||||
|
||||
ipmi_sel_events_count_by_state{state="Nominal"} 10
|
||||
ipmi_sel_events_count_by_state{state="Warning"} 5
|
||||
|
||||
## Supermicro LAN mode setting
|
||||
|
||||
This metric is only provided if the `sm-lan-mode` collector is enabled (it
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
package freeipmi
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"crypto/rand"
|
||||
"encoding/csv"
|
||||
|
@ -40,6 +41,7 @@ var (
|
|||
ipmiChassisCoolingFaultRegex = regexp.MustCompile(`^Cooling/fan fault\s*:\s(?P<value>.*)`)
|
||||
ipmiSELEntriesRegex = regexp.MustCompile(`^Number of log entries\s*:\s(?P<value>[0-9.]*)`)
|
||||
ipmiSELFreeSpaceRegex = regexp.MustCompile(`^Free space remaining\s*:\s(?P<value>[0-9.]*)\s*bytes.*`)
|
||||
ipmiSELEventRegex = regexp.MustCompile(`^(?P<id>[0-9]+),\s*(?P<date>[^,]*),(?P<time>[^,]*),(?P<name>[^,]*),(?P<type>[^,]*),(?P<state>[^,]*),(?P<event>[^,]*)$`)
|
||||
bmcInfoFirmwareRevisionRegex = regexp.MustCompile(`^Firmware Revision\s*:\s*(?P<value>[0-9.]*).*`)
|
||||
bmcInfoSystemFirmwareVersionRegex = regexp.MustCompile(`^System Firmware Version\s*:\s*(?P<value>[0-9.]*).*`)
|
||||
bmcInfoManufacturerIDRegex = regexp.MustCompile(`^Manufacturer ID\s*:\s*(?P<value>.*)`)
|
||||
|
@ -71,6 +73,17 @@ type SensorData struct {
|
|||
Event string
|
||||
}
|
||||
|
||||
// SELEvent represents log line from SEL
|
||||
type SELEventData struct {
|
||||
ID int64
|
||||
Date string
|
||||
Time string
|
||||
Name string
|
||||
Type string
|
||||
State string
|
||||
Event string
|
||||
}
|
||||
|
||||
// EscapePassword escapes a password so that the result is suitable for usage in a
|
||||
// FreeIPMI config file.
|
||||
func EscapePassword(password string) string {
|
||||
|
@ -417,3 +430,44 @@ func GetBMCWatchdogCurrentCountdown(ipmiOutput Result) (float64, error) {
|
|||
}
|
||||
return strconv.ParseFloat(value, 64)
|
||||
}
|
||||
|
||||
func GetSELEvents(ipmiOutput Result) ([]SELEventData, error) {
|
||||
if ipmiOutput.err != nil {
|
||||
return nil, fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output)
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(bytes.NewReader(ipmiOutput.output))
|
||||
events := []SELEventData{}
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
match := ipmiSELEventRegex.FindStringSubmatch(line)
|
||||
// ignore lines which does not matches event regexp
|
||||
if match == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
result := make(map[string]string)
|
||||
for i, name := range ipmiSELEventRegex.SubexpNames() {
|
||||
if i != 0 && name != "" {
|
||||
result[name] = match[i]
|
||||
}
|
||||
}
|
||||
id, err := strconv.ParseInt(result["id"], 10, 64)
|
||||
|
||||
// ignore lines which does not starts with number
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
events = append(events, SELEventData{
|
||||
ID: id,
|
||||
Date: result["date"],
|
||||
Time: result["time"],
|
||||
Name: result["name"],
|
||||
Type: result["type"],
|
||||
State: result["state"],
|
||||
Event: result["event"],
|
||||
})
|
||||
}
|
||||
return events, nil
|
||||
}
|
||||
|
|
|
@ -3,16 +3,21 @@
|
|||
# This is an example config for scraping the local host.
|
||||
# In most cases, this should work without using a config file at all.
|
||||
modules:
|
||||
default:
|
||||
# Available collectors are bmc, bmc-watchdog, ipmi, chassis, dcmi, sel, and sm-lan-mode
|
||||
collectors:
|
||||
- bmc
|
||||
- ipmi
|
||||
- dcmi
|
||||
- chassis
|
||||
- sel
|
||||
# Got any sensors you don't care about? Add them here.
|
||||
exclude_sensor_ids:
|
||||
- 2
|
||||
- 29
|
||||
- 32
|
||||
default:
|
||||
# Available collectors are bmc, bmc-watchdog, ipmi, chassis, dcmi, sel, sel-events and sm-lan-mode
|
||||
collectors:
|
||||
- bmc
|
||||
- ipmi
|
||||
- dcmi
|
||||
- chassis
|
||||
- sel
|
||||
- sel-events
|
||||
# Got any sensors you don't care about? Add them here.
|
||||
exclude_sensor_ids:
|
||||
- 2
|
||||
- 29
|
||||
- 32
|
||||
# Define custom metrics for SEL entries
|
||||
sel_events:
|
||||
- name: correctable_memory_error
|
||||
regex: Correctable memory error.*
|
||||
|
|
160
ipmi_remote.yml
160
ipmi_remote.yml
|
@ -6,83 +6,83 @@
|
|||
# setting the `module` URL parameter.
|
||||
|
||||
modules:
|
||||
default:
|
||||
# These settings are used if no module is specified, the
|
||||
# specified module doesn't exist, or of course if
|
||||
# module=default is specified.
|
||||
user: "default_user"
|
||||
pass: "example_pw"
|
||||
# The below settings correspond to driver-type, privilege-level, and
|
||||
# session-timeout respectively, see `man 5 freeipmi.conf` (and e.g.
|
||||
# `man 8 ipmi-sensors` for a list of driver types).
|
||||
driver: "LAN_2_0"
|
||||
privilege: "user"
|
||||
# The session timeout is in milliseconds. Note that a scrape can take up
|
||||
# to (session-timeout * #-of-collectors) milliseconds, so set the scrape
|
||||
# timeout in Prometheus accordingly.
|
||||
# Must be larger than the retransmission timeout, which defaults to 1000.
|
||||
timeout: 10000
|
||||
# Available collectors are bmc, bmc-watchdog, ipmi, chassis, dcmi, sel,
|
||||
# and sm-lan-mode
|
||||
# If _not_ specified, bmc, ipmi, chassis, and dcmi are used
|
||||
collectors:
|
||||
- bmc
|
||||
- ipmi
|
||||
- chassis
|
||||
# Got any sensors you don't care about? Add them here.
|
||||
exclude_sensor_ids:
|
||||
- 2
|
||||
- 29
|
||||
- 32
|
||||
- 50
|
||||
- 52
|
||||
- 55
|
||||
dcmi:
|
||||
# Use these settings when scraped with module=dcmi.
|
||||
user: "admin_user"
|
||||
pass: "another_pw"
|
||||
privilege: "admin"
|
||||
driver: "LAN_2_0"
|
||||
collectors:
|
||||
- dcmi
|
||||
thatspecialhost:
|
||||
# Use these settings when scraped with module=thatspecialhost.
|
||||
user: "some_user"
|
||||
pass: "secret_pw"
|
||||
privilege: "admin"
|
||||
driver: "LAN"
|
||||
collectors:
|
||||
- ipmi
|
||||
- sel
|
||||
# Need any special workaround flags set? Add them here.
|
||||
# Workaround flags might be needed to address issues with specific vendor implementations
|
||||
# e.g. https://www.gnu.org/software/freeipmi/freeipmi-faq.html#Why-is-the-output-from-FreeIPMI-different-than-another-software_003f
|
||||
# For a full list of flags, refer to:
|
||||
# https://www.gnu.org/software/freeipmi/manpages/man8/ipmi-sensors.8.html#lbAL
|
||||
workaround_flags:
|
||||
- discretereading
|
||||
# If you require additional command line arguments (e.g. --bridge-sensors for ipmimonitoring),
|
||||
# you can specify them per collector - BE CAREFUL, you can easily break the exporter with this!
|
||||
custom_args:
|
||||
ipmi:
|
||||
- "--bridge-sensors"
|
||||
advanced:
|
||||
# Use these settings when scraped with module=advanced.
|
||||
user: "some_user"
|
||||
pass: "secret_pw"
|
||||
privilege: "admin"
|
||||
driver: "LAN"
|
||||
collectors:
|
||||
- ipmi
|
||||
- sel
|
||||
# USING ANY OF THE BELOW VOIDS YOUR WARRANTY! YOU MAY GET BITTEN BY SHARKS!
|
||||
# You can override the command to be executed for a collector. Paired with
|
||||
# custom_args, this can be used to e.g. execute the IPMI tools with sudo:
|
||||
collector_cmd:
|
||||
ipmi: sudo
|
||||
sel: sudo
|
||||
custom_args:
|
||||
ipmi:
|
||||
- "ipmimonitoring"
|
||||
sel:
|
||||
- "ipmi-sel"
|
||||
default:
|
||||
# These settings are used if no module is specified, the
|
||||
# specified module doesn't exist, or of course if
|
||||
# module=default is specified.
|
||||
user: "default_user"
|
||||
pass: "example_pw"
|
||||
# The below settings correspond to driver-type, privilege-level, and
|
||||
# session-timeout respectively, see `man 5 freeipmi.conf` (and e.g.
|
||||
# `man 8 ipmi-sensors` for a list of driver types).
|
||||
driver: "LAN_2_0"
|
||||
privilege: "user"
|
||||
# The session timeout is in milliseconds. Note that a scrape can take up
|
||||
# to (session-timeout * #-of-collectors) milliseconds, so set the scrape
|
||||
# timeout in Prometheus accordingly.
|
||||
# Must be larger than the retransmission timeout, which defaults to 1000.
|
||||
timeout: 10000
|
||||
# Available collectors are bmc, bmc-watchdog, ipmi, chassis, dcmi, sel,
|
||||
# and sm-lan-mode
|
||||
# If _not_ specified, bmc, ipmi, chassis, and dcmi are used
|
||||
collectors:
|
||||
- bmc
|
||||
- ipmi
|
||||
- chassis
|
||||
# Got any sensors you don't care about? Add them here.
|
||||
exclude_sensor_ids:
|
||||
- 2
|
||||
- 29
|
||||
- 32
|
||||
- 50
|
||||
- 52
|
||||
- 55
|
||||
dcmi:
|
||||
# Use these settings when scraped with module=dcmi.
|
||||
user: "admin_user"
|
||||
pass: "another_pw"
|
||||
privilege: "admin"
|
||||
driver: "LAN_2_0"
|
||||
collectors:
|
||||
- dcmi
|
||||
thatspecialhost:
|
||||
# Use these settings when scraped with module=thatspecialhost.
|
||||
user: "some_user"
|
||||
pass: "secret_pw"
|
||||
privilege: "admin"
|
||||
driver: "LAN"
|
||||
collectors:
|
||||
- ipmi
|
||||
- sel
|
||||
# Need any special workaround flags set? Add them here.
|
||||
# Workaround flags might be needed to address issues with specific vendor implementations
|
||||
# e.g. https://www.gnu.org/software/freeipmi/freeipmi-faq.html#Why-is-the-output-from-FreeIPMI-different-than-another-software_003f
|
||||
# For a full list of flags, refer to:
|
||||
# https://www.gnu.org/software/freeipmi/manpages/man8/ipmi-sensors.8.html#lbAL
|
||||
workaround_flags:
|
||||
- discretereading
|
||||
# If you require additional command line arguments (e.g. --bridge-sensors for ipmimonitoring),
|
||||
# you can specify them per collector - BE CAREFUL, you can easily break the exporter with this!
|
||||
custom_args:
|
||||
ipmi:
|
||||
- "--bridge-sensors"
|
||||
advanced:
|
||||
# Use these settings when scraped with module=advanced.
|
||||
user: "some_user"
|
||||
pass: "secret_pw"
|
||||
privilege: "admin"
|
||||
driver: "LAN"
|
||||
collectors:
|
||||
- ipmi
|
||||
- sel
|
||||
# USING ANY OF THE BELOW VOIDS YOUR WARRANTY! YOU MAY GET BITTEN BY SHARKS!
|
||||
# You can override the command to be executed for a collector. Paired with
|
||||
# custom_args, this can be used to e.g. execute the IPMI tools with sudo:
|
||||
collector_cmd:
|
||||
ipmi: sudo
|
||||
sel: sudo
|
||||
custom_args:
|
||||
ipmi:
|
||||
- "ipmimonitoring"
|
||||
sel:
|
||||
- "ipmi-sel"
|
||||
|
|
Loading…
Reference in New Issue