diff --git a/collector_sel_events.go b/collector_sel_events.go new file mode 100644 index 0000000..4b7c28d --- /dev/null +++ b/collector_sel_events.go @@ -0,0 +1,141 @@ +// Copyright 2021 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "time" + + "github.com/go-kit/log/level" + "github.com/prometheus/client_golang/prometheus" + + "github.com/prometheus-community/ipmi_exporter/freeipmi" +) + +const ( + SELEventsCollectorName CollectorName = "sel-events" +) + +var ( + selEventsCountByStateDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "sel_events", "count_by_state"), + "Current number of log entries in the SEL by state.", + []string{"state"}, + nil, + ) + selEventsCountByNameDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "sel_events", "count_by_name"), + "Current number of custom log entries in the SEL by name.", + []string{"name"}, + nil, + ) + selEventsLatestTimestampDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "sel_events", "latest_timestamp"), + "Latest timestamp of custom log entries in the SEL by name.", + []string{"name"}, + nil, + ) +) + +type SELEventsCollector struct{} + +func (c SELEventsCollector) Name() CollectorName { + return SELEventsCollectorName +} + +func (c SELEventsCollector) Cmd() string { + return "ipmi-sel" +} + +func (c SELEventsCollector) Args() []string { + return []string{ + "-Q", + "--comma-separated-output", + "--no-header-output", + "--sdr-cache-recreate", + "--output-event-state", + "--interpret-oem-data", + "--entity-sensor-names", + } +} + +func (c SELEventsCollector) Collect(result freeipmi.Result, ch chan<- prometheus.Metric, target ipmiTarget) (int, error) { + selEventConfigs := target.config.SELEvents + + events, err := freeipmi.GetSELEvents(result) + if err != nil { + level.Error(logger).Log("msg", "Failed to collect SEL events", "target", targetName(target.host), "error", err) + return 0, err + } + + selEventByStateCount := map[string]float64{} + selEventByNameCount := map[string]float64{} + selEventByNameTimestamp := map[string]float64{} + + // initialize sel event metrics by zero + for _, metricConfig := range selEventConfigs { + selEventByNameTimestamp[metricConfig.Name] = 0 + selEventByNameCount[metricConfig.Name] = 0 + } + + for _, data := range events { + for _, metricConfig := range selEventConfigs { + match := metricConfig.Regex.FindStringSubmatch(data.Event) + if match != nil { + t, err := time.Parse("Jan-02-2006 15:04:05", data.Date+" "+data.Time) + if err != nil { + level.Error(logger).Log("msg", "Failed to collect SEL event metrics", "target", targetName(target.host), "error", err) + return 0, err + } + newTimestamp := float64(t.Unix()) + // save latest timestamp by name metrics + if newTimestamp > selEventByNameTimestamp[metricConfig.Name] { + selEventByNameTimestamp[metricConfig.Name] = newTimestamp + } + // save count by name metrics + selEventByNameCount[metricConfig.Name]++ + } + } + // save count by state metrics + _, ok := selEventByStateCount[data.State] + if !ok { + selEventByStateCount[data.State] = 0 + } + selEventByStateCount[data.State]++ + } + + for state, value := range selEventByStateCount { + ch <- prometheus.MustNewConstMetric( + selEventsCountByStateDesc, + prometheus.GaugeValue, + value, + state, + ) + } + + for name, value := range selEventByNameCount { + ch <- prometheus.MustNewConstMetric( + selEventsCountByNameDesc, + prometheus.GaugeValue, + value, + name, + ) + ch <- prometheus.MustNewConstMetric( + selEventsLatestTimestampDesc, + prometheus.GaugeValue, + selEventByNameTimestamp[name], + name, + ) + } + return 1, nil +} diff --git a/config.go b/config.go index 494e47e..8994a1c 100644 --- a/config.go +++ b/config.go @@ -16,6 +16,7 @@ package main import ( "fmt" "os" + "regexp" "strings" "sync" @@ -80,6 +81,8 @@ func (c CollectorName) GetInstance() (collector, error) { return BMCWatchdogCollector{}, nil case SELCollectorName: return SELCollector{}, nil + case SELEventsCollectorName: + return SELEventsCollector{}, nil case DCMICollectorName: return DCMICollector{}, nil case ChassisCollectorName: @@ -124,10 +127,17 @@ type IPMIConfig struct { CollectorArgs map[CollectorName][]string `yaml:"default_args"` CustomArgs map[CollectorName][]string `yaml:"custom_args"` + SELEvents []*IpmiSELEvent `yaml:"sel_events,omitempty"` // Catches all undefined fields and must be empty after parsing. XXX map[string]interface{} `yaml:",inline"` } +type IpmiSELEvent struct { + Name string `yaml:"name"` + RegexRaw string `yaml:"regex"` + Regex *regexp.Regexp `yaml:"-"` +} + var defaultConfig = IPMIConfig{ Collectors: []CollectorName{IPMICollectorName, DCMICollectorName, BMCCollectorName, ChassisCollectorName}, } @@ -170,6 +180,9 @@ func (s *IPMIConfig) UnmarshalYAML(unmarshal func(interface{}) error) error { return err } } + for _, selEvent := range s.SELEvents { + selEvent.Regex = regexp.MustCompile(selEvent.RegexRaw) + } return nil } diff --git a/docs/metrics.md b/docs/metrics.md index 4daacc5..3e524d7 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -19,6 +19,8 @@ These metrics provide data about the scrape itself: the chassis power state metric (see below) will not be available - `sel`: collects system event log (SEL) details. If it fails, SEL metrics (see below) will not be available + - `sel-events`: collects metrics for user-defined events in system event log + (SEL). If it fails, SEL entries metrics (see below) will not be available - `sm-lan-mode`: collects the "LAN mode" setting in the current BMC config. If it fails, the LAN mode metric (see below) will not be available - `ipmi_scrape_duration_seconds` is the amount of time it took to retrieve the @@ -87,7 +89,6 @@ countdown in seconds. The metric `ipmi_bmc_watchdog_current_countdown_seconds` shows the current countdown in seconds. - ## Chassis Power State This metric is only provided if the `chassis` collector is enabled. @@ -116,6 +117,23 @@ no labels. The metric `ipmi_sel_free_space_bytes` contains the current number of free space for new SEL entries, in bytes. This metric has no labels. +## System event log (SEL) entries metrics + +These metrics are only provided if the `sel-events` collector is enabled (it +isn't by default). + +For each event specified in the configuration file (`sel_events` field), will be +generated metrics containing the number of such events and the timestamp of their +last occurrence. Example: + + ipmi_sel_events_count_by_name{name="my_custom_event_from_config"} 77 + ipmi_sel_events_latest_timestamp{name="my_custom_event_from_config"} 1.703613275e+09 + +also next aggregated metrics will be exported: + + ipmi_sel_events_count_by_state{state="Nominal"} 10 + ipmi_sel_events_count_by_state{state="Warning"} 5 + ## Supermicro LAN mode setting This metric is only provided if the `sm-lan-mode` collector is enabled (it diff --git a/freeipmi/freeipmi.go b/freeipmi/freeipmi.go index 0b717b8..2570240 100644 --- a/freeipmi/freeipmi.go +++ b/freeipmi/freeipmi.go @@ -14,6 +14,7 @@ package freeipmi import ( + "bufio" "bytes" "crypto/rand" "encoding/csv" @@ -40,6 +41,7 @@ var ( ipmiChassisCoolingFaultRegex = regexp.MustCompile(`^Cooling/fan fault\s*:\s(?P.*)`) ipmiSELEntriesRegex = regexp.MustCompile(`^Number of log entries\s*:\s(?P[0-9.]*)`) ipmiSELFreeSpaceRegex = regexp.MustCompile(`^Free space remaining\s*:\s(?P[0-9.]*)\s*bytes.*`) + ipmiSELEventRegex = regexp.MustCompile(`^(?P[0-9]+),\s*(?P[^,]*),(?P