Add collector for bmc-watchdog
Some BMC's provide a watchdog functionality, i.e. taking some specified action if a timer is not reset within a specified time. freeipmi tools have a bmc-watchdog command to control and also report the current status of such function. This collector reports that information. Signed-off-by: Erdem Agaoglu <erdem.agaoglu@gmail.com>
This commit is contained in:
parent
d4c9372727
commit
5a4e594b68
|
@ -0,0 +1,164 @@
|
||||||
|
// Copyright 2021 The Prometheus Authors
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/go-kit/log/level"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
|
||||||
|
"github.com/prometheus-community/ipmi_exporter/freeipmi"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
BMCWatchdogCollectorName CollectorName = "bmc-watchdog"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
bmcWatchdogTimerDesc = prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(namespace, "bmc_watchdog", "timer_state"),
|
||||||
|
"Watchdog timer running (1: running, 0: stopped)",
|
||||||
|
[]string{},
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
watchdogTimerUses = []string{"BIOS FRB2", "BIOS POST", "OS LOAD", "SMS/OS", "OEM"}
|
||||||
|
bmcWatchdogTimerUseDesc = prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(namespace, "bmc_watchdog", "timer_use_state"),
|
||||||
|
"Watchdog timer use (1: active, 0: inactive)",
|
||||||
|
[]string{"name"},
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
bmcWatchdogLoggingDesc = prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(namespace, "bmc_watchdog", "logging_state"),
|
||||||
|
"Watchdog log flag (1: Enabled, 0: Disabled / note: reverse of freeipmi)",
|
||||||
|
[]string{},
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
watchdogTimeoutActions = []string{"None", "Hard Reset", "Power Down", "Power Cycle"}
|
||||||
|
bmcWatchdogTimeoutActionDesc = prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(namespace, "bmc_watchdog", "timeout_action_state"),
|
||||||
|
"Watchdog timeout action (1: active, 0: inactive)",
|
||||||
|
[]string{"action"},
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
watchdogPretimeoutInterrupts = []string{"None", "SMI", "NMI / Diagnostic Interrupt", "Messaging Interrupt"}
|
||||||
|
bmcWatchdogPretimeoutInterruptDesc = prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(namespace, "bmc_watchdog", "pretimeout_interrupt_state"),
|
||||||
|
"Watchdog pre-timeout interrupt (1: active, 0: inactive)",
|
||||||
|
[]string{"interrupt"},
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
bmcWatchdogPretimeoutIntervalDesc = prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(namespace, "bmc_watchdog", "pretimeout_interval_seconds"),
|
||||||
|
"Watchdog pre-timeout interval in seconds",
|
||||||
|
[]string{},
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
bmcWatchdogInitialCountdownDesc = prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(namespace, "bmc_watchdog", "initial_countdown_seconds"),
|
||||||
|
"Watchdog initial countdown in seconds",
|
||||||
|
[]string{},
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
bmcWatchdogCurrentCountdownDesc = prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(namespace, "bmc_watchdog", "current_countdown_seconds"),
|
||||||
|
"Watchdog initial countdown in seconds",
|
||||||
|
[]string{},
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
type BMCWatchdogCollector struct{}
|
||||||
|
|
||||||
|
func (c BMCWatchdogCollector) Name() CollectorName {
|
||||||
|
return BMCWatchdogCollectorName
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c BMCWatchdogCollector) Cmd() string {
|
||||||
|
return "bmc-watchdog"
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c BMCWatchdogCollector) Args() []string {
|
||||||
|
return []string{"--get"}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c BMCWatchdogCollector) Collect(result freeipmi.Result, ch chan<- prometheus.Metric, target ipmiTarget) (int, error) {
|
||||||
|
timerState, err := freeipmi.GetBMCWatchdogTimerState(result)
|
||||||
|
if err != nil {
|
||||||
|
level.Error(logger).Log("msg", "Failed to collect BMC watchdog timer", "target", targetName(target.host), "error", err)
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
currentTimerUse, err := freeipmi.GetBMCWatchdogTimerUse(result)
|
||||||
|
if err != nil {
|
||||||
|
level.Error(logger).Log("msg", "Failed to collect BMC watchdog timer use", "target", targetName(target.host), "error", err)
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
loggingState, err := freeipmi.GetBMCWatchdogLoggingState(result)
|
||||||
|
if err != nil {
|
||||||
|
level.Error(logger).Log("msg", "Failed to collect BMC watchdog logging", "target", targetName(target.host), "error", err)
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
currentTimeoutAction, err := freeipmi.GetBMCWatchdogTimeoutAction(result)
|
||||||
|
if err != nil {
|
||||||
|
level.Error(logger).Log("msg", "Failed to collect BMC watchdog timeout action", "target", targetName(target.host), "error", err)
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
currentPretimeoutInterrupt, err := freeipmi.GetBMCWatchdogPretimeoutInterrupt(result)
|
||||||
|
if err != nil {
|
||||||
|
level.Error(logger).Log("msg", "Failed to collect BMC watchdog pretimeout interrupt", "target", targetName(target.host), "error", err)
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
pretimeoutInterval, err := freeipmi.GetBMCWatchdogPretimeoutInterval(result)
|
||||||
|
if err != nil {
|
||||||
|
level.Error(logger).Log("msg", "Failed to collect BMC watchdog pretimeout interval", "target", targetName(target.host), "error", err)
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
initialCountdown, err := freeipmi.GetBMCWatchdogInitialCountdown(result)
|
||||||
|
if err != nil {
|
||||||
|
level.Error(logger).Log("msg", "Failed to collect BMC watchdog initial countdown", "target", targetName(target.host), "error", err)
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
currentCountdown, err := freeipmi.GetBMCWatchdogCurrentCountdown(result)
|
||||||
|
if err != nil {
|
||||||
|
level.Error(logger).Log("msg", "Failed to collect BMC watchdog current countdown", "target", targetName(target.host), "error", err)
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
ch <- prometheus.MustNewConstMetric(bmcWatchdogTimerDesc, prometheus.GaugeValue, timerState)
|
||||||
|
for _, timerUse := range watchdogTimerUses {
|
||||||
|
if currentTimerUse == timerUse {
|
||||||
|
ch <- prometheus.MustNewConstMetric(bmcWatchdogTimerUseDesc, prometheus.GaugeValue, 1, timerUse)
|
||||||
|
} else {
|
||||||
|
ch <- prometheus.MustNewConstMetric(bmcWatchdogTimerUseDesc, prometheus.GaugeValue, 0, timerUse)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ch <- prometheus.MustNewConstMetric(bmcWatchdogLoggingDesc, prometheus.GaugeValue, loggingState)
|
||||||
|
for _, timeoutAction := range watchdogTimeoutActions {
|
||||||
|
if currentTimeoutAction == timeoutAction {
|
||||||
|
ch <- prometheus.MustNewConstMetric(bmcWatchdogTimeoutActionDesc, prometheus.GaugeValue, 1, timeoutAction)
|
||||||
|
} else {
|
||||||
|
ch <- prometheus.MustNewConstMetric(bmcWatchdogTimeoutActionDesc, prometheus.GaugeValue, 0, timeoutAction)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, pretimeoutInterrupt := range watchdogPretimeoutInterrupts {
|
||||||
|
if currentPretimeoutInterrupt == pretimeoutInterrupt {
|
||||||
|
ch <- prometheus.MustNewConstMetric(bmcWatchdogPretimeoutInterruptDesc, prometheus.GaugeValue, 1, pretimeoutInterrupt)
|
||||||
|
} else {
|
||||||
|
ch <- prometheus.MustNewConstMetric(bmcWatchdogPretimeoutInterruptDesc, prometheus.GaugeValue, 0, pretimeoutInterrupt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ch <- prometheus.MustNewConstMetric(bmcWatchdogPretimeoutIntervalDesc, prometheus.GaugeValue, pretimeoutInterval)
|
||||||
|
ch <- prometheus.MustNewConstMetric(bmcWatchdogInitialCountdownDesc, prometheus.GaugeValue, initialCountdown)
|
||||||
|
ch <- prometheus.MustNewConstMetric(bmcWatchdogCurrentCountdownDesc, prometheus.GaugeValue, currentCountdown)
|
||||||
|
return 1, nil
|
||||||
|
}
|
|
@ -76,6 +76,8 @@ func (c CollectorName) GetInstance() (collector, error) {
|
||||||
return IPMICollector{}, nil
|
return IPMICollector{}, nil
|
||||||
case BMCCollectorName:
|
case BMCCollectorName:
|
||||||
return BMCCollector{}, nil
|
return BMCCollector{}, nil
|
||||||
|
case BMCWatchdogCollectorName:
|
||||||
|
return BMCWatchdogCollector{}, nil
|
||||||
case SELCollectorName:
|
case SELCollectorName:
|
||||||
return SELCollector{}, nil
|
return SELCollector{}, nil
|
||||||
case DCMICollectorName:
|
case DCMICollectorName:
|
||||||
|
|
|
@ -33,16 +33,24 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
ipmiDCMIPowerMeasurementRegex = regexp.MustCompile(`^Power Measurement\s*:\s*(?P<value>Active|Not\sAvailable).*`)
|
ipmiDCMIPowerMeasurementRegex = regexp.MustCompile(`^Power Measurement\s*:\s*(?P<value>Active|Not\sAvailable).*`)
|
||||||
ipmiDCMICurrentPowerRegex = regexp.MustCompile(`^Current Power\s*:\s*(?P<value>[0-9.]*)\s*Watts.*`)
|
ipmiDCMICurrentPowerRegex = regexp.MustCompile(`^Current Power\s*:\s*(?P<value>[0-9.]*)\s*Watts.*`)
|
||||||
ipmiChassisPowerRegex = regexp.MustCompile(`^System Power\s*:\s(?P<value>.*)`)
|
ipmiChassisPowerRegex = regexp.MustCompile(`^System Power\s*:\s(?P<value>.*)`)
|
||||||
ipmiChassisDriveFaultRegex = regexp.MustCompile(`^Drive Fault\s*:\s(?P<value>.*)`)
|
ipmiChassisDriveFaultRegex = regexp.MustCompile(`^Drive Fault\s*:\s(?P<value>.*)`)
|
||||||
ipmiChassisCoolingFaultRegex = regexp.MustCompile(`^Cooling/fan fault\s*:\s(?P<value>.*)`)
|
ipmiChassisCoolingFaultRegex = regexp.MustCompile(`^Cooling/fan fault\s*:\s(?P<value>.*)`)
|
||||||
ipmiSELEntriesRegex = regexp.MustCompile(`^Number of log entries\s*:\s(?P<value>[0-9.]*)`)
|
ipmiSELEntriesRegex = regexp.MustCompile(`^Number of log entries\s*:\s(?P<value>[0-9.]*)`)
|
||||||
ipmiSELFreeSpaceRegex = regexp.MustCompile(`^Free space remaining\s*:\s(?P<value>[0-9.]*)\s*bytes.*`)
|
ipmiSELFreeSpaceRegex = regexp.MustCompile(`^Free space remaining\s*:\s(?P<value>[0-9.]*)\s*bytes.*`)
|
||||||
bmcInfoFirmwareRevisionRegex = regexp.MustCompile(`^Firmware Revision\s*:\s*(?P<value>[0-9.]*).*`)
|
bmcInfoFirmwareRevisionRegex = regexp.MustCompile(`^Firmware Revision\s*:\s*(?P<value>[0-9.]*).*`)
|
||||||
bmcInfoSystemFirmwareVersionRegex = regexp.MustCompile(`^System Firmware Version\s*:\s*(?P<value>[0-9.]*).*`)
|
bmcInfoSystemFirmwareVersionRegex = regexp.MustCompile(`^System Firmware Version\s*:\s*(?P<value>[0-9.]*).*`)
|
||||||
bmcInfoManufacturerIDRegex = regexp.MustCompile(`^Manufacturer ID\s*:\s*(?P<value>.*)`)
|
bmcInfoManufacturerIDRegex = regexp.MustCompile(`^Manufacturer ID\s*:\s*(?P<value>.*)`)
|
||||||
|
bmcWatchdogTimerStateRegex = regexp.MustCompile(`^Timer:\s*(?P<value>Running|Stopped)`)
|
||||||
|
bmcWatchdogTimerUseRegex = regexp.MustCompile(`^Timer Use:\s*(?P<value>.*)`)
|
||||||
|
bmcWatchdogTimerLoggingRegex = regexp.MustCompile(`^Logging:\s*(?P<value>Enabled|Disabled)`)
|
||||||
|
bmcWatchdogTimeoutActionRegex = regexp.MustCompile(`^Timeout Action:\s*(?P<value>.*)`)
|
||||||
|
bmcWatchdogPretimeoutInterruptRegex = regexp.MustCompile(`^Pre-Timeout Interrupt:\s*(?P<value>.*)`)
|
||||||
|
bmcWatchdogPretimeoutIntervalRegex = regexp.MustCompile(`^Pre-Timeout Interval:\s*(?P<value>[0-9.]*)\s*seconds.*`)
|
||||||
|
bmcWatchdogInitialCountdownRegex = regexp.MustCompile(`^Initial Countdown:\s*(?P<value>[0-9.]*)\s*seconds.*`)
|
||||||
|
bmcWatchdogCurrentCountdownRegex = regexp.MustCompile(`^Current Countdown:\s*(?P<value>[0-9.]*)\s*seconds.*`)
|
||||||
)
|
)
|
||||||
|
|
||||||
// Result represents the outcome of a call to one of the FreeIPMI tools.
|
// Result represents the outcome of a call to one of the FreeIPMI tools.
|
||||||
|
@ -327,3 +335,85 @@ func GetRawOctets(ipmiOutput Result) ([]string, error) {
|
||||||
octets := strings.Split(strOutput[6:], " ")
|
octets := strings.Split(strOutput[6:], " ")
|
||||||
return octets, nil
|
return octets, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func GetBMCWatchdogTimerState(ipmiOutput Result) (float64, error) {
|
||||||
|
if ipmiOutput.err != nil {
|
||||||
|
return -1, fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output)
|
||||||
|
}
|
||||||
|
value, err := getValue(ipmiOutput.output, bmcWatchdogTimerStateRegex)
|
||||||
|
if err != nil {
|
||||||
|
return -1, err
|
||||||
|
}
|
||||||
|
if value == "Running" {
|
||||||
|
return 1, err
|
||||||
|
}
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetBMCWatchdogTimerUse(ipmiOutput Result) (string, error) {
|
||||||
|
if ipmiOutput.err != nil {
|
||||||
|
return "", fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output)
|
||||||
|
}
|
||||||
|
return getValue(ipmiOutput.output, bmcWatchdogTimerUseRegex)
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetBMCWatchdogLoggingState(ipmiOutput Result) (float64, error) {
|
||||||
|
if ipmiOutput.err != nil {
|
||||||
|
return -1, fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output)
|
||||||
|
}
|
||||||
|
value, err := getValue(ipmiOutput.output, bmcWatchdogTimerLoggingRegex)
|
||||||
|
if err != nil {
|
||||||
|
return -1, err
|
||||||
|
}
|
||||||
|
if value == "Enabled" {
|
||||||
|
return 1, err
|
||||||
|
}
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetBMCWatchdogTimeoutAction(ipmiOutput Result) (string, error) {
|
||||||
|
if ipmiOutput.err != nil {
|
||||||
|
return "", fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output)
|
||||||
|
}
|
||||||
|
return getValue(ipmiOutput.output, bmcWatchdogTimeoutActionRegex)
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetBMCWatchdogPretimeoutInterrupt(ipmiOutput Result) (string, error) {
|
||||||
|
if ipmiOutput.err != nil {
|
||||||
|
return "", fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output)
|
||||||
|
}
|
||||||
|
return getValue(ipmiOutput.output, bmcWatchdogPretimeoutInterruptRegex)
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetBMCWatchdogPretimeoutInterval(ipmiOutput Result) (float64, error) {
|
||||||
|
if ipmiOutput.err != nil {
|
||||||
|
return -1, fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output)
|
||||||
|
}
|
||||||
|
value, err := getValue(ipmiOutput.output, bmcWatchdogPretimeoutIntervalRegex)
|
||||||
|
if err != nil {
|
||||||
|
return -1, err
|
||||||
|
}
|
||||||
|
return strconv.ParseFloat(value, 64)
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetBMCWatchdogInitialCountdown(ipmiOutput Result) (float64, error) {
|
||||||
|
if ipmiOutput.err != nil {
|
||||||
|
return -1, fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output)
|
||||||
|
}
|
||||||
|
value, err := getValue(ipmiOutput.output, bmcWatchdogInitialCountdownRegex)
|
||||||
|
if err != nil {
|
||||||
|
return -1, err
|
||||||
|
}
|
||||||
|
return strconv.ParseFloat(value, 64)
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetBMCWatchdogCurrentCountdown(ipmiOutput Result) (float64, error) {
|
||||||
|
if ipmiOutput.err != nil {
|
||||||
|
return -1, fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output)
|
||||||
|
}
|
||||||
|
value, err := getValue(ipmiOutput.output, bmcWatchdogCurrentCountdownRegex)
|
||||||
|
if err != nil {
|
||||||
|
return -1, err
|
||||||
|
}
|
||||||
|
return strconv.ParseFloat(value, 64)
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue