Handle tool-specific failures more gracefully
Instead of failing hard and not returning any metrics at all if just one (or two) of the three calls to IPMI tools fail, return whatever data was properly received and add a `collector` label to the `ipmi_up` metric indicating which tools failed. This is only a small step towards the concept of "collectors" like they exist e.g. in the node exporter, but it should help solve #1. Additional functionality, like disabling certain collectors, can be built on top of this. Currently, an error in the `ipmi` collector is always logged as an error, In the `dcmi` and `bmc` collectors, an error retrieving the data is only logged as debug output, but an error processing retrieved data is logged as an error. This should cover most use cases and will be improved upon once more work is done to make the collectors selectable per scrape.
This commit is contained in:
parent
a22a7c65c6
commit
9fb5f7296c
22
README.md
22
README.md
|
@ -5,8 +5,8 @@ This is an IPMI over LAN exporter for [Prometheus](https://prometheus.io).
|
|||
|
||||
An instance running on one host can be used to monitor a large number of IPMI
|
||||
interfaces by passing the `target` parameter to a scrape. It uses tools from
|
||||
the [FreeIPMI](https://www.thomas-krenn.com/en/wiki/FreeIPMI_ipmimonitoring)
|
||||
suite for the actual IPMI communication.
|
||||
the [FreeIPMI](https://www.gnu.org/software/freeipmi/) suite for the actual
|
||||
IPMI communication.
|
||||
|
||||
## Installation
|
||||
|
||||
|
@ -27,9 +27,8 @@ Supported parameters include:
|
|||
- `config.file`: path to the configuration file (default: `ipmi.yml`)
|
||||
- `path`: path to the FreeIPMI executables (default: rely on `$PATH`)
|
||||
|
||||
Make sure you have at least the following tools from the
|
||||
[FreeIPMI](https://www.thomas-krenn.com/en/wiki/FreeIPMI_ipmimonitoring) suite
|
||||
installed:
|
||||
Make sure you have the following tools from the
|
||||
[FreeIPMI](https://www.gnu.org/software/freeipmi/) suite installed:
|
||||
|
||||
- `ipmimonitoring`
|
||||
- `ipmi-dcmi`
|
||||
|
@ -126,10 +125,17 @@ documentation](https://prometheus.io/docs).
|
|||
|
||||
### Scrape meta data
|
||||
|
||||
There are two metrics providing data about the scrape itself:
|
||||
These metrics provide data about the scrape itself:
|
||||
|
||||
- `ipmi_up` is `1` if all data could successfully be retrieved from the remote
|
||||
host, `0` otherwise
|
||||
- `ipmi_up{collector="<NAME>"}` is `1` if the data for this collector could
|
||||
successfully be retrieved from the remote host, `0` otherwise. The following
|
||||
collectors are available:
|
||||
- `ipmi`: collects IPMI sensor data. If it fails, sensor metrics (see below)
|
||||
will not be available
|
||||
- `dcmi`: collects DCMI data, currently only power consumption. If it fails,
|
||||
power consumption metrics (see below) will not be available
|
||||
- `bmc`: collects BMC details. If if fails, BMC info metrics (see below)
|
||||
will not be available
|
||||
- `ipmi_scrape_duration_seconds` is the amount of time it took to retrieve the
|
||||
data
|
||||
|
||||
|
|
114
collector.go
114
collector.go
|
@ -146,7 +146,7 @@ var (
|
|||
upDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "", "up"),
|
||||
"'1' if a scrape of the IPMI device was successful, '0' otherwise.",
|
||||
nil,
|
||||
[]string{"collector"},
|
||||
nil,
|
||||
)
|
||||
|
||||
|
@ -348,17 +348,17 @@ func collectGenericSensor(ch chan<- prometheus.Metric, state float64, data senso
|
|||
)
|
||||
}
|
||||
|
||||
func (c collector) collectMonitoring(ch chan<- prometheus.Metric, creds Credentials) error {
|
||||
func (c collector) collectMonitoring(ch chan<- prometheus.Metric, creds Credentials) (int, error) {
|
||||
output, err := ipmiMonitoringOutput(c.target, creds.User, creds.Password)
|
||||
if err != nil {
|
||||
log.Errorln(err)
|
||||
return err
|
||||
log.Errorf("Failed to collect ipmimonitoring data: %s", err)
|
||||
return 0, err
|
||||
}
|
||||
excludeIds := c.config.ExcludeSensorIDs()
|
||||
results, err := splitMonitoringOutput(output, excludeIds)
|
||||
if err != nil {
|
||||
log.Errorln(err)
|
||||
return err
|
||||
log.Errorf("Failed to parse ipmimonitoring data: %s", err)
|
||||
return 0, err
|
||||
}
|
||||
for _, data := range results {
|
||||
var state float64
|
||||
|
@ -394,41 +394,71 @@ func (c collector) collectMonitoring(ch chan<- prometheus.Metric, creds Credenti
|
|||
collectGenericSensor(ch, state, data)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
return 1, nil
|
||||
}
|
||||
|
||||
func (c collector) getPowerConsumption(creds Credentials) (float64, error) {
|
||||
func (c collector) collectDCMI(ch chan<- prometheus.Metric, creds Credentials) (int, error) {
|
||||
output, err := ipmiDCMIOutput(c.target, creds.User, creds.Password)
|
||||
if err != nil {
|
||||
log.Errorln(err)
|
||||
return float64(-1), err
|
||||
log.Debugf("Failed to collect ipmi-dcmi data: %s", err)
|
||||
return 0, err
|
||||
}
|
||||
return getCurrentPowerConsumption(output)
|
||||
currentPowerConsumption, err := getCurrentPowerConsumption(output)
|
||||
if err != nil {
|
||||
log.Errorf("Failed to parse ipmi-dcmi data: %s", err)
|
||||
return 0, err
|
||||
}
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
powerConsumption,
|
||||
prometheus.GaugeValue,
|
||||
currentPowerConsumption,
|
||||
)
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
func (c collector) getBmcInfo(creds Credentials) (string, string, error) {
|
||||
func (c collector) collectBmcInfo(ch chan<- prometheus.Metric, creds Credentials) (int, error) {
|
||||
output, err := bmcInfoOutput(c.target, creds.User, creds.Password)
|
||||
if err != nil {
|
||||
log.Errorln(err)
|
||||
return "", "", err
|
||||
log.Debugf("Failed to collect bmc-info data: %s", err)
|
||||
return 0, err
|
||||
}
|
||||
firmwareRevision, err := getBMCInfoFirmwareRevision(output)
|
||||
if err != nil {
|
||||
return "", "", err
|
||||
log.Errorf("Failed to parse bmc-info data: %s", err)
|
||||
return 0, err
|
||||
}
|
||||
manufacturerID, err := getBMCInfoManufacturerID(output)
|
||||
if err != nil {
|
||||
return "", "", err
|
||||
log.Errorf("Failed to parse bmc-info data: %s", err)
|
||||
return 0, err
|
||||
}
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
bmcInfo,
|
||||
prometheus.GaugeValue,
|
||||
1,
|
||||
firmwareRevision, manufacturerID,
|
||||
)
|
||||
return 1, nil
|
||||
}
|
||||
|
||||
return firmwareRevision, manufacturerID, nil
|
||||
}
|
||||
|
||||
func (c collector) markAsDown(ch chan<- prometheus.Metric) {
|
||||
func (c collector) markCollectorsUp(ch chan<- prometheus.Metric, bmc, dcmi, ipmi int) {
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
upDesc,
|
||||
prometheus.GaugeValue,
|
||||
float64(0),
|
||||
float64(bmc),
|
||||
"bmc",
|
||||
)
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
upDesc,
|
||||
prometheus.GaugeValue,
|
||||
float64(dcmi),
|
||||
"dcmi",
|
||||
)
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
upDesc,
|
||||
prometheus.GaugeValue,
|
||||
float64(ipmi),
|
||||
"ipmi",
|
||||
)
|
||||
}
|
||||
|
||||
|
@ -448,47 +478,15 @@ func (c collector) Collect(ch chan<- prometheus.Metric) {
|
|||
creds, err := c.config.CredentialsForTarget(c.target)
|
||||
if err != nil {
|
||||
log.Errorf("No credentials available for target %s.", c.target)
|
||||
c.markAsDown(ch)
|
||||
c.markCollectorsUp(ch, 0, 0, 0)
|
||||
return
|
||||
}
|
||||
|
||||
firmwareRevision, manufacturerID, err := c.getBmcInfo(creds)
|
||||
if err != nil {
|
||||
log.Errorf("Could not collect bmc-info metrics: %s", err)
|
||||
c.markAsDown(ch)
|
||||
return
|
||||
}
|
||||
ipmiUp, _ := c.collectMonitoring(ch, creds)
|
||||
dcmiUp, _ := c.collectDCMI(ch, creds)
|
||||
bmcUp, _ := c.collectBmcInfo(ch, creds)
|
||||
|
||||
currentPowerConsumption, err := c.getPowerConsumption(creds)
|
||||
if err != nil {
|
||||
log.Errorf("Could not collect ipmi-dcmi power metrics: %s", err)
|
||||
c.markAsDown(ch)
|
||||
return
|
||||
}
|
||||
|
||||
err = c.collectMonitoring(ch, creds)
|
||||
if err != nil {
|
||||
log.Errorf("Could not collect ipmimonitoring sensor metrics: %s", err)
|
||||
c.markAsDown(ch)
|
||||
return
|
||||
}
|
||||
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
bmcInfo,
|
||||
prometheus.GaugeValue,
|
||||
1,
|
||||
firmwareRevision, manufacturerID,
|
||||
)
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
powerConsumption,
|
||||
prometheus.GaugeValue,
|
||||
currentPowerConsumption,
|
||||
)
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
upDesc,
|
||||
prometheus.GaugeValue,
|
||||
1,
|
||||
)
|
||||
c.markCollectorsUp(ch, bmcUp, dcmiUp, ipmiUp)
|
||||
}
|
||||
|
||||
func contains(s []int64, elm int64) bool {
|
||||
|
|
Loading…
Reference in New Issue