Handle tool-specific failures more gracefully

Instead of failing hard and not returning any metrics at all if just one
(or two) of the three calls to IPMI tools fail, return whatever data was
properly received and add a `collector` label to the `ipmi_up` metric
indicating which tools failed.

This is only a small step towards the concept of "collectors" like they
exist e.g. in the node exporter, but it should help solve #1. Additional
functionality, like disabling certain collectors, can be built on top of
this.

Currently, an error in the `ipmi` collector is always logged as an error,
In the `dcmi` and `bmc` collectors, an error retrieving the data is only
logged as debug output, but an error processing retrieved data is logged
as an error. This should cover most use cases and will be improved upon
once more work is done to make the collectors selectable per scrape.
This commit is contained in:
Conrad Hoffmann 2018-07-30 12:32:54 +02:00
parent a22a7c65c6
commit 9fb5f7296c
2 changed files with 69 additions and 65 deletions

View File

@ -5,8 +5,8 @@ This is an IPMI over LAN exporter for [Prometheus](https://prometheus.io).
An instance running on one host can be used to monitor a large number of IPMI
interfaces by passing the `target` parameter to a scrape. It uses tools from
the [FreeIPMI](https://www.thomas-krenn.com/en/wiki/FreeIPMI_ipmimonitoring)
suite for the actual IPMI communication.
the [FreeIPMI](https://www.gnu.org/software/freeipmi/) suite for the actual
IPMI communication.
## Installation
@ -27,9 +27,8 @@ Supported parameters include:
- `config.file`: path to the configuration file (default: `ipmi.yml`)
- `path`: path to the FreeIPMI executables (default: rely on `$PATH`)
Make sure you have at least the following tools from the
[FreeIPMI](https://www.thomas-krenn.com/en/wiki/FreeIPMI_ipmimonitoring) suite
installed:
Make sure you have the following tools from the
[FreeIPMI](https://www.gnu.org/software/freeipmi/) suite installed:
- `ipmimonitoring`
- `ipmi-dcmi`
@ -126,10 +125,17 @@ documentation](https://prometheus.io/docs).
### Scrape meta data
There are two metrics providing data about the scrape itself:
These metrics provide data about the scrape itself:
- `ipmi_up` is `1` if all data could successfully be retrieved from the remote
host, `0` otherwise
- `ipmi_up{collector="<NAME>"}` is `1` if the data for this collector could
successfully be retrieved from the remote host, `0` otherwise. The following
collectors are available:
- `ipmi`: collects IPMI sensor data. If it fails, sensor metrics (see below)
will not be available
- `dcmi`: collects DCMI data, currently only power consumption. If it fails,
power consumption metrics (see below) will not be available
- `bmc`: collects BMC details. If if fails, BMC info metrics (see below)
will not be available
- `ipmi_scrape_duration_seconds` is the amount of time it took to retrieve the
data

View File

@ -146,7 +146,7 @@ var (
upDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "up"),
"'1' if a scrape of the IPMI device was successful, '0' otherwise.",
nil,
[]string{"collector"},
nil,
)
@ -348,17 +348,17 @@ func collectGenericSensor(ch chan<- prometheus.Metric, state float64, data senso
)
}
func (c collector) collectMonitoring(ch chan<- prometheus.Metric, creds Credentials) error {
func (c collector) collectMonitoring(ch chan<- prometheus.Metric, creds Credentials) (int, error) {
output, err := ipmiMonitoringOutput(c.target, creds.User, creds.Password)
if err != nil {
log.Errorln(err)
return err
log.Errorf("Failed to collect ipmimonitoring data: %s", err)
return 0, err
}
excludeIds := c.config.ExcludeSensorIDs()
results, err := splitMonitoringOutput(output, excludeIds)
if err != nil {
log.Errorln(err)
return err
log.Errorf("Failed to parse ipmimonitoring data: %s", err)
return 0, err
}
for _, data := range results {
var state float64
@ -394,41 +394,71 @@ func (c collector) collectMonitoring(ch chan<- prometheus.Metric, creds Credenti
collectGenericSensor(ch, state, data)
}
}
return nil
return 1, nil
}
func (c collector) getPowerConsumption(creds Credentials) (float64, error) {
func (c collector) collectDCMI(ch chan<- prometheus.Metric, creds Credentials) (int, error) {
output, err := ipmiDCMIOutput(c.target, creds.User, creds.Password)
if err != nil {
log.Errorln(err)
return float64(-1), err
log.Debugf("Failed to collect ipmi-dcmi data: %s", err)
return 0, err
}
return getCurrentPowerConsumption(output)
currentPowerConsumption, err := getCurrentPowerConsumption(output)
if err != nil {
log.Errorf("Failed to parse ipmi-dcmi data: %s", err)
return 0, err
}
ch <- prometheus.MustNewConstMetric(
powerConsumption,
prometheus.GaugeValue,
currentPowerConsumption,
)
return 0, nil
}
func (c collector) getBmcInfo(creds Credentials) (string, string, error) {
func (c collector) collectBmcInfo(ch chan<- prometheus.Metric, creds Credentials) (int, error) {
output, err := bmcInfoOutput(c.target, creds.User, creds.Password)
if err != nil {
log.Errorln(err)
return "", "", err
log.Debugf("Failed to collect bmc-info data: %s", err)
return 0, err
}
firmwareRevision, err := getBMCInfoFirmwareRevision(output)
if err != nil {
return "", "", err
log.Errorf("Failed to parse bmc-info data: %s", err)
return 0, err
}
manufacturerID, err := getBMCInfoManufacturerID(output)
if err != nil {
return "", "", err
log.Errorf("Failed to parse bmc-info data: %s", err)
return 0, err
}
return firmwareRevision, manufacturerID, nil
ch <- prometheus.MustNewConstMetric(
bmcInfo,
prometheus.GaugeValue,
1,
firmwareRevision, manufacturerID,
)
return 1, nil
}
func (c collector) markAsDown(ch chan<- prometheus.Metric) {
func (c collector) markCollectorsUp(ch chan<- prometheus.Metric, bmc, dcmi, ipmi int) {
ch <- prometheus.MustNewConstMetric(
upDesc,
prometheus.GaugeValue,
float64(0),
float64(bmc),
"bmc",
)
ch <- prometheus.MustNewConstMetric(
upDesc,
prometheus.GaugeValue,
float64(dcmi),
"dcmi",
)
ch <- prometheus.MustNewConstMetric(
upDesc,
prometheus.GaugeValue,
float64(ipmi),
"ipmi",
)
}
@ -448,47 +478,15 @@ func (c collector) Collect(ch chan<- prometheus.Metric) {
creds, err := c.config.CredentialsForTarget(c.target)
if err != nil {
log.Errorf("No credentials available for target %s.", c.target)
c.markAsDown(ch)
c.markCollectorsUp(ch, 0, 0, 0)
return
}
firmwareRevision, manufacturerID, err := c.getBmcInfo(creds)
if err != nil {
log.Errorf("Could not collect bmc-info metrics: %s", err)
c.markAsDown(ch)
return
}
ipmiUp, _ := c.collectMonitoring(ch, creds)
dcmiUp, _ := c.collectDCMI(ch, creds)
bmcUp, _ := c.collectBmcInfo(ch, creds)
currentPowerConsumption, err := c.getPowerConsumption(creds)
if err != nil {
log.Errorf("Could not collect ipmi-dcmi power metrics: %s", err)
c.markAsDown(ch)
return
}
err = c.collectMonitoring(ch, creds)
if err != nil {
log.Errorf("Could not collect ipmimonitoring sensor metrics: %s", err)
c.markAsDown(ch)
return
}
ch <- prometheus.MustNewConstMetric(
bmcInfo,
prometheus.GaugeValue,
1,
firmwareRevision, manufacturerID,
)
ch <- prometheus.MustNewConstMetric(
powerConsumption,
prometheus.GaugeValue,
currentPowerConsumption,
)
ch <- prometheus.MustNewConstMetric(
upDesc,
prometheus.GaugeValue,
1,
)
c.markCollectorsUp(ch, bmcUp, dcmiUp, ipmiUp)
}
func contains(s []int64, elm int64) bool {