Merge pull request #6 from soundcloud/bitfehler/collectors

Handle tool-specific failures more gracefully
This commit is contained in:
Conrad Hoffmann 2018-08-02 13:58:17 +02:00 committed by GitHub
commit 109c7ca99c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 69 additions and 65 deletions

View File

@ -5,8 +5,8 @@ This is an IPMI over LAN exporter for [Prometheus](https://prometheus.io).
An instance running on one host can be used to monitor a large number of IPMI
interfaces by passing the `target` parameter to a scrape. It uses tools from
the [FreeIPMI](https://www.thomas-krenn.com/en/wiki/FreeIPMI_ipmimonitoring)
suite for the actual IPMI communication.
the [FreeIPMI](https://www.gnu.org/software/freeipmi/) suite for the actual
IPMI communication.
## Installation
@ -27,9 +27,8 @@ Supported parameters include:
- `config.file`: path to the configuration file (default: `ipmi.yml`)
- `path`: path to the FreeIPMI executables (default: rely on `$PATH`)
Make sure you have at least the following tools from the
[FreeIPMI](https://www.thomas-krenn.com/en/wiki/FreeIPMI_ipmimonitoring) suite
installed:
Make sure you have the following tools from the
[FreeIPMI](https://www.gnu.org/software/freeipmi/) suite installed:
- `ipmimonitoring`
- `ipmi-dcmi`
@ -126,10 +125,17 @@ documentation](https://prometheus.io/docs).
### Scrape meta data
There are two metrics providing data about the scrape itself:
These metrics provide data about the scrape itself:
- `ipmi_up` is `1` if all data could successfully be retrieved from the remote
host, `0` otherwise
- `ipmi_up{collector="<NAME>"}` is `1` if the data for this collector could
successfully be retrieved from the remote host, `0` otherwise. The following
collectors are available:
- `ipmi`: collects IPMI sensor data. If it fails, sensor metrics (see below)
will not be available
- `dcmi`: collects DCMI data, currently only power consumption. If it fails,
power consumption metrics (see below) will not be available
- `bmc`: collects BMC details. If if fails, BMC info metrics (see below)
will not be available
- `ipmi_scrape_duration_seconds` is the amount of time it took to retrieve the
data

View File

@ -146,7 +146,7 @@ var (
upDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "up"),
"'1' if a scrape of the IPMI device was successful, '0' otherwise.",
nil,
[]string{"collector"},
nil,
)
@ -348,17 +348,17 @@ func collectGenericSensor(ch chan<- prometheus.Metric, state float64, data senso
)
}
func (c collector) collectMonitoring(ch chan<- prometheus.Metric, creds Credentials) error {
func (c collector) collectMonitoring(ch chan<- prometheus.Metric, creds Credentials) (int, error) {
output, err := ipmiMonitoringOutput(c.target, creds.User, creds.Password)
if err != nil {
log.Errorln(err)
return err
log.Errorf("Failed to collect ipmimonitoring data: %s", err)
return 0, err
}
excludeIds := c.config.ExcludeSensorIDs()
results, err := splitMonitoringOutput(output, excludeIds)
if err != nil {
log.Errorln(err)
return err
log.Errorf("Failed to parse ipmimonitoring data: %s", err)
return 0, err
}
for _, data := range results {
var state float64
@ -394,41 +394,71 @@ func (c collector) collectMonitoring(ch chan<- prometheus.Metric, creds Credenti
collectGenericSensor(ch, state, data)
}
}
return nil
return 1, nil
}
func (c collector) getPowerConsumption(creds Credentials) (float64, error) {
func (c collector) collectDCMI(ch chan<- prometheus.Metric, creds Credentials) (int, error) {
output, err := ipmiDCMIOutput(c.target, creds.User, creds.Password)
if err != nil {
log.Errorln(err)
return float64(-1), err
log.Debugf("Failed to collect ipmi-dcmi data: %s", err)
return 0, err
}
return getCurrentPowerConsumption(output)
currentPowerConsumption, err := getCurrentPowerConsumption(output)
if err != nil {
log.Errorf("Failed to parse ipmi-dcmi data: %s", err)
return 0, err
}
ch <- prometheus.MustNewConstMetric(
powerConsumption,
prometheus.GaugeValue,
currentPowerConsumption,
)
return 0, nil
}
func (c collector) getBmcInfo(creds Credentials) (string, string, error) {
func (c collector) collectBmcInfo(ch chan<- prometheus.Metric, creds Credentials) (int, error) {
output, err := bmcInfoOutput(c.target, creds.User, creds.Password)
if err != nil {
log.Errorln(err)
return "", "", err
log.Debugf("Failed to collect bmc-info data: %s", err)
return 0, err
}
firmwareRevision, err := getBMCInfoFirmwareRevision(output)
if err != nil {
return "", "", err
log.Errorf("Failed to parse bmc-info data: %s", err)
return 0, err
}
manufacturerID, err := getBMCInfoManufacturerID(output)
if err != nil {
return "", "", err
log.Errorf("Failed to parse bmc-info data: %s", err)
return 0, err
}
return firmwareRevision, manufacturerID, nil
ch <- prometheus.MustNewConstMetric(
bmcInfo,
prometheus.GaugeValue,
1,
firmwareRevision, manufacturerID,
)
return 1, nil
}
func (c collector) markAsDown(ch chan<- prometheus.Metric) {
func (c collector) markCollectorsUp(ch chan<- prometheus.Metric, bmc, dcmi, ipmi int) {
ch <- prometheus.MustNewConstMetric(
upDesc,
prometheus.GaugeValue,
float64(0),
float64(bmc),
"bmc",
)
ch <- prometheus.MustNewConstMetric(
upDesc,
prometheus.GaugeValue,
float64(dcmi),
"dcmi",
)
ch <- prometheus.MustNewConstMetric(
upDesc,
prometheus.GaugeValue,
float64(ipmi),
"ipmi",
)
}
@ -448,47 +478,15 @@ func (c collector) Collect(ch chan<- prometheus.Metric) {
creds, err := c.config.CredentialsForTarget(c.target)
if err != nil {
log.Errorf("No credentials available for target %s.", c.target)
c.markAsDown(ch)
c.markCollectorsUp(ch, 0, 0, 0)
return
}
firmwareRevision, manufacturerID, err := c.getBmcInfo(creds)
if err != nil {
log.Errorf("Could not collect bmc-info metrics: %s", err)
c.markAsDown(ch)
return
}
ipmiUp, _ := c.collectMonitoring(ch, creds)
dcmiUp, _ := c.collectDCMI(ch, creds)
bmcUp, _ := c.collectBmcInfo(ch, creds)
currentPowerConsumption, err := c.getPowerConsumption(creds)
if err != nil {
log.Errorf("Could not collect ipmi-dcmi power metrics: %s", err)
c.markAsDown(ch)
return
}
err = c.collectMonitoring(ch, creds)
if err != nil {
log.Errorf("Could not collect ipmimonitoring sensor metrics: %s", err)
c.markAsDown(ch)
return
}
ch <- prometheus.MustNewConstMetric(
bmcInfo,
prometheus.GaugeValue,
1,
firmwareRevision, manufacturerID,
)
ch <- prometheus.MustNewConstMetric(
powerConsumption,
prometheus.GaugeValue,
currentPowerConsumption,
)
ch <- prometheus.MustNewConstMetric(
upDesc,
prometheus.GaugeValue,
1,
)
c.markCollectorsUp(ch, bmcUp, dcmiUp, ipmiUp)
}
func contains(s []int64, elm int64) bool {