Merge pull request #6 from soundcloud/bitfehler/collectors
Handle tool-specific failures more gracefully
This commit is contained in:
commit
109c7ca99c
22
README.md
22
README.md
|
@ -5,8 +5,8 @@ This is an IPMI over LAN exporter for [Prometheus](https://prometheus.io).
|
|||
|
||||
An instance running on one host can be used to monitor a large number of IPMI
|
||||
interfaces by passing the `target` parameter to a scrape. It uses tools from
|
||||
the [FreeIPMI](https://www.thomas-krenn.com/en/wiki/FreeIPMI_ipmimonitoring)
|
||||
suite for the actual IPMI communication.
|
||||
the [FreeIPMI](https://www.gnu.org/software/freeipmi/) suite for the actual
|
||||
IPMI communication.
|
||||
|
||||
## Installation
|
||||
|
||||
|
@ -27,9 +27,8 @@ Supported parameters include:
|
|||
- `config.file`: path to the configuration file (default: `ipmi.yml`)
|
||||
- `path`: path to the FreeIPMI executables (default: rely on `$PATH`)
|
||||
|
||||
Make sure you have at least the following tools from the
|
||||
[FreeIPMI](https://www.thomas-krenn.com/en/wiki/FreeIPMI_ipmimonitoring) suite
|
||||
installed:
|
||||
Make sure you have the following tools from the
|
||||
[FreeIPMI](https://www.gnu.org/software/freeipmi/) suite installed:
|
||||
|
||||
- `ipmimonitoring`
|
||||
- `ipmi-dcmi`
|
||||
|
@ -126,10 +125,17 @@ documentation](https://prometheus.io/docs).
|
|||
|
||||
### Scrape meta data
|
||||
|
||||
There are two metrics providing data about the scrape itself:
|
||||
These metrics provide data about the scrape itself:
|
||||
|
||||
- `ipmi_up` is `1` if all data could successfully be retrieved from the remote
|
||||
host, `0` otherwise
|
||||
- `ipmi_up{collector="<NAME>"}` is `1` if the data for this collector could
|
||||
successfully be retrieved from the remote host, `0` otherwise. The following
|
||||
collectors are available:
|
||||
- `ipmi`: collects IPMI sensor data. If it fails, sensor metrics (see below)
|
||||
will not be available
|
||||
- `dcmi`: collects DCMI data, currently only power consumption. If it fails,
|
||||
power consumption metrics (see below) will not be available
|
||||
- `bmc`: collects BMC details. If if fails, BMC info metrics (see below)
|
||||
will not be available
|
||||
- `ipmi_scrape_duration_seconds` is the amount of time it took to retrieve the
|
||||
data
|
||||
|
||||
|
|
112
collector.go
112
collector.go
|
@ -146,7 +146,7 @@ var (
|
|||
upDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, "", "up"),
|
||||
"'1' if a scrape of the IPMI device was successful, '0' otherwise.",
|
||||
nil,
|
||||
[]string{"collector"},
|
||||
nil,
|
||||
)
|
||||
|
||||
|
@ -348,17 +348,17 @@ func collectGenericSensor(ch chan<- prometheus.Metric, state float64, data senso
|
|||
)
|
||||
}
|
||||
|
||||
func (c collector) collectMonitoring(ch chan<- prometheus.Metric, creds Credentials) error {
|
||||
func (c collector) collectMonitoring(ch chan<- prometheus.Metric, creds Credentials) (int, error) {
|
||||
output, err := ipmiMonitoringOutput(c.target, creds.User, creds.Password)
|
||||
if err != nil {
|
||||
log.Errorln(err)
|
||||
return err
|
||||
log.Errorf("Failed to collect ipmimonitoring data: %s", err)
|
||||
return 0, err
|
||||
}
|
||||
excludeIds := c.config.ExcludeSensorIDs()
|
||||
results, err := splitMonitoringOutput(output, excludeIds)
|
||||
if err != nil {
|
||||
log.Errorln(err)
|
||||
return err
|
||||
log.Errorf("Failed to parse ipmimonitoring data: %s", err)
|
||||
return 0, err
|
||||
}
|
||||
for _, data := range results {
|
||||
var state float64
|
||||
|
@ -394,41 +394,71 @@ func (c collector) collectMonitoring(ch chan<- prometheus.Metric, creds Credenti
|
|||
collectGenericSensor(ch, state, data)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
return 1, nil
|
||||
}
|
||||
|
||||
func (c collector) getPowerConsumption(creds Credentials) (float64, error) {
|
||||
func (c collector) collectDCMI(ch chan<- prometheus.Metric, creds Credentials) (int, error) {
|
||||
output, err := ipmiDCMIOutput(c.target, creds.User, creds.Password)
|
||||
if err != nil {
|
||||
log.Errorln(err)
|
||||
return float64(-1), err
|
||||
log.Debugf("Failed to collect ipmi-dcmi data: %s", err)
|
||||
return 0, err
|
||||
}
|
||||
return getCurrentPowerConsumption(output)
|
||||
currentPowerConsumption, err := getCurrentPowerConsumption(output)
|
||||
if err != nil {
|
||||
log.Errorf("Failed to parse ipmi-dcmi data: %s", err)
|
||||
return 0, err
|
||||
}
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
powerConsumption,
|
||||
prometheus.GaugeValue,
|
||||
currentPowerConsumption,
|
||||
)
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
func (c collector) getBmcInfo(creds Credentials) (string, string, error) {
|
||||
func (c collector) collectBmcInfo(ch chan<- prometheus.Metric, creds Credentials) (int, error) {
|
||||
output, err := bmcInfoOutput(c.target, creds.User, creds.Password)
|
||||
if err != nil {
|
||||
log.Errorln(err)
|
||||
return "", "", err
|
||||
log.Debugf("Failed to collect bmc-info data: %s", err)
|
||||
return 0, err
|
||||
}
|
||||
firmwareRevision, err := getBMCInfoFirmwareRevision(output)
|
||||
if err != nil {
|
||||
return "", "", err
|
||||
log.Errorf("Failed to parse bmc-info data: %s", err)
|
||||
return 0, err
|
||||
}
|
||||
manufacturerID, err := getBMCInfoManufacturerID(output)
|
||||
if err != nil {
|
||||
return "", "", err
|
||||
log.Errorf("Failed to parse bmc-info data: %s", err)
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return firmwareRevision, manufacturerID, nil
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
bmcInfo,
|
||||
prometheus.GaugeValue,
|
||||
1,
|
||||
firmwareRevision, manufacturerID,
|
||||
)
|
||||
return 1, nil
|
||||
}
|
||||
|
||||
func (c collector) markAsDown(ch chan<- prometheus.Metric) {
|
||||
func (c collector) markCollectorsUp(ch chan<- prometheus.Metric, bmc, dcmi, ipmi int) {
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
upDesc,
|
||||
prometheus.GaugeValue,
|
||||
float64(0),
|
||||
float64(bmc),
|
||||
"bmc",
|
||||
)
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
upDesc,
|
||||
prometheus.GaugeValue,
|
||||
float64(dcmi),
|
||||
"dcmi",
|
||||
)
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
upDesc,
|
||||
prometheus.GaugeValue,
|
||||
float64(ipmi),
|
||||
"ipmi",
|
||||
)
|
||||
}
|
||||
|
||||
|
@ -448,47 +478,15 @@ func (c collector) Collect(ch chan<- prometheus.Metric) {
|
|||
creds, err := c.config.CredentialsForTarget(c.target)
|
||||
if err != nil {
|
||||
log.Errorf("No credentials available for target %s.", c.target)
|
||||
c.markAsDown(ch)
|
||||
c.markCollectorsUp(ch, 0, 0, 0)
|
||||
return
|
||||
}
|
||||
|
||||
firmwareRevision, manufacturerID, err := c.getBmcInfo(creds)
|
||||
if err != nil {
|
||||
log.Errorf("Could not collect bmc-info metrics: %s", err)
|
||||
c.markAsDown(ch)
|
||||
return
|
||||
}
|
||||
ipmiUp, _ := c.collectMonitoring(ch, creds)
|
||||
dcmiUp, _ := c.collectDCMI(ch, creds)
|
||||
bmcUp, _ := c.collectBmcInfo(ch, creds)
|
||||
|
||||
currentPowerConsumption, err := c.getPowerConsumption(creds)
|
||||
if err != nil {
|
||||
log.Errorf("Could not collect ipmi-dcmi power metrics: %s", err)
|
||||
c.markAsDown(ch)
|
||||
return
|
||||
}
|
||||
|
||||
err = c.collectMonitoring(ch, creds)
|
||||
if err != nil {
|
||||
log.Errorf("Could not collect ipmimonitoring sensor metrics: %s", err)
|
||||
c.markAsDown(ch)
|
||||
return
|
||||
}
|
||||
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
bmcInfo,
|
||||
prometheus.GaugeValue,
|
||||
1,
|
||||
firmwareRevision, manufacturerID,
|
||||
)
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
powerConsumption,
|
||||
prometheus.GaugeValue,
|
||||
currentPowerConsumption,
|
||||
)
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
upDesc,
|
||||
prometheus.GaugeValue,
|
||||
1,
|
||||
)
|
||||
c.markCollectorsUp(ch, bmcUp, dcmiUp, ipmiUp)
|
||||
}
|
||||
|
||||
func contains(s []int64, elm int64) bool {
|
||||
|
|
Loading…
Reference in New Issue