mirror of
https://github.com/prometheus/node_exporter
synced 2025-01-10 15:39:30 +00:00
bdc0e7e678
* Collect additional common Infiniband counters Signed-off-by: Patrick Freeman <will.pat.free@gmail.com>
251 lines
9.3 KiB
Go
251 lines
9.3 KiB
Go
// Copyright 2017 The Prometheus Authors
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// +build linux
|
|
// +build !noinfiniband
|
|
|
|
package collector
|
|
|
|
import (
|
|
"errors"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/common/log"
|
|
)
|
|
|
|
const infinibandPath = "class/infiniband"
|
|
|
|
var (
|
|
errInfinibandNoDevicesFound = errors.New("no InfiniBand devices detected")
|
|
errInfinibandNoPortsFound = errors.New("no InfiniBand ports detected")
|
|
)
|
|
|
|
type infinibandCollector struct {
|
|
metricDescs map[string]*prometheus.Desc
|
|
counters map[string]infinibandMetric
|
|
legacyCounters map[string]infinibandMetric
|
|
}
|
|
|
|
type infinibandMetric struct {
|
|
File string
|
|
Help string
|
|
}
|
|
|
|
func init() {
|
|
registerCollector("infiniband", defaultEnabled, NewInfiniBandCollector)
|
|
}
|
|
|
|
// NewInfiniBandCollector returns a new Collector exposing InfiniBand stats.
|
|
func NewInfiniBandCollector() (Collector, error) {
|
|
var i infinibandCollector
|
|
|
|
// Filenames of all InfiniBand counter metrics including a detailed description.
|
|
i.counters = map[string]infinibandMetric{
|
|
"link_downed_total": {"link_downed", "Number of times the link failed to recover from an error state and went down"},
|
|
"link_error_recovery_total": {"link_error_recovery", "Number of times the link successfully recovered from an error state"},
|
|
"multicast_packets_received_total": {"multicast_rcv_packets", "Number of multicast packets received (including errors)"},
|
|
"multicast_packets_transmitted_total": {"multicast_xmit_packets", "Number of multicast packets transmitted (including errors)"},
|
|
"port_constraint_errors_received_total": {"port_rcv_constraint_errors", "Number of packets received on the switch physical port that are discarded"},
|
|
"port_constraint_errors_transmitted_total": {"port_xmit_constraint_errors", "Number of packets not transmitted from the switch physical port"},
|
|
"port_data_received_bytes_total": {"port_rcv_data", "Number of data octets received on all links"},
|
|
"port_data_transmitted_bytes_total": {"port_xmit_data", "Number of data octets transmitted on all links"},
|
|
"port_discards_received_total": {"port_rcv_discards", "Number of inbound packets discarded by the port because the port is down or congested"},
|
|
"port_discards_transmitted_total": {"port_xmit_discards", "Number of outbound packets discarded by the port because the port is down or congested"},
|
|
"port_errors_received_total": {"port_rcv_errors", "Number of packets containing an error that were received on this port"},
|
|
"port_packets_received_total": {"port_rcv_packets", "Number of packets received on all VLs by this port (including errors)"},
|
|
"port_packets_transmitted_total": {"port_xmit_packets", "Number of packets transmitted on all VLs from this port (including errors)"},
|
|
"port_transmit_wait_total": {"port_xmit_wait", "Number of ticks during which the port had data to transmit but no data was sent during the entire tick"},
|
|
"unicast_packets_received_total": {"unicast_rcv_packets", "Number of unicast packets received (including errors)"},
|
|
"unicast_packets_transmitted_total": {"unicast_xmit_packets", "Number of unicast packets transmitted (including errors)"},
|
|
}
|
|
|
|
// Deprecated counters for some older versions of InfiniBand drivers.
|
|
i.legacyCounters = map[string]infinibandMetric{
|
|
"legacy_multicast_packets_received_total": {"port_multicast_rcv_packets", "Number of multicast packets received"},
|
|
"legacy_multicast_packets_transmitted_total": {"port_multicast_xmit_packets", "Number of multicast packets transmitted"},
|
|
"legacy_data_received_bytes_total": {"port_rcv_data_64", "Number of data octets received on all links"},
|
|
"legacy_packets_received_total": {"port_rcv_packets_64", "Number of data packets received on all links"},
|
|
"legacy_unicast_packets_received_total": {"port_unicast_rcv_packets", "Number of unicast packets received"},
|
|
"legacy_unicast_packets_transmitted_total": {"port_unicast_xmit_packets", "Number of unicast packets transmitted"},
|
|
"legacy_data_transmitted_bytes_total": {"port_xmit_data_64", "Number of data octets transmitted on all links"},
|
|
"legacy_packets_transmitted_total": {"port_xmit_packets_64", "Number of data packets received on all links"},
|
|
}
|
|
|
|
subsystem := "infiniband"
|
|
i.metricDescs = make(map[string]*prometheus.Desc)
|
|
|
|
for metricName, infinibandMetric := range i.counters {
|
|
i.metricDescs[metricName] = prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, subsystem, metricName),
|
|
infinibandMetric.Help,
|
|
[]string{"device", "port"},
|
|
nil,
|
|
)
|
|
}
|
|
|
|
for metricName, infinibandMetric := range i.legacyCounters {
|
|
i.metricDescs[metricName] = prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, subsystem, metricName),
|
|
infinibandMetric.Help,
|
|
[]string{"device", "port"},
|
|
nil,
|
|
)
|
|
}
|
|
|
|
return &i, nil
|
|
}
|
|
|
|
// infinibandDevices retrieves a list of InfiniBand devices.
|
|
func infinibandDevices(infinibandPath string) ([]string, error) {
|
|
devices, err := filepath.Glob(filepath.Join(infinibandPath, "/*"))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if len(devices) < 1 {
|
|
log.Debugf("Unable to detect InfiniBand devices")
|
|
err = errInfinibandNoDevicesFound
|
|
return nil, err
|
|
}
|
|
|
|
// Extract just the filenames which equate to the device names.
|
|
for i, device := range devices {
|
|
devices[i] = filepath.Base(device)
|
|
}
|
|
|
|
return devices, nil
|
|
}
|
|
|
|
// Retrieve a list of ports for the InfiniBand device.
|
|
func infinibandPorts(infinibandPath, device string) ([]string, error) {
|
|
ports, err := filepath.Glob(filepath.Join(infinibandPath, device, "ports/*"))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if len(ports) < 1 {
|
|
log.Debugf("Unable to detect ports for %s", device)
|
|
err = errInfinibandNoPortsFound
|
|
return nil, err
|
|
}
|
|
|
|
// Extract just the filenames which equates to the port numbers.
|
|
for i, port := range ports {
|
|
ports[i] = filepath.Base(port)
|
|
}
|
|
|
|
return ports, nil
|
|
}
|
|
|
|
func readMetric(directory, metricFile string) (uint64, error) {
|
|
metric, err := readUintFromFile(filepath.Join(directory, metricFile))
|
|
if err != nil {
|
|
// Ugly workaround for handling #966, when counters are
|
|
// `N/A (not available)`.
|
|
// This was already patched and submitted, see
|
|
// https://www.spinics.net/lists/linux-rdma/msg68596.html
|
|
// Remove this as soon as the fix lands in the enterprise distros.
|
|
if strings.Contains(err.Error(), "N/A (no PMA)") {
|
|
log.Debugf("%q value is N/A", metricFile)
|
|
return 0, nil
|
|
}
|
|
log.Debugf("Error reading %q file", metricFile)
|
|
return 0, err
|
|
}
|
|
|
|
// According to Mellanox, the following metrics "are divided by 4 unconditionally"
|
|
// as they represent the amount of data being transmitted and received per lane.
|
|
// Mellanox cards have 4 lanes per port, so all values must be multiplied by 4
|
|
// to get the expected value.
|
|
switch metricFile {
|
|
case "port_rcv_data", "port_xmit_data", "port_rcv_data_64", "port_xmit_data_64":
|
|
metric *= 4
|
|
}
|
|
|
|
return metric, nil
|
|
}
|
|
|
|
func (c *infinibandCollector) Update(ch chan<- prometheus.Metric) error {
|
|
devices, err := infinibandDevices(sysFilePath(infinibandPath))
|
|
|
|
// If no devices are found or another error is raised while attempting to find devices,
|
|
// InfiniBand is likely not installed and the collector should be skipped.
|
|
switch err {
|
|
case nil:
|
|
case errInfinibandNoDevicesFound:
|
|
return nil
|
|
default:
|
|
return err
|
|
}
|
|
|
|
for _, device := range devices {
|
|
ports, err := infinibandPorts(sysFilePath(infinibandPath), device)
|
|
|
|
// If no ports are found for the specified device, skip to the next device.
|
|
switch err {
|
|
case nil:
|
|
case errInfinibandNoPortsFound:
|
|
continue
|
|
default:
|
|
return err
|
|
}
|
|
|
|
for _, port := range ports {
|
|
portFiles := sysFilePath(filepath.Join(infinibandPath, device, "ports", port))
|
|
|
|
// Add metrics for the InfiniBand counters.
|
|
for metricName, infinibandMetric := range c.counters {
|
|
if _, err := os.Stat(filepath.Join(portFiles, "counters", infinibandMetric.File)); os.IsNotExist(err) {
|
|
continue
|
|
}
|
|
metric, err := readMetric(filepath.Join(portFiles, "counters"), infinibandMetric.File)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
ch <- prometheus.MustNewConstMetric(
|
|
c.metricDescs[metricName],
|
|
prometheus.CounterValue,
|
|
float64(metric),
|
|
device,
|
|
port,
|
|
)
|
|
}
|
|
|
|
// Add metrics for the legacy InfiniBand counters.
|
|
for metricName, infinibandMetric := range c.legacyCounters {
|
|
if _, err := os.Stat(filepath.Join(portFiles, "counters_ext", infinibandMetric.File)); os.IsNotExist(err) {
|
|
continue
|
|
}
|
|
metric, err := readMetric(filepath.Join(portFiles, "counters_ext"), infinibandMetric.File)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
ch <- prometheus.MustNewConstMetric(
|
|
c.metricDescs[metricName],
|
|
prometheus.CounterValue,
|
|
float64(metric),
|
|
device,
|
|
port,
|
|
)
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|