Use InfiniBandClass from procfs library (#1396)
Parsing the sysfs files for InfiniBand was added to the procfs library (see https://github.com/prometheus/procfs/pull/164). Therefore use `InfiniBandClass` from the procfs library instead of parsing sysfs itself. If the port counter return `N/A (no PMA)` no metric will be returned (instead of returning 0 for this metric. Signed-off-by: Benjamin Drung <benjamin.drung@cloud.ionos.com>
This commit is contained in:
parent
f3538e1fc6
commit
27b8c93a5a
|
@ -816,12 +816,10 @@ node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="1
|
|||
node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239
|
||||
# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down
|
||||
# TYPE node_infiniband_link_downed_total counter
|
||||
node_infiniband_link_downed_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0
|
||||
node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state
|
||||
# TYPE node_infiniband_link_error_recovery_total counter
|
||||
node_infiniband_link_error_recovery_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0
|
||||
node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors)
|
||||
|
@ -834,20 +832,16 @@ node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16
|
|||
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_port_constraint_errors_received_total Number of packets received on the switch physical port that are discarded
|
||||
# TYPE node_infiniband_port_constraint_errors_received_total counter
|
||||
node_infiniband_port_constraint_errors_received_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_constraint_errors_received_total{device="mlx4_0",port="1"} 0
|
||||
# HELP node_infiniband_port_constraint_errors_transmitted_total Number of packets not transmitted from the switch physical port
|
||||
# TYPE node_infiniband_port_constraint_errors_transmitted_total counter
|
||||
node_infiniband_port_constraint_errors_transmitted_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_constraint_errors_transmitted_total{device="mlx4_0",port="1"} 0
|
||||
# HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links
|
||||
# TYPE node_infiniband_port_data_received_bytes_total counter
|
||||
node_infiniband_port_data_received_bytes_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07
|
||||
node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links
|
||||
# TYPE node_infiniband_port_data_transmitted_bytes_total counter
|
||||
node_infiniband_port_data_transmitted_bytes_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07
|
||||
node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_port_discards_received_total Number of inbound packets discarded by the port because the port is down or congested
|
||||
|
@ -855,23 +849,18 @@ node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0
|
|||
node_infiniband_port_discards_received_total{device="mlx4_0",port="1"} 0
|
||||
# HELP node_infiniband_port_discards_transmitted_total Number of outbound packets discarded by the port because the port is down or congested
|
||||
# TYPE node_infiniband_port_discards_transmitted_total counter
|
||||
node_infiniband_port_discards_transmitted_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_discards_transmitted_total{device="mlx4_0",port="1"} 5
|
||||
# HELP node_infiniband_port_errors_received_total Number of packets containing an error that were received on this port
|
||||
# TYPE node_infiniband_port_errors_received_total counter
|
||||
node_infiniband_port_errors_received_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_errors_received_total{device="mlx4_0",port="1"} 0
|
||||
# HELP node_infiniband_port_packets_received_total Number of packets received on all VLs by this port (including errors)
|
||||
# TYPE node_infiniband_port_packets_received_total counter
|
||||
node_infiniband_port_packets_received_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_packets_received_total{device="mlx4_0",port="1"} 6.825908347e+09
|
||||
# HELP node_infiniband_port_packets_transmitted_total Number of packets transmitted on all VLs from this port (including errors)
|
||||
# TYPE node_infiniband_port_packets_transmitted_total counter
|
||||
node_infiniband_port_packets_transmitted_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_packets_transmitted_total{device="mlx4_0",port="1"} 6.235865e+06
|
||||
# HELP node_infiniband_port_transmit_wait_total Number of ticks during which the port had data to transmit but no data was sent during the entire tick
|
||||
# TYPE node_infiniband_port_transmit_wait_total counter
|
||||
node_infiniband_port_transmit_wait_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_transmit_wait_total{device="mlx4_0",port="1"} 4.294967295e+09
|
||||
# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors)
|
||||
# TYPE node_infiniband_unicast_packets_received_total counter
|
||||
|
|
|
@ -816,12 +816,10 @@ node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="1
|
|||
node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239
|
||||
# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down
|
||||
# TYPE node_infiniband_link_downed_total counter
|
||||
node_infiniband_link_downed_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0
|
||||
node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state
|
||||
# TYPE node_infiniband_link_error_recovery_total counter
|
||||
node_infiniband_link_error_recovery_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0
|
||||
node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors)
|
||||
|
@ -834,20 +832,16 @@ node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16
|
|||
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_port_constraint_errors_received_total Number of packets received on the switch physical port that are discarded
|
||||
# TYPE node_infiniband_port_constraint_errors_received_total counter
|
||||
node_infiniband_port_constraint_errors_received_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_constraint_errors_received_total{device="mlx4_0",port="1"} 0
|
||||
# HELP node_infiniband_port_constraint_errors_transmitted_total Number of packets not transmitted from the switch physical port
|
||||
# TYPE node_infiniband_port_constraint_errors_transmitted_total counter
|
||||
node_infiniband_port_constraint_errors_transmitted_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_constraint_errors_transmitted_total{device="mlx4_0",port="1"} 0
|
||||
# HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links
|
||||
# TYPE node_infiniband_port_data_received_bytes_total counter
|
||||
node_infiniband_port_data_received_bytes_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07
|
||||
node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links
|
||||
# TYPE node_infiniband_port_data_transmitted_bytes_total counter
|
||||
node_infiniband_port_data_transmitted_bytes_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07
|
||||
node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_port_discards_received_total Number of inbound packets discarded by the port because the port is down or congested
|
||||
|
@ -855,23 +849,18 @@ node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0
|
|||
node_infiniband_port_discards_received_total{device="mlx4_0",port="1"} 0
|
||||
# HELP node_infiniband_port_discards_transmitted_total Number of outbound packets discarded by the port because the port is down or congested
|
||||
# TYPE node_infiniband_port_discards_transmitted_total counter
|
||||
node_infiniband_port_discards_transmitted_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_discards_transmitted_total{device="mlx4_0",port="1"} 5
|
||||
# HELP node_infiniband_port_errors_received_total Number of packets containing an error that were received on this port
|
||||
# TYPE node_infiniband_port_errors_received_total counter
|
||||
node_infiniband_port_errors_received_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_errors_received_total{device="mlx4_0",port="1"} 0
|
||||
# HELP node_infiniband_port_packets_received_total Number of packets received on all VLs by this port (including errors)
|
||||
# TYPE node_infiniband_port_packets_received_total counter
|
||||
node_infiniband_port_packets_received_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_packets_received_total{device="mlx4_0",port="1"} 6.825908347e+09
|
||||
# HELP node_infiniband_port_packets_transmitted_total Number of packets transmitted on all VLs from this port (including errors)
|
||||
# TYPE node_infiniband_port_packets_transmitted_total counter
|
||||
node_infiniband_port_packets_transmitted_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_packets_transmitted_total{device="mlx4_0",port="1"} 6.235865e+06
|
||||
# HELP node_infiniband_port_transmit_wait_total Number of ticks during which the port had data to transmit but no data was sent during the entire tick
|
||||
# TYPE node_infiniband_port_transmit_wait_total counter
|
||||
node_infiniband_port_transmit_wait_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_transmit_wait_total{device="mlx4_0",port="1"} 4.294967295e+09
|
||||
# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors)
|
||||
# TYPE node_infiniband_unicast_packets_received_total counter
|
||||
|
|
|
@ -112,6 +112,21 @@ Mode: 755
|
|||
Directory: sys/class/infiniband/i40iw0
|
||||
Mode: 755
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/board_id
|
||||
Lines: 1
|
||||
I40IW Board ID
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/fw_ver
|
||||
Lines: 1
|
||||
0.2
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/hca_type
|
||||
Lines: 1
|
||||
I40IW
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Directory: sys/class/infiniband/i40iw0/ports
|
||||
Mode: 755
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
|
@ -206,9 +221,39 @@ Lines: 1
|
|||
N/A (no PMA)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/phys_state
|
||||
Lines: 1
|
||||
5: LinkUp
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/rate
|
||||
Lines: 1
|
||||
10 Gb/sec (4X)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/state
|
||||
Lines: 1
|
||||
4: ACTIVE
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Directory: sys/class/infiniband/mlx4_0
|
||||
Mode: 755
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/mlx4_0/board_id
|
||||
Lines: 1
|
||||
SM_1141000001000
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/mlx4_0/fw_ver
|
||||
Lines: 1
|
||||
2.31.5050
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/mlx4_0/hca_type
|
||||
Lines: 1
|
||||
MT4099
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Directory: sys/class/infiniband/mlx4_0/ports
|
||||
Mode: 755
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
|
@ -346,6 +391,21 @@ Lines: 1
|
|||
0
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/mlx4_0/ports/1/phys_state
|
||||
Lines: 1
|
||||
5: LinkUp
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/mlx4_0/ports/1/rate
|
||||
Lines: 1
|
||||
40 Gb/sec (4X QDR)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/mlx4_0/ports/1/state
|
||||
Lines: 1
|
||||
4: ACTIVE
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Directory: sys/class/infiniband/mlx4_0/ports/2
|
||||
Mode: 755
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
|
@ -435,6 +495,21 @@ Lines: 1
|
|||
0
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/mlx4_0/ports/2/phys_state
|
||||
Lines: 1
|
||||
5: LinkUp
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/mlx4_0/ports/2/rate
|
||||
Lines: 1
|
||||
40 Gb/sec (4X QDR)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/mlx4_0/ports/2/state
|
||||
Lines: 1
|
||||
4: ACTIVE
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Directory: sys/class/net
|
||||
Mode: 755
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// Copyright 2017 The Prometheus Authors
|
||||
// Copyright 2017-2019 The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
@ -17,31 +17,16 @@
|
|||
package collector
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"fmt"
|
||||
"strconv"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/common/log"
|
||||
)
|
||||
|
||||
const infinibandPath = "class/infiniband"
|
||||
|
||||
var (
|
||||
errInfinibandNoDevicesFound = errors.New("no InfiniBand devices detected")
|
||||
errInfinibandNoPortsFound = errors.New("no InfiniBand ports detected")
|
||||
"github.com/prometheus/procfs/sysfs"
|
||||
)
|
||||
|
||||
type infinibandCollector struct {
|
||||
metricDescs map[string]*prometheus.Desc
|
||||
counters map[string]infinibandMetric
|
||||
legacyCounters map[string]infinibandMetric
|
||||
}
|
||||
|
||||
type infinibandMetric struct {
|
||||
File string
|
||||
Help string
|
||||
fs sysfs.FS
|
||||
metricDescs map[string]*prometheus.Desc
|
||||
}
|
||||
|
||||
func init() {
|
||||
|
@ -51,55 +36,47 @@ func init() {
|
|||
// NewInfiniBandCollector returns a new Collector exposing InfiniBand stats.
|
||||
func NewInfiniBandCollector() (Collector, error) {
|
||||
var i infinibandCollector
|
||||
var err error
|
||||
|
||||
// Filenames of all InfiniBand counter metrics including a detailed description.
|
||||
i.counters = map[string]infinibandMetric{
|
||||
"link_downed_total": {"link_downed", "Number of times the link failed to recover from an error state and went down"},
|
||||
"link_error_recovery_total": {"link_error_recovery", "Number of times the link successfully recovered from an error state"},
|
||||
"multicast_packets_received_total": {"multicast_rcv_packets", "Number of multicast packets received (including errors)"},
|
||||
"multicast_packets_transmitted_total": {"multicast_xmit_packets", "Number of multicast packets transmitted (including errors)"},
|
||||
"port_constraint_errors_received_total": {"port_rcv_constraint_errors", "Number of packets received on the switch physical port that are discarded"},
|
||||
"port_constraint_errors_transmitted_total": {"port_xmit_constraint_errors", "Number of packets not transmitted from the switch physical port"},
|
||||
"port_data_received_bytes_total": {"port_rcv_data", "Number of data octets received on all links"},
|
||||
"port_data_transmitted_bytes_total": {"port_xmit_data", "Number of data octets transmitted on all links"},
|
||||
"port_discards_received_total": {"port_rcv_discards", "Number of inbound packets discarded by the port because the port is down or congested"},
|
||||
"port_discards_transmitted_total": {"port_xmit_discards", "Number of outbound packets discarded by the port because the port is down or congested"},
|
||||
"port_errors_received_total": {"port_rcv_errors", "Number of packets containing an error that were received on this port"},
|
||||
"port_packets_received_total": {"port_rcv_packets", "Number of packets received on all VLs by this port (including errors)"},
|
||||
"port_packets_transmitted_total": {"port_xmit_packets", "Number of packets transmitted on all VLs from this port (including errors)"},
|
||||
"port_transmit_wait_total": {"port_xmit_wait", "Number of ticks during which the port had data to transmit but no data was sent during the entire tick"},
|
||||
"unicast_packets_received_total": {"unicast_rcv_packets", "Number of unicast packets received (including errors)"},
|
||||
"unicast_packets_transmitted_total": {"unicast_xmit_packets", "Number of unicast packets transmitted (including errors)"},
|
||||
i.fs, err = sysfs.NewFS(*sysPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open sysfs: %v", err)
|
||||
}
|
||||
|
||||
// Deprecated counters for some older versions of InfiniBand drivers.
|
||||
i.legacyCounters = map[string]infinibandMetric{
|
||||
"legacy_multicast_packets_received_total": {"port_multicast_rcv_packets", "Number of multicast packets received"},
|
||||
"legacy_multicast_packets_transmitted_total": {"port_multicast_xmit_packets", "Number of multicast packets transmitted"},
|
||||
"legacy_data_received_bytes_total": {"port_rcv_data_64", "Number of data octets received on all links"},
|
||||
"legacy_packets_received_total": {"port_rcv_packets_64", "Number of data packets received on all links"},
|
||||
"legacy_unicast_packets_received_total": {"port_unicast_rcv_packets", "Number of unicast packets received"},
|
||||
"legacy_unicast_packets_transmitted_total": {"port_unicast_xmit_packets", "Number of unicast packets transmitted"},
|
||||
"legacy_data_transmitted_bytes_total": {"port_xmit_data_64", "Number of data octets transmitted on all links"},
|
||||
"legacy_packets_transmitted_total": {"port_xmit_packets_64", "Number of data packets received on all links"},
|
||||
// Detailed description for all metrics.
|
||||
descriptions := map[string]string{
|
||||
"legacy_multicast_packets_received_total": "Number of multicast packets received",
|
||||
"legacy_multicast_packets_transmitted_total": "Number of multicast packets transmitted",
|
||||
"legacy_data_received_bytes_total": "Number of data octets received on all links",
|
||||
"legacy_packets_received_total": "Number of data packets received on all links",
|
||||
"legacy_unicast_packets_received_total": "Number of unicast packets received",
|
||||
"legacy_unicast_packets_transmitted_total": "Number of unicast packets transmitted",
|
||||
"legacy_data_transmitted_bytes_total": "Number of data octets transmitted on all links",
|
||||
"legacy_packets_transmitted_total": "Number of data packets received on all links",
|
||||
"link_downed_total": "Number of times the link failed to recover from an error state and went down",
|
||||
"link_error_recovery_total": "Number of times the link successfully recovered from an error state",
|
||||
"multicast_packets_received_total": "Number of multicast packets received (including errors)",
|
||||
"multicast_packets_transmitted_total": "Number of multicast packets transmitted (including errors)",
|
||||
"port_constraint_errors_received_total": "Number of packets received on the switch physical port that are discarded",
|
||||
"port_constraint_errors_transmitted_total": "Number of packets not transmitted from the switch physical port",
|
||||
"port_data_received_bytes_total": "Number of data octets received on all links",
|
||||
"port_data_transmitted_bytes_total": "Number of data octets transmitted on all links",
|
||||
"port_discards_received_total": "Number of inbound packets discarded by the port because the port is down or congested",
|
||||
"port_discards_transmitted_total": "Number of outbound packets discarded by the port because the port is down or congested",
|
||||
"port_errors_received_total": "Number of packets containing an error that were received on this port",
|
||||
"port_packets_received_total": "Number of packets received on all VLs by this port (including errors)",
|
||||
"port_packets_transmitted_total": "Number of packets transmitted on all VLs from this port (including errors)",
|
||||
"port_transmit_wait_total": "Number of ticks during which the port had data to transmit but no data was sent during the entire tick",
|
||||
"unicast_packets_received_total": "Number of unicast packets received (including errors)",
|
||||
"unicast_packets_transmitted_total": "Number of unicast packets transmitted (including errors)",
|
||||
}
|
||||
|
||||
subsystem := "infiniband"
|
||||
i.metricDescs = make(map[string]*prometheus.Desc)
|
||||
|
||||
for metricName, infinibandMetric := range i.counters {
|
||||
for metricName, description := range descriptions {
|
||||
i.metricDescs[metricName] = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, subsystem, metricName),
|
||||
infinibandMetric.Help,
|
||||
[]string{"device", "port"},
|
||||
nil,
|
||||
)
|
||||
}
|
||||
|
||||
for metricName, infinibandMetric := range i.legacyCounters {
|
||||
i.metricDescs[metricName] = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(namespace, subsystem, metricName),
|
||||
infinibandMetric.Help,
|
||||
prometheus.BuildFQName(namespace, "infiniband", metricName),
|
||||
description,
|
||||
[]string{"device", "port"},
|
||||
nil,
|
||||
)
|
||||
|
@ -108,141 +85,50 @@ func NewInfiniBandCollector() (Collector, error) {
|
|||
return &i, nil
|
||||
}
|
||||
|
||||
// infinibandDevices retrieves a list of InfiniBand devices.
|
||||
func infinibandDevices(infinibandPath string) ([]string, error) {
|
||||
devices, err := filepath.Glob(filepath.Join(infinibandPath, "/*"))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(devices) < 1 {
|
||||
log.Debugf("Unable to detect InfiniBand devices")
|
||||
err = errInfinibandNoDevicesFound
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Extract just the filenames which equate to the device names.
|
||||
for i, device := range devices {
|
||||
devices[i] = filepath.Base(device)
|
||||
}
|
||||
|
||||
return devices, nil
|
||||
func (c *infinibandCollector) pushMetric(ch chan<- prometheus.Metric, name string, value uint64, deviceName string, port string, valueType prometheus.ValueType) {
|
||||
ch <- prometheus.MustNewConstMetric(c.metricDescs[name], valueType, float64(value), deviceName, port)
|
||||
}
|
||||
|
||||
// Retrieve a list of ports for the InfiniBand device.
|
||||
func infinibandPorts(infinibandPath, device string) ([]string, error) {
|
||||
ports, err := filepath.Glob(filepath.Join(infinibandPath, device, "ports/*"))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
func (c *infinibandCollector) pushCounter(ch chan<- prometheus.Metric, name string, value *uint64, deviceName string, port string) {
|
||||
if value != nil {
|
||||
c.pushMetric(ch, name, *value, deviceName, port, prometheus.CounterValue)
|
||||
}
|
||||
|
||||
if len(ports) < 1 {
|
||||
log.Debugf("Unable to detect ports for %s", device)
|
||||
err = errInfinibandNoPortsFound
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Extract just the filenames which equates to the port numbers.
|
||||
for i, port := range ports {
|
||||
ports[i] = filepath.Base(port)
|
||||
}
|
||||
|
||||
return ports, nil
|
||||
}
|
||||
|
||||
func readMetric(directory, metricFile string) (uint64, error) {
|
||||
metric, err := readUintFromFile(filepath.Join(directory, metricFile))
|
||||
if err != nil {
|
||||
// Ugly workaround for handling #966, when counters are
|
||||
// `N/A (not available)`.
|
||||
// This was already patched and submitted, see
|
||||
// https://www.spinics.net/lists/linux-rdma/msg68596.html
|
||||
// Remove this as soon as the fix lands in the enterprise distros.
|
||||
if strings.Contains(err.Error(), "N/A (no PMA)") {
|
||||
log.Debugf("%q value is N/A", metricFile)
|
||||
return 0, nil
|
||||
}
|
||||
log.Debugf("Error reading %q file", metricFile)
|
||||
return 0, err
|
||||
}
|
||||
|
||||
// According to Mellanox, the following metrics "are divided by 4 unconditionally"
|
||||
// as they represent the amount of data being transmitted and received per lane.
|
||||
// Mellanox cards have 4 lanes per port, so all values must be multiplied by 4
|
||||
// to get the expected value.
|
||||
switch metricFile {
|
||||
case "port_rcv_data", "port_xmit_data", "port_rcv_data_64", "port_xmit_data_64":
|
||||
metric *= 4
|
||||
}
|
||||
|
||||
return metric, nil
|
||||
}
|
||||
|
||||
func (c *infinibandCollector) Update(ch chan<- prometheus.Metric) error {
|
||||
devices, err := infinibandDevices(sysFilePath(infinibandPath))
|
||||
|
||||
// If no devices are found or another error is raised while attempting to find devices,
|
||||
// InfiniBand is likely not installed and the collector should be skipped.
|
||||
switch err {
|
||||
case nil:
|
||||
case errInfinibandNoDevicesFound:
|
||||
return nil
|
||||
default:
|
||||
return err
|
||||
devices, err := c.fs.InfiniBandClass()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error obtaining InfiniBand class info: %s", err)
|
||||
}
|
||||
|
||||
for _, device := range devices {
|
||||
ports, err := infinibandPorts(sysFilePath(infinibandPath), device)
|
||||
for _, port := range device.Ports {
|
||||
portStr := strconv.FormatUint(uint64(port.Port), 10)
|
||||
|
||||
// If no ports are found for the specified device, skip to the next device.
|
||||
switch err {
|
||||
case nil:
|
||||
case errInfinibandNoPortsFound:
|
||||
continue
|
||||
default:
|
||||
return err
|
||||
}
|
||||
|
||||
for _, port := range ports {
|
||||
portFiles := sysFilePath(filepath.Join(infinibandPath, device, "ports", port))
|
||||
|
||||
// Add metrics for the InfiniBand counters.
|
||||
for metricName, infinibandMetric := range c.counters {
|
||||
if _, err := os.Stat(filepath.Join(portFiles, "counters", infinibandMetric.File)); os.IsNotExist(err) {
|
||||
continue
|
||||
}
|
||||
metric, err := readMetric(filepath.Join(portFiles, "counters"), infinibandMetric.File)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
c.metricDescs[metricName],
|
||||
prometheus.CounterValue,
|
||||
float64(metric),
|
||||
device,
|
||||
port,
|
||||
)
|
||||
}
|
||||
|
||||
// Add metrics for the legacy InfiniBand counters.
|
||||
for metricName, infinibandMetric := range c.legacyCounters {
|
||||
if _, err := os.Stat(filepath.Join(portFiles, "counters_ext", infinibandMetric.File)); os.IsNotExist(err) {
|
||||
continue
|
||||
}
|
||||
metric, err := readMetric(filepath.Join(portFiles, "counters_ext"), infinibandMetric.File)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
c.metricDescs[metricName],
|
||||
prometheus.CounterValue,
|
||||
float64(metric),
|
||||
device,
|
||||
port,
|
||||
)
|
||||
}
|
||||
c.pushCounter(ch, "legacy_multicast_packets_received_total", port.Counters.LegacyPortMulticastRcvPackets, port.Name, portStr)
|
||||
c.pushCounter(ch, "legacy_multicast_packets_transmitted_total", port.Counters.LegacyPortMulticastXmitPackets, port.Name, portStr)
|
||||
c.pushCounter(ch, "legacy_data_received_bytes_total", port.Counters.LegacyPortRcvData64, port.Name, portStr)
|
||||
c.pushCounter(ch, "legacy_packets_received_total", port.Counters.LegacyPortRcvPackets64, port.Name, portStr)
|
||||
c.pushCounter(ch, "legacy_unicast_packets_received_total", port.Counters.LegacyPortUnicastRcvPackets, port.Name, portStr)
|
||||
c.pushCounter(ch, "legacy_unicast_packets_transmitted_total", port.Counters.LegacyPortUnicastXmitPackets, port.Name, portStr)
|
||||
c.pushCounter(ch, "legacy_data_transmitted_bytes_total", port.Counters.LegacyPortXmitData64, port.Name, portStr)
|
||||
c.pushCounter(ch, "legacy_packets_transmitted_total", port.Counters.LegacyPortXmitPackets64, port.Name, portStr)
|
||||
c.pushCounter(ch, "link_downed_total", port.Counters.LinkDowned, port.Name, portStr)
|
||||
c.pushCounter(ch, "link_error_recovery_total", port.Counters.LinkErrorRecovery, port.Name, portStr)
|
||||
c.pushCounter(ch, "multicast_packets_received_total", port.Counters.MulticastRcvPackets, port.Name, portStr)
|
||||
c.pushCounter(ch, "multicast_packets_transmitted_total", port.Counters.MulticastXmitPackets, port.Name, portStr)
|
||||
c.pushCounter(ch, "port_constraint_errors_received_total", port.Counters.PortRcvConstraintErrors, port.Name, portStr)
|
||||
c.pushCounter(ch, "port_constraint_errors_transmitted_total", port.Counters.PortXmitConstraintErrors, port.Name, portStr)
|
||||
c.pushCounter(ch, "port_data_received_bytes_total", port.Counters.PortRcvData, port.Name, portStr)
|
||||
c.pushCounter(ch, "port_data_transmitted_bytes_total", port.Counters.PortXmitData, port.Name, portStr)
|
||||
c.pushCounter(ch, "port_discards_received_total", port.Counters.PortRcvDiscards, port.Name, portStr)
|
||||
c.pushCounter(ch, "port_discards_transmitted_total", port.Counters.PortXmitDiscards, port.Name, portStr)
|
||||
c.pushCounter(ch, "port_errors_received_total", port.Counters.PortRcvErrors, port.Name, portStr)
|
||||
c.pushCounter(ch, "port_packets_received_total", port.Counters.PortRcvPackets, port.Name, portStr)
|
||||
c.pushCounter(ch, "port_packets_transmitted_total", port.Counters.PortXmitPackets, port.Name, portStr)
|
||||
c.pushCounter(ch, "port_transmit_wait_total", port.Counters.PortXmitWait, port.Name, portStr)
|
||||
c.pushCounter(ch, "unicast_packets_received_total", port.Counters.UnicastRcvPackets, port.Name, portStr)
|
||||
c.pushCounter(ch, "unicast_packets_transmitted_total", port.Counters.UnicastXmitPackets, port.Name, portStr)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,40 +0,0 @@
|
|||
// Copyright 2017 The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestInfiniBandDevices(t *testing.T) {
|
||||
devices, err := infinibandDevices("fixtures/sys/class/infiniband")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if l := len(devices); l != 2 {
|
||||
t.Fatalf("Retrieved an unexpected number of InfiniBand devices: %d", l)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInfiniBandPorts(t *testing.T) {
|
||||
ports, err := infinibandPorts("fixtures/sys/class/infiniband", "mlx4_0")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if l := len(ports); l != 2 {
|
||||
t.Fatalf("Retrieved an unexpected number of InfiniBand ports: %d", l)
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue