infiniband: Handle iWARP* RDMA modules N/A (#974)
* infiniband: Add not connected i40iw0/ports/1 fixtures * infiniband: Handle issue when iWARP* RDMA modules are not available This is related to #966, and handle this error, Jun 07 13:33:24 hostname node_exporter[81888]: time="2018-06-07T13:33:24+02:00" level=error msg="ERROR: infiniband collector failed after 0.000929s: strconv.ParseUint: parsing \"N/A (no PMA)\": invalid syntax" source="collector.go:132" Signed-off-by: Mario Trangoni <mjtrangoni@gmail.com>
This commit is contained in:
parent
0f9842f20a
commit
3659260b66
|
@ -787,10 +787,12 @@ node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="1
|
|||
node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239
|
||||
# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down
|
||||
# TYPE node_infiniband_link_downed_total counter
|
||||
node_infiniband_link_downed_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0
|
||||
node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state
|
||||
# TYPE node_infiniband_link_error_recovery_total counter
|
||||
node_infiniband_link_error_recovery_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0
|
||||
node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors)
|
||||
|
@ -803,10 +805,12 @@ node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16
|
|||
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links
|
||||
# TYPE node_infiniband_port_data_received_bytes_total counter
|
||||
node_infiniband_port_data_received_bytes_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07
|
||||
node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links
|
||||
# TYPE node_infiniband_port_data_transmitted_bytes_total counter
|
||||
node_infiniband_port_data_transmitted_bytes_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07
|
||||
node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors)
|
||||
|
|
|
@ -787,10 +787,12 @@ node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="1
|
|||
node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239
|
||||
# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down
|
||||
# TYPE node_infiniband_link_downed_total counter
|
||||
node_infiniband_link_downed_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0
|
||||
node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state
|
||||
# TYPE node_infiniband_link_error_recovery_total counter
|
||||
node_infiniband_link_error_recovery_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0
|
||||
node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors)
|
||||
|
@ -803,10 +805,12 @@ node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16
|
|||
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links
|
||||
# TYPE node_infiniband_port_data_received_bytes_total counter
|
||||
node_infiniband_port_data_received_bytes_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07
|
||||
node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links
|
||||
# TYPE node_infiniband_port_data_transmitted_bytes_total counter
|
||||
node_infiniband_port_data_transmitted_bytes_total{device="i40iw0",port="1"} 0
|
||||
node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07
|
||||
node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors)
|
||||
|
|
|
@ -109,6 +109,103 @@ Mode: 644
|
|||
Directory: sys/class/infiniband
|
||||
Mode: 755
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Directory: sys/class/infiniband/i40iw0
|
||||
Mode: 755
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Directory: sys/class/infiniband/i40iw0/ports
|
||||
Mode: 755
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Directory: sys/class/infiniband/i40iw0/ports/1
|
||||
Mode: 755
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Directory: sys/class/infiniband/i40iw0/ports/1/counters
|
||||
Mode: 755
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/counters/excessive_buffer_overrun_errors
|
||||
Lines: 1
|
||||
N/A (no PMA)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/counters/link_downed
|
||||
Lines: 1
|
||||
N/A (no PMA)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/counters/link_error_recovery
|
||||
Lines: 1
|
||||
N/A (no PMA)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/counters/local_link_integrity_errors
|
||||
Lines: 1
|
||||
N/A (no PMA)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_constraint_errors
|
||||
Lines: 1
|
||||
N/A (no PMA)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_data
|
||||
Lines: 1
|
||||
N/A (no PMA)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_errors
|
||||
Lines: 1
|
||||
N/A (no PMA)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_packets
|
||||
Lines: 1
|
||||
N/A (no PMA)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_remote_physical_errors
|
||||
Lines: 1
|
||||
N/A (no PMA)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_switch_relay_errors
|
||||
Lines: 1
|
||||
N/A (no PMA)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_constraint_errors
|
||||
Lines: 1
|
||||
N/A (no PMA)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_data
|
||||
Lines: 1
|
||||
N/A (no PMA)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_discards
|
||||
Lines: 1
|
||||
N/A (no PMA)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_packets
|
||||
Lines: 1
|
||||
N/A (no PMA)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_wait
|
||||
Lines: 1
|
||||
N/A (no PMA)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/counters/symbol_error
|
||||
Lines: 1
|
||||
N/A (no PMA)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Path: sys/class/infiniband/i40iw0/ports/1/counters/VL15_dropped
|
||||
Lines: 1
|
||||
N/A (no PMA)
|
||||
Mode: 644
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
Directory: sys/class/infiniband/mlx4_0
|
||||
Mode: 755
|
||||
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
|
|
|
@ -20,6 +20,7 @@ import (
|
|||
"errors"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/common/log"
|
||||
|
@ -144,6 +145,15 @@ func infinibandPorts(infinibandPath, device string) ([]string, error) {
|
|||
func readMetric(directory, metricFile string) (uint64, error) {
|
||||
metric, err := readUintFromFile(filepath.Join(directory, metricFile))
|
||||
if err != nil {
|
||||
// Ugly workaround for handling #966, when counters are
|
||||
// `N/A (not available)`.
|
||||
// This was already patched and submitted, see
|
||||
// https://www.spinics.net/lists/linux-rdma/msg68596.html
|
||||
// Remove this as soon as the fix lands in the enterprise distros.
|
||||
if strings.Contains(err.Error(), "N/A (no PMA)") {
|
||||
log.Debugf("%q value is N/A", metricFile)
|
||||
return 0, nil
|
||||
}
|
||||
log.Debugf("Error reading %q file", metricFile)
|
||||
return 0, err
|
||||
}
|
||||
|
|
|
@ -23,7 +23,7 @@ func TestInfiniBandDevices(t *testing.T) {
|
|||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if l := len(devices); l != 1 {
|
||||
if l := len(devices); l != 2 {
|
||||
t.Fatalf("Retrieved an unexpected number of InfiniBand devices: %d", l)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue