Make metrics better follow guidelines (#787)

* Improve stat linux metric names.

cpu is no longer used.

* node_cpu -> node_cpu_seconds_total for Linux

* Improve filesystem metric names with units

* Improve units and names of linux disk stats

Remove sector metrics, the bytes metrics cover those already.

* Infiniband counters should end in _total

* Improve timex metric names, convert to more normal units.

See
3c073991eb/kernel/time/ntp.c (L909)
for what stabil means, looks like a moving average of some form.

* Update test fixture

* For meminfo metrics that had "kB" units, add _bytes

* Interrupts counter should have _total
This commit is contained in:
Brian Brazil 2018-01-17 16:55:55 +00:00 committed by Ben Kochie
parent b4d7ba119a
commit a98067a294
12 changed files with 547 additions and 604 deletions

View File

@ -54,7 +54,7 @@ func init() {
func NewCPUCollector() (Collector, error) {
return &cpuCollector{
cpu: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", cpuCollectorSubsystem),
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "seconds_total"),
"Seconds the cpus spent in each mode.",
[]string{"cpu", "mode"}, nil,
),

View File

@ -30,17 +30,30 @@ import (
)
const (
diskSubsystem = "disk"
diskSectorSize uint64 = 512
diskSubsystem = "disk"
diskSectorSize = 512
)
var (
ignoredDevices = kingpin.Flag("collector.diskstats.ignored-devices", "Regexp of devices to ignore for diskstats.").Default("^(ram|loop|fd|(h|s|v|xv)d[a-z]|nvme\\d+n\\d+p)\\d+$").String()
)
type typedFactorDesc struct {
desc *prometheus.Desc
valueType prometheus.ValueType
factor float64
}
func (d *typedFactorDesc) mustNewConstMetric(value float64, labels ...string) prometheus.Metric {
if d.factor != 0 {
value *= d.factor
}
return prometheus.MustNewConstMetric(d.desc, d.valueType, value, labels...)
}
type diskstatsCollector struct {
ignoredDevicesPattern *regexp.Regexp
descs []typedDesc
descs []typedFactorDesc
}
func init() {
@ -54,10 +67,10 @@ func NewDiskstatsCollector() (Collector, error) {
return &diskstatsCollector{
ignoredDevicesPattern: regexp.MustCompile(*ignoredDevices),
// Docs from https://www.kernel.org/doc/Documentation/iostats.txt
descs: []typedDesc{
descs: []typedFactorDesc{
{
desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, diskSubsystem, "reads_completed"),
prometheus.BuildFQName(namespace, diskSubsystem, "reads_completed_total"),
"The total number of reads completed successfully.",
diskLabelNames,
nil,
@ -65,7 +78,7 @@ func NewDiskstatsCollector() (Collector, error) {
},
{
desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, diskSubsystem, "reads_merged"),
prometheus.BuildFQName(namespace, diskSubsystem, "reads_merged_total"),
"The total number of reads merged. See https://www.kernel.org/doc/Documentation/iostats.txt.",
diskLabelNames,
nil,
@ -73,23 +86,25 @@ func NewDiskstatsCollector() (Collector, error) {
},
{
desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, diskSubsystem, "sectors_read"),
"The total number of sectors read successfully.",
prometheus.BuildFQName(namespace, diskSubsystem, "read_bytes_total"),
"The total number of bytes read successfully.",
diskLabelNames,
nil,
), valueType: prometheus.CounterValue,
factor: diskSectorSize,
},
{
desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, diskSubsystem, "read_time_ms"),
prometheus.BuildFQName(namespace, diskSubsystem, "read_time_seconds_total"),
"The total number of milliseconds spent by all reads.",
diskLabelNames,
nil,
), valueType: prometheus.CounterValue,
factor: .001,
},
{
desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, diskSubsystem, "writes_completed"),
prometheus.BuildFQName(namespace, diskSubsystem, "writes_completed_total"),
"The total number of writes completed successfully.",
diskLabelNames,
nil,
@ -97,7 +112,7 @@ func NewDiskstatsCollector() (Collector, error) {
},
{
desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, diskSubsystem, "writes_merged"),
prometheus.BuildFQName(namespace, diskSubsystem, "writes_merged_total"),
"The number of writes merged. See https://www.kernel.org/doc/Documentation/iostats.txt.",
diskLabelNames,
nil,
@ -105,19 +120,21 @@ func NewDiskstatsCollector() (Collector, error) {
},
{
desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, diskSubsystem, "sectors_written"),
"The total number of sectors written successfully.",
prometheus.BuildFQName(namespace, diskSubsystem, "written_bytes_total"),
"The total number of bytes written successfully.",
diskLabelNames,
nil,
), valueType: prometheus.CounterValue,
factor: diskSectorSize,
},
{
desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, diskSubsystem, "write_time_ms"),
"This is the total number of milliseconds spent by all writes.",
prometheus.BuildFQName(namespace, diskSubsystem, "write_time_seconds_total"),
"This is the total number of seconds spent by all writes.",
diskLabelNames,
nil,
), valueType: prometheus.CounterValue,
factor: .001,
},
{
desc: prometheus.NewDesc(
@ -129,35 +146,21 @@ func NewDiskstatsCollector() (Collector, error) {
},
{
desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, diskSubsystem, "io_time_ms"),
"Total Milliseconds spent doing I/Os.",
prometheus.BuildFQName(namespace, diskSubsystem, "io_time_seconds_total"),
"Total seconds spent doing I/Os.",
diskLabelNames,
nil,
), valueType: prometheus.CounterValue,
factor: .001,
},
{
desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, diskSubsystem, "io_time_weighted"),
"The weighted # of milliseconds spent doing I/Os. See https://www.kernel.org/doc/Documentation/iostats.txt.",
diskLabelNames,
nil,
), valueType: prometheus.CounterValue,
},
{
desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, diskSubsystem, "bytes_read"),
"The total number of bytes read successfully.",
diskLabelNames,
nil,
), valueType: prometheus.CounterValue,
},
{
desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, diskSubsystem, "bytes_written"),
"The total number of bytes written successfully.",
prometheus.BuildFQName(namespace, diskSubsystem, "io_time_weighted_seconds_total"),
"The weighted # of seconds spent doing I/Os. See https://www.kernel.org/doc/Documentation/iostats.txt.",
diskLabelNames,
nil,
), valueType: prometheus.CounterValue,
factor: .001,
},
},
}, nil
@ -201,15 +204,6 @@ func getDiskStats() (map[string]map[int]string, error) {
return parseDiskStats(file)
}
func convertDiskSectorsToBytes(sectorCount string) (string, error) {
sectors, err := strconv.ParseUint(sectorCount, 10, 64)
if err != nil {
return "", err
}
return strconv.FormatUint(sectors*diskSectorSize, 10), nil
}
func parseDiskStats(r io.Reader) (map[string]map[int]string, error) {
var (
diskStats = map[string]map[int]string{}
@ -226,17 +220,6 @@ func parseDiskStats(r io.Reader) (map[string]map[int]string, error) {
for i, v := range parts[3:] {
diskStats[dev][i] = v
}
bytesRead, err := convertDiskSectorsToBytes(diskStats[dev][2])
if err != nil {
return nil, fmt.Errorf("invalid value for sectors read in %s: %s", procFilePath("diskstats"), scanner.Text())
}
diskStats[dev][11] = bytesRead
bytesWritten, err := convertDiskSectorsToBytes(diskStats[dev][6])
if err != nil {
return nil, fmt.Errorf("invalid value for sectors written in %s: %s", procFilePath("diskstats"), scanner.Text())
}
diskStats[dev][12] = bytesWritten
}
return diskStats, scanner.Err()

View File

@ -37,12 +37,4 @@ func TestDiskStats(t *testing.T) {
if want, got := "68", diskStats["mmcblk0p2"][10]; want != got {
t.Errorf("want diskstats mmcblk0p2 %s, got %s", want, got)
}
if want, got := "513713216512", diskStats["sda"][11]; want != got {
t.Errorf("want diskstats sda read bytes %s, got %s", want, got)
}
if want, got := "258916880384", diskStats["sda"][12]; want != got {
t.Errorf("want diskstats sda write bytes %s, got %s", want, got)
}
}

View File

@ -72,19 +72,19 @@ func NewFilesystemCollector() (Collector, error) {
filesystemsTypesPattern := regexp.MustCompile(*ignoredFSTypes)
sizeDesc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "size"),
prometheus.BuildFQName(namespace, subsystem, "size_bytes"),
"Filesystem size in bytes.",
filesystemLabelNames, nil,
)
freeDesc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "free"),
prometheus.BuildFQName(namespace, subsystem, "free_bytes"),
"Filesystem free space in bytes.",
filesystemLabelNames, nil,
)
availDesc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "avail"),
prometheus.BuildFQName(namespace, subsystem, "avail_bytes"),
"Filesystem space available to non-root users in bytes.",
filesystemLabelNames, nil,
)

File diff suppressed because it is too large Load Diff

View File

@ -57,8 +57,8 @@ func NewInfiniBandCollector() (Collector, error) {
"link_error_recovery_total": {"link_error_recovery", "Number of times the link successfully recovered from an error state"},
"multicast_packets_received_total": {"multicast_rcv_packets", "Number of multicast packets received (including errors)"},
"multicast_packets_transmitted_total": {"multicast_xmit_packets", "Number of multicast packets transmitted (including errors)"},
"port_data_received_bytes": {"port_rcv_data", "Number of data octets received on all links"},
"port_data_transmitted_bytes": {"port_xmit_data", "Number of data octets transmitted on all links"},
"port_data_received_bytes_total": {"port_rcv_data", "Number of data octets received on all links"},
"port_data_transmitted_bytes_total": {"port_xmit_data", "Number of data octets transmitted on all links"},
"unicast_packets_received_total": {"unicast_rcv_packets", "Number of unicast packets received (including errors)"},
"unicast_packets_transmitted_total": {"unicast_xmit_packets", "Number of unicast packets transmitted (including errors)"},
}

View File

@ -30,7 +30,7 @@ func init() {
func NewInterruptsCollector() (Collector, error) {
return &interruptsCollector{
desc: typedDesc{prometheus.NewDesc(
namespace+"_interrupts",
namespace+"_interrupts_total",
"Interrupt details.",
interruptLabelNames, nil,
), prometheus.CounterValue},

View File

@ -49,16 +49,17 @@ func parseMemInfo(r io.Reader) (map[string]float64, error) {
if err != nil {
return nil, fmt.Errorf("invalid value in meminfo: %s", err)
}
key := parts[0][:len(parts[0])-1] // remove trailing : from key
// Active(anon) -> Active_anon
key = re.ReplaceAllString(key, "_${1}")
switch len(parts) {
case 2: // no unit
case 3: // has unit, we presume kB
fv *= 1024
key = key + "_bytes"
default:
return nil, fmt.Errorf("invalid line in meminfo: %s", line)
}
key := parts[0][:len(parts[0])-1] // remove trailing : from key
// Active(anon) -> Active_anon
key = re.ReplaceAllString(key, "_${1}")
memInfo[key] = fv
}

View File

@ -30,11 +30,11 @@ func TestMemInfo(t *testing.T) {
t.Fatal(err)
}
if want, got := 3831959552.0, memInfo["MemTotal"]; want != got {
if want, got := 3831959552.0, memInfo["MemTotal_bytes"]; want != got {
t.Errorf("want memory total %f, got %f", want, got)
}
if want, got := 3787456512.0, memInfo["DirectMap2M"]; want != got {
if want, got := 3787456512.0, memInfo["DirectMap2M_bytes"]; want != got {
t.Errorf("want memory directMap2M %f, got %f", want, got)
}
}

View File

@ -47,19 +47,19 @@ func (c *meminfoCollector) getMemInfo() (map[string]float64, error) {
return nil, fmt.Errorf("sysctl CTL_VM VM_UVMEXP failed: %v", err)
}
ps := float64(uvmexp.pagesize)
ps := float64(uvmexp.pagesize)
// see uvm(9)
return map[string]float64{
"active_bytes": ps * float64(uvmexp.active),
"cache_bytes": ps * float64(uvmexp.vnodepages),
"free_bytes": ps * float64(uvmexp.free),
"inactive_bytes": ps * float64(uvmexp.inactive),
"size_bytes": ps * float64(uvmexp.npages),
"swap_size_bytes": ps * float64(uvmexp.swpages),
"swap_used_bytes": ps * float64(uvmexp.swpgonly),
"swapped_in_pages_bytes_total": ps * float64(uvmexp.pgswapin),
"swapped_out_pages_bytes_total": ps * float64(uvmexp.pgswapout),
"wired_bytes": ps * float64(uvmexp.wired),
"active_bytes": ps * float64(uvmexp.active),
"cache_bytes": ps * float64(uvmexp.vnodepages),
"free_bytes": ps * float64(uvmexp.free),
"inactive_bytes": ps * float64(uvmexp.inactive),
"size_bytes": ps * float64(uvmexp.npages),
"swap_size_bytes": ps * float64(uvmexp.swpages),
"swap_used_bytes": ps * float64(uvmexp.swpgonly),
"swapped_in_pages_bytes_total": ps * float64(uvmexp.pgswapin),
"swapped_out_pages_bytes_total": ps * float64(uvmexp.pgswapout),
"wired_bytes": ps * float64(uvmexp.wired),
}, nil
}

View File

@ -24,7 +24,6 @@ import (
)
type statCollector struct {
cpu *prometheus.Desc
intr *prometheus.Desc
ctxt *prometheus.Desc
forks *prometheus.Desc
@ -40,28 +39,23 @@ func init() {
// NewStatCollector returns a new Collector exposing kernel/system statistics.
func NewStatCollector() (Collector, error) {
return &statCollector{
cpu: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "cpu"),
"Seconds the cpus spent in each mode.",
[]string{"cpu", "mode"}, nil,
),
intr: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "intr"),
prometheus.BuildFQName(namespace, "", "intr_total"),
"Total number of interrupts serviced.",
nil, nil,
),
ctxt: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "context_switches"),
prometheus.BuildFQName(namespace, "", "context_switches_total"),
"Total number of context switches.",
nil, nil,
),
forks: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "forks"),
prometheus.BuildFQName(namespace, "", "forks_total"),
"Total number of forks.",
nil, nil,
),
btime: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "boot_time"),
prometheus.BuildFQName(namespace, "", "boot_time_seconds"),
"Node boot time, in unixtime.",
nil, nil,
),

View File

@ -71,7 +71,7 @@ func NewTimexCollector() (Collector, error) {
nil, nil,
), prometheus.GaugeValue},
freq: typedDesc{prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "frequency_adjustment"),
prometheus.BuildFQName(namespace, subsystem, "frequency_adjustment_ratio"),
"Local clock frequency adjustment.",
nil, nil,
), prometheus.GaugeValue},
@ -101,7 +101,7 @@ func NewTimexCollector() (Collector, error) {
nil, nil,
), prometheus.GaugeValue},
ppsfreq: typedDesc{prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "pps_frequency"),
prometheus.BuildFQName(namespace, subsystem, "pps_frequency_hertz"),
"Pulse per second frequency.",
nil, nil,
), prometheus.GaugeValue},
@ -116,32 +116,32 @@ func NewTimexCollector() (Collector, error) {
nil, nil,
), prometheus.GaugeValue},
stabil: typedDesc{prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "pps_stability"),
"Pulse per second stability.",
prometheus.BuildFQName(namespace, subsystem, "pps_stability_hertz"),
"Pulse per second stability, average of recent frequency changes.",
nil, nil,
), prometheus.CounterValue},
), prometheus.GaugeValue},
jitcnt: typedDesc{prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "pps_jitter_count"),
prometheus.BuildFQName(namespace, subsystem, "pps_jitter_total"),
"Pulse per second count of jitter limit exceeded events.",
nil, nil,
), prometheus.CounterValue},
calcnt: typedDesc{prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "pps_calibration_count"),
prometheus.BuildFQName(namespace, subsystem, "pps_calibration_total"),
"Pulse per second count of calibration intervals.",
nil, nil,
), prometheus.CounterValue},
errcnt: typedDesc{prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "pps_error_count"),
prometheus.BuildFQName(namespace, subsystem, "pps_error_total"),
"Pulse per second count of calibration errors.",
nil, nil,
), prometheus.CounterValue},
stbcnt: typedDesc{prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "pps_stability_exceeded_count"),
prometheus.BuildFQName(namespace, subsystem, "pps_stability_exceeded_total"),
"Pulse per second count of stability limit exceeded events.",
nil, nil,
), prometheus.GaugeValue},
), prometheus.CounterValue},
tai: typedDesc{prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "tai_offset"),
prometheus.BuildFQName(namespace, subsystem, "tai_offset_seconds"),
"International Atomic Time (TAI) offset.",
nil, nil,
), prometheus.GaugeValue},
@ -173,18 +173,21 @@ func (c *timexCollector) Update(ch chan<- prometheus.Metric) error {
} else {
divisor = microSeconds
}
// See NOTES in adjtimex(2).
const ppm16frac = 1000000.0 * 65536.0
ch <- c.syncStatus.mustNewConstMetric(syncStatus)
ch <- c.offset.mustNewConstMetric(float64(timex.Offset) / divisor)
ch <- c.freq.mustNewConstMetric(float64(timex.Freq))
ch <- c.freq.mustNewConstMetric(1 + float64(timex.Freq)/ppm16frac)
ch <- c.maxerror.mustNewConstMetric(float64(timex.Maxerror) / microSeconds)
ch <- c.esterror.mustNewConstMetric(float64(timex.Esterror) / microSeconds)
ch <- c.status.mustNewConstMetric(float64(timex.Status))
ch <- c.constant.mustNewConstMetric(float64(timex.Constant))
ch <- c.tick.mustNewConstMetric(float64(timex.Tick) / microSeconds)
ch <- c.ppsfreq.mustNewConstMetric(float64(timex.Ppsfreq))
ch <- c.ppsfreq.mustNewConstMetric(float64(timex.Ppsfreq) / ppm16frac)
ch <- c.jitter.mustNewConstMetric(float64(timex.Jitter) / divisor)
ch <- c.shift.mustNewConstMetric(float64(timex.Shift))
ch <- c.stabil.mustNewConstMetric(float64(timex.Stabil))
ch <- c.stabil.mustNewConstMetric(float64(timex.Stabil) / ppm16frac)
ch <- c.jitcnt.mustNewConstMetric(float64(timex.Jitcnt))
ch <- c.calcnt.mustNewConstMetric(float64(timex.Calcnt))
ch <- c.errcnt.mustNewConstMetric(float64(timex.Errcnt))