First implementation of physical_disk collector (#803)

* Added physical_disk collector.

Signed-off-by: Rob Scheepens <rob.scheepens@nutanix.com>
Signed-off-by: Brantley West <brantley@nutanix.com>

* exporter.go: Added physical_disk to defaultCollectors

Signed-off-by: Rob Scheepens <rob.scheepens@nutanix.com>
Signed-off-by: Brantley West <brantley@nutanix.com>

* Fix test cases for physicaldisk metrics

Signed-off-by: Rob Scheepens <rob.scheepens@nutanix.com>
Signed-off-by: Brantley West <brantley@nutanix.com>

* physical_disk.go: cleanup gofmt error

Signed-off-by: Brantley West <brantley@nutanix.com>

* physical_disk.go: populate physical disk 'number' label

Signed-off-by: Brantley West <brantley@nutanix.com>

* Added docs/collector.physical_disk.md.

Signed-off-by: Rob Scheepens <rob.scheepens@nutanix.com>

* physical_disk.go: change 'number' label to 'disk' to match node_exporter label

Signed-off-by: Brantley West <brantley@nutanix.com>

* physical_disk.go: adopt github.com/go-kit/log

Signed-off-by: Brantley West <brantley@nutanix.com>

* physical_disk.go: adopt include/exclude disk list

Signed-off-by: Brantley West <brantley@nutanix.com>

* fix: Add init config for physical_disk collector

Signed-off-by: Ben Reedy <breed808@breed808.com>

* chore: gofmt physical_disk collector

Signed-off-by: Ben Reedy <breed808@breed808.com>

---------

Signed-off-by: Rob Scheepens <rob.scheepens@nutanix.com>
Signed-off-by: Brantley West <brantley@nutanix.com>
Signed-off-by: Ben Reedy <breed808@breed808.com>
Co-authored-by: Brantley West <brantley@nutanix.com>
Co-authored-by: Ben Reedy <breed808@breed808.com>
This commit is contained in:
rob-scheepens 2023-09-22 08:45:22 +02:00 committed by GitHub
parent 083537a96a
commit 07fff6afc2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 415 additions and 3 deletions

View File

@ -20,7 +20,7 @@ test:
go test -v ./...
bench:
go test -v -bench='benchmark(cpu|logicaldisk|logon|memory|net|process|service|system|tcp|time)collector' ./...
go test -v -bench='benchmark(cpu|logicaldisk|physicaldisk|logon|memory|net|process|service|system|tcp|time)collector' ./...
lint:
golangci-lint -c .golangci.yaml run

View File

@ -288,6 +288,14 @@ var collectors = []collectorInit{
return []string{"Paging File"}
},
},
{
name: "physical_disk",
flags: newPhysicalDiskCollectorFlags,
builder: NewPhysicalDiskCollector,
perfCounterFunc: func(_ log.Logger) []string {
return []string{"PhysicalDisk"}
},
},
{
name: "process",
flags: newProcessCollectorFlags,

298
collector/physical_disk.go Normal file
View File

@ -0,0 +1,298 @@
//go:build windows
// +build windows
package collector
import (
"fmt"
"regexp"
"strings"
"github.com/alecthomas/kingpin/v2"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/prometheus/client_golang/prometheus"
)
const (
FlagPhysicalDiskExclude = "collector.physical_disk.disk-exclude"
FlagPhysicalDiskInclude = "collector.physical_disk.disk-include"
)
var (
diskInclude *string
diskExclude *string
diskIncludeSet bool
diskExcludeSet bool
)
// A PhysicalDiskCollector is a Prometheus collector for perflib PhysicalDisk metrics
type PhysicalDiskCollector struct {
logger log.Logger
RequestsQueued *prometheus.Desc
ReadBytesTotal *prometheus.Desc
ReadsTotal *prometheus.Desc
WriteBytesTotal *prometheus.Desc
WritesTotal *prometheus.Desc
ReadTime *prometheus.Desc
WriteTime *prometheus.Desc
IdleTime *prometheus.Desc
SplitIOs *prometheus.Desc
ReadLatency *prometheus.Desc
WriteLatency *prometheus.Desc
ReadWriteLatency *prometheus.Desc
diskIncludePattern *regexp.Regexp
diskExcludePattern *regexp.Regexp
}
// newPhysicalDiskCollectorFlags ...
func newPhysicalDiskCollectorFlags(app *kingpin.Application) {
diskInclude = app.Flag(
FlagPhysicalDiskInclude,
"Regexp of disks to include. Disk number must both match include and not match exclude to be included.",
).Default(".+").PreAction(func(c *kingpin.ParseContext) error {
diskIncludeSet = true
return nil
}).String()
diskExclude = app.Flag(
FlagPhysicalDiskExclude,
"Regexp of disks to exclude. Disk number must both match include and not match exclude to be included.",
).Default("").PreAction(func(c *kingpin.ParseContext) error {
diskExcludeSet = true
return nil
}).String()
}
// NewPhysicalDiskCollector ...
func NewPhysicalDiskCollector(logger log.Logger) (Collector, error) {
const subsystem = "physical_disk"
logger = log.With(logger, "collector", subsystem)
return &PhysicalDiskCollector{
logger: logger,
RequestsQueued: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "requests_queued"),
"The number of requests queued to the disk (PhysicalDisk.CurrentDiskQueueLength)",
[]string{"disk"},
nil,
),
ReadBytesTotal: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "read_bytes_total"),
"The number of bytes transferred from the disk during read operations (PhysicalDisk.DiskReadBytesPerSec)",
[]string{"disk"},
nil,
),
ReadsTotal: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "reads_total"),
"The number of read operations on the disk (PhysicalDisk.DiskReadsPerSec)",
[]string{"disk"},
nil,
),
WriteBytesTotal: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "write_bytes_total"),
"The number of bytes transferred to the disk during write operations (PhysicalDisk.DiskWriteBytesPerSec)",
[]string{"disk"},
nil,
),
WritesTotal: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "writes_total"),
"The number of write operations on the disk (PhysicalDisk.DiskWritesPerSec)",
[]string{"disk"},
nil,
),
ReadTime: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "read_seconds_total"),
"Seconds that the disk was busy servicing read requests (PhysicalDisk.PercentDiskReadTime)",
[]string{"disk"},
nil,
),
WriteTime: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "write_seconds_total"),
"Seconds that the disk was busy servicing write requests (PhysicalDisk.PercentDiskWriteTime)",
[]string{"disk"},
nil,
),
IdleTime: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "idle_seconds_total"),
"Seconds that the disk was idle (PhysicalDisk.PercentIdleTime)",
[]string{"disk"},
nil,
),
SplitIOs: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "split_ios_total"),
"The number of I/Os to the disk were split into multiple I/Os (PhysicalDisk.SplitIOPerSec)",
[]string{"disk"},
nil,
),
ReadLatency: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "read_latency_seconds_total"),
"Shows the average time, in seconds, of a read operation from the disk (PhysicalDisk.AvgDiskSecPerRead)",
[]string{"disk"},
nil,
),
WriteLatency: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "write_latency_seconds_total"),
"Shows the average time, in seconds, of a write operation to the disk (PhysicalDisk.AvgDiskSecPerWrite)",
[]string{"disk"},
nil,
),
ReadWriteLatency: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "read_write_latency_seconds_total"),
"Shows the time, in seconds, of the average disk transfer (PhysicalDisk.AvgDiskSecPerTransfer)",
[]string{"disk"},
nil,
),
diskIncludePattern: regexp.MustCompile(fmt.Sprintf("^(?:%s)$", *diskInclude)),
diskExcludePattern: regexp.MustCompile(fmt.Sprintf("^(?:%s)$", *diskExclude)),
}, nil
}
// Collect sends the metric values for each metric
// to the provided prometheus Metric channel.
func (c *PhysicalDiskCollector) Collect(ctx *ScrapeContext, ch chan<- prometheus.Metric) error {
if desc, err := c.collect(ctx, ch); err != nil {
_ = level.Error(c.logger).Log("failed collecting physical_disk metrics", "desc", desc, "err", err)
return err
}
return nil
}
// Win32_PerfRawData_PerfDisk_PhysicalDisk docs:
// - https://docs.microsoft.com/en-us/previous-versions/aa394308(v=vs.85) - Win32_PerfRawData_PerfDisk_PhysicalDisk class
type PhysicalDisk struct {
Name string
CurrentDiskQueueLength float64 `perflib:"Current Disk Queue Length"`
DiskReadBytesPerSec float64 `perflib:"Disk Read Bytes/sec"`
DiskReadsPerSec float64 `perflib:"Disk Reads/sec"`
DiskWriteBytesPerSec float64 `perflib:"Disk Write Bytes/sec"`
DiskWritesPerSec float64 `perflib:"Disk Writes/sec"`
PercentDiskReadTime float64 `perflib:"% Disk Read Time"`
PercentDiskWriteTime float64 `perflib:"% Disk Write Time"`
PercentIdleTime float64 `perflib:"% Idle Time"`
SplitIOPerSec float64 `perflib:"Split IO/Sec"`
AvgDiskSecPerRead float64 `perflib:"Avg. Disk sec/Read"`
AvgDiskSecPerWrite float64 `perflib:"Avg. Disk sec/Write"`
AvgDiskSecPerTransfer float64 `perflib:"Avg. Disk sec/Transfer"`
}
func (c *PhysicalDiskCollector) collect(ctx *ScrapeContext, ch chan<- prometheus.Metric) (*prometheus.Desc, error) {
var dst []PhysicalDisk
if err := unmarshalObject(ctx.perfObjects["PhysicalDisk"], &dst, c.logger); err != nil {
return nil, err
}
for _, disk := range dst {
if disk.Name == "_Total" ||
c.diskExcludePattern.MatchString(disk.Name) ||
!c.diskIncludePattern.MatchString(disk.Name) {
continue
}
// Parse physical disk number from disk.Name. Mountpoint information is
// sometimes included, e.g. "1 C:".
disk_number, _, _ := strings.Cut(disk.Name, " ")
ch <- prometheus.MustNewConstMetric(
c.RequestsQueued,
prometheus.GaugeValue,
disk.CurrentDiskQueueLength,
disk_number,
)
ch <- prometheus.MustNewConstMetric(
c.ReadBytesTotal,
prometheus.CounterValue,
disk.DiskReadBytesPerSec,
disk_number,
)
ch <- prometheus.MustNewConstMetric(
c.ReadsTotal,
prometheus.CounterValue,
disk.DiskReadsPerSec,
disk_number,
)
ch <- prometheus.MustNewConstMetric(
c.WriteBytesTotal,
prometheus.CounterValue,
disk.DiskWriteBytesPerSec,
disk_number,
)
ch <- prometheus.MustNewConstMetric(
c.WritesTotal,
prometheus.CounterValue,
disk.DiskWritesPerSec,
disk_number,
)
ch <- prometheus.MustNewConstMetric(
c.ReadTime,
prometheus.CounterValue,
disk.PercentDiskReadTime,
disk_number,
)
ch <- prometheus.MustNewConstMetric(
c.WriteTime,
prometheus.CounterValue,
disk.PercentDiskWriteTime,
disk_number,
)
ch <- prometheus.MustNewConstMetric(
c.IdleTime,
prometheus.CounterValue,
disk.PercentIdleTime,
disk_number,
)
ch <- prometheus.MustNewConstMetric(
c.SplitIOs,
prometheus.CounterValue,
disk.SplitIOPerSec,
disk_number,
)
ch <- prometheus.MustNewConstMetric(
c.ReadLatency,
prometheus.CounterValue,
disk.AvgDiskSecPerRead*ticksToSecondsScaleFactor,
disk_number,
)
ch <- prometheus.MustNewConstMetric(
c.WriteLatency,
prometheus.CounterValue,
disk.AvgDiskSecPerWrite*ticksToSecondsScaleFactor,
disk_number,
)
ch <- prometheus.MustNewConstMetric(
c.ReadWriteLatency,
prometheus.CounterValue,
disk.AvgDiskSecPerTransfer*ticksToSecondsScaleFactor,
disk_number,
)
}
return nil, nil
}

View File

@ -0,0 +1,80 @@
# physical_disk collector
The physical_disk collector exposes metrics about physical disks
|||
-|-
Metric name prefix | `physical_disk`
Data source | Perflib
Counters | `physicalDisk` ([`Win32_PerfRawData_PerfDisk_physicalDisk`](https://msdn.microsoft.com/en-us/windows/hardware/aa394307(v=vs.71)))
Enabled by default? | Yes
## Flags
### `--collector.physical_disk.disk-include`
If given, a disk needs to match the include regexp in order for the corresponding disk metrics to be reported
### `--collector.physical_disk.disk-exclude`
If given, a disk needs to *not* match the exclude regexp in order for the corresponding disk metrics to be reported
## Metrics
Name | Description | Type | Labels
-----|-------------|------|-------
`requests_queued` | Number of requests outstanding on the disk at the time the performance data is collected | gauge | `disk`
`read_bytes_total` | Rate at which bytes are transferred from the disk during read operations | counter | `disk`
`reads_total` | Rate of read operations on the disk | counter | `disk`
`write_bytes_total` | Rate at which bytes are transferred to the disk during write operations | counter | `disk`
`writes_total` | Rate of write operations on the disk | counter | `disk`
`read_seconds_total` | Seconds the disk was busy servicing read requests | counter | `disk`
`write_seconds_total` | Seconds the disk was busy servicing write requests | counter | `disk`
`free_bytes` | Unused space of the disk in bytes (not real time, updates every 10-15 min) | gauge | `disk`
`size_bytes` | Total size of the disk in bytes (not real time, updates every 10-15 min) | gauge | `disk`
`idle_seconds_total` | Seconds the disk was idle (not servicing read/write requests) | counter | `disk`
`split_ios_total` | Number of I/Os to the disk split into multiple I/Os | counter | `disk`
### Warning about size metrics
The `free_bytes` and `size_bytes` metrics are not updated in real time and might have a delay of 10-15min.
This is the same behavior as the windows performance counters.
### Example metric
Query the rate of write operations to a disk
```
rate(windows_physical_disk_read_bytes_total{instance="localhost", disk=~"0"}[2m])
```
## Useful queries
Calculate rate of total IOPS for disk
```
rate(windows_physical_disk_reads_total{instance="localhost", disk=~"0"}[2m]) + rate(windows_physical_disk_writes_total{instance="localhost", disk=~"0"}[2m])
```
## Alerting examples
**prometheus.rules**
```yaml
groups:
- name: Windows Disk Alerts
rules:
# Sends an alert when disk space usage is above 95%
- alert: DiskSpaceUsage
expr: 100.0 - 100 * (windows_physical_disk_free_bytes / windows_physical_disk_size_bytes) > 95
for: 10m
labels:
severity: high
annotations:
summary: "Disk Space Usage (instance {{ $labels.instance }})"
description: "Disk Space on Drive is used more than 95%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
# Alerts on disks with over 85% space usage predicted to fill within the next four days
- alert: DiskFilling
expr: 100 * (windows_physical_disk_free_bytes / windows_physical_disk_size_bytes) < 15 and predict_linear(windows_physical_disk_free_bytes[6h], 4 * 24 * 3600) < 0
for: 10m
labels:
severity: warning
annotations:
summary: "Disk full in four days (instance {{ $labels.instance }})"
description: "{{ $labels.disk }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
```

View File

@ -49,7 +49,7 @@ type prometheusVersion struct {
}
const (
defaultCollectors = "cpu,cs,logical_disk,net,os,service,system,textfile"
defaultCollectors = "cpu,cs,logical_disk,physical_disk,net,os,service,system,textfile"
defaultCollectorsPlaceholder = "[defaults]"
)

View File

@ -40,6 +40,7 @@ test_alpha_total 42
windows_exporter_collector_success{collector="cpu"} 1
windows_exporter_collector_success{collector="cs"} 1
windows_exporter_collector_success{collector="logical_disk"} 1
windows_exporter_collector_success{collector="physical_disk"} 1
windows_exporter_collector_success{collector="net"} 1
windows_exporter_collector_success{collector="os"} 1
windows_exporter_collector_success{collector="service"} 1
@ -50,6 +51,7 @@ windows_exporter_collector_success{collector="textfile"} 1
windows_exporter_collector_timeout{collector="cpu"} 0
windows_exporter_collector_timeout{collector="cs"} 0
windows_exporter_collector_timeout{collector="logical_disk"} 0
windows_exporter_collector_timeout{collector="physical_disk"} 0
windows_exporter_collector_timeout{collector="net"} 0
windows_exporter_collector_timeout{collector="os"} 0
windows_exporter_collector_timeout{collector="service"} 0
@ -89,6 +91,30 @@ windows_exporter_collector_timeout{collector="textfile"} 0
# TYPE windows_logical_disk_write_seconds_total counter
# HELP windows_logical_disk_writes_total The number of write operations on the disk (LogicalDisk.DiskWritesPerSec)
# TYPE windows_logical_disk_writes_total counter
# HELP windows_physical_disk_idle_seconds_total Seconds that the disk was idle (PhysicalDisk.PercentIdleTime)
# TYPE windows_physical_disk_idle_seconds_total counter
# HELP windows_physical_disk_read_bytes_total The number of bytes transferred from the disk during read operations (PhysicalDisk.DiskReadBytesPerSec)
# TYPE windows_physical_disk_read_bytes_total counter
# HELP windows_physical_disk_read_latency_seconds_total Shows the average time, in seconds, of a read operation from the disk (PhysicalDisk.AvgDiskSecPerRead)
# TYPE windows_physical_disk_read_latency_seconds_total counter
# HELP windows_physical_disk_read_seconds_total Seconds that the disk was busy servicing read requests (PhysicalDisk.PercentDiskReadTime)
# TYPE windows_physical_disk_read_seconds_total counter
# HELP windows_physical_disk_read_write_latency_seconds_total Shows the time, in seconds, of the average disk transfer (PhysicalDisk.AvgDiskSecPerTransfer)
# TYPE windows_physical_disk_read_write_latency_seconds_total counter
# HELP windows_physical_disk_reads_total The number of read operations on the disk (PhysicalDisk.DiskReadsPerSec)
# TYPE windows_physical_disk_reads_total counter
# HELP windows_physical_disk_requests_queued The number of requests queued to the disk (PhysicalDisk.CurrentDiskQueueLength)
# TYPE windows_physical_disk_requests_queued gauge
# HELP windows_physical_disk_split_ios_total The number of I/Os to the disk were split into multiple I/Os (PhysicalDisk.SplitIOPerSec)
# TYPE windows_physical_disk_split_ios_total counter
# HELP windows_physical_disk_write_bytes_total The number of bytes transferred to the disk during write operations (PhysicalDisk.DiskWriteBytesPerSec)
# TYPE windows_physical_disk_write_bytes_total counter
# HELP windows_physical_disk_write_latency_seconds_total Shows the average time, in seconds, of a write operation to the disk (PhysicalDisk.AvgDiskSecPerWrite)
# TYPE windows_physical_disk_write_latency_seconds_total counter
# HELP windows_physical_disk_write_seconds_total Seconds that the disk was busy servicing write requests (PhysicalDisk.PercentDiskWriteTime)
# TYPE windows_physical_disk_write_seconds_total counter
# HELP windows_physical_disk_writes_total The number of write operations on the disk (PhysicalDisk.DiskWritesPerSec)
# TYPE windows_physical_disk_writes_total counter
# HELP windows_net_bytes_received_total (Network.BytesReceivedPerSec)
# TYPE windows_net_bytes_received_total counter
# HELP windows_net_bytes_sent_total (Network.BytesSentPerSec)

View File

@ -18,7 +18,7 @@ mkdir $textfile_dir | Out-Null
Copy-Item 'e2e-textfile.prom' -Destination "$($textfile_dir)/e2e-textfile.prom"
# Omit dynamic collector information that will change after each run
$skip_re = "^(go_|windows_exporter_build_info|windows_exporter_collector_duration_seconds|windows_exporter_perflib_snapshot_duration_seconds|process_|windows_textfile_mtime_seconds|windows_cpu|windows_cs|windows_logical_disk|windows_net|windows_os|windows_service|windows_system|windows_textfile_mtime_seconds)"
$skip_re = "^(go_|windows_exporter_build_info|windows_exporter_collector_duration_seconds|windows_exporter_perflib_snapshot_duration_seconds|process_|windows_textfile_mtime_seconds|windows_cpu|windows_cs|windows_logical_disk|windows_physical_disk|windows_net|windows_os|windows_service|windows_system|windows_textfile_mtime_seconds)"
# Start process in background, awaiting HTTP requests.
# Use default collectors, port and address: http://localhost:9182/metrics