diff --git a/Makefile b/Makefile index 7bf36f90..c15045a4 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ test: go test -v ./... bench: - go test -v -bench='benchmark(cpu|logicaldisk|logon|memory|net|process|service|system|tcp|time)collector' ./... + go test -v -bench='benchmark(cpu|logicaldisk|physicaldisk|logon|memory|net|process|service|system|tcp|time)collector' ./... lint: golangci-lint -c .golangci.yaml run diff --git a/collector/init.go b/collector/init.go index a35f4cc1..7fb07df9 100644 --- a/collector/init.go +++ b/collector/init.go @@ -288,6 +288,14 @@ var collectors = []collectorInit{ return []string{"Paging File"} }, }, + { + name: "physical_disk", + flags: newPhysicalDiskCollectorFlags, + builder: NewPhysicalDiskCollector, + perfCounterFunc: func(_ log.Logger) []string { + return []string{"PhysicalDisk"} + }, + }, { name: "process", flags: newProcessCollectorFlags, diff --git a/collector/physical_disk.go b/collector/physical_disk.go new file mode 100644 index 00000000..1f64a44d --- /dev/null +++ b/collector/physical_disk.go @@ -0,0 +1,298 @@ +//go:build windows +// +build windows + +package collector + +import ( + "fmt" + "regexp" + "strings" + + "github.com/alecthomas/kingpin/v2" + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/prometheus/client_golang/prometheus" +) + +const ( + FlagPhysicalDiskExclude = "collector.physical_disk.disk-exclude" + FlagPhysicalDiskInclude = "collector.physical_disk.disk-include" +) + +var ( + diskInclude *string + diskExclude *string + + diskIncludeSet bool + diskExcludeSet bool +) + +// A PhysicalDiskCollector is a Prometheus collector for perflib PhysicalDisk metrics +type PhysicalDiskCollector struct { + logger log.Logger + + RequestsQueued *prometheus.Desc + ReadBytesTotal *prometheus.Desc + ReadsTotal *prometheus.Desc + WriteBytesTotal *prometheus.Desc + WritesTotal *prometheus.Desc + ReadTime *prometheus.Desc + WriteTime *prometheus.Desc + IdleTime *prometheus.Desc + SplitIOs *prometheus.Desc + ReadLatency *prometheus.Desc + WriteLatency *prometheus.Desc + ReadWriteLatency *prometheus.Desc + + diskIncludePattern *regexp.Regexp + diskExcludePattern *regexp.Regexp +} + +// newPhysicalDiskCollectorFlags ... +func newPhysicalDiskCollectorFlags(app *kingpin.Application) { + diskInclude = app.Flag( + FlagPhysicalDiskInclude, + "Regexp of disks to include. Disk number must both match include and not match exclude to be included.", + ).Default(".+").PreAction(func(c *kingpin.ParseContext) error { + diskIncludeSet = true + return nil + }).String() + + diskExclude = app.Flag( + FlagPhysicalDiskExclude, + "Regexp of disks to exclude. Disk number must both match include and not match exclude to be included.", + ).Default("").PreAction(func(c *kingpin.ParseContext) error { + diskExcludeSet = true + return nil + }).String() +} + +// NewPhysicalDiskCollector ... +func NewPhysicalDiskCollector(logger log.Logger) (Collector, error) { + const subsystem = "physical_disk" + logger = log.With(logger, "collector", subsystem) + + return &PhysicalDiskCollector{ + logger: logger, + + RequestsQueued: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, subsystem, "requests_queued"), + "The number of requests queued to the disk (PhysicalDisk.CurrentDiskQueueLength)", + []string{"disk"}, + nil, + ), + + ReadBytesTotal: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, subsystem, "read_bytes_total"), + "The number of bytes transferred from the disk during read operations (PhysicalDisk.DiskReadBytesPerSec)", + []string{"disk"}, + nil, + ), + + ReadsTotal: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, subsystem, "reads_total"), + "The number of read operations on the disk (PhysicalDisk.DiskReadsPerSec)", + []string{"disk"}, + nil, + ), + + WriteBytesTotal: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, subsystem, "write_bytes_total"), + "The number of bytes transferred to the disk during write operations (PhysicalDisk.DiskWriteBytesPerSec)", + []string{"disk"}, + nil, + ), + + WritesTotal: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, subsystem, "writes_total"), + "The number of write operations on the disk (PhysicalDisk.DiskWritesPerSec)", + []string{"disk"}, + nil, + ), + + ReadTime: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, subsystem, "read_seconds_total"), + "Seconds that the disk was busy servicing read requests (PhysicalDisk.PercentDiskReadTime)", + []string{"disk"}, + nil, + ), + + WriteTime: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, subsystem, "write_seconds_total"), + "Seconds that the disk was busy servicing write requests (PhysicalDisk.PercentDiskWriteTime)", + []string{"disk"}, + nil, + ), + + IdleTime: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, subsystem, "idle_seconds_total"), + "Seconds that the disk was idle (PhysicalDisk.PercentIdleTime)", + []string{"disk"}, + nil, + ), + + SplitIOs: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, subsystem, "split_ios_total"), + "The number of I/Os to the disk were split into multiple I/Os (PhysicalDisk.SplitIOPerSec)", + []string{"disk"}, + nil, + ), + + ReadLatency: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, subsystem, "read_latency_seconds_total"), + "Shows the average time, in seconds, of a read operation from the disk (PhysicalDisk.AvgDiskSecPerRead)", + []string{"disk"}, + nil, + ), + + WriteLatency: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, subsystem, "write_latency_seconds_total"), + "Shows the average time, in seconds, of a write operation to the disk (PhysicalDisk.AvgDiskSecPerWrite)", + []string{"disk"}, + nil, + ), + + ReadWriteLatency: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, subsystem, "read_write_latency_seconds_total"), + "Shows the time, in seconds, of the average disk transfer (PhysicalDisk.AvgDiskSecPerTransfer)", + []string{"disk"}, + nil, + ), + + diskIncludePattern: regexp.MustCompile(fmt.Sprintf("^(?:%s)$", *diskInclude)), + diskExcludePattern: regexp.MustCompile(fmt.Sprintf("^(?:%s)$", *diskExclude)), + }, nil +} + +// Collect sends the metric values for each metric +// to the provided prometheus Metric channel. +func (c *PhysicalDiskCollector) Collect(ctx *ScrapeContext, ch chan<- prometheus.Metric) error { + if desc, err := c.collect(ctx, ch); err != nil { + _ = level.Error(c.logger).Log("failed collecting physical_disk metrics", "desc", desc, "err", err) + return err + } + return nil +} + +// Win32_PerfRawData_PerfDisk_PhysicalDisk docs: +// - https://docs.microsoft.com/en-us/previous-versions/aa394308(v=vs.85) - Win32_PerfRawData_PerfDisk_PhysicalDisk class +type PhysicalDisk struct { + Name string + CurrentDiskQueueLength float64 `perflib:"Current Disk Queue Length"` + DiskReadBytesPerSec float64 `perflib:"Disk Read Bytes/sec"` + DiskReadsPerSec float64 `perflib:"Disk Reads/sec"` + DiskWriteBytesPerSec float64 `perflib:"Disk Write Bytes/sec"` + DiskWritesPerSec float64 `perflib:"Disk Writes/sec"` + PercentDiskReadTime float64 `perflib:"% Disk Read Time"` + PercentDiskWriteTime float64 `perflib:"% Disk Write Time"` + PercentIdleTime float64 `perflib:"% Idle Time"` + SplitIOPerSec float64 `perflib:"Split IO/Sec"` + AvgDiskSecPerRead float64 `perflib:"Avg. Disk sec/Read"` + AvgDiskSecPerWrite float64 `perflib:"Avg. Disk sec/Write"` + AvgDiskSecPerTransfer float64 `perflib:"Avg. Disk sec/Transfer"` +} + +func (c *PhysicalDiskCollector) collect(ctx *ScrapeContext, ch chan<- prometheus.Metric) (*prometheus.Desc, error) { + var dst []PhysicalDisk + if err := unmarshalObject(ctx.perfObjects["PhysicalDisk"], &dst, c.logger); err != nil { + return nil, err + } + + for _, disk := range dst { + if disk.Name == "_Total" || + c.diskExcludePattern.MatchString(disk.Name) || + !c.diskIncludePattern.MatchString(disk.Name) { + continue + } + + // Parse physical disk number from disk.Name. Mountpoint information is + // sometimes included, e.g. "1 C:". + disk_number, _, _ := strings.Cut(disk.Name, " ") + + ch <- prometheus.MustNewConstMetric( + c.RequestsQueued, + prometheus.GaugeValue, + disk.CurrentDiskQueueLength, + disk_number, + ) + + ch <- prometheus.MustNewConstMetric( + c.ReadBytesTotal, + prometheus.CounterValue, + disk.DiskReadBytesPerSec, + disk_number, + ) + + ch <- prometheus.MustNewConstMetric( + c.ReadsTotal, + prometheus.CounterValue, + disk.DiskReadsPerSec, + disk_number, + ) + + ch <- prometheus.MustNewConstMetric( + c.WriteBytesTotal, + prometheus.CounterValue, + disk.DiskWriteBytesPerSec, + disk_number, + ) + + ch <- prometheus.MustNewConstMetric( + c.WritesTotal, + prometheus.CounterValue, + disk.DiskWritesPerSec, + disk_number, + ) + + ch <- prometheus.MustNewConstMetric( + c.ReadTime, + prometheus.CounterValue, + disk.PercentDiskReadTime, + disk_number, + ) + + ch <- prometheus.MustNewConstMetric( + c.WriteTime, + prometheus.CounterValue, + disk.PercentDiskWriteTime, + disk_number, + ) + + ch <- prometheus.MustNewConstMetric( + c.IdleTime, + prometheus.CounterValue, + disk.PercentIdleTime, + disk_number, + ) + + ch <- prometheus.MustNewConstMetric( + c.SplitIOs, + prometheus.CounterValue, + disk.SplitIOPerSec, + disk_number, + ) + + ch <- prometheus.MustNewConstMetric( + c.ReadLatency, + prometheus.CounterValue, + disk.AvgDiskSecPerRead*ticksToSecondsScaleFactor, + disk_number, + ) + + ch <- prometheus.MustNewConstMetric( + c.WriteLatency, + prometheus.CounterValue, + disk.AvgDiskSecPerWrite*ticksToSecondsScaleFactor, + disk_number, + ) + + ch <- prometheus.MustNewConstMetric( + c.ReadWriteLatency, + prometheus.CounterValue, + disk.AvgDiskSecPerTransfer*ticksToSecondsScaleFactor, + disk_number, + ) + } + + return nil, nil +} diff --git a/docs/collector.physical_disk.md b/docs/collector.physical_disk.md new file mode 100644 index 00000000..49d91ba2 --- /dev/null +++ b/docs/collector.physical_disk.md @@ -0,0 +1,80 @@ +# physical_disk collector + +The physical_disk collector exposes metrics about physical disks + +||| +-|- +Metric name prefix | `physical_disk` +Data source | Perflib +Counters | `physicalDisk` ([`Win32_PerfRawData_PerfDisk_physicalDisk`](https://msdn.microsoft.com/en-us/windows/hardware/aa394307(v=vs.71))) +Enabled by default? | Yes + +## Flags + +### `--collector.physical_disk.disk-include` + +If given, a disk needs to match the include regexp in order for the corresponding disk metrics to be reported + +### `--collector.physical_disk.disk-exclude` + +If given, a disk needs to *not* match the exclude regexp in order for the corresponding disk metrics to be reported + +## Metrics + +Name | Description | Type | Labels +-----|-------------|------|------- +`requests_queued` | Number of requests outstanding on the disk at the time the performance data is collected | gauge | `disk` +`read_bytes_total` | Rate at which bytes are transferred from the disk during read operations | counter | `disk` +`reads_total` | Rate of read operations on the disk | counter | `disk` +`write_bytes_total` | Rate at which bytes are transferred to the disk during write operations | counter | `disk` +`writes_total` | Rate of write operations on the disk | counter | `disk` +`read_seconds_total` | Seconds the disk was busy servicing read requests | counter | `disk` +`write_seconds_total` | Seconds the disk was busy servicing write requests | counter | `disk` +`free_bytes` | Unused space of the disk in bytes (not real time, updates every 10-15 min) | gauge | `disk` +`size_bytes` | Total size of the disk in bytes (not real time, updates every 10-15 min) | gauge | `disk` +`idle_seconds_total` | Seconds the disk was idle (not servicing read/write requests) | counter | `disk` +`split_ios_total` | Number of I/Os to the disk split into multiple I/Os | counter | `disk` + +### Warning about size metrics +The `free_bytes` and `size_bytes` metrics are not updated in real time and might have a delay of 10-15min. +This is the same behavior as the windows performance counters. + +### Example metric +Query the rate of write operations to a disk +``` +rate(windows_physical_disk_read_bytes_total{instance="localhost", disk=~"0"}[2m]) +``` + +## Useful queries +Calculate rate of total IOPS for disk +``` +rate(windows_physical_disk_reads_total{instance="localhost", disk=~"0"}[2m]) + rate(windows_physical_disk_writes_total{instance="localhost", disk=~"0"}[2m]) +``` + +## Alerting examples +**prometheus.rules** +```yaml +groups: +- name: Windows Disk Alerts + rules: + + # Sends an alert when disk space usage is above 95% + - alert: DiskSpaceUsage + expr: 100.0 - 100 * (windows_physical_disk_free_bytes / windows_physical_disk_size_bytes) > 95 + for: 10m + labels: + severity: high + annotations: + summary: "Disk Space Usage (instance {{ $labels.instance }})" + description: "Disk Space on Drive is used more than 95%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + # Alerts on disks with over 85% space usage predicted to fill within the next four days + - alert: DiskFilling + expr: 100 * (windows_physical_disk_free_bytes / windows_physical_disk_size_bytes) < 15 and predict_linear(windows_physical_disk_free_bytes[6h], 4 * 24 * 3600) < 0 + for: 10m + labels: + severity: warning + annotations: + summary: "Disk full in four days (instance {{ $labels.instance }})" + description: "{{ $labels.disk }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" +``` diff --git a/exporter.go b/exporter.go index a14729c3..390824a1 100644 --- a/exporter.go +++ b/exporter.go @@ -49,7 +49,7 @@ type prometheusVersion struct { } const ( - defaultCollectors = "cpu,cs,logical_disk,net,os,service,system,textfile" + defaultCollectors = "cpu,cs,logical_disk,physical_disk,net,os,service,system,textfile" defaultCollectorsPlaceholder = "[defaults]" ) diff --git a/tools/e2e-output.txt b/tools/e2e-output.txt index 7bed991c..e0bc6668 100644 --- a/tools/e2e-output.txt +++ b/tools/e2e-output.txt @@ -40,6 +40,7 @@ test_alpha_total 42 windows_exporter_collector_success{collector="cpu"} 1 windows_exporter_collector_success{collector="cs"} 1 windows_exporter_collector_success{collector="logical_disk"} 1 +windows_exporter_collector_success{collector="physical_disk"} 1 windows_exporter_collector_success{collector="net"} 1 windows_exporter_collector_success{collector="os"} 1 windows_exporter_collector_success{collector="service"} 1 @@ -50,6 +51,7 @@ windows_exporter_collector_success{collector="textfile"} 1 windows_exporter_collector_timeout{collector="cpu"} 0 windows_exporter_collector_timeout{collector="cs"} 0 windows_exporter_collector_timeout{collector="logical_disk"} 0 +windows_exporter_collector_timeout{collector="physical_disk"} 0 windows_exporter_collector_timeout{collector="net"} 0 windows_exporter_collector_timeout{collector="os"} 0 windows_exporter_collector_timeout{collector="service"} 0 @@ -89,6 +91,30 @@ windows_exporter_collector_timeout{collector="textfile"} 0 # TYPE windows_logical_disk_write_seconds_total counter # HELP windows_logical_disk_writes_total The number of write operations on the disk (LogicalDisk.DiskWritesPerSec) # TYPE windows_logical_disk_writes_total counter +# HELP windows_physical_disk_idle_seconds_total Seconds that the disk was idle (PhysicalDisk.PercentIdleTime) +# TYPE windows_physical_disk_idle_seconds_total counter +# HELP windows_physical_disk_read_bytes_total The number of bytes transferred from the disk during read operations (PhysicalDisk.DiskReadBytesPerSec) +# TYPE windows_physical_disk_read_bytes_total counter +# HELP windows_physical_disk_read_latency_seconds_total Shows the average time, in seconds, of a read operation from the disk (PhysicalDisk.AvgDiskSecPerRead) +# TYPE windows_physical_disk_read_latency_seconds_total counter +# HELP windows_physical_disk_read_seconds_total Seconds that the disk was busy servicing read requests (PhysicalDisk.PercentDiskReadTime) +# TYPE windows_physical_disk_read_seconds_total counter +# HELP windows_physical_disk_read_write_latency_seconds_total Shows the time, in seconds, of the average disk transfer (PhysicalDisk.AvgDiskSecPerTransfer) +# TYPE windows_physical_disk_read_write_latency_seconds_total counter +# HELP windows_physical_disk_reads_total The number of read operations on the disk (PhysicalDisk.DiskReadsPerSec) +# TYPE windows_physical_disk_reads_total counter +# HELP windows_physical_disk_requests_queued The number of requests queued to the disk (PhysicalDisk.CurrentDiskQueueLength) +# TYPE windows_physical_disk_requests_queued gauge +# HELP windows_physical_disk_split_ios_total The number of I/Os to the disk were split into multiple I/Os (PhysicalDisk.SplitIOPerSec) +# TYPE windows_physical_disk_split_ios_total counter +# HELP windows_physical_disk_write_bytes_total The number of bytes transferred to the disk during write operations (PhysicalDisk.DiskWriteBytesPerSec) +# TYPE windows_physical_disk_write_bytes_total counter +# HELP windows_physical_disk_write_latency_seconds_total Shows the average time, in seconds, of a write operation to the disk (PhysicalDisk.AvgDiskSecPerWrite) +# TYPE windows_physical_disk_write_latency_seconds_total counter +# HELP windows_physical_disk_write_seconds_total Seconds that the disk was busy servicing write requests (PhysicalDisk.PercentDiskWriteTime) +# TYPE windows_physical_disk_write_seconds_total counter +# HELP windows_physical_disk_writes_total The number of write operations on the disk (PhysicalDisk.DiskWritesPerSec) +# TYPE windows_physical_disk_writes_total counter # HELP windows_net_bytes_received_total (Network.BytesReceivedPerSec) # TYPE windows_net_bytes_received_total counter # HELP windows_net_bytes_sent_total (Network.BytesSentPerSec) diff --git a/tools/end-to-end-test.ps1 b/tools/end-to-end-test.ps1 index 7aa7f76d..b2213926 100644 --- a/tools/end-to-end-test.ps1 +++ b/tools/end-to-end-test.ps1 @@ -18,7 +18,7 @@ mkdir $textfile_dir | Out-Null Copy-Item 'e2e-textfile.prom' -Destination "$($textfile_dir)/e2e-textfile.prom" # Omit dynamic collector information that will change after each run -$skip_re = "^(go_|windows_exporter_build_info|windows_exporter_collector_duration_seconds|windows_exporter_perflib_snapshot_duration_seconds|process_|windows_textfile_mtime_seconds|windows_cpu|windows_cs|windows_logical_disk|windows_net|windows_os|windows_service|windows_system|windows_textfile_mtime_seconds)" +$skip_re = "^(go_|windows_exporter_build_info|windows_exporter_collector_duration_seconds|windows_exporter_perflib_snapshot_duration_seconds|process_|windows_textfile_mtime_seconds|windows_cpu|windows_cs|windows_logical_disk|windows_physical_disk|windows_net|windows_os|windows_service|windows_system|windows_textfile_mtime_seconds)" # Start process in background, awaiting HTTP requests. # Use default collectors, port and address: http://localhost:9182/metrics