windows_exporter/collector/container.go
Tatenda Zifudzi 9adefdceeb fix: Resolve missing container network metrics by querying from HNS
This fix ensures that network stats for containerd on Windows are successfully collected. Before this change, other container stats such as CPU and memory are successfully collected, but network stats are failing for containerd.

The root cause is that the code for collecting network stats was originally written to work with docker which relies on v1 schema. After dockershim removal as Kubernetes's container runtime, containerd adoption has increased and this error is more frequently encountered when using containerd as  the runtime. containerd uses v2 schema whereby the network stats need to be queried from the HNS component.

Signed-off-by: Tatenda Zifudzi <tzifudzi@amazon.com>
2023-07-20 11:22:29 -07:00

373 lines
11 KiB
Go

//go:build windows
// +build windows
package collector
import (
"fmt"
"strings"
"github.com/Microsoft/hcsshim"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/prometheus/client_golang/prometheus"
)
// A ContainerMetricsCollector is a Prometheus collector for containers metrics
type ContainerMetricsCollector struct {
logger log.Logger
// Presence
ContainerAvailable *prometheus.Desc
// Number of containers
ContainersCount *prometheus.Desc
// memory
UsageCommitBytes *prometheus.Desc
UsageCommitPeakBytes *prometheus.Desc
UsagePrivateWorkingSetBytes *prometheus.Desc
// CPU
RuntimeTotal *prometheus.Desc
RuntimeUser *prometheus.Desc
RuntimeKernel *prometheus.Desc
// Network
BytesReceived *prometheus.Desc
BytesSent *prometheus.Desc
PacketsReceived *prometheus.Desc
PacketsSent *prometheus.Desc
DroppedPacketsIncoming *prometheus.Desc
DroppedPacketsOutgoing *prometheus.Desc
// Storage
ReadCountNormalized *prometheus.Desc
ReadSizeBytes *prometheus.Desc
WriteCountNormalized *prometheus.Desc
WriteSizeBytes *prometheus.Desc
}
// newContainerMetricsCollector constructs a new ContainerMetricsCollector
func newContainerMetricsCollector(logger log.Logger) (Collector, error) {
const subsystem = "container"
return &ContainerMetricsCollector{
logger: log.With(logger, "collector", subsystem),
ContainerAvailable: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "available"),
"Available",
[]string{"container_id"},
nil,
),
ContainersCount: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "count"),
"Number of containers",
nil,
nil,
),
UsageCommitBytes: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "memory_usage_commit_bytes"),
"Memory Usage Commit Bytes",
[]string{"container_id"},
nil,
),
UsageCommitPeakBytes: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "memory_usage_commit_peak_bytes"),
"Memory Usage Commit Peak Bytes",
[]string{"container_id"},
nil,
),
UsagePrivateWorkingSetBytes: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "memory_usage_private_working_set_bytes"),
"Memory Usage Private Working Set Bytes",
[]string{"container_id"},
nil,
),
RuntimeTotal: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "cpu_usage_seconds_total"),
"Total Run time in Seconds",
[]string{"container_id"},
nil,
),
RuntimeUser: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "cpu_usage_seconds_usermode"),
"Run Time in User mode in Seconds",
[]string{"container_id"},
nil,
),
RuntimeKernel: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "cpu_usage_seconds_kernelmode"),
"Run time in Kernel mode in Seconds",
[]string{"container_id"},
nil,
),
BytesReceived: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "network_receive_bytes_total"),
"Bytes Received on Interface",
[]string{"container_id", "interface"},
nil,
),
BytesSent: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "network_transmit_bytes_total"),
"Bytes Sent on Interface",
[]string{"container_id", "interface"},
nil,
),
PacketsReceived: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "network_receive_packets_total"),
"Packets Received on Interface",
[]string{"container_id", "interface"},
nil,
),
PacketsSent: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "network_transmit_packets_total"),
"Packets Sent on Interface",
[]string{"container_id", "interface"},
nil,
),
DroppedPacketsIncoming: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "network_receive_packets_dropped_total"),
"Dropped Incoming Packets on Interface",
[]string{"container_id", "interface"},
nil,
),
DroppedPacketsOutgoing: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "network_transmit_packets_dropped_total"),
"Dropped Outgoing Packets on Interface",
[]string{"container_id", "interface"},
nil,
),
ReadCountNormalized: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "storage_read_count_normalized_total"),
"Read Count Normalized",
[]string{"container_id"},
nil,
),
ReadSizeBytes: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "storage_read_size_bytes_total"),
"Read Size Bytes",
[]string{"container_id"},
nil,
),
WriteCountNormalized: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "storage_write_count_normalized_total"),
"Write Count Normalized",
[]string{"container_id"},
nil,
),
WriteSizeBytes: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "storage_write_size_bytes_total"),
"Write Size Bytes",
[]string{"container_id"},
nil,
),
}, nil
}
// Collect sends the metric values for each metric
// to the provided prometheus Metric channel.
func (c *ContainerMetricsCollector) Collect(ctx *ScrapeContext, ch chan<- prometheus.Metric) error {
if desc, err := c.collect(ch); err != nil {
_ = level.Error(c.logger).Log("msg", "failed collecting ContainerMetricsCollector metrics", "desc", desc, "err", err)
return err
}
return nil
}
// containerClose closes the container resource
func (c *ContainerMetricsCollector) containerClose(container hcsshim.Container) {
err := container.Close()
if err != nil {
_ = level.Error(c.logger).Log("err", err)
}
}
func (c *ContainerMetricsCollector) collect(ch chan<- prometheus.Metric) (*prometheus.Desc, error) {
// Types Container is passed to get the containers compute systems only
containers, err := hcsshim.GetContainers(hcsshim.ComputeSystemQuery{Types: []string{"Container"}})
if err != nil {
_ = level.Error(c.logger).Log("msg", "Err in Getting containers", "err", err)
return nil, err
}
count := len(containers)
ch <- prometheus.MustNewConstMetric(
c.ContainersCount,
prometheus.GaugeValue,
float64(count),
)
if count == 0 {
return nil, nil
}
containerPrefixes := make(map[string]string)
for _, containerDetails := range containers {
container, err := hcsshim.OpenContainer(containerDetails.ID)
if container != nil {
defer c.containerClose(container)
}
if err != nil {
_ = level.Error(c.logger).Log("msg", "err in opening container", "containerId", containerDetails.ID, "err", err)
continue
}
cstats, err := container.Statistics()
if err != nil {
_ = level.Error(c.logger).Log("msg", "err in fetching container Statistics", "containerId", containerDetails.ID, "err", err)
continue
}
containerIdWithPrefix := getContainerIdWithPrefix(containerDetails)
containerPrefixes[containerDetails.ID] = containerIdWithPrefix
ch <- prometheus.MustNewConstMetric(
c.ContainerAvailable,
prometheus.CounterValue,
1,
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.UsageCommitBytes,
prometheus.GaugeValue,
float64(cstats.Memory.UsageCommitBytes),
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.UsageCommitPeakBytes,
prometheus.GaugeValue,
float64(cstats.Memory.UsageCommitPeakBytes),
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.UsagePrivateWorkingSetBytes,
prometheus.GaugeValue,
float64(cstats.Memory.UsagePrivateWorkingSetBytes),
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.RuntimeTotal,
prometheus.CounterValue,
float64(cstats.Processor.TotalRuntime100ns)*ticksToSecondsScaleFactor,
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.RuntimeUser,
prometheus.CounterValue,
float64(cstats.Processor.RuntimeUser100ns)*ticksToSecondsScaleFactor,
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.RuntimeKernel,
prometheus.CounterValue,
float64(cstats.Processor.RuntimeKernel100ns)*ticksToSecondsScaleFactor,
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.ReadCountNormalized,
prometheus.CounterValue,
float64(cstats.Storage.ReadCountNormalized),
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.ReadSizeBytes,
prometheus.CounterValue,
float64(cstats.Storage.ReadSizeBytes),
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.WriteCountNormalized,
prometheus.CounterValue,
float64(cstats.Storage.WriteCountNormalized),
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.WriteSizeBytes,
prometheus.CounterValue,
float64(cstats.Storage.WriteSizeBytes),
containerIdWithPrefix,
)
}
hnsEndpoints, err := hcsshim.HNSListEndpointRequest()
if err != nil {
_ = level.Warn(c.logger).Log("msg", "Failed to collect network stats for containers")
return nil, nil
}
if len(hnsEndpoints) == 0 {
_ = level.Info(c.logger).Log("msg", fmt.Sprintf("No network stats for containers to collect"))
return nil, nil
}
for _, endpoint := range hnsEndpoints {
endpointStats, err := hcsshim.GetHNSEndpointStats(endpoint.Id)
if err != nil {
_ = level.Warn(c.logger).Log("msg", fmt.Sprintf("Failed to collect network stats for interface %s", endpoint.Id), "err", err)
continue
}
for _, containerId := range endpoint.SharedContainers {
containerIdWithPrefix, ok := containerPrefixes[containerId]
endpointId := strings.ToUpper(endpoint.Id)
if !ok {
_ = level.Warn(c.logger).Log("msg", fmt.Sprintf("Failed to collect network stats for container %s", containerId))
continue
}
ch <- prometheus.MustNewConstMetric(
c.BytesReceived,
prometheus.CounterValue,
float64(endpointStats.BytesReceived),
containerIdWithPrefix, endpointId,
)
ch <- prometheus.MustNewConstMetric(
c.BytesSent,
prometheus.CounterValue,
float64(endpointStats.BytesSent),
containerIdWithPrefix, endpointId,
)
ch <- prometheus.MustNewConstMetric(
c.PacketsReceived,
prometheus.CounterValue,
float64(endpointStats.PacketsReceived),
containerIdWithPrefix, endpointId,
)
ch <- prometheus.MustNewConstMetric(
c.PacketsSent,
prometheus.CounterValue,
float64(endpointStats.PacketsSent),
containerIdWithPrefix, endpointId,
)
ch <- prometheus.MustNewConstMetric(
c.DroppedPacketsIncoming,
prometheus.CounterValue,
float64(endpointStats.DroppedPacketsIncoming),
containerIdWithPrefix, endpointId,
)
ch <- prometheus.MustNewConstMetric(
c.DroppedPacketsOutgoing,
prometheus.CounterValue,
float64(endpointStats.DroppedPacketsOutgoing),
containerIdWithPrefix, endpointId,
)
}
}
return nil, nil
}
func getContainerIdWithPrefix(containerDetails hcsshim.ContainerProperties) string {
switch containerDetails.Owner {
case "containerd-shim-runhcs-v1.exe":
return "containerd://" + containerDetails.ID
default:
// default to docker or if owner is not set
return "docker://" + containerDetails.ID
}
}