fix: Resolve missing container network metrics by querying from HNS

This fix ensures that network stats for containerd on Windows are successfully collected. Before this change, other container stats such as CPU and memory are successfully collected, but network stats are failing for containerd.

The root cause is that the code for collecting network stats was originally written to work with docker which relies on v1 schema. After dockershim removal as Kubernetes's container runtime, containerd adoption has increased and this error is more frequently encountered when using containerd as  the runtime. containerd uses v2 schema whereby the network stats need to be queried from the HNS component.

Signed-off-by: Tatenda Zifudzi <tzifudzi@amazon.com>
This commit is contained in:
Tatenda Zifudzi 2023-06-03 16:22:17 -07:00
parent 89cb5439b8
commit 9adefdceeb
1 changed files with 74 additions and 48 deletions

View File

@ -4,6 +4,9 @@
package collector
import (
"fmt"
"strings"
"github.com/Microsoft/hcsshim"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
@ -199,6 +202,8 @@ func (c *ContainerMetricsCollector) collect(ch chan<- prometheus.Metric) (*prome
return nil, nil
}
containerPrefixes := make(map[string]string)
for _, containerDetails := range containers {
container, err := hcsshim.OpenContainer(containerDetails.ID)
if container != nil {
@ -214,7 +219,9 @@ func (c *ContainerMetricsCollector) collect(ch chan<- prometheus.Metric) (*prome
_ = level.Error(c.logger).Log("msg", "err in fetching container Statistics", "containerId", containerDetails.ID, "err", err)
continue
}
containerIdWithPrefix := getContainerIdWithPrefix(containerDetails)
containerPrefixes[containerDetails.ID] = containerIdWithPrefix
ch <- prometheus.MustNewConstMetric(
c.ContainerAvailable,
@ -258,54 +265,6 @@ func (c *ContainerMetricsCollector) collect(ch chan<- prometheus.Metric) (*prome
float64(cstats.Processor.RuntimeKernel100ns)*ticksToSecondsScaleFactor,
containerIdWithPrefix,
)
if len(cstats.Network) == 0 {
_ = level.Info(c.logger).Log("msg", "No Network Stats for container", "containerId", containerDetails.ID)
continue
}
networkStats := cstats.Network
for _, networkInterface := range networkStats {
ch <- prometheus.MustNewConstMetric(
c.BytesReceived,
prometheus.CounterValue,
float64(networkInterface.BytesReceived),
containerIdWithPrefix, networkInterface.EndpointId,
)
ch <- prometheus.MustNewConstMetric(
c.BytesSent,
prometheus.CounterValue,
float64(networkInterface.BytesSent),
containerIdWithPrefix, networkInterface.EndpointId,
)
ch <- prometheus.MustNewConstMetric(
c.PacketsReceived,
prometheus.CounterValue,
float64(networkInterface.PacketsReceived),
containerIdWithPrefix, networkInterface.EndpointId,
)
ch <- prometheus.MustNewConstMetric(
c.PacketsSent,
prometheus.CounterValue,
float64(networkInterface.PacketsSent),
containerIdWithPrefix, networkInterface.EndpointId,
)
ch <- prometheus.MustNewConstMetric(
c.DroppedPacketsIncoming,
prometheus.CounterValue,
float64(networkInterface.DroppedPacketsIncoming),
containerIdWithPrefix, networkInterface.EndpointId,
)
ch <- prometheus.MustNewConstMetric(
c.DroppedPacketsOutgoing,
prometheus.CounterValue,
float64(networkInterface.DroppedPacketsOutgoing),
containerIdWithPrefix, networkInterface.EndpointId,
)
break
}
ch <- prometheus.MustNewConstMetric(
c.ReadCountNormalized,
prometheus.CounterValue,
@ -332,6 +291,73 @@ func (c *ContainerMetricsCollector) collect(ch chan<- prometheus.Metric) (*prome
)
}
hnsEndpoints, err := hcsshim.HNSListEndpointRequest()
if err != nil {
_ = level.Warn(c.logger).Log("msg", "Failed to collect network stats for containers")
return nil, nil
}
if len(hnsEndpoints) == 0 {
_ = level.Info(c.logger).Log("msg", fmt.Sprintf("No network stats for containers to collect"))
return nil, nil
}
for _, endpoint := range hnsEndpoints {
endpointStats, err := hcsshim.GetHNSEndpointStats(endpoint.Id)
if err != nil {
_ = level.Warn(c.logger).Log("msg", fmt.Sprintf("Failed to collect network stats for interface %s", endpoint.Id), "err", err)
continue
}
for _, containerId := range endpoint.SharedContainers {
containerIdWithPrefix, ok := containerPrefixes[containerId]
endpointId := strings.ToUpper(endpoint.Id)
if !ok {
_ = level.Warn(c.logger).Log("msg", fmt.Sprintf("Failed to collect network stats for container %s", containerId))
continue
}
ch <- prometheus.MustNewConstMetric(
c.BytesReceived,
prometheus.CounterValue,
float64(endpointStats.BytesReceived),
containerIdWithPrefix, endpointId,
)
ch <- prometheus.MustNewConstMetric(
c.BytesSent,
prometheus.CounterValue,
float64(endpointStats.BytesSent),
containerIdWithPrefix, endpointId,
)
ch <- prometheus.MustNewConstMetric(
c.PacketsReceived,
prometheus.CounterValue,
float64(endpointStats.PacketsReceived),
containerIdWithPrefix, endpointId,
)
ch <- prometheus.MustNewConstMetric(
c.PacketsSent,
prometheus.CounterValue,
float64(endpointStats.PacketsSent),
containerIdWithPrefix, endpointId,
)
ch <- prometheus.MustNewConstMetric(
c.DroppedPacketsIncoming,
prometheus.CounterValue,
float64(endpointStats.DroppedPacketsIncoming),
containerIdWithPrefix, endpointId,
)
ch <- prometheus.MustNewConstMetric(
c.DroppedPacketsOutgoing,
prometheus.CounterValue,
float64(endpointStats.DroppedPacketsOutgoing),
containerIdWithPrefix, endpointId,
)
}
}
return nil, nil
}