From 73c9a10d371ed306b22f365dc9b86a689212f7f4 Mon Sep 17 00:00:00 2001 From: Ben Kochie Date: Sun, 4 Jul 2021 10:47:04 +0200 Subject: [PATCH] Handle small backwards jumps in CPU idle The Linux CPU idle stat can also jump backwards slightly in some cases. Allow the jump back up to 3 seconds before we attempt to reset the CPU counter cache. Fixes: https://github.com/prometheus/node_exporter/issues/1903 Signed-off-by: Ben Kochie --- collector/cpu_linux.go | 24 ++++++--- collector/cpu_linux_test.go | 105 ++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 7 deletions(-) create mode 100644 collector/cpu_linux_test.go diff --git a/collector/cpu_linux.go b/collector/cpu_linux.go index 5c1293d0..d1f93540 100644 --- a/collector/cpu_linux.go +++ b/collector/cpu_linux.go @@ -46,10 +46,14 @@ type cpuCollector struct { cpuBugsIncludeRegexp *regexp.Regexp } +// Idle jump back limit in seconds. +const jumpBackSeconds = 3.0 + var ( - enableCPUInfo = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool() - flagsInclude = kingpin.Flag("collector.cpu.info.flags-include", "Filter the `flags` field in cpuInfo with a value that must be a regular expression").String() - bugsInclude = kingpin.Flag("collector.cpu.info.bugs-include", "Filter the `bugs` field in cpuInfo with a value that must be a regular expression").String() + enableCPUInfo = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool() + flagsInclude = kingpin.Flag("collector.cpu.info.flags-include", "Filter the `flags` field in cpuInfo with a value that must be a regular expression").String() + bugsInclude = kingpin.Flag("collector.cpu.info.bugs-include", "Filter the `bugs` field in cpuInfo with a value that must be a regular expression").String() + jumpBackDebugMessage = fmt.Sprintf("CPU Idle counter jumped backwards more than %f seconds, possible hotplug event, resetting CPU stats", jumpBackSeconds) ) func init() { @@ -302,6 +306,7 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error { // updateCPUStats updates the internal cache of CPU stats. func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) { + // Acquire a lock to update the stats. c.cpuStatsMutex.Lock() defer c.cpuStatsMutex.Unlock() @@ -312,12 +317,17 @@ func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) { } for i, n := range newStats { - // If idle jumps backwards, assume we had a hotplug event and reset the stats for this CPU. - if n.Idle < c.cpuStats[i].Idle { - level.Debug(c.logger).Log("msg", "CPU Idle counter jumped backwards, possible hotplug event, resetting CPU stats", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle) + // If idle jumps backwards by more than X seconds, assume we had a hotplug event and reset the stats for this CPU. + if (c.cpuStats[i].Idle - n.Idle) >= jumpBackSeconds { + level.Debug(c.logger).Log("msg", jumpBackDebugMessage, "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle) c.cpuStats[i] = procfs.CPUStat{} } - c.cpuStats[i].Idle = n.Idle + + if n.Idle >= c.cpuStats[i].Idle { + c.cpuStats[i].Idle = n.Idle + } else { + level.Debug(c.logger).Log("msg", "CPU Idle counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle) + } if n.User >= c.cpuStats[i].User { c.cpuStats[i].User = n.User diff --git a/collector/cpu_linux_test.go b/collector/cpu_linux_test.go new file mode 100644 index 00000000..f7b48313 --- /dev/null +++ b/collector/cpu_linux_test.go @@ -0,0 +1,105 @@ +// Copyright 2021 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build !nocpu + +package collector + +import ( + "reflect" + "testing" + + "github.com/go-kit/log" + "github.com/prometheus/procfs" +) + +func makeTestCPUCollector(s []procfs.CPUStat) *cpuCollector { + dup := make([]procfs.CPUStat, len(s)) + copy(dup, s) + return &cpuCollector{ + logger: log.NewNopLogger(), + cpuStats: dup, + } +} + +func TestCPU(t *testing.T) { + firstCPUStat := []procfs.CPUStat{{ + User: 100.0, + Nice: 100.0, + System: 100.0, + Idle: 100.0, + Iowait: 100.0, + IRQ: 100.0, + SoftIRQ: 100.0, + Steal: 100.0, + Guest: 100.0, + GuestNice: 100.0, + }} + + c := makeTestCPUCollector(firstCPUStat) + want := []procfs.CPUStat{{ + User: 101.0, + Nice: 101.0, + System: 101.0, + Idle: 101.0, + Iowait: 101.0, + IRQ: 101.0, + SoftIRQ: 101.0, + Steal: 101.0, + Guest: 101.0, + GuestNice: 101.0, + }} + c.updateCPUStats(want) + got := c.cpuStats + if !reflect.DeepEqual(want, got) { + t.Fatalf("should have %v CPU Stat: got %v", want, got) + } + + c = makeTestCPUCollector(firstCPUStat) + jumpBack := []procfs.CPUStat{{ + User: 99.9, + Nice: 99.9, + System: 99.9, + Idle: 99.9, + Iowait: 99.9, + IRQ: 99.9, + SoftIRQ: 99.9, + Steal: 99.9, + Guest: 99.9, + GuestNice: 99.9, + }} + c.updateCPUStats(jumpBack) + got = c.cpuStats + if reflect.DeepEqual(jumpBack, got) { + t.Fatalf("should have %v CPU Stat: got %v", firstCPUStat, got) + } + + c = makeTestCPUCollector(firstCPUStat) + resetIdle := []procfs.CPUStat{{ + User: 102.0, + Nice: 102.0, + System: 102.0, + Idle: 1.0, + Iowait: 102.0, + IRQ: 102.0, + SoftIRQ: 102.0, + Steal: 102.0, + Guest: 102.0, + GuestNice: 102.0, + }} + c.updateCPUStats(resetIdle) + got = c.cpuStats + if !reflect.DeepEqual(resetIdle, got) { + t.Fatalf("should have %v CPU Stat: got %v", resetIdle, got) + } +}