Handle small backwards jumps in CPU idle
The Linux CPU idle stat can also jump backwards slightly in some cases. Allow the jump back up to 3 seconds before we attempt to reset the CPU counter cache. Fixes: https://github.com/prometheus/node_exporter/issues/1903 Signed-off-by: Ben Kochie <superq@gmail.com>
This commit is contained in:
parent
13be860e25
commit
73c9a10d37
|
@ -46,10 +46,14 @@ type cpuCollector struct {
|
||||||
cpuBugsIncludeRegexp *regexp.Regexp
|
cpuBugsIncludeRegexp *regexp.Regexp
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Idle jump back limit in seconds.
|
||||||
|
const jumpBackSeconds = 3.0
|
||||||
|
|
||||||
var (
|
var (
|
||||||
enableCPUInfo = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool()
|
enableCPUInfo = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool()
|
||||||
flagsInclude = kingpin.Flag("collector.cpu.info.flags-include", "Filter the `flags` field in cpuInfo with a value that must be a regular expression").String()
|
flagsInclude = kingpin.Flag("collector.cpu.info.flags-include", "Filter the `flags` field in cpuInfo with a value that must be a regular expression").String()
|
||||||
bugsInclude = kingpin.Flag("collector.cpu.info.bugs-include", "Filter the `bugs` field in cpuInfo with a value that must be a regular expression").String()
|
bugsInclude = kingpin.Flag("collector.cpu.info.bugs-include", "Filter the `bugs` field in cpuInfo with a value that must be a regular expression").String()
|
||||||
|
jumpBackDebugMessage = fmt.Sprintf("CPU Idle counter jumped backwards more than %f seconds, possible hotplug event, resetting CPU stats", jumpBackSeconds)
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
|
@ -302,6 +306,7 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error {
|
||||||
|
|
||||||
// updateCPUStats updates the internal cache of CPU stats.
|
// updateCPUStats updates the internal cache of CPU stats.
|
||||||
func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) {
|
func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) {
|
||||||
|
|
||||||
// Acquire a lock to update the stats.
|
// Acquire a lock to update the stats.
|
||||||
c.cpuStatsMutex.Lock()
|
c.cpuStatsMutex.Lock()
|
||||||
defer c.cpuStatsMutex.Unlock()
|
defer c.cpuStatsMutex.Unlock()
|
||||||
|
@ -312,12 +317,17 @@ func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for i, n := range newStats {
|
for i, n := range newStats {
|
||||||
// If idle jumps backwards, assume we had a hotplug event and reset the stats for this CPU.
|
// If idle jumps backwards by more than X seconds, assume we had a hotplug event and reset the stats for this CPU.
|
||||||
if n.Idle < c.cpuStats[i].Idle {
|
if (c.cpuStats[i].Idle - n.Idle) >= jumpBackSeconds {
|
||||||
level.Debug(c.logger).Log("msg", "CPU Idle counter jumped backwards, possible hotplug event, resetting CPU stats", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
|
level.Debug(c.logger).Log("msg", jumpBackDebugMessage, "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
|
||||||
c.cpuStats[i] = procfs.CPUStat{}
|
c.cpuStats[i] = procfs.CPUStat{}
|
||||||
}
|
}
|
||||||
c.cpuStats[i].Idle = n.Idle
|
|
||||||
|
if n.Idle >= c.cpuStats[i].Idle {
|
||||||
|
c.cpuStats[i].Idle = n.Idle
|
||||||
|
} else {
|
||||||
|
level.Debug(c.logger).Log("msg", "CPU Idle counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
|
||||||
|
}
|
||||||
|
|
||||||
if n.User >= c.cpuStats[i].User {
|
if n.User >= c.cpuStats[i].User {
|
||||||
c.cpuStats[i].User = n.User
|
c.cpuStats[i].User = n.User
|
||||||
|
|
|
@ -0,0 +1,105 @@
|
||||||
|
// Copyright 2021 The Prometheus Authors
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
// +build !nocpu
|
||||||
|
|
||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/go-kit/log"
|
||||||
|
"github.com/prometheus/procfs"
|
||||||
|
)
|
||||||
|
|
||||||
|
func makeTestCPUCollector(s []procfs.CPUStat) *cpuCollector {
|
||||||
|
dup := make([]procfs.CPUStat, len(s))
|
||||||
|
copy(dup, s)
|
||||||
|
return &cpuCollector{
|
||||||
|
logger: log.NewNopLogger(),
|
||||||
|
cpuStats: dup,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCPU(t *testing.T) {
|
||||||
|
firstCPUStat := []procfs.CPUStat{{
|
||||||
|
User: 100.0,
|
||||||
|
Nice: 100.0,
|
||||||
|
System: 100.0,
|
||||||
|
Idle: 100.0,
|
||||||
|
Iowait: 100.0,
|
||||||
|
IRQ: 100.0,
|
||||||
|
SoftIRQ: 100.0,
|
||||||
|
Steal: 100.0,
|
||||||
|
Guest: 100.0,
|
||||||
|
GuestNice: 100.0,
|
||||||
|
}}
|
||||||
|
|
||||||
|
c := makeTestCPUCollector(firstCPUStat)
|
||||||
|
want := []procfs.CPUStat{{
|
||||||
|
User: 101.0,
|
||||||
|
Nice: 101.0,
|
||||||
|
System: 101.0,
|
||||||
|
Idle: 101.0,
|
||||||
|
Iowait: 101.0,
|
||||||
|
IRQ: 101.0,
|
||||||
|
SoftIRQ: 101.0,
|
||||||
|
Steal: 101.0,
|
||||||
|
Guest: 101.0,
|
||||||
|
GuestNice: 101.0,
|
||||||
|
}}
|
||||||
|
c.updateCPUStats(want)
|
||||||
|
got := c.cpuStats
|
||||||
|
if !reflect.DeepEqual(want, got) {
|
||||||
|
t.Fatalf("should have %v CPU Stat: got %v", want, got)
|
||||||
|
}
|
||||||
|
|
||||||
|
c = makeTestCPUCollector(firstCPUStat)
|
||||||
|
jumpBack := []procfs.CPUStat{{
|
||||||
|
User: 99.9,
|
||||||
|
Nice: 99.9,
|
||||||
|
System: 99.9,
|
||||||
|
Idle: 99.9,
|
||||||
|
Iowait: 99.9,
|
||||||
|
IRQ: 99.9,
|
||||||
|
SoftIRQ: 99.9,
|
||||||
|
Steal: 99.9,
|
||||||
|
Guest: 99.9,
|
||||||
|
GuestNice: 99.9,
|
||||||
|
}}
|
||||||
|
c.updateCPUStats(jumpBack)
|
||||||
|
got = c.cpuStats
|
||||||
|
if reflect.DeepEqual(jumpBack, got) {
|
||||||
|
t.Fatalf("should have %v CPU Stat: got %v", firstCPUStat, got)
|
||||||
|
}
|
||||||
|
|
||||||
|
c = makeTestCPUCollector(firstCPUStat)
|
||||||
|
resetIdle := []procfs.CPUStat{{
|
||||||
|
User: 102.0,
|
||||||
|
Nice: 102.0,
|
||||||
|
System: 102.0,
|
||||||
|
Idle: 1.0,
|
||||||
|
Iowait: 102.0,
|
||||||
|
IRQ: 102.0,
|
||||||
|
SoftIRQ: 102.0,
|
||||||
|
Steal: 102.0,
|
||||||
|
Guest: 102.0,
|
||||||
|
GuestNice: 102.0,
|
||||||
|
}}
|
||||||
|
c.updateCPUStats(resetIdle)
|
||||||
|
got = c.cpuStats
|
||||||
|
if !reflect.DeepEqual(resetIdle, got) {
|
||||||
|
t.Fatalf("should have %v CPU Stat: got %v", resetIdle, got)
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue