diff --git a/.circleci/config.yml b/.circleci/config.yml index 19c0f1ba..018b8b18 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -28,7 +28,7 @@ jobs: steps: - checkout - run: sudo pip install codespell - - run: codespell --skip=".git,./vendor,ttar,go.mod,go.sum,*pem" -L uint,packages\',uptodate + - run: codespell --skip=".git,./vendor,ttar,go.mod,go.sum,*pem,./collector/fixtures" -L uint,packages\',uptodate build: machine: diff --git a/CHANGELOG.md b/CHANGELOG.md index a5d884fa..9e05d0f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ * [CHANGE] * [FEATURE] * [ENHANCEMENT] -* [BUGFIX] +* [BUGFIX] Linux CPU: Cache CPU metrics to make them monotonically increasing #1711 ## 1.0.0-rc.1 / 2020-05-14 diff --git a/collector/cpu_linux.go b/collector/cpu_linux.go index ae8ee53e..dfa4d4af 100644 --- a/collector/cpu_linux.go +++ b/collector/cpu_linux.go @@ -19,6 +19,7 @@ import ( "fmt" "path/filepath" "strconv" + "sync" "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" @@ -35,6 +36,8 @@ type cpuCollector struct { cpuCoreThrottle *prometheus.Desc cpuPackageThrottle *prometheus.Desc logger log.Logger + cpuStats []procfs.CPUStat + cpuStatsMutex sync.Mutex } var ( @@ -203,7 +206,12 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error { return err } - for cpuID, cpuStat := range stats.CPU { + c.updateCPUStats(stats.CPU) + + // Acquire a lock to read the stats. + c.cpuStatsMutex.Lock() + defer c.cpuStatsMutex.Unlock() + for cpuID, cpuStat := range c.cpuStats { cpuNum := strconv.Itoa(cpuID) ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.User, cpuNum, "user") ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.Nice, cpuNum, "nice") @@ -221,3 +229,78 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error { return nil } + +// updateCPUStats updates the internal cache of CPU stats. +func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) { + // Acquire a lock to update the stats. + c.cpuStatsMutex.Lock() + defer c.cpuStatsMutex.Unlock() + + // Reset the cache if the list of CPUs has changed. + if len(c.cpuStats) != len(newStats) { + c.cpuStats = make([]procfs.CPUStat, len(newStats)) + } + + for i, n := range newStats { + // If idle jumps backwards, assume we had a hotplug event and reset the stats for this CPU. + if n.Idle < c.cpuStats[i].Idle { + level.Warn(c.logger).Log("msg", "CPU Idle counter jumped backwards, possible hotplug event, resetting CPU stats", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle) + c.cpuStats[i] = procfs.CPUStat{} + } + c.cpuStats[i].Idle = n.Idle + + if n.User >= c.cpuStats[i].User { + c.cpuStats[i].User = n.User + } else { + level.Warn(c.logger).Log("msg", "CPU User counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].User, "new_value", n.User) + } + + if n.Nice >= c.cpuStats[i].Nice { + c.cpuStats[i].Nice = n.Nice + } else { + level.Warn(c.logger).Log("msg", "CPU Nice counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Nice, "new_value", n.Nice) + } + + if n.System >= c.cpuStats[i].System { + c.cpuStats[i].System = n.System + } else { + level.Warn(c.logger).Log("msg", "CPU System counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].System, "new_value", n.System) + } + + if n.Iowait >= c.cpuStats[i].Iowait { + c.cpuStats[i].Iowait = n.Iowait + } else { + level.Warn(c.logger).Log("msg", "CPU Iowait counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Iowait, "new_value", n.Iowait) + } + + if n.IRQ >= c.cpuStats[i].IRQ { + c.cpuStats[i].IRQ = n.IRQ + } else { + level.Warn(c.logger).Log("msg", "CPU IRQ counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].IRQ, "new_value", n.IRQ) + } + + if n.SoftIRQ >= c.cpuStats[i].SoftIRQ { + c.cpuStats[i].SoftIRQ = n.SoftIRQ + } else { + level.Warn(c.logger).Log("msg", "CPU SoftIRQ counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].SoftIRQ, "new_value", n.SoftIRQ) + } + + if n.Steal >= c.cpuStats[i].Steal { + c.cpuStats[i].Steal = n.Steal + } else { + level.Warn(c.logger).Log("msg", "CPU Steal counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Steal, "new_value", n.Steal) + } + + if n.Guest >= c.cpuStats[i].Guest { + c.cpuStats[i].Guest = n.Guest + } else { + level.Warn(c.logger).Log("msg", "CPU Guest counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Guest, "new_value", n.Guest) + } + + if n.GuestNice >= c.cpuStats[i].GuestNice { + c.cpuStats[i].GuestNice = n.GuestNice + } else { + level.Warn(c.logger).Log("msg", "CPU GuestNice counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].GuestNice, "new_value", n.GuestNice) + } + } +}