From 39b4556b5b8d8d05d7e46e4b9b5de76cdc01997e Mon Sep 17 00:00:00 2001 From: Jia Xin Date: Tue, 15 Mar 2022 12:14:27 +0800 Subject: [PATCH] fix cpustat when some cpus are offline Signed-off-by: Jia Xin --- collector/cpu_linux.go | 81 +++++++++++++------------- collector/cpu_linux_test.go | 113 ++++++++++++++++++++---------------- go.mod | 4 +- go.sum | 8 +-- 4 files changed, 109 insertions(+), 97 deletions(-) diff --git a/collector/cpu_linux.go b/collector/cpu_linux.go index 933774bc..e16764d4 100644 --- a/collector/cpu_linux.go +++ b/collector/cpu_linux.go @@ -43,7 +43,7 @@ type cpuCollector struct { cpuPackageThrottle *prometheus.Desc cpuIsolated *prometheus.Desc logger log.Logger - cpuStats []procfs.CPUStat + cpuStats map[int64]procfs.CPUStat cpuStatsMutex sync.Mutex isolatedCpus []uint16 @@ -126,6 +126,7 @@ func NewCPUCollector(logger log.Logger) (Collector, error) { ), logger: logger, isolatedCpus: isolcpus, + cpuStats: make(map[int64]procfs.CPUStat), } err = c.compileIncludeFlags(flagsInclude, bugsInclude) if err != nil { @@ -324,7 +325,7 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error { c.cpuStatsMutex.Lock() defer c.cpuStatsMutex.Unlock() for cpuID, cpuStat := range c.cpuStats { - cpuNum := strconv.Itoa(cpuID) + cpuNum := strconv.Itoa(int(cpuID)) ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.User, cpuNum, "user") ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.Nice, cpuNum, "nice") ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.System, cpuNum, "system") @@ -345,82 +346,82 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error { } // updateCPUStats updates the internal cache of CPU stats. -func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) { +func (c *cpuCollector) updateCPUStats(newStats map[int64]procfs.CPUStat) { // Acquire a lock to update the stats. c.cpuStatsMutex.Lock() defer c.cpuStatsMutex.Unlock() // Reset the cache if the list of CPUs has changed. - if len(c.cpuStats) != len(newStats) { - c.cpuStats = make([]procfs.CPUStat, len(newStats)) - } - for i, n := range newStats { + cpuStats := c.cpuStats[i] + // If idle jumps backwards by more than X seconds, assume we had a hotplug event and reset the stats for this CPU. - if (c.cpuStats[i].Idle - n.Idle) >= jumpBackSeconds { - level.Debug(c.logger).Log("msg", jumpBackDebugMessage, "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle) - c.cpuStats[i] = procfs.CPUStat{} + if (cpuStats.Idle - n.Idle) >= jumpBackSeconds { + level.Debug(c.logger).Log("msg", jumpBackDebugMessage, "cpu", i, "old_value", cpuStats.Idle, "new_value", n.Idle) + cpuStats = procfs.CPUStat{} } - if n.Idle >= c.cpuStats[i].Idle { - c.cpuStats[i].Idle = n.Idle + if n.Idle >= cpuStats.Idle { + cpuStats.Idle = n.Idle } else { - level.Debug(c.logger).Log("msg", "CPU Idle counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle) + level.Debug(c.logger).Log("msg", "CPU Idle counter jumped backwards", "cpu", i, "old_value", cpuStats.Idle, "new_value", n.Idle) } - if n.User >= c.cpuStats[i].User { - c.cpuStats[i].User = n.User + if n.User >= cpuStats.User { + cpuStats.User = n.User } else { - level.Debug(c.logger).Log("msg", "CPU User counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].User, "new_value", n.User) + level.Debug(c.logger).Log("msg", "CPU User counter jumped backwards", "cpu", i, "old_value", cpuStats.User, "new_value", n.User) } - if n.Nice >= c.cpuStats[i].Nice { - c.cpuStats[i].Nice = n.Nice + if n.Nice >= cpuStats.Nice { + cpuStats.Nice = n.Nice } else { - level.Debug(c.logger).Log("msg", "CPU Nice counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Nice, "new_value", n.Nice) + level.Debug(c.logger).Log("msg", "CPU Nice counter jumped backwards", "cpu", i, "old_value", cpuStats.Nice, "new_value", n.Nice) } - if n.System >= c.cpuStats[i].System { - c.cpuStats[i].System = n.System + if n.System >= cpuStats.System { + cpuStats.System = n.System } else { - level.Debug(c.logger).Log("msg", "CPU System counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].System, "new_value", n.System) + level.Debug(c.logger).Log("msg", "CPU System counter jumped backwards", "cpu", i, "old_value", cpuStats.System, "new_value", n.System) } - if n.Iowait >= c.cpuStats[i].Iowait { - c.cpuStats[i].Iowait = n.Iowait + if n.Iowait >= cpuStats.Iowait { + cpuStats.Iowait = n.Iowait } else { - level.Debug(c.logger).Log("msg", "CPU Iowait counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Iowait, "new_value", n.Iowait) + level.Debug(c.logger).Log("msg", "CPU Iowait counter jumped backwards", "cpu", i, "old_value", cpuStats.Iowait, "new_value", n.Iowait) } - if n.IRQ >= c.cpuStats[i].IRQ { - c.cpuStats[i].IRQ = n.IRQ + if n.IRQ >= cpuStats.IRQ { + cpuStats.IRQ = n.IRQ } else { - level.Debug(c.logger).Log("msg", "CPU IRQ counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].IRQ, "new_value", n.IRQ) + level.Debug(c.logger).Log("msg", "CPU IRQ counter jumped backwards", "cpu", i, "old_value", cpuStats.IRQ, "new_value", n.IRQ) } - if n.SoftIRQ >= c.cpuStats[i].SoftIRQ { - c.cpuStats[i].SoftIRQ = n.SoftIRQ + if n.SoftIRQ >= cpuStats.SoftIRQ { + cpuStats.SoftIRQ = n.SoftIRQ } else { - level.Debug(c.logger).Log("msg", "CPU SoftIRQ counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].SoftIRQ, "new_value", n.SoftIRQ) + level.Debug(c.logger).Log("msg", "CPU SoftIRQ counter jumped backwards", "cpu", i, "old_value", cpuStats.SoftIRQ, "new_value", n.SoftIRQ) } - if n.Steal >= c.cpuStats[i].Steal { - c.cpuStats[i].Steal = n.Steal + if n.Steal >= cpuStats.Steal { + cpuStats.Steal = n.Steal } else { - level.Debug(c.logger).Log("msg", "CPU Steal counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Steal, "new_value", n.Steal) + level.Debug(c.logger).Log("msg", "CPU Steal counter jumped backwards", "cpu", i, "old_value", cpuStats.Steal, "new_value", n.Steal) } - if n.Guest >= c.cpuStats[i].Guest { - c.cpuStats[i].Guest = n.Guest + if n.Guest >= cpuStats.Guest { + cpuStats.Guest = n.Guest } else { - level.Debug(c.logger).Log("msg", "CPU Guest counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Guest, "new_value", n.Guest) + level.Debug(c.logger).Log("msg", "CPU Guest counter jumped backwards", "cpu", i, "old_value", cpuStats.Guest, "new_value", n.Guest) } - if n.GuestNice >= c.cpuStats[i].GuestNice { - c.cpuStats[i].GuestNice = n.GuestNice + if n.GuestNice >= cpuStats.GuestNice { + cpuStats.GuestNice = n.GuestNice } else { - level.Debug(c.logger).Log("msg", "CPU GuestNice counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].GuestNice, "new_value", n.GuestNice) + level.Debug(c.logger).Log("msg", "CPU GuestNice counter jumped backwards", "cpu", i, "old_value", cpuStats.GuestNice, "new_value", n.GuestNice) } + + c.cpuStats[i] = cpuStats } } diff --git a/collector/cpu_linux_test.go b/collector/cpu_linux_test.go index 93b493b2..586087f3 100644 --- a/collector/cpu_linux_test.go +++ b/collector/cpu_linux_test.go @@ -24,9 +24,16 @@ import ( "github.com/prometheus/procfs" ) -func makeTestCPUCollector(s []procfs.CPUStat) *cpuCollector { - dup := make([]procfs.CPUStat, len(s)) - copy(dup, s) +func copyStats(d, s map[int64]procfs.CPUStat) { + for k := range s { + v := s[k] + d[k] = v + } +} + +func makeTestCPUCollector(s map[int64]procfs.CPUStat) *cpuCollector { + dup := make(map[int64]procfs.CPUStat, len(s)) + copyStats(dup, s) return &cpuCollector{ logger: log.NewNopLogger(), cpuStats: dup, @@ -34,32 +41,34 @@ func makeTestCPUCollector(s []procfs.CPUStat) *cpuCollector { } func TestCPU(t *testing.T) { - firstCPUStat := []procfs.CPUStat{{ - User: 100.0, - Nice: 100.0, - System: 100.0, - Idle: 100.0, - Iowait: 100.0, - IRQ: 100.0, - SoftIRQ: 100.0, - Steal: 100.0, - Guest: 100.0, - GuestNice: 100.0, - }} + firstCPUStat := map[int64]procfs.CPUStat{ + 0: { + User: 100.0, + Nice: 100.0, + System: 100.0, + Idle: 100.0, + Iowait: 100.0, + IRQ: 100.0, + SoftIRQ: 100.0, + Steal: 100.0, + Guest: 100.0, + GuestNice: 100.0, + }} c := makeTestCPUCollector(firstCPUStat) - want := []procfs.CPUStat{{ - User: 101.0, - Nice: 101.0, - System: 101.0, - Idle: 101.0, - Iowait: 101.0, - IRQ: 101.0, - SoftIRQ: 101.0, - Steal: 101.0, - Guest: 101.0, - GuestNice: 101.0, - }} + want := map[int64]procfs.CPUStat{ + 0: { + User: 101.0, + Nice: 101.0, + System: 101.0, + Idle: 101.0, + Iowait: 101.0, + IRQ: 101.0, + SoftIRQ: 101.0, + Steal: 101.0, + Guest: 101.0, + GuestNice: 101.0, + }} c.updateCPUStats(want) got := c.cpuStats if !reflect.DeepEqual(want, got) { @@ -67,18 +76,19 @@ func TestCPU(t *testing.T) { } c = makeTestCPUCollector(firstCPUStat) - jumpBack := []procfs.CPUStat{{ - User: 99.9, - Nice: 99.9, - System: 99.9, - Idle: 99.9, - Iowait: 99.9, - IRQ: 99.9, - SoftIRQ: 99.9, - Steal: 99.9, - Guest: 99.9, - GuestNice: 99.9, - }} + jumpBack := map[int64]procfs.CPUStat{ + 0: { + User: 99.9, + Nice: 99.9, + System: 99.9, + Idle: 99.9, + Iowait: 99.9, + IRQ: 99.9, + SoftIRQ: 99.9, + Steal: 99.9, + Guest: 99.9, + GuestNice: 99.9, + }} c.updateCPUStats(jumpBack) got = c.cpuStats if reflect.DeepEqual(jumpBack, got) { @@ -86,18 +96,19 @@ func TestCPU(t *testing.T) { } c = makeTestCPUCollector(firstCPUStat) - resetIdle := []procfs.CPUStat{{ - User: 102.0, - Nice: 102.0, - System: 102.0, - Idle: 1.0, - Iowait: 102.0, - IRQ: 102.0, - SoftIRQ: 102.0, - Steal: 102.0, - Guest: 102.0, - GuestNice: 102.0, - }} + resetIdle := map[int64]procfs.CPUStat{ + 0: { + User: 102.0, + Nice: 102.0, + System: 102.0, + Idle: 1.0, + Iowait: 102.0, + IRQ: 102.0, + SoftIRQ: 102.0, + Steal: 102.0, + Guest: 102.0, + GuestNice: 102.0, + }} c.updateCPUStats(resetIdle) got = c.cpuStats if !reflect.DeepEqual(resetIdle, got) { diff --git a/go.mod b/go.mod index 435e95b0..a6633f3c 100644 --- a/go.mod +++ b/go.mod @@ -24,10 +24,10 @@ require ( github.com/prometheus/client_model v0.3.0 github.com/prometheus/common v0.37.0 github.com/prometheus/exporter-toolkit v0.8.2 - github.com/prometheus/procfs v0.8.0 + github.com/prometheus/procfs v0.9.0 github.com/safchain/ethtool v0.2.0 github.com/soundcloud/go-runit v0.0.0-20150630195641-06ad41a06c4a - golang.org/x/sys v0.2.0 + golang.org/x/sys v0.4.0 gopkg.in/alecthomas/kingpin.v2 v2.2.6 ) diff --git a/go.sum b/go.sum index 21079bfc..04c7614a 100644 --- a/go.sum +++ b/go.sum @@ -246,8 +246,8 @@ github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsT github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU= github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= github.com/prometheus/procfs v0.7.3/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= -github.com/prometheus/procfs v0.8.0 h1:ODq8ZFEaYeCaZOJlZZdJA2AbQR98dSHSM1KW/You5mo= -github.com/prometheus/procfs v0.8.0/go.mod h1:z7EfXMXOkbkqb9IINtpCn86r/to3BnA0uaxHdg830/4= +github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI= +github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= @@ -418,8 +418,8 @@ golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220114195835-da31bd327af9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220128215802-99c3d69c2c27/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.2.0 h1:ljd4t30dBnAvMZaQCevtY0xLLD0A+bRZXbgLMLU1F/A= -golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.4.0 h1:Zr2JFtRQNX3BCZ8YtxRE9hNJYC8J6I1MVbMg6owUp18= +golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=