fix cpustat when some cpus are offline

Signed-off-by: Jia Xin <alexjx@gmail.com>
This commit is contained in:
Jia Xin 2022-03-15 12:14:27 +08:00
parent a3bd2e1305
commit 39b4556b5b
4 changed files with 109 additions and 97 deletions

View File

@ -43,7 +43,7 @@ type cpuCollector struct {
cpuPackageThrottle *prometheus.Desc
cpuIsolated *prometheus.Desc
logger log.Logger
cpuStats []procfs.CPUStat
cpuStats map[int64]procfs.CPUStat
cpuStatsMutex sync.Mutex
isolatedCpus []uint16
@ -126,6 +126,7 @@ func NewCPUCollector(logger log.Logger) (Collector, error) {
),
logger: logger,
isolatedCpus: isolcpus,
cpuStats: make(map[int64]procfs.CPUStat),
}
err = c.compileIncludeFlags(flagsInclude, bugsInclude)
if err != nil {
@ -324,7 +325,7 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error {
c.cpuStatsMutex.Lock()
defer c.cpuStatsMutex.Unlock()
for cpuID, cpuStat := range c.cpuStats {
cpuNum := strconv.Itoa(cpuID)
cpuNum := strconv.Itoa(int(cpuID))
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.User, cpuNum, "user")
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.Nice, cpuNum, "nice")
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.System, cpuNum, "system")
@ -345,82 +346,82 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error {
}
// updateCPUStats updates the internal cache of CPU stats.
func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) {
func (c *cpuCollector) updateCPUStats(newStats map[int64]procfs.CPUStat) {
// Acquire a lock to update the stats.
c.cpuStatsMutex.Lock()
defer c.cpuStatsMutex.Unlock()
// Reset the cache if the list of CPUs has changed.
if len(c.cpuStats) != len(newStats) {
c.cpuStats = make([]procfs.CPUStat, len(newStats))
}
for i, n := range newStats {
cpuStats := c.cpuStats[i]
// If idle jumps backwards by more than X seconds, assume we had a hotplug event and reset the stats for this CPU.
if (c.cpuStats[i].Idle - n.Idle) >= jumpBackSeconds {
level.Debug(c.logger).Log("msg", jumpBackDebugMessage, "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
c.cpuStats[i] = procfs.CPUStat{}
if (cpuStats.Idle - n.Idle) >= jumpBackSeconds {
level.Debug(c.logger).Log("msg", jumpBackDebugMessage, "cpu", i, "old_value", cpuStats.Idle, "new_value", n.Idle)
cpuStats = procfs.CPUStat{}
}
if n.Idle >= c.cpuStats[i].Idle {
c.cpuStats[i].Idle = n.Idle
if n.Idle >= cpuStats.Idle {
cpuStats.Idle = n.Idle
} else {
level.Debug(c.logger).Log("msg", "CPU Idle counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
level.Debug(c.logger).Log("msg", "CPU Idle counter jumped backwards", "cpu", i, "old_value", cpuStats.Idle, "new_value", n.Idle)
}
if n.User >= c.cpuStats[i].User {
c.cpuStats[i].User = n.User
if n.User >= cpuStats.User {
cpuStats.User = n.User
} else {
level.Debug(c.logger).Log("msg", "CPU User counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].User, "new_value", n.User)
level.Debug(c.logger).Log("msg", "CPU User counter jumped backwards", "cpu", i, "old_value", cpuStats.User, "new_value", n.User)
}
if n.Nice >= c.cpuStats[i].Nice {
c.cpuStats[i].Nice = n.Nice
if n.Nice >= cpuStats.Nice {
cpuStats.Nice = n.Nice
} else {
level.Debug(c.logger).Log("msg", "CPU Nice counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Nice, "new_value", n.Nice)
level.Debug(c.logger).Log("msg", "CPU Nice counter jumped backwards", "cpu", i, "old_value", cpuStats.Nice, "new_value", n.Nice)
}
if n.System >= c.cpuStats[i].System {
c.cpuStats[i].System = n.System
if n.System >= cpuStats.System {
cpuStats.System = n.System
} else {
level.Debug(c.logger).Log("msg", "CPU System counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].System, "new_value", n.System)
level.Debug(c.logger).Log("msg", "CPU System counter jumped backwards", "cpu", i, "old_value", cpuStats.System, "new_value", n.System)
}
if n.Iowait >= c.cpuStats[i].Iowait {
c.cpuStats[i].Iowait = n.Iowait
if n.Iowait >= cpuStats.Iowait {
cpuStats.Iowait = n.Iowait
} else {
level.Debug(c.logger).Log("msg", "CPU Iowait counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Iowait, "new_value", n.Iowait)
level.Debug(c.logger).Log("msg", "CPU Iowait counter jumped backwards", "cpu", i, "old_value", cpuStats.Iowait, "new_value", n.Iowait)
}
if n.IRQ >= c.cpuStats[i].IRQ {
c.cpuStats[i].IRQ = n.IRQ
if n.IRQ >= cpuStats.IRQ {
cpuStats.IRQ = n.IRQ
} else {
level.Debug(c.logger).Log("msg", "CPU IRQ counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].IRQ, "new_value", n.IRQ)
level.Debug(c.logger).Log("msg", "CPU IRQ counter jumped backwards", "cpu", i, "old_value", cpuStats.IRQ, "new_value", n.IRQ)
}
if n.SoftIRQ >= c.cpuStats[i].SoftIRQ {
c.cpuStats[i].SoftIRQ = n.SoftIRQ
if n.SoftIRQ >= cpuStats.SoftIRQ {
cpuStats.SoftIRQ = n.SoftIRQ
} else {
level.Debug(c.logger).Log("msg", "CPU SoftIRQ counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].SoftIRQ, "new_value", n.SoftIRQ)
level.Debug(c.logger).Log("msg", "CPU SoftIRQ counter jumped backwards", "cpu", i, "old_value", cpuStats.SoftIRQ, "new_value", n.SoftIRQ)
}
if n.Steal >= c.cpuStats[i].Steal {
c.cpuStats[i].Steal = n.Steal
if n.Steal >= cpuStats.Steal {
cpuStats.Steal = n.Steal
} else {
level.Debug(c.logger).Log("msg", "CPU Steal counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Steal, "new_value", n.Steal)
level.Debug(c.logger).Log("msg", "CPU Steal counter jumped backwards", "cpu", i, "old_value", cpuStats.Steal, "new_value", n.Steal)
}
if n.Guest >= c.cpuStats[i].Guest {
c.cpuStats[i].Guest = n.Guest
if n.Guest >= cpuStats.Guest {
cpuStats.Guest = n.Guest
} else {
level.Debug(c.logger).Log("msg", "CPU Guest counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Guest, "new_value", n.Guest)
level.Debug(c.logger).Log("msg", "CPU Guest counter jumped backwards", "cpu", i, "old_value", cpuStats.Guest, "new_value", n.Guest)
}
if n.GuestNice >= c.cpuStats[i].GuestNice {
c.cpuStats[i].GuestNice = n.GuestNice
if n.GuestNice >= cpuStats.GuestNice {
cpuStats.GuestNice = n.GuestNice
} else {
level.Debug(c.logger).Log("msg", "CPU GuestNice counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].GuestNice, "new_value", n.GuestNice)
level.Debug(c.logger).Log("msg", "CPU GuestNice counter jumped backwards", "cpu", i, "old_value", cpuStats.GuestNice, "new_value", n.GuestNice)
}
c.cpuStats[i] = cpuStats
}
}

View File

@ -24,9 +24,16 @@ import (
"github.com/prometheus/procfs"
)
func makeTestCPUCollector(s []procfs.CPUStat) *cpuCollector {
dup := make([]procfs.CPUStat, len(s))
copy(dup, s)
func copyStats(d, s map[int64]procfs.CPUStat) {
for k := range s {
v := s[k]
d[k] = v
}
}
func makeTestCPUCollector(s map[int64]procfs.CPUStat) *cpuCollector {
dup := make(map[int64]procfs.CPUStat, len(s))
copyStats(dup, s)
return &cpuCollector{
logger: log.NewNopLogger(),
cpuStats: dup,
@ -34,32 +41,34 @@ func makeTestCPUCollector(s []procfs.CPUStat) *cpuCollector {
}
func TestCPU(t *testing.T) {
firstCPUStat := []procfs.CPUStat{{
User: 100.0,
Nice: 100.0,
System: 100.0,
Idle: 100.0,
Iowait: 100.0,
IRQ: 100.0,
SoftIRQ: 100.0,
Steal: 100.0,
Guest: 100.0,
GuestNice: 100.0,
}}
firstCPUStat := map[int64]procfs.CPUStat{
0: {
User: 100.0,
Nice: 100.0,
System: 100.0,
Idle: 100.0,
Iowait: 100.0,
IRQ: 100.0,
SoftIRQ: 100.0,
Steal: 100.0,
Guest: 100.0,
GuestNice: 100.0,
}}
c := makeTestCPUCollector(firstCPUStat)
want := []procfs.CPUStat{{
User: 101.0,
Nice: 101.0,
System: 101.0,
Idle: 101.0,
Iowait: 101.0,
IRQ: 101.0,
SoftIRQ: 101.0,
Steal: 101.0,
Guest: 101.0,
GuestNice: 101.0,
}}
want := map[int64]procfs.CPUStat{
0: {
User: 101.0,
Nice: 101.0,
System: 101.0,
Idle: 101.0,
Iowait: 101.0,
IRQ: 101.0,
SoftIRQ: 101.0,
Steal: 101.0,
Guest: 101.0,
GuestNice: 101.0,
}}
c.updateCPUStats(want)
got := c.cpuStats
if !reflect.DeepEqual(want, got) {
@ -67,18 +76,19 @@ func TestCPU(t *testing.T) {
}
c = makeTestCPUCollector(firstCPUStat)
jumpBack := []procfs.CPUStat{{
User: 99.9,
Nice: 99.9,
System: 99.9,
Idle: 99.9,
Iowait: 99.9,
IRQ: 99.9,
SoftIRQ: 99.9,
Steal: 99.9,
Guest: 99.9,
GuestNice: 99.9,
}}
jumpBack := map[int64]procfs.CPUStat{
0: {
User: 99.9,
Nice: 99.9,
System: 99.9,
Idle: 99.9,
Iowait: 99.9,
IRQ: 99.9,
SoftIRQ: 99.9,
Steal: 99.9,
Guest: 99.9,
GuestNice: 99.9,
}}
c.updateCPUStats(jumpBack)
got = c.cpuStats
if reflect.DeepEqual(jumpBack, got) {
@ -86,18 +96,19 @@ func TestCPU(t *testing.T) {
}
c = makeTestCPUCollector(firstCPUStat)
resetIdle := []procfs.CPUStat{{
User: 102.0,
Nice: 102.0,
System: 102.0,
Idle: 1.0,
Iowait: 102.0,
IRQ: 102.0,
SoftIRQ: 102.0,
Steal: 102.0,
Guest: 102.0,
GuestNice: 102.0,
}}
resetIdle := map[int64]procfs.CPUStat{
0: {
User: 102.0,
Nice: 102.0,
System: 102.0,
Idle: 1.0,
Iowait: 102.0,
IRQ: 102.0,
SoftIRQ: 102.0,
Steal: 102.0,
Guest: 102.0,
GuestNice: 102.0,
}}
c.updateCPUStats(resetIdle)
got = c.cpuStats
if !reflect.DeepEqual(resetIdle, got) {

4
go.mod
View File

@ -24,10 +24,10 @@ require (
github.com/prometheus/client_model v0.3.0
github.com/prometheus/common v0.37.0
github.com/prometheus/exporter-toolkit v0.8.2
github.com/prometheus/procfs v0.8.0
github.com/prometheus/procfs v0.9.0
github.com/safchain/ethtool v0.2.0
github.com/soundcloud/go-runit v0.0.0-20150630195641-06ad41a06c4a
golang.org/x/sys v0.2.0
golang.org/x/sys v0.4.0
gopkg.in/alecthomas/kingpin.v2 v2.2.6
)

8
go.sum
View File

@ -246,8 +246,8 @@ github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsT
github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA=
github.com/prometheus/procfs v0.7.3/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA=
github.com/prometheus/procfs v0.8.0 h1:ODq8ZFEaYeCaZOJlZZdJA2AbQR98dSHSM1KW/You5mo=
github.com/prometheus/procfs v0.8.0/go.mod h1:z7EfXMXOkbkqb9IINtpCn86r/to3BnA0uaxHdg830/4=
github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI=
github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY=
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
@ -418,8 +418,8 @@ golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20220114195835-da31bd327af9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220128215802-99c3d69c2c27/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.2.0 h1:ljd4t30dBnAvMZaQCevtY0xLLD0A+bRZXbgLMLU1F/A=
golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.4.0 h1:Zr2JFtRQNX3BCZ8YtxRE9hNJYC8J6I1MVbMg6owUp18=
golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=