From 65d19f433e958826d7e5fda619b302b7c4996170 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan-Otto=20Kr=C3=B6pke?= Date: Fri, 27 Sep 2024 11:09:17 +0200 Subject: [PATCH] collector: fix flapping metrics if process is enabled. (#1643) --- go.mod | 8 ++--- go.sum | 16 +++++----- pkg/collector/cs/cs.go | 1 + pkg/collector/prometheus.go | 64 +++++++++++++++++++++++++++++-------- 4 files changed, 64 insertions(+), 25 deletions(-) diff --git a/go.mod b/go.mod index af950a93..0907eea4 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/prometheus-community/windows_exporter go 1.23 require ( - github.com/Microsoft/hcsshim v0.12.6 + github.com/Microsoft/hcsshim v0.12.7 github.com/alecthomas/kingpin/v2 v2.4.0 github.com/bmatcuk/doublestar/v4 v4.6.1 github.com/dimchansky/utfbom v1.1.1 @@ -30,7 +30,7 @@ require ( github.com/davecgh/go-spew v1.1.1 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/jpillora/backoff v1.0.0 // indirect - github.com/klauspost/compress v1.17.9 // indirect + github.com/klauspost/compress v1.17.10 // indirect github.com/mdlayher/socket v0.5.1 // indirect github.com/mdlayher/vsock v1.2.1 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect @@ -46,8 +46,8 @@ require ( golang.org/x/oauth2 v0.23.0 // indirect golang.org/x/sync v0.8.0 // indirect golang.org/x/text v0.18.0 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 // indirect - google.golang.org/grpc v1.66.2 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240924160255-9d4c2d233b61 // indirect + google.golang.org/grpc v1.67.0 // indirect google.golang.org/protobuf v1.34.2 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect ) diff --git a/go.sum b/go.sum index 669f3ebf..a19723d3 100644 --- a/go.sum +++ b/go.sum @@ -2,8 +2,8 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= -github.com/Microsoft/hcsshim v0.12.6 h1:qEnZjoHXv+4/s0LmKZWE0/AiZmMWEIkFfWBSf1a0wlU= -github.com/Microsoft/hcsshim v0.12.6/go.mod h1:ZABCLVcvLMjIkzr9rUGcQ1QA0p0P3Ps+d3N1g2DsFfk= +github.com/Microsoft/hcsshim v0.12.7 h1:MP6R1spmjxTE4EU4J3YsrTxn8CjvN9qwjTKJXldFaRg= +github.com/Microsoft/hcsshim v0.12.7/go.mod h1:HPbAuJ9BvQYYZbB4yEQcyGIsTP5L4yHKeO9XO149AEM= github.com/alecthomas/kingpin/v2 v2.4.0 h1:f48lwail6p8zpO1bC4TxtqACaGqHYA22qkHjHpqDjYY= github.com/alecthomas/kingpin/v2 v2.4.0/go.mod h1:0gyi0zQnjuFk8xrkNKamJoyUo382HRL7ATRpFZCw6tE= github.com/alecthomas/units v0.0.0-20240626203959-61d1e3462e30 h1:t3eaIm0rUkzbrIewtiFmMK5RXHej2XnoXNhxVsAYUfg= @@ -63,8 +63,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= -github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= -github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= +github.com/klauspost/compress v1.17.10 h1:oXAz+Vh0PMUvJczoi+flxpnBEPxoER1IaAnU/NMPtT0= +github.com/klauspost/compress v1.17.10/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= @@ -163,15 +163,15 @@ google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7 google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 h1:pPJltXNxVzT4pK9yD8vR9X75DaWYYmLGMsEvBfFQZzQ= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240924160255-9d4c2d233b61 h1:N9BgCIAUvn/M+p4NJccWPWb3BWh88+zyL0ll9HgbEeM= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240924160255-9d4c2d233b61/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc= -google.golang.org/grpc v1.66.2 h1:3QdXkuq3Bkh7w+ywLdLvM56cmGvQHUMZpiCzt6Rqaoo= -google.golang.org/grpc v1.66.2/go.mod h1:s3/l6xSSCURdVfAnL+TqCNMyTDAGN6+lZeVxnZR128Y= +google.golang.org/grpc v1.67.0 h1:IdH9y6PF5MPSdAntIcpjQ+tXO41pcQsfZV2RxtQgVcw= +google.golang.org/grpc v1.67.0/go.mod h1:1gLDyUQU7CTLJI90u3nXZ9ekeghjeM7pTDZlqFNg2AA= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= diff --git a/pkg/collector/cs/cs.go b/pkg/collector/cs/cs.go index 21d6fe55..59d24a60 100644 --- a/pkg/collector/cs/cs.go +++ b/pkg/collector/cs/cs.go @@ -97,6 +97,7 @@ func (c *Collector) Build(logger *slog.Logger, _ *wmi.Client) error { // to the provided prometheus Metric channel. func (c *Collector) Collect(_ *types.ScrapeContext, logger *slog.Logger, ch chan<- prometheus.Metric) error { logger = logger.With(slog.String("collector", Name)) + if err := c.collect(ch); err != nil { logger.Error("failed collecting cs metrics", slog.Any("err", err), diff --git a/pkg/collector/prometheus.go b/pkg/collector/prometheus.go index 29696e97..ee7883de 100644 --- a/pkg/collector/prometheus.go +++ b/pkg/collector/prometheus.go @@ -3,6 +3,7 @@ package collector import ( + "context" "fmt" "log/slog" "sync" @@ -164,37 +165,65 @@ func (p *Prometheus) Collect(ch chan<- prometheus.Metric) { ) } -func (p *Prometheus) execute(name string, c Collector, ctx *types.ScrapeContext, ch chan<- prometheus.Metric) collectorStatusCode { +func (p *Prometheus) execute(name string, c Collector, scrapeCtx *types.ScrapeContext, ch chan<- prometheus.Metric) collectorStatusCode { var ( - err error - duration time.Duration - timeout atomic.Bool + err error + numMetrics int + duration time.Duration + timeout atomic.Bool ) // bufCh is a buffer channel to store the metrics // This is needed because once timeout is reached, the prometheus registry channel is closed. - bufCh := make(chan prometheus.Metric, 10) + bufCh := make(chan prometheus.Metric, 1000) errCh := make(chan error, 1) + ctx, cancel := context.WithTimeout(context.Background(), p.maxScrapeDuration) + defer cancel() + // Execute the collector go func() { - errCh <- c.Collect(ctx, p.logger, bufCh) + defer func() { + if r := recover(); r != nil { + p.logger.Error("panic in collector "+name, + slog.Any("panic", r), + ) + } + }() + + errCh <- c.Collect(scrapeCtx, p.logger, bufCh) close(bufCh) }() + wg := sync.WaitGroup{} + wg.Add(1) + go func() { defer func() { // This prevents a panic from race-condition when closing the ch channel too early. _ = recover() + + wg.Done() }() // Pass metrics to the prometheus registry // If timeout is reached, the channel is closed. // This will cause a panic if we try to write to it. - for m := range bufCh { - if !timeout.Load() { - ch <- m + for { + select { + case <-ctx.Done(): + return + case m, ok := <-bufCh: + if !ok { + return + } + + if !timeout.Load() { + ch <- m + + numMetrics++ + } } } }() @@ -204,6 +233,8 @@ func (p *Prometheus) execute(name string, c Collector, ctx *types.ScrapeContext, // Wait for the collector to finish or timeout select { case err = <-errCh: + wg.Wait() // Wait for the buffer channel to be closed and empty + duration = time.Since(t) ch <- prometheus.MustNewConstMetric( p.collectorScrapeDurationDesc, @@ -211,7 +242,7 @@ func (p *Prometheus) execute(name string, c Collector, ctx *types.ScrapeContext, duration.Seconds(), name, ) - case <-time.After(p.maxScrapeDuration): + case <-ctx.Done(): timeout.Store(true) duration = time.Since(t) @@ -222,20 +253,27 @@ func (p *Prometheus) execute(name string, c Collector, ctx *types.ScrapeContext, name, ) - p.logger.Warn(fmt.Sprintf("collector %s timeouted after %s", name, p.maxScrapeDuration)) + p.logger.Warn(fmt.Sprintf("collector %s timeouted after %s, resulting in %d metrics", name, p.maxScrapeDuration, numMetrics)) + + go func() { + // Drain channel in case of premature return to not leak a goroutine. + //nolint:revive + for range bufCh { + } + }() return pending } if err != nil { - p.logger.Error(fmt.Sprintf("collector %s failed after %s", name, duration), + p.logger.Error(fmt.Sprintf("collector %s failed after %s, resulting in %d metrics", name, duration, numMetrics), slog.Any("err", err), ) return failed } - p.logger.Info(fmt.Sprintf("collector %s succeeded after %s", name, duration)) + p.logger.Info(fmt.Sprintf("collector %s succeeded after %s, resulting in %d metrics", name, duration, numMetrics)) return success }