Stop rule manager before TSDB is stopped (#10680)
During shutdown TSDB is stopped before rule manager is stopped. Since TSDB shutdown can take a long time (minutes or 10s of minutes) it keeps rule manager running while parts of Prometheus are already stopped (most notebly scrape manager). This can cause false positive alerts to fire, mostly those that rely on absent() calls since new sample appends will stop while alert queries are still evaluated. Stop rules before stopping TSDB and scrape manager to avoid this problem. Signed-off-by: Łukasz Mierzwa <l.mierzwa@gmail.com>
This commit is contained in:
parent
42f574d5ac
commit
d3c9c4f574
|
@ -838,6 +838,19 @@ func main() {
|
|||
},
|
||||
)
|
||||
}
|
||||
if !agentMode {
|
||||
// Rule manager.
|
||||
g.Add(
|
||||
func() error {
|
||||
<-reloadReady.C
|
||||
ruleManager.Run()
|
||||
return nil
|
||||
},
|
||||
func(err error) {
|
||||
ruleManager.Stop()
|
||||
},
|
||||
)
|
||||
}
|
||||
{
|
||||
// Scrape manager.
|
||||
g.Add(
|
||||
|
@ -855,6 +868,8 @@ func main() {
|
|||
func(err error) {
|
||||
// Scrape manager needs to be stopped before closing the local TSDB
|
||||
// so that it doesn't try to write samples to a closed storage.
|
||||
// We should also wait for rule manager to be fully stopped to ensure
|
||||
// we don't trigger any false positive alerts for rules using absent().
|
||||
level.Info(logger).Log("msg", "Stopping scrape manager...")
|
||||
scrapeManager.Stop()
|
||||
},
|
||||
|
@ -940,18 +955,6 @@ func main() {
|
|||
)
|
||||
}
|
||||
if !agentMode {
|
||||
// Rule manager.
|
||||
g.Add(
|
||||
func() error {
|
||||
<-reloadReady.C
|
||||
ruleManager.Run()
|
||||
return nil
|
||||
},
|
||||
func(err error) {
|
||||
ruleManager.Stop()
|
||||
},
|
||||
)
|
||||
|
||||
// TSDB.
|
||||
opts := cfg.tsdb.ToTSDBOptions()
|
||||
cancel := make(chan struct{})
|
||||
|
|
|
@ -934,6 +934,7 @@ func NewManager(o *ManagerOptions) *Manager {
|
|||
|
||||
// Run starts processing of the rule manager. It is blocking.
|
||||
func (m *Manager) Run() {
|
||||
level.Info(m.logger).Log("msg", "Starting rule manager...")
|
||||
m.start()
|
||||
<-m.done
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue