rules/manager.go: Fix race between reload and stop
On one relatively large Prometheus instance (1.7M series), I noticed that upgrades were frequently resulting in Prometheus undergoing crash recovery on start-up. On closer examination, I found that Prometheus was panicking on shutdown. It seems that our configuration management (or misconfiguration thereof) is reloading Prometheus then immediately restarting it, which I suspect is causing this race: Sep 21 15:12:42 host systemd[1]: Reloading prometheus monitoring system. Sep 21 15:12:42 host prometheus[18734]: time="2016-09-21T15:12:42Z" level=info msg="Loading configuration file /etc/prometheus/config.yaml" source="main.go:221" Sep 21 15:12:42 host systemd[1]: Reloaded prometheus monitoring system. Sep 21 15:12:44 host systemd[1]: Stopping prometheus monitoring system... Sep 21 15:12:44 host prometheus[18734]: time="2016-09-21T15:12:44Z" level=warning msg="Received SIGTERM, exiting gracefully..." source="main.go:203" Sep 21 15:12:44 host prometheus[18734]: time="2016-09-21T15:12:44Z" level=info msg="See you next time!" source="main.go:210" Sep 21 15:12:44 host prometheus[18734]: time="2016-09-21T15:12:44Z" level=info msg="Stopping target manager..." source="targetmanager.go:90" Sep 21 15:12:52 host prometheus[18734]: time="2016-09-21T15:12:52Z" level=info msg="Checkpointing in-memory metrics and chunks..." source="persistence.go:548" Sep 21 15:12:56 host prometheus[18734]: time="2016-09-21T15:12:56Z" level=warning msg="Error on ingesting out-of-order samples" numDropped=1 source="scrape.go:467" Sep 21 15:12:56 host prometheus[18734]: time="2016-09-21T15:12:56Z" level=error msg="Error adding file watch for \"/etc/prometheus/targets\": no such file or directory" source="file.go:84" Sep 21 15:12:56 host prometheus[18734]: time="2016-09-21T15:12:56Z" level=error msg="Error adding file watch for \"/etc/prometheus/targets\": no such file or directory" source="file.go:84" Sep 21 15:13:01 host prometheus[18734]: time="2016-09-21T15:13:01Z" level=info msg="Stopping rule manager..." source="manager.go:366" Sep 21 15:13:01 host prometheus[18734]: time="2016-09-21T15:13:01Z" level=info msg="Rule manager stopped." source="manager.go:372" Sep 21 15:13:01 host prometheus[18734]: time="2016-09-21T15:13:01Z" level=info msg="Stopping notification handler..." source="notifier.go:325" Sep 21 15:13:01 host prometheus[18734]: time="2016-09-21T15:13:01Z" level=info msg="Stopping local storage..." source="storage.go:381" Sep 21 15:13:01 host prometheus[18734]: time="2016-09-21T15:13:01Z" level=info msg="Stopping maintenance loop..." source="storage.go:383" Sep 21 15:13:01 host prometheus[18734]: panic: close of closed channel Sep 21 15:13:01 host prometheus[18734]: goroutine 7686074 [running]: Sep 21 15:13:01 host prometheus[18734]: panic(0xba57a0, 0xc60c42b500) Sep 21 15:13:01 host prometheus[18734]: /usr/local/go/src/runtime/panic.go:500 +0x1a1 Sep 21 15:13:01 host prometheus[18734]: github.com/prometheus/prometheus/rules.(*Manager).ApplyConfig.func1(0xc6645a9901, 0xc420271ef0, 0xc420338ed0, 0xc60c42b4f0, 0xc6645a9900) Sep 21 15:13:01 host prometheus[18734]: /home/build/packages/prometheus/tmp/build/gopath/src/github.com/prometheus/prometheus/rules/manager.go:412 +0x3c Sep 21 15:13:01 host prometheus[18734]: created by github.com/prometheus/prometheus/rules.(*Manager).ApplyConfig Sep 21 15:13:01 host prometheus[18734]: /home/build/packages/prometheus/tmp/build/gopath/src/github.com/prometheus/prometheus/rules/manager.go:423 +0x56b Sep 21 15:13:03 host systemd[1]: prometheus.service: main process exited, code=exited, status=2/INVALIDARGUMENT
This commit is contained in:
parent
4520e12440
commit
926a5ab3dd
|
@ -365,6 +365,9 @@ func (m *Manager) Run() {
|
||||||
|
|
||||||
// Stop the rule manager's rule evaluation cycles.
|
// Stop the rule manager's rule evaluation cycles.
|
||||||
func (m *Manager) Stop() {
|
func (m *Manager) Stop() {
|
||||||
|
m.mtx.Lock()
|
||||||
|
defer m.mtx.Unlock()
|
||||||
|
|
||||||
log.Info("Stopping rule manager...")
|
log.Info("Stopping rule manager...")
|
||||||
|
|
||||||
for _, eg := range m.groups {
|
for _, eg := range m.groups {
|
||||||
|
|
Loading…
Reference in New Issue