From 070e409dbaf2ca1950e49efb4a16fecfa4dbe04c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Mierzwa?= Date: Mon, 23 May 2022 10:42:01 +0100 Subject: [PATCH] Add prometheus_ready metric (#10682) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When Prometheus starts it can take a long time before WAL is replayed and it can do anything useful. While it's starting it exposes metrics and other Prometheus servers can scrape it. We do have alerts that fire if any Prometheus server is not ingesting samples and so far we've been dealing with instances that are starting for a long time by adding a check on Prometheus process uptime. Relying on uptime isn't ideal because the time needed to start depends on the number of metrics scraped, and so on the amount of data in WAL. To help write better alerts it would be great if Prometheus exposed a metric that tells us it's fully started, that way any alert that suppose to notify us about any runtime issue can filter out starting instances. Signed-off-by: Ɓukasz Mierzwa --- cmd/prometheus/main.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index 549a5b3fd..ccc41a566 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -86,6 +86,10 @@ var ( Name: "prometheus_config_last_reload_success_timestamp_seconds", Help: "Timestamp of the last successful configuration reload.", }) + readyStatus = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "prometheus_ready", + Help: "Whether Prometheus startup was fully completed and the server is ready for normal operation.", + }) defaultRetentionString = "15d" defaultRetentionDuration model.Duration @@ -752,6 +756,7 @@ func main() { prometheus.MustRegister(configSuccess) prometheus.MustRegister(configSuccessTime) + prometheus.MustRegister(readyStatus) // Start all components while we wait for TSDB to open but only load // initial config and mark ourselves as ready after it completed. @@ -946,6 +951,7 @@ func main() { webHandler.Ready() level.Info(logger).Log("msg", "Server is ready to receive web requests.") + readyStatus.Set(1) <-cancel return nil },