Rule Manager: Add `rule_group_last_restore_duration_seconds` to measure restore time per rule group

When a rule group changes or prometheus is restarted we need to ensure we restore the active alerts that were firing for a corresponding rule, for that Prometheus uses the `ALERTS_FOR_STATE` series to query the previous state and restore it. If a given rule has high cardinality (think 100s of 1000s for series) this proccess can take a bit of time - this is the first of a series of PRs to improve this problem and I'd like to start with exposing the time it takes to restore a rule group as a gauge.

Signed-off-by: gotjosh <josue.abreu@gmail.com>
This commit is contained in:
gotjosh 2024-04-23 09:54:21 +01:00
parent 76b0318ed5
commit e7219e3d36
No known key found for this signature in database
GPG Key ID: A6E1DDE38FF3C74E
1 changed files with 24 additions and 12 deletions

View File

@ -230,7 +230,9 @@ func (g *Group) run(ctx context.Context) {
g.evalIterationFunc(ctx, g, evalTimestamp)
}
g.RestoreForState(time.Now())
now := time.Now()
g.RestoreForState(now)
g.metrics.GroupLastRestoreDuration.WithLabelValues(GroupKey(g.file, g.name)).Set(time.Since(now).Seconds())
g.shouldRestore = false
}
@ -779,17 +781,18 @@ const namespace = "prometheus"
// Metrics for rule evaluation.
type Metrics struct {
EvalDuration prometheus.Summary
IterationDuration prometheus.Summary
IterationsMissed *prometheus.CounterVec
IterationsScheduled *prometheus.CounterVec
EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec
GroupInterval *prometheus.GaugeVec
GroupLastEvalTime *prometheus.GaugeVec
GroupLastDuration *prometheus.GaugeVec
GroupRules *prometheus.GaugeVec
GroupSamples *prometheus.GaugeVec
EvalDuration prometheus.Summary
IterationDuration prometheus.Summary
IterationsMissed *prometheus.CounterVec
IterationsScheduled *prometheus.CounterVec
EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec
GroupInterval *prometheus.GaugeVec
GroupLastEvalTime *prometheus.GaugeVec
GroupLastDuration *prometheus.GaugeVec
GroupLastRestoreDuration *prometheus.GaugeVec
GroupRules *prometheus.GaugeVec
GroupSamples *prometheus.GaugeVec
}
// NewGroupMetrics creates a new instance of Metrics and registers it with the provided registerer,
@ -865,6 +868,14 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
},
[]string{"rule_group"},
),
GroupLastRestoreDuration: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "rule_group_last_restore_duration_seconds",
Help: "The duration of the last rule group restoration.",
},
[]string{"rule_group"},
),
GroupRules: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
@ -894,6 +905,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
m.GroupInterval,
m.GroupLastEvalTime,
m.GroupLastDuration,
m.GroupLastRestoreDuration,
m.GroupRules,
m.GroupSamples,
)