Rule Manager: Add `rule_group_last_restore_duration_seconds` to measure restore time per rule group
When a rule group changes or prometheus is restarted we need to ensure we restore the active alerts that were firing for a corresponding rule, for that Prometheus uses the `ALERTS_FOR_STATE` series to query the previous state and restore it. If a given rule has high cardinality (think 100s of 1000s for series) this proccess can take a bit of time - this is the first of a series of PRs to improve this problem and I'd like to start with exposing the time it takes to restore a rule group as a gauge. Signed-off-by: gotjosh <josue.abreu@gmail.com>
This commit is contained in:
parent
76b0318ed5
commit
e7219e3d36
|
@ -230,7 +230,9 @@ func (g *Group) run(ctx context.Context) {
|
|||
g.evalIterationFunc(ctx, g, evalTimestamp)
|
||||
}
|
||||
|
||||
g.RestoreForState(time.Now())
|
||||
now := time.Now()
|
||||
g.RestoreForState(now)
|
||||
g.metrics.GroupLastRestoreDuration.WithLabelValues(GroupKey(g.file, g.name)).Set(time.Since(now).Seconds())
|
||||
g.shouldRestore = false
|
||||
}
|
||||
|
||||
|
@ -779,17 +781,18 @@ const namespace = "prometheus"
|
|||
|
||||
// Metrics for rule evaluation.
|
||||
type Metrics struct {
|
||||
EvalDuration prometheus.Summary
|
||||
IterationDuration prometheus.Summary
|
||||
IterationsMissed *prometheus.CounterVec
|
||||
IterationsScheduled *prometheus.CounterVec
|
||||
EvalTotal *prometheus.CounterVec
|
||||
EvalFailures *prometheus.CounterVec
|
||||
GroupInterval *prometheus.GaugeVec
|
||||
GroupLastEvalTime *prometheus.GaugeVec
|
||||
GroupLastDuration *prometheus.GaugeVec
|
||||
GroupRules *prometheus.GaugeVec
|
||||
GroupSamples *prometheus.GaugeVec
|
||||
EvalDuration prometheus.Summary
|
||||
IterationDuration prometheus.Summary
|
||||
IterationsMissed *prometheus.CounterVec
|
||||
IterationsScheduled *prometheus.CounterVec
|
||||
EvalTotal *prometheus.CounterVec
|
||||
EvalFailures *prometheus.CounterVec
|
||||
GroupInterval *prometheus.GaugeVec
|
||||
GroupLastEvalTime *prometheus.GaugeVec
|
||||
GroupLastDuration *prometheus.GaugeVec
|
||||
GroupLastRestoreDuration *prometheus.GaugeVec
|
||||
GroupRules *prometheus.GaugeVec
|
||||
GroupSamples *prometheus.GaugeVec
|
||||
}
|
||||
|
||||
// NewGroupMetrics creates a new instance of Metrics and registers it with the provided registerer,
|
||||
|
@ -865,6 +868,14 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
|
|||
},
|
||||
[]string{"rule_group"},
|
||||
),
|
||||
GroupLastRestoreDuration: prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: namespace,
|
||||
Name: "rule_group_last_restore_duration_seconds",
|
||||
Help: "The duration of the last rule group restoration.",
|
||||
},
|
||||
[]string{"rule_group"},
|
||||
),
|
||||
GroupRules: prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: namespace,
|
||||
|
@ -894,6 +905,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
|
|||
m.GroupInterval,
|
||||
m.GroupLastEvalTime,
|
||||
m.GroupLastDuration,
|
||||
m.GroupLastRestoreDuration,
|
||||
m.GroupRules,
|
||||
m.GroupSamples,
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue