Merge pull request #15669 from rajagopalanand/restore-for-state

rules: Add an option to restore new rule groups added to existing rule manager
This commit is contained in:
Bartlomiej Plotka 2025-01-17 15:37:07 +01:00 committed by GitHub
commit 39d7242cfe
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 164 additions and 13 deletions

View File

@ -0,0 +1,6 @@
groups:
- name: test
interval: 1s
rules:
- alert: rule1
expr: 1 < bool 2

View File

@ -0,0 +1,6 @@
groups:
- name: test2
interval: 1s
rules:
- alert: rule2
expr: 1 < bool 2

View File

@ -90,12 +90,13 @@ func DefaultEvalIterationFunc(ctx context.Context, g *Group, evalTimestamp time.
// The Manager manages recording and alerting rules.
type Manager struct {
opts *ManagerOptions
groups map[string]*Group
mtx sync.RWMutex
block chan struct{}
done chan struct{}
restored bool
opts *ManagerOptions
groups map[string]*Group
mtx sync.RWMutex
block chan struct{}
done chan struct{}
restored bool
restoreNewRuleGroups bool
logger *slog.Logger
}
@ -122,6 +123,10 @@ type ManagerOptions struct {
ConcurrentEvalsEnabled bool
RuleConcurrencyController RuleConcurrencyController
RuleDependencyController RuleDependencyController
// At present, manager only restores `for` state when manager is newly created which happens
// during restarts. This flag provides an option to restore the `for` state when new rule groups are
// added to an existing manager
RestoreNewRuleGroups bool
Metrics *Metrics
}
@ -154,11 +159,12 @@ func NewManager(o *ManagerOptions) *Manager {
}
m := &Manager{
groups: map[string]*Group{},
opts: o,
block: make(chan struct{}),
done: make(chan struct{}),
logger: o.Logger,
groups: map[string]*Group{},
opts: o,
block: make(chan struct{}),
done: make(chan struct{}),
logger: o.Logger,
restoreNewRuleGroups: o.RestoreNewRuleGroups,
}
return m
@ -296,7 +302,7 @@ func (m *Manager) LoadGroups(
) (map[string]*Group, []error) {
groups := make(map[string]*Group)
shouldRestore := !m.restored
shouldRestore := !m.restored || m.restoreNewRuleGroups
for _, fn := range filenames {
rgs, errs := m.opts.GroupLoader.Load(fn, ignoreUnknownFields)
@ -329,7 +335,7 @@ func (m *Manager) LoadGroups(
labels.FromMap(r.Annotations),
externalLabels,
externalURL,
m.restored,
!shouldRestore,
m.logger.With("alert", r.Alert),
))
continue

View File

@ -2264,6 +2264,139 @@ func TestAsyncRuleEvaluation(t *testing.T) {
})
}
func TestNewRuleGroupRestoration(t *testing.T) {
store := teststorage.New(t)
t.Cleanup(func() { store.Close() })
var (
inflightQueries atomic.Int32
maxInflight atomic.Int32
maxConcurrency int64
interval = 60 * time.Second
)
waitForEvaluations := func(t *testing.T, ch <-chan int32, targetCount int32) {
for {
select {
case cnt := <-ch:
if cnt == targetCount {
return
}
case <-time.After(5 * time.Second):
return
}
}
}
files := []string{"fixtures/alert_rule.yaml"}
option := optsFactory(store, &maxInflight, &inflightQueries, maxConcurrency)
option.Queryable = store
option.Appendable = store
option.NotifyFunc = func(ctx context.Context, expr string, alerts ...*Alert) {}
var evalCount atomic.Int32
ch := make(chan int32)
noopEvalIterFunc := func(ctx context.Context, g *Group, evalTimestamp time.Time) {
evalCount.Inc()
ch <- evalCount.Load()
}
ruleManager := NewManager(option)
go ruleManager.Run()
err := ruleManager.Update(interval, files, labels.EmptyLabels(), "", noopEvalIterFunc)
require.NoError(t, err)
waitForEvaluations(t, ch, 3)
require.Equal(t, int32(3), evalCount.Load())
ruleGroups := make(map[string]struct{})
for _, group := range ruleManager.groups {
ruleGroups[group.Name()] = struct{}{}
require.False(t, group.shouldRestore)
for _, rule := range group.rules {
require.True(t, rule.(*AlertingRule).restored.Load())
}
}
files = append(files, "fixtures/alert_rule1.yaml")
err = ruleManager.Update(interval, files, labels.EmptyLabels(), "", nil)
require.NoError(t, err)
ruleManager.Stop()
for _, group := range ruleManager.groups {
// new rule groups added to existing manager will not be restored
require.False(t, group.shouldRestore)
}
}
func TestNewRuleGroupRestorationWithRestoreNewGroupOption(t *testing.T) {
store := teststorage.New(t)
t.Cleanup(func() { store.Close() })
var (
inflightQueries atomic.Int32
maxInflight atomic.Int32
maxConcurrency int64
interval = 60 * time.Second
)
waitForEvaluations := func(t *testing.T, ch <-chan int32, targetCount int32) {
for {
select {
case cnt := <-ch:
if cnt == targetCount {
return
}
case <-time.After(5 * time.Second):
return
}
}
}
files := []string{"fixtures/alert_rule.yaml"}
option := optsFactory(store, &maxInflight, &inflightQueries, maxConcurrency)
option.Queryable = store
option.Appendable = store
option.RestoreNewRuleGroups = true
option.NotifyFunc = func(ctx context.Context, expr string, alerts ...*Alert) {}
var evalCount atomic.Int32
ch := make(chan int32)
noopEvalIterFunc := func(ctx context.Context, g *Group, evalTimestamp time.Time) {
evalCount.Inc()
ch <- evalCount.Load()
}
ruleManager := NewManager(option)
go ruleManager.Run()
err := ruleManager.Update(interval, files, labels.EmptyLabels(), "", noopEvalIterFunc)
require.NoError(t, err)
waitForEvaluations(t, ch, 3)
require.Equal(t, int32(3), evalCount.Load())
ruleGroups := make(map[string]struct{})
for _, group := range ruleManager.groups {
ruleGroups[group.Name()] = struct{}{}
require.False(t, group.shouldRestore)
for _, rule := range group.rules {
require.True(t, rule.(*AlertingRule).restored.Load())
}
}
files = append(files, "fixtures/alert_rule1.yaml")
err = ruleManager.Update(interval, files, labels.EmptyLabels(), "", nil)
require.NoError(t, err)
// stop eval
ruleManager.Stop()
for _, group := range ruleManager.groups {
if _, OK := ruleGroups[group.Name()]; OK {
// already restored
require.False(t, group.shouldRestore)
continue
}
// new rule groups added to existing manager will be restored
require.True(t, group.shouldRestore)
}
}
func TestBoundedRuleEvalConcurrency(t *testing.T) {
storage := teststorage.New(t)
t.Cleanup(func() { storage.Close() })