Limit number of alerts or series produced by a rule (#9260)
* Add limit to rules Signed-off-by: Levi Harrison <git@leviharrison.dev>
This commit is contained in:
parent
1ea774f184
commit
dc2f1993d8
|
@ -78,6 +78,10 @@ name: <string>
|
|||
# How often rules in the group are evaluated.
|
||||
[ interval: <duration> | default = global.evaluation_interval ]
|
||||
|
||||
# Limit the number of alerts and series individual rules can produce.
|
||||
# 0 is no limit.
|
||||
[ limit: <int> | default = 0 ]
|
||||
|
||||
rules:
|
||||
[ - <rule> ... ]
|
||||
```
|
||||
|
|
|
@ -107,6 +107,7 @@ func (g *RuleGroups) Validate(node ruleGroups) (errs []error) {
|
|||
type RuleGroup struct {
|
||||
Name string `yaml:"name"`
|
||||
Interval model.Duration `yaml:"interval,omitempty"`
|
||||
Limit int `yaml:"limit,omitempty"`
|
||||
Rules []RuleNode `yaml:"rules"`
|
||||
}
|
||||
|
||||
|
|
|
@ -297,7 +297,7 @@ const resolvedRetention = 15 * time.Minute
|
|||
|
||||
// Eval evaluates the rule expression and then creates pending alerts and fires
|
||||
// or removes previously pending alerts accordingly.
|
||||
func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, externalURL *url.URL) (promql.Vector, error) {
|
||||
func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, externalURL *url.URL, limit int) (promql.Vector, error) {
|
||||
res, err := query(ctx, r.vector.String(), ts)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -415,6 +415,12 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc,
|
|||
}
|
||||
}
|
||||
|
||||
numActive := len(r.active)
|
||||
if limit != 0 && numActive > limit {
|
||||
r.active = map[uint64]*Alert{}
|
||||
return nil, errors.Errorf("exceeded limit of %d with %d alerts", limit, numActive)
|
||||
}
|
||||
|
||||
return vec, nil
|
||||
}
|
||||
|
||||
|
|
|
@ -170,7 +170,7 @@ func TestAlertingRuleLabelsUpdate(t *testing.T) {
|
|||
t.Logf("case %d", i)
|
||||
evalTime := baseTime.Add(time.Duration(i) * time.Minute)
|
||||
result[0].Point.T = timestamp.FromTime(evalTime)
|
||||
res, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil)
|
||||
res, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0)
|
||||
require.NoError(t, err)
|
||||
|
||||
var filteredRes promql.Vector // After removing 'ALERTS_FOR_STATE' samples.
|
||||
|
@ -252,7 +252,7 @@ func TestAlertingRuleExternalLabelsInTemplate(t *testing.T) {
|
|||
|
||||
var filteredRes promql.Vector // After removing 'ALERTS_FOR_STATE' samples.
|
||||
res, err := ruleWithoutExternalLabels.Eval(
|
||||
suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil,
|
||||
suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0,
|
||||
)
|
||||
require.NoError(t, err)
|
||||
for _, smpl := range res {
|
||||
|
@ -266,7 +266,7 @@ func TestAlertingRuleExternalLabelsInTemplate(t *testing.T) {
|
|||
}
|
||||
|
||||
res, err = ruleWithExternalLabels.Eval(
|
||||
suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil,
|
||||
suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0,
|
||||
)
|
||||
require.NoError(t, err)
|
||||
for _, smpl := range res {
|
||||
|
@ -346,7 +346,7 @@ func TestAlertingRuleExternalURLInTemplate(t *testing.T) {
|
|||
|
||||
var filteredRes promql.Vector // After removing 'ALERTS_FOR_STATE' samples.
|
||||
res, err := ruleWithoutExternalURL.Eval(
|
||||
suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil,
|
||||
suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0,
|
||||
)
|
||||
require.NoError(t, err)
|
||||
for _, smpl := range res {
|
||||
|
@ -360,7 +360,7 @@ func TestAlertingRuleExternalURLInTemplate(t *testing.T) {
|
|||
}
|
||||
|
||||
res, err = ruleWithExternalURL.Eval(
|
||||
suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil,
|
||||
suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0,
|
||||
)
|
||||
require.NoError(t, err)
|
||||
for _, smpl := range res {
|
||||
|
@ -417,7 +417,7 @@ func TestAlertingRuleEmptyLabelFromTemplate(t *testing.T) {
|
|||
|
||||
var filteredRes promql.Vector // After removing 'ALERTS_FOR_STATE' samples.
|
||||
res, err := rule.Eval(
|
||||
suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil,
|
||||
suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0,
|
||||
)
|
||||
require.NoError(t, err)
|
||||
for _, smpl := range res {
|
||||
|
@ -460,7 +460,61 @@ func TestAlertingRuleDuplicate(t *testing.T) {
|
|||
"",
|
||||
true, log.NewNopLogger(),
|
||||
)
|
||||
_, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil)
|
||||
_, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, 0)
|
||||
require.Error(t, err)
|
||||
require.EqualError(t, err, "vector contains metrics with the same labelset after applying alert labels")
|
||||
}
|
||||
|
||||
func TestAlertingRuleLimit(t *testing.T) {
|
||||
storage := teststorage.New(t)
|
||||
defer storage.Close()
|
||||
|
||||
opts := promql.EngineOpts{
|
||||
Logger: nil,
|
||||
Reg: nil,
|
||||
MaxSamples: 10,
|
||||
Timeout: 10 * time.Second,
|
||||
}
|
||||
|
||||
engine := promql.NewEngine(opts)
|
||||
ctx, cancelCtx := context.WithCancel(context.Background())
|
||||
defer cancelCtx()
|
||||
|
||||
now := time.Now()
|
||||
|
||||
suite := []struct {
|
||||
limit int
|
||||
err string
|
||||
}{
|
||||
{
|
||||
limit: 0,
|
||||
},
|
||||
{
|
||||
limit: 1,
|
||||
},
|
||||
{
|
||||
limit: -1,
|
||||
err: "exceeded limit of -1 with 1 alerts",
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range suite {
|
||||
expr, _ := parser.ParseExpr(`1`)
|
||||
rule := NewAlertingRule(
|
||||
"foo",
|
||||
expr,
|
||||
time.Minute,
|
||||
labels.FromStrings("test", "test"),
|
||||
nil,
|
||||
nil,
|
||||
"",
|
||||
true, log.NewNopLogger(),
|
||||
)
|
||||
_, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, test.limit)
|
||||
if test.err == "" {
|
||||
require.NoError(t, err)
|
||||
} else {
|
||||
require.Equal(t, test.err, err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -213,7 +213,7 @@ type Rule interface {
|
|||
// Labels of the rule.
|
||||
Labels() labels.Labels
|
||||
// eval evaluates the rule, including any associated recording or alerting actions.
|
||||
Eval(context.Context, time.Time, QueryFunc, *url.URL) (promql.Vector, error)
|
||||
Eval(context.Context, time.Time, QueryFunc, *url.URL, int) (promql.Vector, error)
|
||||
// String returns a human-readable string representation of the rule.
|
||||
String() string
|
||||
// Query returns the rule query expression.
|
||||
|
@ -244,6 +244,7 @@ type Group struct {
|
|||
name string
|
||||
file string
|
||||
interval time.Duration
|
||||
limit int
|
||||
rules []Rule
|
||||
seriesInPreviousEval []map[string]labels.Labels // One per Rule.
|
||||
staleSeries []labels.Labels
|
||||
|
@ -267,6 +268,7 @@ type Group struct {
|
|||
type GroupOptions struct {
|
||||
Name, File string
|
||||
Interval time.Duration
|
||||
Limit int
|
||||
Rules []Rule
|
||||
ShouldRestore bool
|
||||
Opts *ManagerOptions
|
||||
|
@ -295,6 +297,7 @@ func NewGroup(o GroupOptions) *Group {
|
|||
name: o.Name,
|
||||
file: o.File,
|
||||
interval: o.Interval,
|
||||
limit: o.Limit,
|
||||
rules: o.Rules,
|
||||
shouldRestore: o.ShouldRestore,
|
||||
opts: o.Opts,
|
||||
|
@ -319,6 +322,9 @@ func (g *Group) Rules() []Rule { return g.rules }
|
|||
// Interval returns the group's interval.
|
||||
func (g *Group) Interval() time.Duration { return g.interval }
|
||||
|
||||
// Limit returns the group's limit.
|
||||
func (g *Group) Limit() int { return g.limit }
|
||||
|
||||
func (g *Group) run(ctx context.Context) {
|
||||
defer close(g.terminated)
|
||||
|
||||
|
@ -591,7 +597,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
|
|||
|
||||
g.metrics.EvalTotal.WithLabelValues(GroupKey(g.File(), g.Name())).Inc()
|
||||
|
||||
vector, err := rule.Eval(ctx, ts, g.opts.QueryFunc, g.opts.ExternalURL)
|
||||
vector, err := rule.Eval(ctx, ts, g.opts.QueryFunc, g.opts.ExternalURL, g.Limit())
|
||||
if err != nil {
|
||||
rule.SetHealth(HealthBad)
|
||||
rule.SetLastError(err)
|
||||
|
@ -850,6 +856,10 @@ func (g *Group) Equals(ng *Group) bool {
|
|||
return false
|
||||
}
|
||||
|
||||
if g.limit != ng.limit {
|
||||
return false
|
||||
}
|
||||
|
||||
if len(g.rules) != len(ng.rules) {
|
||||
return false
|
||||
}
|
||||
|
@ -1086,6 +1096,7 @@ func (m *Manager) LoadGroups(
|
|||
Name: rg.Name,
|
||||
File: fn,
|
||||
Interval: itv,
|
||||
Limit: rg.Limit,
|
||||
Rules: rules,
|
||||
ShouldRestore: shouldRestore,
|
||||
Opts: m.opts,
|
||||
|
|
|
@ -156,7 +156,7 @@ func TestAlertingRule(t *testing.T) {
|
|||
|
||||
evalTime := baseTime.Add(test.time)
|
||||
|
||||
res, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil)
|
||||
res, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0)
|
||||
require.NoError(t, err)
|
||||
|
||||
var filteredRes promql.Vector // After removing 'ALERTS_FOR_STATE' samples.
|
||||
|
@ -305,7 +305,7 @@ func TestForStateAddSamples(t *testing.T) {
|
|||
forState = float64(value.StaleNaN)
|
||||
}
|
||||
|
||||
res, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil)
|
||||
res, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0)
|
||||
require.NoError(t, err)
|
||||
|
||||
var filteredRes promql.Vector // After removing 'ALERTS' samples.
|
||||
|
@ -773,6 +773,12 @@ func TestUpdate(t *testing.T) {
|
|||
}
|
||||
reloadAndValidate(rgs, t, tmpFile, ruleManager, expected, ogs)
|
||||
|
||||
// Update limit and reload.
|
||||
for i := range rgs.Groups {
|
||||
rgs.Groups[i].Limit = 1
|
||||
}
|
||||
reloadAndValidate(rgs, t, tmpFile, ruleManager, expected, ogs)
|
||||
|
||||
// Change group rules and reload.
|
||||
for i, g := range rgs.Groups {
|
||||
for j, r := range g.Rules {
|
||||
|
@ -791,6 +797,7 @@ type ruleGroupsTest struct {
|
|||
type ruleGroupTest struct {
|
||||
Name string `yaml:"name"`
|
||||
Interval model.Duration `yaml:"interval,omitempty"`
|
||||
Limit int `yaml:"limit,omitempty"`
|
||||
Rules []rulefmt.Rule `yaml:"rules"`
|
||||
}
|
||||
|
||||
|
@ -812,6 +819,7 @@ func formatRules(r *rulefmt.RuleGroups) ruleGroupsTest {
|
|||
tmp = append(tmp, ruleGroupTest{
|
||||
Name: g.Name,
|
||||
Interval: g.Interval,
|
||||
Limit: g.Limit,
|
||||
Rules: rtmp,
|
||||
})
|
||||
}
|
||||
|
|
|
@ -73,7 +73,7 @@ func (rule *RecordingRule) Labels() labels.Labels {
|
|||
}
|
||||
|
||||
// Eval evaluates the rule and then overrides the metric names and labels accordingly.
|
||||
func (rule *RecordingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, _ *url.URL) (promql.Vector, error) {
|
||||
func (rule *RecordingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, _ *url.URL, limit int) (promql.Vector, error) {
|
||||
vector, err := query(ctx, rule.vector.String(), ts)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -99,6 +99,13 @@ func (rule *RecordingRule) Eval(ctx context.Context, ts time.Time, query QueryFu
|
|||
return nil, fmt.Errorf("vector contains metrics with the same labelset after applying rule labels")
|
||||
}
|
||||
|
||||
numSamples := len(vector)
|
||||
if limit != 0 && numSamples > limit {
|
||||
return nil, fmt.Errorf("exceeded limit %d with %d samples", limit, numSamples)
|
||||
}
|
||||
|
||||
rule.SetHealth(HealthGood)
|
||||
rule.SetLastError(err)
|
||||
return vector, nil
|
||||
}
|
||||
|
||||
|
|
|
@ -49,7 +49,9 @@ func TestRuleEval(t *testing.T) {
|
|||
name string
|
||||
expr parser.Expr
|
||||
labels labels.Labels
|
||||
limit int
|
||||
result promql.Vector
|
||||
err string
|
||||
}{
|
||||
{
|
||||
name: "nolabels",
|
||||
|
@ -69,12 +71,43 @@ func TestRuleEval(t *testing.T) {
|
|||
Point: promql.Point{V: 1, T: timestamp.FromTime(now)},
|
||||
}},
|
||||
},
|
||||
{
|
||||
name: "underlimit",
|
||||
expr: &parser.NumberLiteral{Val: 1},
|
||||
labels: labels.FromStrings("foo", "bar"),
|
||||
limit: 2,
|
||||
result: promql.Vector{promql.Sample{
|
||||
Metric: labels.FromStrings("__name__", "underlimit", "foo", "bar"),
|
||||
Point: promql.Point{V: 1, T: timestamp.FromTime(now)},
|
||||
}},
|
||||
},
|
||||
{
|
||||
name: "atlimit",
|
||||
expr: &parser.NumberLiteral{Val: 1},
|
||||
labels: labels.FromStrings("foo", "bar"),
|
||||
limit: 1,
|
||||
result: promql.Vector{promql.Sample{
|
||||
Metric: labels.FromStrings("__name__", "atlimit", "foo", "bar"),
|
||||
Point: promql.Point{V: 1, T: timestamp.FromTime(now)},
|
||||
}},
|
||||
},
|
||||
{
|
||||
name: "overlimit",
|
||||
expr: &parser.NumberLiteral{Val: 1},
|
||||
labels: labels.FromStrings("foo", "bar"),
|
||||
limit: -1,
|
||||
err: "exceeded limit -1 with 1 samples",
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range suite {
|
||||
rule := NewRecordingRule(test.name, test.expr, test.labels)
|
||||
result, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil)
|
||||
require.NoError(t, err)
|
||||
result, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, test.limit)
|
||||
if test.err == "" {
|
||||
require.NoError(t, err)
|
||||
} else {
|
||||
require.Equal(t, test.err, err.Error())
|
||||
}
|
||||
require.Equal(t, test.result, result)
|
||||
}
|
||||
}
|
||||
|
@ -114,7 +147,7 @@ func TestRuleEvalDuplicate(t *testing.T) {
|
|||
|
||||
expr, _ := parser.ParseExpr(`vector(0) or label_replace(vector(0),"test","x","","")`)
|
||||
rule := NewRecordingRule("foo", expr, labels.FromStrings("test", "test"))
|
||||
_, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil)
|
||||
_, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, 0)
|
||||
require.Error(t, err)
|
||||
require.EqualError(t, err, "vector contains metrics with the same labelset after applying rule labels")
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue