Rule alerts/series limit updates (#9541)
* Add docs and do not limit inactive alerts. Signed-off-by: Levi Harrison <git@leviharrison.dev>
This commit is contained in:
parent
8c3eca84db
commit
d81bbe154d
|
@ -78,8 +78,8 @@ name: <string>
|
|||
# How often rules in the group are evaluated.
|
||||
[ interval: <duration> | default = global.evaluation_interval ]
|
||||
|
||||
# Limit the number of alerts and series individual rules can produce.
|
||||
# 0 is no limit.
|
||||
# Limit the number of alerts an alerting rule and series a recording
|
||||
# rule can produce. 0 is no limit.
|
||||
[ limit: <int> | default = 0 ]
|
||||
|
||||
rules:
|
||||
|
@ -128,3 +128,11 @@ annotations:
|
|||
[ <labelname>: <tmpl_string> ]
|
||||
```
|
||||
|
||||
# Limiting alerts and series
|
||||
|
||||
A limit for alerts produced by alerting rules and series produced recording rules
|
||||
can be configured per-group. When the limit is exceeded, _all_ series produced
|
||||
by the rule are discarded, and if it's an alerting rule, _all_ alerts for
|
||||
the rule, active, pending, or inactive, are cleared as well. The event will be
|
||||
recorded as an error in the evaluation, and as such no stale markers are
|
||||
written.
|
||||
|
|
|
@ -389,6 +389,7 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc,
|
|||
r.active[h] = a
|
||||
}
|
||||
|
||||
var numActivePending int
|
||||
// Check if any pending alerts should be removed or fire now. Write out alert timeseries.
|
||||
for fp, a := range r.active {
|
||||
if _, ok := resultFPs[fp]; !ok {
|
||||
|
@ -403,6 +404,7 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc,
|
|||
}
|
||||
continue
|
||||
}
|
||||
numActivePending++
|
||||
|
||||
if a.State == StatePending && ts.Sub(a.ActiveAt) >= r.holdDuration {
|
||||
a.State = StateFiring
|
||||
|
@ -415,10 +417,9 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc,
|
|||
}
|
||||
}
|
||||
|
||||
numActive := len(r.active)
|
||||
if limit != 0 && numActive > limit {
|
||||
if limit > 0 && numActivePending > limit {
|
||||
r.active = map[uint64]*Alert{}
|
||||
return nil, errors.Errorf("exceeded limit of %d with %d alerts", limit, numActive)
|
||||
return nil, errors.Errorf("exceeded limit of %d with %d alerts", limit, numActivePending)
|
||||
}
|
||||
|
||||
return vec, nil
|
||||
|
|
|
@ -466,23 +466,17 @@ func TestAlertingRuleDuplicate(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestAlertingRuleLimit(t *testing.T) {
|
||||
storage := teststorage.New(t)
|
||||
defer storage.Close()
|
||||
suite, err := promql.NewTest(t, `
|
||||
load 1m
|
||||
metric{label="1"} 1
|
||||
metric{label="2"} 1
|
||||
`)
|
||||
require.NoError(t, err)
|
||||
defer suite.Close()
|
||||
|
||||
opts := promql.EngineOpts{
|
||||
Logger: nil,
|
||||
Reg: nil,
|
||||
MaxSamples: 10,
|
||||
Timeout: 10 * time.Second,
|
||||
}
|
||||
require.NoError(t, suite.Run())
|
||||
|
||||
engine := promql.NewEngine(opts)
|
||||
ctx, cancelCtx := context.WithCancel(context.Background())
|
||||
defer cancelCtx()
|
||||
|
||||
now := time.Now()
|
||||
|
||||
suite := []struct {
|
||||
tests := []struct {
|
||||
limit int
|
||||
err string
|
||||
}{
|
||||
|
@ -490,31 +484,37 @@ func TestAlertingRuleLimit(t *testing.T) {
|
|||
limit: 0,
|
||||
},
|
||||
{
|
||||
limit: 1,
|
||||
limit: -1,
|
||||
},
|
||||
{
|
||||
limit: -1,
|
||||
err: "exceeded limit of -1 with 1 alerts",
|
||||
limit: 2,
|
||||
},
|
||||
{
|
||||
limit: 1,
|
||||
err: "exceeded limit of 1 with 2 alerts",
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range suite {
|
||||
expr, _ := parser.ParseExpr(`1`)
|
||||
rule := NewAlertingRule(
|
||||
"foo",
|
||||
expr,
|
||||
time.Minute,
|
||||
labels.FromStrings("test", "test"),
|
||||
nil,
|
||||
nil,
|
||||
"",
|
||||
true, log.NewNopLogger(),
|
||||
)
|
||||
_, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, test.limit)
|
||||
if test.err == "" {
|
||||
require.NoError(t, err)
|
||||
} else {
|
||||
require.Equal(t, test.err, err.Error())
|
||||
expr, _ := parser.ParseExpr(`metric > 0`)
|
||||
rule := NewAlertingRule(
|
||||
"foo",
|
||||
expr,
|
||||
time.Minute,
|
||||
labels.FromStrings("test", "test"),
|
||||
nil,
|
||||
nil,
|
||||
"",
|
||||
true, log.NewNopLogger(),
|
||||
)
|
||||
|
||||
evalTime := time.Unix(0, 0)
|
||||
|
||||
for _, test := range tests {
|
||||
_, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, test.limit)
|
||||
if err != nil {
|
||||
require.EqualError(t, err, test.err)
|
||||
} else if test.err != "" {
|
||||
t.Errorf("Expected errror %s, got none", test.err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -99,9 +99,9 @@ func (rule *RecordingRule) Eval(ctx context.Context, ts time.Time, query QueryFu
|
|||
return nil, fmt.Errorf("vector contains metrics with the same labelset after applying rule labels")
|
||||
}
|
||||
|
||||
numSamples := len(vector)
|
||||
if limit != 0 && numSamples > limit {
|
||||
return nil, fmt.Errorf("exceeded limit %d with %d samples", limit, numSamples)
|
||||
numSeries := len(vector)
|
||||
if limit > 0 && numSeries > limit {
|
||||
return nil, fmt.Errorf("exceeded limit of %d with %d series", limit, numSeries)
|
||||
}
|
||||
|
||||
rule.SetHealth(HealthGood)
|
||||
|
|
|
@ -49,7 +49,6 @@ func TestRuleEval(t *testing.T) {
|
|||
name string
|
||||
expr parser.Expr
|
||||
labels labels.Labels
|
||||
limit int
|
||||
result promql.Vector
|
||||
err string
|
||||
}{
|
||||
|
@ -71,38 +70,11 @@ func TestRuleEval(t *testing.T) {
|
|||
Point: promql.Point{V: 1, T: timestamp.FromTime(now)},
|
||||
}},
|
||||
},
|
||||
{
|
||||
name: "underlimit",
|
||||
expr: &parser.NumberLiteral{Val: 1},
|
||||
labels: labels.FromStrings("foo", "bar"),
|
||||
limit: 2,
|
||||
result: promql.Vector{promql.Sample{
|
||||
Metric: labels.FromStrings("__name__", "underlimit", "foo", "bar"),
|
||||
Point: promql.Point{V: 1, T: timestamp.FromTime(now)},
|
||||
}},
|
||||
},
|
||||
{
|
||||
name: "atlimit",
|
||||
expr: &parser.NumberLiteral{Val: 1},
|
||||
labels: labels.FromStrings("foo", "bar"),
|
||||
limit: 1,
|
||||
result: promql.Vector{promql.Sample{
|
||||
Metric: labels.FromStrings("__name__", "atlimit", "foo", "bar"),
|
||||
Point: promql.Point{V: 1, T: timestamp.FromTime(now)},
|
||||
}},
|
||||
},
|
||||
{
|
||||
name: "overlimit",
|
||||
expr: &parser.NumberLiteral{Val: 1},
|
||||
labels: labels.FromStrings("foo", "bar"),
|
||||
limit: -1,
|
||||
err: "exceeded limit -1 with 1 samples",
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range suite {
|
||||
rule := NewRecordingRule(test.name, test.expr, test.labels)
|
||||
result, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, test.limit)
|
||||
result, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, 0)
|
||||
if test.err == "" {
|
||||
require.NoError(t, err)
|
||||
} else {
|
||||
|
@ -151,3 +123,52 @@ func TestRuleEvalDuplicate(t *testing.T) {
|
|||
require.Error(t, err)
|
||||
require.EqualError(t, err, "vector contains metrics with the same labelset after applying rule labels")
|
||||
}
|
||||
|
||||
func TestRecordingRuleLimit(t *testing.T) {
|
||||
suite, err := promql.NewTest(t, `
|
||||
load 1m
|
||||
metric{label="1"} 1
|
||||
metric{label="2"} 1
|
||||
`)
|
||||
require.NoError(t, err)
|
||||
defer suite.Close()
|
||||
|
||||
require.NoError(t, suite.Run())
|
||||
|
||||
tests := []struct {
|
||||
limit int
|
||||
err string
|
||||
}{
|
||||
{
|
||||
limit: 0,
|
||||
},
|
||||
{
|
||||
limit: -1,
|
||||
},
|
||||
{
|
||||
limit: 2,
|
||||
},
|
||||
{
|
||||
limit: 1,
|
||||
err: "exceeded limit of 1 with 2 series",
|
||||
},
|
||||
}
|
||||
|
||||
expr, _ := parser.ParseExpr(`metric > 0`)
|
||||
rule := NewRecordingRule(
|
||||
"foo",
|
||||
expr,
|
||||
labels.FromStrings("test", "test"),
|
||||
)
|
||||
|
||||
evalTime := time.Unix(0, 0)
|
||||
|
||||
for _, test := range tests {
|
||||
_, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, test.limit)
|
||||
if err != nil {
|
||||
require.EqualError(t, err, test.err)
|
||||
} else if test.err != "" {
|
||||
t.Errorf("Expected error %s, got none", test.err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue