Add context reasons to notifications failed counter (#3631)

---------

Signed-off-by: Walther Lee <walther.lee@reddit.com>
Co-authored-by: Walther Lee <walther.lee@reddit.com>
Co-authored-by: Ben Kochie <superq@gmail.com>
This commit is contained in:
Walther Lee 2023-12-08 06:30:43 -08:00 committed by GitHub
parent dc1466487a
commit 3416d5a4f5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 58 additions and 8 deletions

View File

@ -790,6 +790,11 @@ func (r RetryStage) exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
case <-ctx.Done():
if iErr == nil {
iErr = ctx.Err()
if errors.Is(iErr, context.Canceled) {
iErr = NewErrorWithReason(ContextCanceledReason, iErr)
} else if errors.Is(iErr, context.DeadlineExceeded) {
iErr = NewErrorWithReason(ContextDeadlineExceededReason, iErr)
}
}
return ctx, nil, errors.Wrapf(iErr, "%s/%s: notify retry canceled after %d attempts", r.groupName, r.integration.String(), i)
@ -808,14 +813,15 @@ func (r RetryStage) exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
if !retry {
return ctx, alerts, errors.Wrapf(err, "%s/%s: notify retry canceled due to unrecoverable error after %d attempts", r.groupName, r.integration.String(), i)
}
if ctx.Err() == nil && (iErr == nil || err.Error() != iErr.Error()) {
// Log the error if the context isn't done and the error isn't the same as before.
level.Warn(l).Log("msg", "Notify attempt failed, will retry later", "attempts", i, "err", err)
if ctx.Err() == nil {
if iErr == nil || err.Error() != iErr.Error() {
// Log the error if the context isn't done and the error isn't the same as before.
level.Warn(l).Log("msg", "Notify attempt failed, will retry later", "attempts", i, "err", err)
}
// Save this error to be able to return the last seen error by an
// integration upon context timeout.
iErr = err
}
// Save this error to be able to return the last seen error by an
// integration upon context timeout.
iErr = err
} else {
lvl := level.Info(l)
if i <= 1 {
@ -828,6 +834,11 @@ func (r RetryStage) exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
case <-ctx.Done():
if iErr == nil {
iErr = ctx.Err()
if errors.Is(iErr, context.Canceled) {
iErr = NewErrorWithReason(ContextCanceledReason, iErr)
} else if errors.Is(iErr, context.DeadlineExceeded) {
iErr = NewErrorWithReason(ContextDeadlineExceededReason, iErr)
}
}
return ctx, nil, errors.Wrapf(iErr, "%s/%s: notify retry canceled after %d attempts", r.groupName, r.integration.String(), i)

View File

@ -469,6 +469,39 @@ func TestRetryStageWithErrorCode(t *testing.T) {
}
}
func TestRetryStageWithContextCanceled(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
i := Integration{
name: "test",
notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) {
cancel()
return true, errors.New("request failed: context canceled")
}),
rs: sendResolved(false),
}
r := NewRetryStage(i, "", NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{}))
alerts := []*types.Alert{
{
Alert: model.Alert{
EndsAt: time.Now().Add(time.Hour),
},
},
}
ctx = WithFiringAlerts(ctx, []uint64{0})
// Notify with a non-recoverable error.
resctx, _, err := r.Exec(ctx, log.NewNopLogger(), alerts...)
counter := r.metrics.numTotalFailedNotifications
require.Equal(t, 1, int(prom_testutil.ToFloat64(counter.WithLabelValues(r.integration.Name(), ContextCanceledReason.String()))))
require.NotNil(t, err)
require.NotNil(t, resctx)
}
func TestRetryStageNoResolved(t *testing.T) {
sent := []*types.Alert{}
i := Integration{

View File

@ -270,6 +270,8 @@ const (
DefaultReason Reason = iota
ClientErrorReason
ServerErrorReason
ContextCanceledReason
ContextDeadlineExceededReason
)
func (s Reason) String() string {
@ -280,13 +282,17 @@ func (s Reason) String() string {
return "clientError"
case ServerErrorReason:
return "serverError"
case ContextCanceledReason:
return "contextCanceled"
case ContextDeadlineExceededReason:
return "contextDeadlineExceeded"
default:
panic(fmt.Sprintf("unknown Reason: %d", s))
}
}
// possibleFailureReasonCategory is a list of possible failure reason.
var possibleFailureReasonCategory = []string{DefaultReason.String(), ClientErrorReason.String(), ServerErrorReason.String()}
var possibleFailureReasonCategory = []string{DefaultReason.String(), ClientErrorReason.String(), ServerErrorReason.String(), ContextCanceledReason.String(), ContextDeadlineExceededReason.String()}
// GetFailureReasonFromStatusCode returns the reason for the failure based on the status code provided.
func GetFailureReasonFromStatusCode(statusCode int) Reason {