Add context reasons to notifications failed counter (#3631)
--------- Signed-off-by: Walther Lee <walther.lee@reddit.com> Co-authored-by: Walther Lee <walther.lee@reddit.com> Co-authored-by: Ben Kochie <superq@gmail.com>
This commit is contained in:
parent
dc1466487a
commit
3416d5a4f5
|
@ -790,6 +790,11 @@ func (r RetryStage) exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
|
|||
case <-ctx.Done():
|
||||
if iErr == nil {
|
||||
iErr = ctx.Err()
|
||||
if errors.Is(iErr, context.Canceled) {
|
||||
iErr = NewErrorWithReason(ContextCanceledReason, iErr)
|
||||
} else if errors.Is(iErr, context.DeadlineExceeded) {
|
||||
iErr = NewErrorWithReason(ContextDeadlineExceededReason, iErr)
|
||||
}
|
||||
}
|
||||
|
||||
return ctx, nil, errors.Wrapf(iErr, "%s/%s: notify retry canceled after %d attempts", r.groupName, r.integration.String(), i)
|
||||
|
@ -808,14 +813,15 @@ func (r RetryStage) exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
|
|||
if !retry {
|
||||
return ctx, alerts, errors.Wrapf(err, "%s/%s: notify retry canceled due to unrecoverable error after %d attempts", r.groupName, r.integration.String(), i)
|
||||
}
|
||||
if ctx.Err() == nil && (iErr == nil || err.Error() != iErr.Error()) {
|
||||
// Log the error if the context isn't done and the error isn't the same as before.
|
||||
level.Warn(l).Log("msg", "Notify attempt failed, will retry later", "attempts", i, "err", err)
|
||||
if ctx.Err() == nil {
|
||||
if iErr == nil || err.Error() != iErr.Error() {
|
||||
// Log the error if the context isn't done and the error isn't the same as before.
|
||||
level.Warn(l).Log("msg", "Notify attempt failed, will retry later", "attempts", i, "err", err)
|
||||
}
|
||||
// Save this error to be able to return the last seen error by an
|
||||
// integration upon context timeout.
|
||||
iErr = err
|
||||
}
|
||||
|
||||
// Save this error to be able to return the last seen error by an
|
||||
// integration upon context timeout.
|
||||
iErr = err
|
||||
} else {
|
||||
lvl := level.Info(l)
|
||||
if i <= 1 {
|
||||
|
@ -828,6 +834,11 @@ func (r RetryStage) exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
|
|||
case <-ctx.Done():
|
||||
if iErr == nil {
|
||||
iErr = ctx.Err()
|
||||
if errors.Is(iErr, context.Canceled) {
|
||||
iErr = NewErrorWithReason(ContextCanceledReason, iErr)
|
||||
} else if errors.Is(iErr, context.DeadlineExceeded) {
|
||||
iErr = NewErrorWithReason(ContextDeadlineExceededReason, iErr)
|
||||
}
|
||||
}
|
||||
|
||||
return ctx, nil, errors.Wrapf(iErr, "%s/%s: notify retry canceled after %d attempts", r.groupName, r.integration.String(), i)
|
||||
|
|
|
@ -469,6 +469,39 @@ func TestRetryStageWithErrorCode(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestRetryStageWithContextCanceled(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
i := Integration{
|
||||
name: "test",
|
||||
notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) {
|
||||
cancel()
|
||||
return true, errors.New("request failed: context canceled")
|
||||
}),
|
||||
rs: sendResolved(false),
|
||||
}
|
||||
r := NewRetryStage(i, "", NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{}))
|
||||
|
||||
alerts := []*types.Alert{
|
||||
{
|
||||
Alert: model.Alert{
|
||||
EndsAt: time.Now().Add(time.Hour),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
ctx = WithFiringAlerts(ctx, []uint64{0})
|
||||
|
||||
// Notify with a non-recoverable error.
|
||||
resctx, _, err := r.Exec(ctx, log.NewNopLogger(), alerts...)
|
||||
counter := r.metrics.numTotalFailedNotifications
|
||||
|
||||
require.Equal(t, 1, int(prom_testutil.ToFloat64(counter.WithLabelValues(r.integration.Name(), ContextCanceledReason.String()))))
|
||||
|
||||
require.NotNil(t, err)
|
||||
require.NotNil(t, resctx)
|
||||
}
|
||||
|
||||
func TestRetryStageNoResolved(t *testing.T) {
|
||||
sent := []*types.Alert{}
|
||||
i := Integration{
|
||||
|
|
|
@ -270,6 +270,8 @@ const (
|
|||
DefaultReason Reason = iota
|
||||
ClientErrorReason
|
||||
ServerErrorReason
|
||||
ContextCanceledReason
|
||||
ContextDeadlineExceededReason
|
||||
)
|
||||
|
||||
func (s Reason) String() string {
|
||||
|
@ -280,13 +282,17 @@ func (s Reason) String() string {
|
|||
return "clientError"
|
||||
case ServerErrorReason:
|
||||
return "serverError"
|
||||
case ContextCanceledReason:
|
||||
return "contextCanceled"
|
||||
case ContextDeadlineExceededReason:
|
||||
return "contextDeadlineExceeded"
|
||||
default:
|
||||
panic(fmt.Sprintf("unknown Reason: %d", s))
|
||||
}
|
||||
}
|
||||
|
||||
// possibleFailureReasonCategory is a list of possible failure reason.
|
||||
var possibleFailureReasonCategory = []string{DefaultReason.String(), ClientErrorReason.String(), ServerErrorReason.String()}
|
||||
var possibleFailureReasonCategory = []string{DefaultReason.String(), ClientErrorReason.String(), ServerErrorReason.String(), ContextCanceledReason.String(), ContextDeadlineExceededReason.String()}
|
||||
|
||||
// GetFailureReasonFromStatusCode returns the reason for the failure based on the status code provided.
|
||||
func GetFailureReasonFromStatusCode(statusCode int) Reason {
|
||||
|
|
Loading…
Reference in New Issue