notify: improve logs on notification errors (#2273)

* notify: improve logs on notification errors

Alertmanager can experience occasional failures when sending
notifications to an external service. If the operation succeeds after
some retry, the 'alertmanager_notifications_failed_total' metric
increases but nothing is logged (unless running with log.level=debug).
Hence an operator might receive an alert about notification failures but
wouldn't know which integration was failing.

With this change, notification failures are logged at the warning level.
To avoid log flooding, similar failures on retries aren't logged.
Additional information on the failing integration has also been added.

Signed-off-by: Simon Pasquier <spasquie@redhat.com>

* Log notify success at info level if it's a retry

Signed-off-by: Simon Pasquier <spasquie@redhat.com>
This commit is contained in:
Simon Pasquier 2020-06-04 10:38:48 +02:00 committed by GitHub
parent 70b5e00ffc
commit f7c595c168
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 32 additions and 25 deletions

View File

@ -24,6 +24,7 @@ import (
"github.com/cespare/xxhash"
"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"
@ -90,6 +91,11 @@ func (i *Integration) Index() int {
return i.idx
}
// String implements the Stringer interface.
func (i *Integration) String() string {
return fmt.Sprintf("%s[%d]", i.name, i.idx)
}
// notifyKey defines a custom type with which a context is populated to
// avoid accidental collisions.
type notifyKey int
@ -316,12 +322,12 @@ type RoutingStage map[string]Stage
func (rs RoutingStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
receiver, ok := ReceiverName(ctx)
if !ok {
return ctx, nil, fmt.Errorf("receiver missing")
return ctx, nil, errors.New("receiver missing")
}
s, ok := rs[receiver]
if !ok {
return ctx, nil, fmt.Errorf("stage for receiver missing")
return ctx, nil, errors.New("stage for receiver missing")
}
return s.Exec(ctx, l, alerts...)
@ -362,14 +368,6 @@ func (fs FanoutStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.A
go func(s Stage) {
if _, _, err := s.Exec(ctx, l, alerts...); err != nil {
me.Add(err)
lvl := level.Error(l)
if ctx.Err() == context.Canceled {
// It is expected for the context to be canceled on
// configuration reload or shutdown. In this case, the
// message should only be logged at the debug level.
lvl = level.Debug(l)
}
lvl.Log("msg", "Error on notify", "err", err, "context_err", ctx.Err())
}
wg.Done()
}(s)
@ -547,12 +545,12 @@ func (n *DedupStage) needsUpdate(entry *nflogpb.Entry, firing, resolved map[uint
func (n *DedupStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
gkey, ok := GroupKey(ctx)
if !ok {
return ctx, nil, fmt.Errorf("group key missing")
return ctx, nil, errors.New("group key missing")
}
repeatInterval, ok := RepeatInterval(ctx)
if !ok {
return ctx, nil, fmt.Errorf("repeat interval missing")
return ctx, nil, errors.New("repeat interval missing")
}
firingSet := map[uint64]struct{}{}
@ -586,7 +584,7 @@ func (n *DedupStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Al
case 1:
entry = entries[0]
default:
return ctx, nil, fmt.Errorf("unexpected entry result size %d", len(entries))
return ctx, nil, errors.Errorf("unexpected entry result size %d", len(entries))
}
if n.needsUpdate(entry, firingSet, resolvedSet, repeatInterval) {
@ -622,7 +620,7 @@ func (r RetryStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
if !r.integration.SendResolved() {
firing, ok := FiringAlerts(ctx)
if !ok {
return ctx, nil, fmt.Errorf("firing alerts missing")
return ctx, nil, errors.New("firing alerts missing")
}
if len(firing) == 0 {
return ctx, alerts, nil
@ -643,17 +641,18 @@ func (r RetryStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
iErr error
)
defer tick.Stop()
l = log.With(l, "receiver", r.groupName, "integration", r.integration.String())
for {
i++
// Always check the context first to not notify again.
select {
case <-ctx.Done():
if iErr != nil {
return ctx, nil, iErr
if iErr == nil {
iErr = ctx.Err()
}
return ctx, nil, ctx.Err()
return ctx, nil, errors.Wrapf(iErr, "%s/%s: notify retry canceled after %d attempts", r.groupName, r.integration.String(), i)
default:
}
@ -665,23 +664,31 @@ func (r RetryStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
r.metrics.numNotifications.WithLabelValues(r.integration.Name()).Inc()
if err != nil {
r.metrics.numFailedNotifications.WithLabelValues(r.integration.Name()).Inc()
level.Debug(l).Log("msg", "Notify attempt failed", "attempt", i, "integration", r.integration.Name(), "receiver", r.groupName, "retry", retry, "err", err)
if !retry {
return ctx, alerts, fmt.Errorf("cancelling notify retry for %q due to unrecoverable error: %s", r.integration.Name(), err)
return ctx, alerts, errors.Wrapf(err, "%s/%s: notify retry canceled due to unrecoverable error after %d attempts", r.groupName, r.integration.String(), i)
}
if ctx.Err() == nil && (iErr == nil || err.Error() != iErr.Error()) {
// Log the error if the context isn't done and the error isn't the same as before.
level.Warn(l).Log("msg", "Notify attempt failed, will retry later", "attempts", i, "err", err)
}
// Save this error to be able to return the last seen error by an
// integration upon context timeout.
iErr = err
} else {
lvl := level.Debug(l)
if i > 1 {
lvl = level.Info(l)
}
lvl.Log("msg", "Notify success", "attempts", i)
return ctx, alerts, nil
}
case <-ctx.Done():
if iErr != nil {
return ctx, nil, iErr
if iErr == nil {
iErr = ctx.Err()
}
return ctx, nil, ctx.Err()
return ctx, nil, errors.Wrapf(iErr, "%s/%s: notify retry canceled after %d attempts", r.groupName, r.integration.String(), i)
}
}
}
@ -705,17 +712,17 @@ func NewSetNotifiesStage(l NotificationLog, recv *nflogpb.Receiver) *SetNotifies
func (n SetNotifiesStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
gkey, ok := GroupKey(ctx)
if !ok {
return ctx, nil, fmt.Errorf("group key missing")
return ctx, nil, errors.New("group key missing")
}
firing, ok := FiringAlerts(ctx)
if !ok {
return ctx, nil, fmt.Errorf("firing alerts missing")
return ctx, nil, errors.New("firing alerts missing")
}
resolved, ok := ResolvedAlerts(ctx)
if !ok {
return ctx, nil, fmt.Errorf("resolved alerts missing")
return ctx, nil, errors.New("resolved alerts missing")
}
return ctx, alerts, n.nflog.Log(n.recv, gkey, firing, resolved)