From 3735df3ac7ffc290f134b974d622a9cfe97374c4 Mon Sep 17 00:00:00 2001 From: Max Inden Date: Wed, 11 Jul 2018 17:19:33 +0200 Subject: [PATCH] cluster: Do not exit when failing to join cluster (#1465) Alertmanager is exiting with a non-zero exit code if the initial cluster join fails. This behavior could be not wanted because: - As Alertmanager is a critical component with an at-least-once guarantee, failing on joining the cluster is unnecessary as Alertmanager still functions by itself. - In an environment like Kubernetes discovering peers via DNS, peers might roll out one-by-one, leaving the DNS entries unpopulated for the first peer of a set. Failing on initial join prevents a roll-out. Instead of failing on the initial join this patch only logs the failure. The cluster can be later joined via the `handleReconnect`. This is a regression introduced in PR #1456 [1]. [1] https://github.com/prometheus/alertmanager/pull/1456 Signed-off-by: Max Leonard Inden --- cluster/cluster.go | 3 +++ cmd/alertmanager/main.go | 5 ++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cluster/cluster.go b/cluster/cluster.go index f44091cb..5881a7e7 100644 --- a/cluster/cluster.go +++ b/cluster/cluster.go @@ -217,6 +217,9 @@ func (p *Peer) Join( n, err := p.mlist.Join(p.resolvedPeers) if err != nil { level.Warn(p.logger).Log("msg", "failed to join cluster", "err", err) + if reconnectInterval != 0 { + level.Info(p.logger).Log("msg", fmt.Sprintf("will retry joining cluster every %v", reconnectInterval.String())) + } } else { level.Debug(p.logger).Log("msg", "joined cluster", "peers", n) } diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go index e331b566..4e0aa6d8 100644 --- a/cmd/alertmanager/main.go +++ b/cmd/alertmanager/main.go @@ -196,7 +196,7 @@ func main() { *probeInterval, ) if err != nil { - level.Error(logger).Log("msg", "Unable to initialize gossip mesh", "err", err) + level.Error(logger).Log("msg", "unable to initialize gossip mesh", "err", err) os.Exit(1) } } @@ -262,8 +262,7 @@ func main() { *peerReconnectTimeout, ) if err != nil { - level.Error(logger).Log("msg", "Unable to join gossip mesh", "err", err) - os.Exit(1) + level.Warn(logger).Log("msg", "unable to join gossip mesh", "err", err) } ctx, cancel := context.WithTimeout(context.Background(), *settleTimeout) defer func() {