cluster: Do not exit when failing to join cluster (#1465)

Alertmanager is exiting with a non-zero exit code if the initial cluster
join fails. This behavior could be not wanted because:

- As Alertmanager is a critical component with an at-least-once
guarantee, failing on joining the cluster is unnecessary as
Alertmanager still functions by itself.

- In an environment like Kubernetes discovering peers via DNS, peers
might roll out one-by-one, leaving the DNS entries unpopulated for the
first peer of a set. Failing on initial join prevents a roll-out.

Instead of failing on the initial join this patch only logs the failure.
The cluster can be later joined via the `handleReconnect`.

This is a regression introduced in PR #1456 [1].

[1] https://github.com/prometheus/alertmanager/pull/1456

Signed-off-by: Max Leonard Inden <IndenML@gmail.com>
This commit is contained in:
Max Inden 2018-07-11 17:19:33 +02:00 committed by GitHub
parent f3bc41d256
commit 3735df3ac7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 5 additions and 3 deletions

View File

@ -217,6 +217,9 @@ func (p *Peer) Join(
n, err := p.mlist.Join(p.resolvedPeers) n, err := p.mlist.Join(p.resolvedPeers)
if err != nil { if err != nil {
level.Warn(p.logger).Log("msg", "failed to join cluster", "err", err) level.Warn(p.logger).Log("msg", "failed to join cluster", "err", err)
if reconnectInterval != 0 {
level.Info(p.logger).Log("msg", fmt.Sprintf("will retry joining cluster every %v", reconnectInterval.String()))
}
} else { } else {
level.Debug(p.logger).Log("msg", "joined cluster", "peers", n) level.Debug(p.logger).Log("msg", "joined cluster", "peers", n)
} }

View File

@ -196,7 +196,7 @@ func main() {
*probeInterval, *probeInterval,
) )
if err != nil { if err != nil {
level.Error(logger).Log("msg", "Unable to initialize gossip mesh", "err", err) level.Error(logger).Log("msg", "unable to initialize gossip mesh", "err", err)
os.Exit(1) os.Exit(1)
} }
} }
@ -262,8 +262,7 @@ func main() {
*peerReconnectTimeout, *peerReconnectTimeout,
) )
if err != nil { if err != nil {
level.Error(logger).Log("msg", "Unable to join gossip mesh", "err", err) level.Warn(logger).Log("msg", "unable to join gossip mesh", "err", err)
os.Exit(1)
} }
ctx, cancel := context.WithTimeout(context.Background(), *settleTimeout) ctx, cancel := context.WithTimeout(context.Background(), *settleTimeout)
defer func() { defer func() {