mirror of
https://github.com/prometheus/alertmanager
synced 2024-12-25 15:42:18 +00:00
cluster: Do not exit when failing to join cluster (#1465)
Alertmanager is exiting with a non-zero exit code if the initial cluster join fails. This behavior could be not wanted because: - As Alertmanager is a critical component with an at-least-once guarantee, failing on joining the cluster is unnecessary as Alertmanager still functions by itself. - In an environment like Kubernetes discovering peers via DNS, peers might roll out one-by-one, leaving the DNS entries unpopulated for the first peer of a set. Failing on initial join prevents a roll-out. Instead of failing on the initial join this patch only logs the failure. The cluster can be later joined via the `handleReconnect`. This is a regression introduced in PR #1456 [1]. [1] https://github.com/prometheus/alertmanager/pull/1456 Signed-off-by: Max Leonard Inden <IndenML@gmail.com>
This commit is contained in:
parent
f3bc41d256
commit
3735df3ac7
@ -217,6 +217,9 @@ func (p *Peer) Join(
|
||||
n, err := p.mlist.Join(p.resolvedPeers)
|
||||
if err != nil {
|
||||
level.Warn(p.logger).Log("msg", "failed to join cluster", "err", err)
|
||||
if reconnectInterval != 0 {
|
||||
level.Info(p.logger).Log("msg", fmt.Sprintf("will retry joining cluster every %v", reconnectInterval.String()))
|
||||
}
|
||||
} else {
|
||||
level.Debug(p.logger).Log("msg", "joined cluster", "peers", n)
|
||||
}
|
||||
|
@ -196,7 +196,7 @@ func main() {
|
||||
*probeInterval,
|
||||
)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Unable to initialize gossip mesh", "err", err)
|
||||
level.Error(logger).Log("msg", "unable to initialize gossip mesh", "err", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
@ -262,8 +262,7 @@ func main() {
|
||||
*peerReconnectTimeout,
|
||||
)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Unable to join gossip mesh", "err", err)
|
||||
os.Exit(1)
|
||||
level.Warn(logger).Log("msg", "unable to join gossip mesh", "err", err)
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), *settleTimeout)
|
||||
defer func() {
|
||||
|
Loading…
Reference in New Issue
Block a user