From e4416bd6122c8d69311916a6fb7f266e1f3f4b8e Mon Sep 17 00:00:00 2001 From: rhysm Date: Mon, 14 May 2018 19:22:04 +1200 Subject: [PATCH] Add additional cluster configuration flags (#1379) The cluster configuration uses DefaultLANConfig which seems to be quite sensitive to WAN conditions. Allowing the tuning of these 3 parameters (TCP Timeout, Probe Interval and Probe Timeout) makes clustering more robust across WAN connections. Signed-off-by: Rhys Meaclem --- README.md | 4 ++++ cluster/cluster.go | 9 +++++++++ cluster/cluster_test.go | 3 +++ cmd/alertmanager/main.go | 6 ++++++ 4 files changed, 22 insertions(+) diff --git a/README.md b/README.md index 3b6265d5..ecc070cd 100644 --- a/README.md +++ b/README.md @@ -301,6 +301,10 @@ be configured to communicate with each other. This is configured using the convergence speeds at expense of bandwidth (default "1m0s") - `--cluster.settle-timeout` value: maximum time to wait for cluster connections to settle before evaluating notifications. +- `--cluster.tcp-timeout` value: timeout value for tcp connections, reads and writes (default "10s") +- `--cluster.probe-timeout` value: time to wait for ack before marking node unhealthy + (default "500ms") +- `--cluster.probe-interval` value: interval between random node probes (default "1s") The chosen port in the `cluster.listen-address` flag is the port that needs to be specified in the `cluster.peer` flag of the other peers. diff --git a/cluster/cluster.go b/cluster/cluster.go index 341642f1..ad9d73c2 100644 --- a/cluster/cluster.go +++ b/cluster/cluster.go @@ -38,6 +38,9 @@ type Peer struct { const ( DefaultPushPullInterval = 60 * time.Second DefaultGossipInterval = 200 * time.Millisecond + DefaultTcpTimeout = 10 * time.Second + DefaultProbeTimeout = 500 * time.Millisecond + DefaultProbeInterval = 1 * time.Second ) func Join( @@ -49,6 +52,9 @@ func Join( waitIfEmpty bool, pushPullInterval time.Duration, gossipInterval time.Duration, + tcpTimeout time.Duration, + probeTimeout time.Duration, + probeInterval time.Duration, ) (*Peer, error) { bindHost, bindPortStr, err := net.SplitHostPort(bindAddr) if err != nil { @@ -111,6 +117,9 @@ func Join( cfg.Events = p.delegate cfg.GossipInterval = gossipInterval cfg.PushPullInterval = pushPullInterval + cfg.TCPTimeout = tcpTimeout + cfg.ProbeTimeout = probeTimeout + cfg.ProbeInterval = probeInterval cfg.LogOutput = ioutil.Discard if advertiseAddr != "" { diff --git a/cluster/cluster_test.go b/cluster/cluster_test.go index 88fb7421..968627ff 100644 --- a/cluster/cluster_test.go +++ b/cluster/cluster_test.go @@ -34,6 +34,9 @@ func TestJoin(t *testing.T) { true, 0*time.Second, 0*time.Second, + 0*time.Second, + 0*time.Second, + 0*time.Second, ) require.NoError(t, err) require.False(t, p == nil) diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go index fe72c561..5f0ac0cc 100644 --- a/cmd/alertmanager/main.go +++ b/cmd/alertmanager/main.go @@ -156,6 +156,9 @@ func main() { peerTimeout = kingpin.Flag("cluster.peer-timeout", "Time to wait between peers to send notifications.").Default("15s").Duration() gossipInterval = kingpin.Flag("cluster.gossip-interval", "Interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across the cluster more quickly at the expense of increased bandwidth.").Default(cluster.DefaultGossipInterval.String()).Duration() pushPullInterval = kingpin.Flag("cluster.pushpull-interval", "Interval for gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage.").Default(cluster.DefaultPushPullInterval.String()).Duration() + tcpTimeout = kingpin.Flag("cluster.tcp-timeout", "Timeout for establishing a stream connection with a remote node for a full state sync, and for stream read and write operations.").Default(cluster.DefaultTcpTimeout.String()).Duration() + probeTimeout = kingpin.Flag("cluster.probe-timeout", "Timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of RTT (round-trip time) on your network.").Default(cluster.DefaultProbeTimeout.String()).Duration() + probeInterval = kingpin.Flag("cluster.probe-interval", "Interval between random node probes. Setting this lower (more frequent) will cause the cluster to detect failed nodes more quickly at the expense of increased bandwidth usage.").Default(cluster.DefaultProbeInterval.String()).Duration() settleTimeout = kingpin.Flag("cluster.settle-timeout", "Maximum time to wait for cluster connections to settle before evaluating notifications.").Default(cluster.DefaultPushPullInterval.String()).Duration() ) @@ -184,6 +187,9 @@ func main() { true, *pushPullInterval, *gossipInterval, + *tcpTimeout, + *probeTimeout, + *probeInterval, ) if err != nil { level.Error(logger).Log("msg", "Unable to initialize gossip mesh", "err", err)