Add additional cluster configuration flags (#1379)

The cluster configuration uses DefaultLANConfig which seems
to be quite sensitive to WAN conditions. Allowing the tuning of these 3
parameters (TCP Timeout, Probe Interval and Probe Timeout) makes
clustering more robust across WAN connections.

Signed-off-by: Rhys Meaclem <rhysmeaclem@gmail.com>
This commit is contained in:
rhysm 2018-05-14 19:22:04 +12:00 committed by stuart nelson
parent 942be9d993
commit e4416bd612
4 changed files with 22 additions and 0 deletions

View File

@ -301,6 +301,10 @@ be configured to communicate with each other. This is configured using the
convergence speeds at expense of bandwidth (default "1m0s")
- `--cluster.settle-timeout` value: maximum time to wait for cluster
connections to settle before evaluating notifications.
- `--cluster.tcp-timeout` value: timeout value for tcp connections, reads and writes (default "10s")
- `--cluster.probe-timeout` value: time to wait for ack before marking node unhealthy
(default "500ms")
- `--cluster.probe-interval` value: interval between random node probes (default "1s")
The chosen port in the `cluster.listen-address` flag is the port that needs to be
specified in the `cluster.peer` flag of the other peers.

View File

@ -38,6 +38,9 @@ type Peer struct {
const (
DefaultPushPullInterval = 60 * time.Second
DefaultGossipInterval = 200 * time.Millisecond
DefaultTcpTimeout = 10 * time.Second
DefaultProbeTimeout = 500 * time.Millisecond
DefaultProbeInterval = 1 * time.Second
)
func Join(
@ -49,6 +52,9 @@ func Join(
waitIfEmpty bool,
pushPullInterval time.Duration,
gossipInterval time.Duration,
tcpTimeout time.Duration,
probeTimeout time.Duration,
probeInterval time.Duration,
) (*Peer, error) {
bindHost, bindPortStr, err := net.SplitHostPort(bindAddr)
if err != nil {
@ -111,6 +117,9 @@ func Join(
cfg.Events = p.delegate
cfg.GossipInterval = gossipInterval
cfg.PushPullInterval = pushPullInterval
cfg.TCPTimeout = tcpTimeout
cfg.ProbeTimeout = probeTimeout
cfg.ProbeInterval = probeInterval
cfg.LogOutput = ioutil.Discard
if advertiseAddr != "" {

View File

@ -34,6 +34,9 @@ func TestJoin(t *testing.T) {
true,
0*time.Second,
0*time.Second,
0*time.Second,
0*time.Second,
0*time.Second,
)
require.NoError(t, err)
require.False(t, p == nil)

View File

@ -156,6 +156,9 @@ func main() {
peerTimeout = kingpin.Flag("cluster.peer-timeout", "Time to wait between peers to send notifications.").Default("15s").Duration()
gossipInterval = kingpin.Flag("cluster.gossip-interval", "Interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across the cluster more quickly at the expense of increased bandwidth.").Default(cluster.DefaultGossipInterval.String()).Duration()
pushPullInterval = kingpin.Flag("cluster.pushpull-interval", "Interval for gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage.").Default(cluster.DefaultPushPullInterval.String()).Duration()
tcpTimeout = kingpin.Flag("cluster.tcp-timeout", "Timeout for establishing a stream connection with a remote node for a full state sync, and for stream read and write operations.").Default(cluster.DefaultTcpTimeout.String()).Duration()
probeTimeout = kingpin.Flag("cluster.probe-timeout", "Timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of RTT (round-trip time) on your network.").Default(cluster.DefaultProbeTimeout.String()).Duration()
probeInterval = kingpin.Flag("cluster.probe-interval", "Interval between random node probes. Setting this lower (more frequent) will cause the cluster to detect failed nodes more quickly at the expense of increased bandwidth usage.").Default(cluster.DefaultProbeInterval.String()).Duration()
settleTimeout = kingpin.Flag("cluster.settle-timeout", "Maximum time to wait for cluster connections to settle before evaluating notifications.").Default(cluster.DefaultPushPullInterval.String()).Duration()
)
@ -184,6 +187,9 @@ func main() {
true,
*pushPullInterval,
*gossipInterval,
*tcpTimeout,
*probeTimeout,
*probeInterval,
)
if err != nil {
level.Error(logger).Log("msg", "Unable to initialize gossip mesh", "err", err)