Add additional cluster configuration flags (#1379)
The cluster configuration uses DefaultLANConfig which seems to be quite sensitive to WAN conditions. Allowing the tuning of these 3 parameters (TCP Timeout, Probe Interval and Probe Timeout) makes clustering more robust across WAN connections. Signed-off-by: Rhys Meaclem <rhysmeaclem@gmail.com>
This commit is contained in:
parent
942be9d993
commit
e4416bd612
|
@ -301,6 +301,10 @@ be configured to communicate with each other. This is configured using the
|
|||
convergence speeds at expense of bandwidth (default "1m0s")
|
||||
- `--cluster.settle-timeout` value: maximum time to wait for cluster
|
||||
connections to settle before evaluating notifications.
|
||||
- `--cluster.tcp-timeout` value: timeout value for tcp connections, reads and writes (default "10s")
|
||||
- `--cluster.probe-timeout` value: time to wait for ack before marking node unhealthy
|
||||
(default "500ms")
|
||||
- `--cluster.probe-interval` value: interval between random node probes (default "1s")
|
||||
|
||||
The chosen port in the `cluster.listen-address` flag is the port that needs to be
|
||||
specified in the `cluster.peer` flag of the other peers.
|
||||
|
|
|
@ -38,6 +38,9 @@ type Peer struct {
|
|||
const (
|
||||
DefaultPushPullInterval = 60 * time.Second
|
||||
DefaultGossipInterval = 200 * time.Millisecond
|
||||
DefaultTcpTimeout = 10 * time.Second
|
||||
DefaultProbeTimeout = 500 * time.Millisecond
|
||||
DefaultProbeInterval = 1 * time.Second
|
||||
)
|
||||
|
||||
func Join(
|
||||
|
@ -49,6 +52,9 @@ func Join(
|
|||
waitIfEmpty bool,
|
||||
pushPullInterval time.Duration,
|
||||
gossipInterval time.Duration,
|
||||
tcpTimeout time.Duration,
|
||||
probeTimeout time.Duration,
|
||||
probeInterval time.Duration,
|
||||
) (*Peer, error) {
|
||||
bindHost, bindPortStr, err := net.SplitHostPort(bindAddr)
|
||||
if err != nil {
|
||||
|
@ -111,6 +117,9 @@ func Join(
|
|||
cfg.Events = p.delegate
|
||||
cfg.GossipInterval = gossipInterval
|
||||
cfg.PushPullInterval = pushPullInterval
|
||||
cfg.TCPTimeout = tcpTimeout
|
||||
cfg.ProbeTimeout = probeTimeout
|
||||
cfg.ProbeInterval = probeInterval
|
||||
cfg.LogOutput = ioutil.Discard
|
||||
|
||||
if advertiseAddr != "" {
|
||||
|
|
|
@ -34,6 +34,9 @@ func TestJoin(t *testing.T) {
|
|||
true,
|
||||
0*time.Second,
|
||||
0*time.Second,
|
||||
0*time.Second,
|
||||
0*time.Second,
|
||||
0*time.Second,
|
||||
)
|
||||
require.NoError(t, err)
|
||||
require.False(t, p == nil)
|
||||
|
|
|
@ -156,6 +156,9 @@ func main() {
|
|||
peerTimeout = kingpin.Flag("cluster.peer-timeout", "Time to wait between peers to send notifications.").Default("15s").Duration()
|
||||
gossipInterval = kingpin.Flag("cluster.gossip-interval", "Interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across the cluster more quickly at the expense of increased bandwidth.").Default(cluster.DefaultGossipInterval.String()).Duration()
|
||||
pushPullInterval = kingpin.Flag("cluster.pushpull-interval", "Interval for gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage.").Default(cluster.DefaultPushPullInterval.String()).Duration()
|
||||
tcpTimeout = kingpin.Flag("cluster.tcp-timeout", "Timeout for establishing a stream connection with a remote node for a full state sync, and for stream read and write operations.").Default(cluster.DefaultTcpTimeout.String()).Duration()
|
||||
probeTimeout = kingpin.Flag("cluster.probe-timeout", "Timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of RTT (round-trip time) on your network.").Default(cluster.DefaultProbeTimeout.String()).Duration()
|
||||
probeInterval = kingpin.Flag("cluster.probe-interval", "Interval between random node probes. Setting this lower (more frequent) will cause the cluster to detect failed nodes more quickly at the expense of increased bandwidth usage.").Default(cluster.DefaultProbeInterval.String()).Duration()
|
||||
settleTimeout = kingpin.Flag("cluster.settle-timeout", "Maximum time to wait for cluster connections to settle before evaluating notifications.").Default(cluster.DefaultPushPullInterval.String()).Duration()
|
||||
)
|
||||
|
||||
|
@ -184,6 +187,9 @@ func main() {
|
|||
true,
|
||||
*pushPullInterval,
|
||||
*gossipInterval,
|
||||
*tcpTimeout,
|
||||
*probeTimeout,
|
||||
*probeInterval,
|
||||
)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Unable to initialize gossip mesh", "err", err)
|
||||
|
|
Loading…
Reference in New Issue