Test Flake: Fix Cluster join/leave flake

Signed-off-by: gotjosh <josue.abreu@gmail.com>
This commit is contained in:
gotjosh 2023-03-08 15:10:28 +00:00
parent 3ee2cd0f12
commit 9a89828c1f
No known key found for this signature in database
GPG Key ID: A6E1DDE38FF3C74E
2 changed files with 38 additions and 7 deletions

View File

@ -24,6 +24,7 @@ import (
"sync" "sync"
"time" "time"
"github.com/benbjohnson/clock"
"github.com/go-kit/log" "github.com/go-kit/log"
"github.com/go-kit/log/level" "github.com/go-kit/log/level"
"github.com/hashicorp/memberlist" "github.com/hashicorp/memberlist"
@ -57,6 +58,7 @@ type ClusterChannel interface {
// Peer is a single peer in a gossip cluster. // Peer is a single peer in a gossip cluster.
type Peer struct { type Peer struct {
clock clock.Clock
mlist *memberlist.Memberlist mlist *memberlist.Memberlist
delegate *delegate delegate *delegate
@ -194,6 +196,7 @@ func Create(
} }
p := &Peer{ p := &Peer{
clock: clock.New(),
states: map[string]State{}, states: map[string]State{},
stopc: make(chan struct{}), stopc: make(chan struct{}),
readyc: make(chan struct{}), readyc: make(chan struct{}),
@ -394,7 +397,7 @@ func (p *Peer) register(reg prometheus.Registerer, name string) {
} }
func (p *Peer) runPeriodicTask(d time.Duration, f func()) { func (p *Peer) runPeriodicTask(d time.Duration, f func()) {
tick := time.NewTicker(d) tick := p.clock.Ticker(d)
defer tick.Stop() defer tick.Stop()
for { for {
@ -679,20 +682,20 @@ func (p *Peer) Position() int {
func (p *Peer) Settle(ctx context.Context, interval time.Duration) { func (p *Peer) Settle(ctx context.Context, interval time.Duration) {
const NumOkayRequired = 3 const NumOkayRequired = 3
level.Info(p.logger).Log("msg", "Waiting for gossip to settle...", "interval", interval) level.Info(p.logger).Log("msg", "Waiting for gossip to settle...", "interval", interval)
start := time.Now() start := p.clock.Now()
nPeers := 0 nPeers := 0
nOkay := 0 nOkay := 0
totalPolls := 0 totalPolls := 0
for { for {
select { select {
case <-ctx.Done(): case <-ctx.Done():
elapsed := time.Since(start) elapsed := p.clock.Since(start)
level.Info(p.logger).Log("msg", "gossip not settled but continuing anyway", "polls", totalPolls, "elapsed", elapsed) level.Info(p.logger).Log("msg", "gossip not settled but continuing anyway", "polls", totalPolls, "elapsed", elapsed)
close(p.readyc) close(p.readyc)
return return
case <-time.After(interval): case <-p.clock.After(interval):
} }
elapsed := time.Since(start) elapsed := p.clock.Since(start)
n := len(p.Peers()) n := len(p.Peers())
if nOkay >= NumOkayRequired { if nOkay >= NumOkayRequired {
level.Info(p.logger).Log("msg", "gossip settled; proceeding", "elapsed", elapsed) level.Info(p.logger).Log("msg", "gossip settled; proceeding", "elapsed", elapsed)

View File

@ -15,9 +15,11 @@ package cluster
import ( import (
"context" "context"
"sync"
"testing" "testing"
"time" "time"
"github.com/benbjohnson/clock"
"github.com/go-kit/log" "github.com/go-kit/log"
"github.com/hashicorp/go-sockaddr" "github.com/hashicorp/go-sockaddr"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
@ -57,10 +59,13 @@ func testJoinLeave(t *testing.T) {
) )
require.NoError(t, err) require.NoError(t, err)
require.NotNil(t, p) require.NotNil(t, p)
c1 := clock.NewMock()
p.clock = c1
err = p.Join( err = p.Join(
DefaultReconnectInterval, DefaultReconnectInterval,
DefaultReconnectTimeout, DefaultReconnectTimeout,
) )
c1.Add(1 * time.Minute)
require.NoError(t, err) require.NoError(t, err)
require.False(t, p.Ready()) require.False(t, p.Ready())
{ {
@ -69,7 +74,17 @@ func testJoinLeave(t *testing.T) {
require.Equal(t, context.Canceled, p.WaitReady(ctx)) require.Equal(t, context.Canceled, p.WaitReady(ctx))
} }
require.Equal(t, p.Status(), "settling") require.Equal(t, p.Status(), "settling")
go p.Settle(context.Background(), 0*time.Second)
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
p.Settle(context.Background(), 1*time.Second)
}()
time.Sleep(1 * time.Millisecond)
c1.Add(10 * time.Second)
wg.Wait()
require.NoError(t, p.WaitReady(context.Background())) require.NoError(t, p.WaitReady(context.Background()))
require.Equal(t, p.Status(), "ready") require.Equal(t, p.Status(), "ready")
@ -89,14 +104,27 @@ func testJoinLeave(t *testing.T) {
nil, nil,
false, false,
) )
c2 := clock.NewMock()
p2.clock = c2
require.NoError(t, err) require.NoError(t, err)
require.NotNil(t, p2) require.NotNil(t, p2)
err = p2.Join( err = p2.Join(
DefaultReconnectInterval, DefaultReconnectInterval,
DefaultReconnectTimeout, DefaultReconnectTimeout,
) )
c2.Add(1 * time.Minute)
require.NoError(t, err) require.NoError(t, err)
go p2.Settle(context.Background(), 0*time.Second)
var wg2 sync.WaitGroup
wg2.Add(1)
go func() {
defer wg2.Done()
p2.Settle(context.Background(), 1*time.Second)
}()
time.Sleep(1 * time.Millisecond)
c2.Add(10 * time.Second)
wg2.Wait()
require.NoError(t, p2.WaitReady(context.Background())) require.NoError(t, p2.WaitReady(context.Background()))
require.Equal(t, 2, p.ClusterSize()) require.Equal(t, 2, p.ClusterSize())