fix set initial failed peers (#1407)

* Correctly add Node to initially failed peer

Reconnect attempts to failed peers were panicking
because peer.Address() would attempt to access the
nil Node struct member.

Signed-off-by: stuart nelson <stuartnelson3@gmail.com>

* Correctly remove old peers

Again, since we aren't assigning a name (this is
generated) we rely on the node's Address for
removing the initially joining (and potentially
later re-joining) peers

Signed-off-by: stuart nelson <stuartnelson3@gmail.com>

* Test the peerJoin removes initial peers

Signed-off-by: stuart nelson <stuartnelson3@gmail.com>

* Don't add self to failing peers list

The initially failing peers list shouldn't include
the bindAddr for the alertmanager itself, as this
connection is never made, and consequently only
removed from the failedPeers list after the failed
peer timeout.

Signed-off-by: stuart nelson <stuartnelson3@gmail.com>

* Filter initialFailed with advertise addr

This may differ from bindAddr, and is the value we
want to not attempt to connect to.

Signed-off-by: stuart nelson <stuartnelson3@gmail.com>
This commit is contained in:
stuart nelson 2018-06-08 12:34:52 +02:00 committed by GitHub
parent 36588c3865
commit 6305229fcc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 38 additions and 10 deletions

View File

@ -205,7 +205,7 @@ func Join(
}
p.mlist = ml
p.setInitialFailed(resolvedPeers)
p.setInitialFailed(resolvedPeers, fmt.Sprintf("%s:%d", advertiseHost, advertisePort))
n, err := ml.Join(resolvedPeers)
if err != nil {
@ -226,16 +226,34 @@ func Join(
// All peers are initially added to the failed list. They will be removed from
// this list in peerJoin when making their initial connection.
func (p *Peer) setInitialFailed(peers []string) {
func (p *Peer) setInitialFailed(peers []string, myAddr string) {
if len(peers) == 0 {
return
}
now := time.Now()
for _, peerAddr := range peers {
if peerAddr == myAddr {
// Don't add ourselves to the initially failing list,
// we don't connect to ourselves.
continue
}
ip, port, err := net.SplitHostPort(peerAddr)
if err != nil {
continue
}
portUint, err := strconv.ParseUint(port, 10, 16)
if err != nil {
continue
}
pr := peer{
status: StatusNone,
status: StatusFailed,
leaveTime: now,
Node: &memberlist.Node{
Addr: net.ParseIP(ip),
Port: uint16(portUint),
},
}
p.failedPeers = append(p.failedPeers, pr)
p.peers[peerAddr] = pr
@ -378,7 +396,7 @@ func (p *Peer) peerJoin(n *memberlist.Node) {
if oldStatus == StatusFailed {
level.Debug(p.logger).Log("msg", "peer rejoined", "peer", pr.Node)
p.failedPeers = removeOldPeer(p.failedPeers, pr.Name)
p.failedPeers = removeOldPeer(p.failedPeers, pr.Address())
}
}
@ -700,10 +718,10 @@ func retry(interval time.Duration, stopc <-chan struct{}, f func() error) error
}
}
func removeOldPeer(old []peer, name string) []peer {
func removeOldPeer(old []peer, addr string) []peer {
new := make([]peer, 0, len(old))
for _, p := range old {
if p.Name != name {
if p.Address() != addr {
new = append(new, p)
}
}

View File

@ -178,7 +178,8 @@ func TestRemoveFailedPeers(t *testing.T) {
func TestInitiallyFailingPeers(t *testing.T) {
logger := log.NewNopLogger()
peerAddrs := []string{"1.2.3.4:5000", "2.3.4.5:5000", "3.4.5.6:5000"}
myAddr := "1.2.3.4:5000"
peerAddrs := []string{myAddr, "2.3.4.5:5000", "3.4.5.6:5000"}
p, err := Join(
logger,
prometheus.NewRegistry(),
@ -197,12 +198,21 @@ func TestInitiallyFailingPeers(t *testing.T) {
require.NoError(t, err)
require.NotNil(t, p)
p.setInitialFailed(peerAddrs)
p.setInitialFailed(peerAddrs, myAddr)
require.Equal(t, len(peerAddrs), len(p.failedPeers))
// We shouldn't have added "our" bind addr to the failed peers list
require.Equal(t, len(peerAddrs)-1, len(p.failedPeers))
for _, addr := range peerAddrs {
if addr == myAddr {
continue
}
pr, ok := p.peers[addr]
require.True(t, ok)
require.Equal(t, StatusNone, pr.status)
require.Equal(t, StatusFailed.String(), pr.status.String())
require.Equal(t, addr, pr.Address())
expectedLen := len(p.failedPeers) - 1
p.peerJoin(pr.Node)
require.Equal(t, expectedLen, len(p.failedPeers))
}
}