Add feature flag to enable discovery and use of public IPaddr for clustering. (#2719)

* Add feature flag to enable discovery and use of public IPaddr for clustering.

Before this change, Alertmanager would refuse to startup if using a
advertise address binding to any address (0.0.0.0), and the host only
had an interface with a public IP address. After this change we feature
flag permitting the use of a discovered public address for cluster
gossiping.

Signed-off-by: Devin Trejo <dtrejo@palantir.com>
This commit is contained in:
Devin Trejo 2021-11-10 11:40:48 -05:00 committed by GitHub
parent 2f445bcf98
commit fad796931b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 134 additions and 49 deletions

View File

@ -20,10 +20,11 @@ import (
"github.com/pkg/errors"
)
type getPrivateIPFunc func() (string, error)
type getIPFunc func() (string, error)
// This is overridden in unit tests to mock the sockaddr.GetPrivateIP function.
var getPrivateAddress getPrivateIPFunc = sockaddr.GetPrivateIP
// These are overridden in unit tests to mock the sockaddr functions.
var getPrivateAddress getIPFunc = sockaddr.GetPrivateIP
var getPublicAddress getIPFunc = sockaddr.GetPublicIP
// calculateAdvertiseAddress attempts to clone logic from deep within memberlist
// (NetTransport.FinalAdvertiseAddr) in order to surface its conclusions to the
@ -31,7 +32,7 @@ var getPrivateAddress getPrivateIPFunc = sockaddr.GetPrivateIP
// inadvertently misconfigured their cluster.
//
// https://github.com/hashicorp/memberlist/blob/022f081/net_transport.go#L126
func calculateAdvertiseAddress(bindAddr, advertiseAddr string) (net.IP, error) {
func calculateAdvertiseAddress(bindAddr, advertiseAddr string, allowInsecureAdvertise bool) (net.IP, error) {
if advertiseAddr != "" {
ip := net.ParseIP(advertiseAddr)
if ip == nil {
@ -44,18 +45,7 @@ func calculateAdvertiseAddress(bindAddr, advertiseAddr string) (net.IP, error) {
}
if isAny(bindAddr) {
privateIP, err := getPrivateAddress()
if err != nil {
return nil, errors.Wrap(err, "failed to get private IP")
}
if privateIP == "" {
return nil, errors.New("no private IP found, explicit advertise addr not provided")
}
ip := net.ParseIP(privateIP)
if ip == nil {
return nil, errors.Errorf("failed to parse private IP '%s'", privateIP)
}
return ip, nil
return discoverAdvertiseAddress(allowInsecureAdvertise)
}
ip := net.ParseIP(bindAddr)
@ -64,3 +54,33 @@ func calculateAdvertiseAddress(bindAddr, advertiseAddr string) (net.IP, error) {
}
return ip, nil
}
// discoverAdvertiseAddress will attempt to get a single IP address to use as
// the advertise address when one is not explicitly provided. It defaults to
// using a private IP address, and if not found then using a public IP if
// insecure advertising is allowed.
func discoverAdvertiseAddress(allowInsecureAdvertise bool) (net.IP, error) {
addr, err := getPrivateAddress()
if err != nil {
return nil, errors.Wrap(err, "failed to get private IP")
}
if addr == "" && !allowInsecureAdvertise {
return nil, errors.New("no private IP found, explicit advertise addr not provided")
}
if addr == "" {
addr, err = getPublicAddress()
if err != nil {
return nil, errors.Wrap(err, "failed to get public IP")
}
if addr == "" {
return nil, errors.New("no private/public IP found, explicit advertise addr not provided")
}
}
ip := net.ParseIP(addr)
if ip == nil {
return nil, errors.Errorf("failed to parse discovered IP '%s'", addr)
}
return ip, nil
}

View File

@ -28,13 +28,17 @@ func TestCalculateAdvertiseAddress(t *testing.T) {
}()
cases := []struct {
fn getPrivateIPFunc
bind, advertise string
name string
privateIPFn getIPFunc
publicIPFn getIPFunc
bind, advertise string
allowInsecureAdvertise bool
expectedIP net.IP
err bool
}{
{
name: "use provided bind address",
bind: "192.0.2.1",
advertise: "",
@ -42,6 +46,7 @@ func TestCalculateAdvertiseAddress(t *testing.T) {
err: false,
},
{
name: "use provided advertise address",
bind: "192.0.2.1",
advertise: "192.0.2.2",
@ -49,44 +54,93 @@ func TestCalculateAdvertiseAddress(t *testing.T) {
err: false,
},
{
fn: func() (string, error) { return "192.0.2.1", nil },
bind: "0.0.0.0",
advertise: "",
name: "discover private ip address",
privateIPFn: func() (string, error) { return "192.0.2.1", nil },
bind: "0.0.0.0",
advertise: "",
expectedIP: net.ParseIP("192.0.2.1"),
err: false,
},
{
fn: func() (string, error) { return "", errors.New("some error") },
bind: "0.0.0.0",
advertise: "",
name: "error if getPrivateAddress errors",
privateIPFn: func() (string, error) { return "", errors.New("some error") },
bind: "0.0.0.0",
advertise: "",
err: true,
},
{
fn: func() (string, error) { return "invalid", nil },
bind: "0.0.0.0",
advertise: "",
name: "error if getPrivateAddress returns an invalid address",
privateIPFn: func() (string, error) { return "invalid", nil },
bind: "0.0.0.0",
advertise: "",
err: true,
},
{
fn: func() (string, error) { return "", nil },
bind: "0.0.0.0",
advertise: "",
name: "error if getPrivateAddress returns an empty address",
privateIPFn: func() (string, error) { return "", nil },
bind: "0.0.0.0",
advertise: "",
err: true,
},
{
name: "discover public advertise address",
privateIPFn: func() (string, error) { return "", nil },
publicIPFn: func() (string, error) { return "192.0.2.1", nil },
bind: "0.0.0.0",
advertise: "",
allowInsecureAdvertise: true,
expectedIP: net.ParseIP("192.0.2.1"),
err: false,
},
{
name: "error if getPublicAddress errors",
privateIPFn: func() (string, error) { return "", nil },
publicIPFn: func() (string, error) { return "", errors.New("some error") },
bind: "0.0.0.0",
advertise: "",
allowInsecureAdvertise: true,
err: true,
},
{
name: "error if getPublicAddress returns an invalid address",
privateIPFn: func() (string, error) { return "", nil },
publicIPFn: func() (string, error) { return "invalid", nil },
bind: "0.0.0.0",
advertise: "",
allowInsecureAdvertise: true,
err: true,
},
{
name: "error if getPublicAddress returns an empty address",
privateIPFn: func() (string, error) { return "", nil },
publicIPFn: func() (string, error) { return "", nil },
bind: "0.0.0.0",
advertise: "",
allowInsecureAdvertise: true,
err: true,
},
}
for _, c := range cases {
getPrivateAddress = c.fn
got, err := calculateAdvertiseAddress(c.bind, c.advertise)
if c.err {
require.Error(t, err)
} else {
t.Run(c.name, func(t *testing.T) {
getPrivateAddress = c.privateIPFn
getPublicAddress = c.publicIPFn
got, err := calculateAdvertiseAddress(c.bind, c.advertise, c.allowInsecureAdvertise)
if c.err {
require.Error(t, err)
return
}
require.NoError(t, err)
require.Equal(t, c.expectedIP.String(), got.String())
}
})
}
}

View File

@ -141,6 +141,7 @@ func Create(
probeTimeout time.Duration,
probeInterval time.Duration,
tlsTransportConfig *TLSTransportConfig,
allowInsecureAdvertise bool,
) (*Peer, error) {
bindHost, bindPortStr, err := net.SplitHostPort(bindAddr)
if err != nil {
@ -172,7 +173,7 @@ func Create(
level.Debug(l).Log("msg", "resolved peers to following addresses", "peers", strings.Join(resolvedPeers, ","))
// Initial validation of user-specified advertise address.
addr, err := calculateAdvertiseAddress(bindHost, advertiseHost)
addr, err := calculateAdvertiseAddress(bindHost, advertiseHost, allowInsecureAdvertise)
if err != nil {
level.Warn(l).Log("err", "couldn't deduce an advertise address: "+err.Error())
} else if hasNonlocal(resolvedPeers) && isUnroutable(addr.String()) {

View File

@ -53,6 +53,7 @@ func testJoinLeave(t *testing.T) {
DefaultProbeTimeout,
DefaultProbeInterval,
nil,
false,
)
require.NoError(t, err)
require.NotNil(t, p)
@ -86,6 +87,7 @@ func testJoinLeave(t *testing.T) {
DefaultProbeTimeout,
DefaultProbeInterval,
nil,
false,
)
require.NoError(t, err)
require.NotNil(t, p2)
@ -120,6 +122,7 @@ func testReconnect(t *testing.T) {
DefaultProbeTimeout,
DefaultProbeInterval,
nil,
false,
)
require.NoError(t, err)
require.NotNil(t, p)
@ -144,6 +147,7 @@ func testReconnect(t *testing.T) {
DefaultProbeTimeout,
DefaultProbeInterval,
nil,
false,
)
require.NoError(t, err)
require.NotNil(t, p2)
@ -183,6 +187,7 @@ func testRemoveFailedPeers(t *testing.T) {
DefaultProbeTimeout,
DefaultProbeInterval,
nil,
false,
)
require.NoError(t, err)
require.NotNil(t, p)
@ -233,6 +238,7 @@ func testInitiallyFailingPeers(t *testing.T) {
DefaultProbeTimeout,
DefaultProbeInterval,
nil,
false,
)
require.NoError(t, err)
require.NotNil(t, p)
@ -279,6 +285,7 @@ func testTLSConnection(t *testing.T) {
DefaultProbeTimeout,
DefaultProbeInterval,
tlsTransportConfig1,
false,
)
require.NoError(t, err)
require.NotNil(t, p1)
@ -309,6 +316,7 @@ func testTLSConnection(t *testing.T) {
DefaultProbeTimeout,
DefaultProbeInterval,
tlsTransportConfig2,
false,
)
require.NoError(t, err)
require.NotNil(t, p2)

View File

@ -200,18 +200,19 @@ func run() int {
clusterBindAddr = kingpin.Flag("cluster.listen-address", "Listen address for cluster. Set to empty string to disable HA mode.").
Default(defaultClusterAddr).String()
clusterAdvertiseAddr = kingpin.Flag("cluster.advertise-address", "Explicit address to advertise in cluster.").String()
peers = kingpin.Flag("cluster.peer", "Initial peers (may be repeated).").Strings()
peerTimeout = kingpin.Flag("cluster.peer-timeout", "Time to wait between peers to send notifications.").Default("15s").Duration()
gossipInterval = kingpin.Flag("cluster.gossip-interval", "Interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across the cluster more quickly at the expense of increased bandwidth.").Default(cluster.DefaultGossipInterval.String()).Duration()
pushPullInterval = kingpin.Flag("cluster.pushpull-interval", "Interval for gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage.").Default(cluster.DefaultPushPullInterval.String()).Duration()
tcpTimeout = kingpin.Flag("cluster.tcp-timeout", "Timeout for establishing a stream connection with a remote node for a full state sync, and for stream read and write operations.").Default(cluster.DefaultTcpTimeout.String()).Duration()
probeTimeout = kingpin.Flag("cluster.probe-timeout", "Timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of RTT (round-trip time) on your network.").Default(cluster.DefaultProbeTimeout.String()).Duration()
probeInterval = kingpin.Flag("cluster.probe-interval", "Interval between random node probes. Setting this lower (more frequent) will cause the cluster to detect failed nodes more quickly at the expense of increased bandwidth usage.").Default(cluster.DefaultProbeInterval.String()).Duration()
settleTimeout = kingpin.Flag("cluster.settle-timeout", "Maximum time to wait for cluster connections to settle before evaluating notifications.").Default(cluster.DefaultPushPullInterval.String()).Duration()
reconnectInterval = kingpin.Flag("cluster.reconnect-interval", "Interval between attempting to reconnect to lost peers.").Default(cluster.DefaultReconnectInterval.String()).Duration()
peerReconnectTimeout = kingpin.Flag("cluster.reconnect-timeout", "Length of time to attempt to reconnect to a lost peer.").Default(cluster.DefaultReconnectTimeout.String()).Duration()
tlsConfigFile = kingpin.Flag("cluster.tls-config", "[EXPERIMENTAL] Path to config yaml file that can enable mutual TLS within the gossip protocol.").Default("").String()
clusterAdvertiseAddr = kingpin.Flag("cluster.advertise-address", "Explicit address to advertise in cluster.").String()
peers = kingpin.Flag("cluster.peer", "Initial peers (may be repeated).").Strings()
peerTimeout = kingpin.Flag("cluster.peer-timeout", "Time to wait between peers to send notifications.").Default("15s").Duration()
gossipInterval = kingpin.Flag("cluster.gossip-interval", "Interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across the cluster more quickly at the expense of increased bandwidth.").Default(cluster.DefaultGossipInterval.String()).Duration()
pushPullInterval = kingpin.Flag("cluster.pushpull-interval", "Interval for gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage.").Default(cluster.DefaultPushPullInterval.String()).Duration()
tcpTimeout = kingpin.Flag("cluster.tcp-timeout", "Timeout for establishing a stream connection with a remote node for a full state sync, and for stream read and write operations.").Default(cluster.DefaultTcpTimeout.String()).Duration()
probeTimeout = kingpin.Flag("cluster.probe-timeout", "Timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of RTT (round-trip time) on your network.").Default(cluster.DefaultProbeTimeout.String()).Duration()
probeInterval = kingpin.Flag("cluster.probe-interval", "Interval between random node probes. Setting this lower (more frequent) will cause the cluster to detect failed nodes more quickly at the expense of increased bandwidth usage.").Default(cluster.DefaultProbeInterval.String()).Duration()
settleTimeout = kingpin.Flag("cluster.settle-timeout", "Maximum time to wait for cluster connections to settle before evaluating notifications.").Default(cluster.DefaultPushPullInterval.String()).Duration()
reconnectInterval = kingpin.Flag("cluster.reconnect-interval", "Interval between attempting to reconnect to lost peers.").Default(cluster.DefaultReconnectInterval.String()).Duration()
peerReconnectTimeout = kingpin.Flag("cluster.reconnect-timeout", "Length of time to attempt to reconnect to a lost peer.").Default(cluster.DefaultReconnectTimeout.String()).Duration()
tlsConfigFile = kingpin.Flag("cluster.tls-config", "[EXPERIMENTAL] Path to config yaml file that can enable mutual TLS within the gossip protocol.").Default("").String()
allowInsecureAdvertise = kingpin.Flag("cluster.allow-insecure-public-advertise-address-discovery", "[EXPERIMENTAL] Allow alertmanager to discover and listen on a public IP address.").Bool()
)
promlogflag.AddFlags(kingpin.CommandLine, &promlogConfig)
@ -252,6 +253,7 @@ func run() int {
*probeTimeout,
*probeInterval,
tlsTransportConfig,
*allowInsecureAdvertise,
)
if err != nil {
level.Error(logger).Log("msg", "unable to initialize gossip mesh", "err", err)