Add feature flag to enable discovery and use of public IPaddr for clustering. (#2719)
* Add feature flag to enable discovery and use of public IPaddr for clustering. Before this change, Alertmanager would refuse to startup if using a advertise address binding to any address (0.0.0.0), and the host only had an interface with a public IP address. After this change we feature flag permitting the use of a discovered public address for cluster gossiping. Signed-off-by: Devin Trejo <dtrejo@palantir.com>
This commit is contained in:
parent
2f445bcf98
commit
fad796931b
|
@ -20,10 +20,11 @@ import (
|
|||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
type getPrivateIPFunc func() (string, error)
|
||||
type getIPFunc func() (string, error)
|
||||
|
||||
// This is overridden in unit tests to mock the sockaddr.GetPrivateIP function.
|
||||
var getPrivateAddress getPrivateIPFunc = sockaddr.GetPrivateIP
|
||||
// These are overridden in unit tests to mock the sockaddr functions.
|
||||
var getPrivateAddress getIPFunc = sockaddr.GetPrivateIP
|
||||
var getPublicAddress getIPFunc = sockaddr.GetPublicIP
|
||||
|
||||
// calculateAdvertiseAddress attempts to clone logic from deep within memberlist
|
||||
// (NetTransport.FinalAdvertiseAddr) in order to surface its conclusions to the
|
||||
|
@ -31,7 +32,7 @@ var getPrivateAddress getPrivateIPFunc = sockaddr.GetPrivateIP
|
|||
// inadvertently misconfigured their cluster.
|
||||
//
|
||||
// https://github.com/hashicorp/memberlist/blob/022f081/net_transport.go#L126
|
||||
func calculateAdvertiseAddress(bindAddr, advertiseAddr string) (net.IP, error) {
|
||||
func calculateAdvertiseAddress(bindAddr, advertiseAddr string, allowInsecureAdvertise bool) (net.IP, error) {
|
||||
if advertiseAddr != "" {
|
||||
ip := net.ParseIP(advertiseAddr)
|
||||
if ip == nil {
|
||||
|
@ -44,18 +45,7 @@ func calculateAdvertiseAddress(bindAddr, advertiseAddr string) (net.IP, error) {
|
|||
}
|
||||
|
||||
if isAny(bindAddr) {
|
||||
privateIP, err := getPrivateAddress()
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "failed to get private IP")
|
||||
}
|
||||
if privateIP == "" {
|
||||
return nil, errors.New("no private IP found, explicit advertise addr not provided")
|
||||
}
|
||||
ip := net.ParseIP(privateIP)
|
||||
if ip == nil {
|
||||
return nil, errors.Errorf("failed to parse private IP '%s'", privateIP)
|
||||
}
|
||||
return ip, nil
|
||||
return discoverAdvertiseAddress(allowInsecureAdvertise)
|
||||
}
|
||||
|
||||
ip := net.ParseIP(bindAddr)
|
||||
|
@ -64,3 +54,33 @@ func calculateAdvertiseAddress(bindAddr, advertiseAddr string) (net.IP, error) {
|
|||
}
|
||||
return ip, nil
|
||||
}
|
||||
|
||||
// discoverAdvertiseAddress will attempt to get a single IP address to use as
|
||||
// the advertise address when one is not explicitly provided. It defaults to
|
||||
// using a private IP address, and if not found then using a public IP if
|
||||
// insecure advertising is allowed.
|
||||
func discoverAdvertiseAddress(allowInsecureAdvertise bool) (net.IP, error) {
|
||||
addr, err := getPrivateAddress()
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "failed to get private IP")
|
||||
}
|
||||
if addr == "" && !allowInsecureAdvertise {
|
||||
return nil, errors.New("no private IP found, explicit advertise addr not provided")
|
||||
}
|
||||
|
||||
if addr == "" {
|
||||
addr, err = getPublicAddress()
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "failed to get public IP")
|
||||
}
|
||||
if addr == "" {
|
||||
return nil, errors.New("no private/public IP found, explicit advertise addr not provided")
|
||||
}
|
||||
}
|
||||
|
||||
ip := net.ParseIP(addr)
|
||||
if ip == nil {
|
||||
return nil, errors.Errorf("failed to parse discovered IP '%s'", addr)
|
||||
}
|
||||
return ip, nil
|
||||
}
|
||||
|
|
|
@ -28,13 +28,17 @@ func TestCalculateAdvertiseAddress(t *testing.T) {
|
|||
}()
|
||||
|
||||
cases := []struct {
|
||||
fn getPrivateIPFunc
|
||||
bind, advertise string
|
||||
name string
|
||||
privateIPFn getIPFunc
|
||||
publicIPFn getIPFunc
|
||||
bind, advertise string
|
||||
allowInsecureAdvertise bool
|
||||
|
||||
expectedIP net.IP
|
||||
err bool
|
||||
}{
|
||||
{
|
||||
name: "use provided bind address",
|
||||
bind: "192.0.2.1",
|
||||
advertise: "",
|
||||
|
||||
|
@ -42,6 +46,7 @@ func TestCalculateAdvertiseAddress(t *testing.T) {
|
|||
err: false,
|
||||
},
|
||||
{
|
||||
name: "use provided advertise address",
|
||||
bind: "192.0.2.1",
|
||||
advertise: "192.0.2.2",
|
||||
|
||||
|
@ -49,44 +54,93 @@ func TestCalculateAdvertiseAddress(t *testing.T) {
|
|||
err: false,
|
||||
},
|
||||
{
|
||||
fn: func() (string, error) { return "192.0.2.1", nil },
|
||||
bind: "0.0.0.0",
|
||||
advertise: "",
|
||||
name: "discover private ip address",
|
||||
privateIPFn: func() (string, error) { return "192.0.2.1", nil },
|
||||
bind: "0.0.0.0",
|
||||
advertise: "",
|
||||
|
||||
expectedIP: net.ParseIP("192.0.2.1"),
|
||||
err: false,
|
||||
},
|
||||
{
|
||||
fn: func() (string, error) { return "", errors.New("some error") },
|
||||
bind: "0.0.0.0",
|
||||
advertise: "",
|
||||
name: "error if getPrivateAddress errors",
|
||||
privateIPFn: func() (string, error) { return "", errors.New("some error") },
|
||||
bind: "0.0.0.0",
|
||||
advertise: "",
|
||||
|
||||
err: true,
|
||||
},
|
||||
{
|
||||
fn: func() (string, error) { return "invalid", nil },
|
||||
bind: "0.0.0.0",
|
||||
advertise: "",
|
||||
name: "error if getPrivateAddress returns an invalid address",
|
||||
privateIPFn: func() (string, error) { return "invalid", nil },
|
||||
bind: "0.0.0.0",
|
||||
advertise: "",
|
||||
|
||||
err: true,
|
||||
},
|
||||
{
|
||||
fn: func() (string, error) { return "", nil },
|
||||
bind: "0.0.0.0",
|
||||
advertise: "",
|
||||
name: "error if getPrivateAddress returns an empty address",
|
||||
privateIPFn: func() (string, error) { return "", nil },
|
||||
bind: "0.0.0.0",
|
||||
advertise: "",
|
||||
|
||||
err: true,
|
||||
},
|
||||
|
||||
{
|
||||
name: "discover public advertise address",
|
||||
privateIPFn: func() (string, error) { return "", nil },
|
||||
publicIPFn: func() (string, error) { return "192.0.2.1", nil },
|
||||
bind: "0.0.0.0",
|
||||
advertise: "",
|
||||
allowInsecureAdvertise: true,
|
||||
|
||||
expectedIP: net.ParseIP("192.0.2.1"),
|
||||
err: false,
|
||||
},
|
||||
{
|
||||
name: "error if getPublicAddress errors",
|
||||
privateIPFn: func() (string, error) { return "", nil },
|
||||
publicIPFn: func() (string, error) { return "", errors.New("some error") },
|
||||
bind: "0.0.0.0",
|
||||
advertise: "",
|
||||
allowInsecureAdvertise: true,
|
||||
|
||||
err: true,
|
||||
},
|
||||
{
|
||||
name: "error if getPublicAddress returns an invalid address",
|
||||
privateIPFn: func() (string, error) { return "", nil },
|
||||
publicIPFn: func() (string, error) { return "invalid", nil },
|
||||
bind: "0.0.0.0",
|
||||
advertise: "",
|
||||
allowInsecureAdvertise: true,
|
||||
|
||||
err: true,
|
||||
},
|
||||
{
|
||||
name: "error if getPublicAddress returns an empty address",
|
||||
privateIPFn: func() (string, error) { return "", nil },
|
||||
publicIPFn: func() (string, error) { return "", nil },
|
||||
bind: "0.0.0.0",
|
||||
advertise: "",
|
||||
allowInsecureAdvertise: true,
|
||||
|
||||
err: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
getPrivateAddress = c.fn
|
||||
got, err := calculateAdvertiseAddress(c.bind, c.advertise)
|
||||
if c.err {
|
||||
require.Error(t, err)
|
||||
} else {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
getPrivateAddress = c.privateIPFn
|
||||
getPublicAddress = c.publicIPFn
|
||||
got, err := calculateAdvertiseAddress(c.bind, c.advertise, c.allowInsecureAdvertise)
|
||||
if c.err {
|
||||
require.Error(t, err)
|
||||
return
|
||||
}
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, c.expectedIP.String(), got.String())
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
@ -141,6 +141,7 @@ func Create(
|
|||
probeTimeout time.Duration,
|
||||
probeInterval time.Duration,
|
||||
tlsTransportConfig *TLSTransportConfig,
|
||||
allowInsecureAdvertise bool,
|
||||
) (*Peer, error) {
|
||||
bindHost, bindPortStr, err := net.SplitHostPort(bindAddr)
|
||||
if err != nil {
|
||||
|
@ -172,7 +173,7 @@ func Create(
|
|||
level.Debug(l).Log("msg", "resolved peers to following addresses", "peers", strings.Join(resolvedPeers, ","))
|
||||
|
||||
// Initial validation of user-specified advertise address.
|
||||
addr, err := calculateAdvertiseAddress(bindHost, advertiseHost)
|
||||
addr, err := calculateAdvertiseAddress(bindHost, advertiseHost, allowInsecureAdvertise)
|
||||
if err != nil {
|
||||
level.Warn(l).Log("err", "couldn't deduce an advertise address: "+err.Error())
|
||||
} else if hasNonlocal(resolvedPeers) && isUnroutable(addr.String()) {
|
||||
|
|
|
@ -53,6 +53,7 @@ func testJoinLeave(t *testing.T) {
|
|||
DefaultProbeTimeout,
|
||||
DefaultProbeInterval,
|
||||
nil,
|
||||
false,
|
||||
)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, p)
|
||||
|
@ -86,6 +87,7 @@ func testJoinLeave(t *testing.T) {
|
|||
DefaultProbeTimeout,
|
||||
DefaultProbeInterval,
|
||||
nil,
|
||||
false,
|
||||
)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, p2)
|
||||
|
@ -120,6 +122,7 @@ func testReconnect(t *testing.T) {
|
|||
DefaultProbeTimeout,
|
||||
DefaultProbeInterval,
|
||||
nil,
|
||||
false,
|
||||
)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, p)
|
||||
|
@ -144,6 +147,7 @@ func testReconnect(t *testing.T) {
|
|||
DefaultProbeTimeout,
|
||||
DefaultProbeInterval,
|
||||
nil,
|
||||
false,
|
||||
)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, p2)
|
||||
|
@ -183,6 +187,7 @@ func testRemoveFailedPeers(t *testing.T) {
|
|||
DefaultProbeTimeout,
|
||||
DefaultProbeInterval,
|
||||
nil,
|
||||
false,
|
||||
)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, p)
|
||||
|
@ -233,6 +238,7 @@ func testInitiallyFailingPeers(t *testing.T) {
|
|||
DefaultProbeTimeout,
|
||||
DefaultProbeInterval,
|
||||
nil,
|
||||
false,
|
||||
)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, p)
|
||||
|
@ -279,6 +285,7 @@ func testTLSConnection(t *testing.T) {
|
|||
DefaultProbeTimeout,
|
||||
DefaultProbeInterval,
|
||||
tlsTransportConfig1,
|
||||
false,
|
||||
)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, p1)
|
||||
|
@ -309,6 +316,7 @@ func testTLSConnection(t *testing.T) {
|
|||
DefaultProbeTimeout,
|
||||
DefaultProbeInterval,
|
||||
tlsTransportConfig2,
|
||||
false,
|
||||
)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, p2)
|
||||
|
|
|
@ -200,18 +200,19 @@ func run() int {
|
|||
|
||||
clusterBindAddr = kingpin.Flag("cluster.listen-address", "Listen address for cluster. Set to empty string to disable HA mode.").
|
||||
Default(defaultClusterAddr).String()
|
||||
clusterAdvertiseAddr = kingpin.Flag("cluster.advertise-address", "Explicit address to advertise in cluster.").String()
|
||||
peers = kingpin.Flag("cluster.peer", "Initial peers (may be repeated).").Strings()
|
||||
peerTimeout = kingpin.Flag("cluster.peer-timeout", "Time to wait between peers to send notifications.").Default("15s").Duration()
|
||||
gossipInterval = kingpin.Flag("cluster.gossip-interval", "Interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across the cluster more quickly at the expense of increased bandwidth.").Default(cluster.DefaultGossipInterval.String()).Duration()
|
||||
pushPullInterval = kingpin.Flag("cluster.pushpull-interval", "Interval for gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage.").Default(cluster.DefaultPushPullInterval.String()).Duration()
|
||||
tcpTimeout = kingpin.Flag("cluster.tcp-timeout", "Timeout for establishing a stream connection with a remote node for a full state sync, and for stream read and write operations.").Default(cluster.DefaultTcpTimeout.String()).Duration()
|
||||
probeTimeout = kingpin.Flag("cluster.probe-timeout", "Timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of RTT (round-trip time) on your network.").Default(cluster.DefaultProbeTimeout.String()).Duration()
|
||||
probeInterval = kingpin.Flag("cluster.probe-interval", "Interval between random node probes. Setting this lower (more frequent) will cause the cluster to detect failed nodes more quickly at the expense of increased bandwidth usage.").Default(cluster.DefaultProbeInterval.String()).Duration()
|
||||
settleTimeout = kingpin.Flag("cluster.settle-timeout", "Maximum time to wait for cluster connections to settle before evaluating notifications.").Default(cluster.DefaultPushPullInterval.String()).Duration()
|
||||
reconnectInterval = kingpin.Flag("cluster.reconnect-interval", "Interval between attempting to reconnect to lost peers.").Default(cluster.DefaultReconnectInterval.String()).Duration()
|
||||
peerReconnectTimeout = kingpin.Flag("cluster.reconnect-timeout", "Length of time to attempt to reconnect to a lost peer.").Default(cluster.DefaultReconnectTimeout.String()).Duration()
|
||||
tlsConfigFile = kingpin.Flag("cluster.tls-config", "[EXPERIMENTAL] Path to config yaml file that can enable mutual TLS within the gossip protocol.").Default("").String()
|
||||
clusterAdvertiseAddr = kingpin.Flag("cluster.advertise-address", "Explicit address to advertise in cluster.").String()
|
||||
peers = kingpin.Flag("cluster.peer", "Initial peers (may be repeated).").Strings()
|
||||
peerTimeout = kingpin.Flag("cluster.peer-timeout", "Time to wait between peers to send notifications.").Default("15s").Duration()
|
||||
gossipInterval = kingpin.Flag("cluster.gossip-interval", "Interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across the cluster more quickly at the expense of increased bandwidth.").Default(cluster.DefaultGossipInterval.String()).Duration()
|
||||
pushPullInterval = kingpin.Flag("cluster.pushpull-interval", "Interval for gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage.").Default(cluster.DefaultPushPullInterval.String()).Duration()
|
||||
tcpTimeout = kingpin.Flag("cluster.tcp-timeout", "Timeout for establishing a stream connection with a remote node for a full state sync, and for stream read and write operations.").Default(cluster.DefaultTcpTimeout.String()).Duration()
|
||||
probeTimeout = kingpin.Flag("cluster.probe-timeout", "Timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of RTT (round-trip time) on your network.").Default(cluster.DefaultProbeTimeout.String()).Duration()
|
||||
probeInterval = kingpin.Flag("cluster.probe-interval", "Interval between random node probes. Setting this lower (more frequent) will cause the cluster to detect failed nodes more quickly at the expense of increased bandwidth usage.").Default(cluster.DefaultProbeInterval.String()).Duration()
|
||||
settleTimeout = kingpin.Flag("cluster.settle-timeout", "Maximum time to wait for cluster connections to settle before evaluating notifications.").Default(cluster.DefaultPushPullInterval.String()).Duration()
|
||||
reconnectInterval = kingpin.Flag("cluster.reconnect-interval", "Interval between attempting to reconnect to lost peers.").Default(cluster.DefaultReconnectInterval.String()).Duration()
|
||||
peerReconnectTimeout = kingpin.Flag("cluster.reconnect-timeout", "Length of time to attempt to reconnect to a lost peer.").Default(cluster.DefaultReconnectTimeout.String()).Duration()
|
||||
tlsConfigFile = kingpin.Flag("cluster.tls-config", "[EXPERIMENTAL] Path to config yaml file that can enable mutual TLS within the gossip protocol.").Default("").String()
|
||||
allowInsecureAdvertise = kingpin.Flag("cluster.allow-insecure-public-advertise-address-discovery", "[EXPERIMENTAL] Allow alertmanager to discover and listen on a public IP address.").Bool()
|
||||
)
|
||||
|
||||
promlogflag.AddFlags(kingpin.CommandLine, &promlogConfig)
|
||||
|
@ -252,6 +253,7 @@ func run() int {
|
|||
*probeTimeout,
|
||||
*probeInterval,
|
||||
tlsTransportConfig,
|
||||
*allowInsecureAdvertise,
|
||||
)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "unable to initialize gossip mesh", "err", err)
|
||||
|
|
Loading…
Reference in New Issue