Add feature flag to enable discovery and use of public IPaddr for clustering. (#2719)
* Add feature flag to enable discovery and use of public IPaddr for clustering. Before this change, Alertmanager would refuse to startup if using a advertise address binding to any address (0.0.0.0), and the host only had an interface with a public IP address. After this change we feature flag permitting the use of a discovered public address for cluster gossiping. Signed-off-by: Devin Trejo <dtrejo@palantir.com>
This commit is contained in:
parent
2f445bcf98
commit
fad796931b
|
@ -20,10 +20,11 @@ import (
|
||||||
"github.com/pkg/errors"
|
"github.com/pkg/errors"
|
||||||
)
|
)
|
||||||
|
|
||||||
type getPrivateIPFunc func() (string, error)
|
type getIPFunc func() (string, error)
|
||||||
|
|
||||||
// This is overridden in unit tests to mock the sockaddr.GetPrivateIP function.
|
// These are overridden in unit tests to mock the sockaddr functions.
|
||||||
var getPrivateAddress getPrivateIPFunc = sockaddr.GetPrivateIP
|
var getPrivateAddress getIPFunc = sockaddr.GetPrivateIP
|
||||||
|
var getPublicAddress getIPFunc = sockaddr.GetPublicIP
|
||||||
|
|
||||||
// calculateAdvertiseAddress attempts to clone logic from deep within memberlist
|
// calculateAdvertiseAddress attempts to clone logic from deep within memberlist
|
||||||
// (NetTransport.FinalAdvertiseAddr) in order to surface its conclusions to the
|
// (NetTransport.FinalAdvertiseAddr) in order to surface its conclusions to the
|
||||||
|
@ -31,7 +32,7 @@ var getPrivateAddress getPrivateIPFunc = sockaddr.GetPrivateIP
|
||||||
// inadvertently misconfigured their cluster.
|
// inadvertently misconfigured their cluster.
|
||||||
//
|
//
|
||||||
// https://github.com/hashicorp/memberlist/blob/022f081/net_transport.go#L126
|
// https://github.com/hashicorp/memberlist/blob/022f081/net_transport.go#L126
|
||||||
func calculateAdvertiseAddress(bindAddr, advertiseAddr string) (net.IP, error) {
|
func calculateAdvertiseAddress(bindAddr, advertiseAddr string, allowInsecureAdvertise bool) (net.IP, error) {
|
||||||
if advertiseAddr != "" {
|
if advertiseAddr != "" {
|
||||||
ip := net.ParseIP(advertiseAddr)
|
ip := net.ParseIP(advertiseAddr)
|
||||||
if ip == nil {
|
if ip == nil {
|
||||||
|
@ -44,18 +45,7 @@ func calculateAdvertiseAddress(bindAddr, advertiseAddr string) (net.IP, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if isAny(bindAddr) {
|
if isAny(bindAddr) {
|
||||||
privateIP, err := getPrivateAddress()
|
return discoverAdvertiseAddress(allowInsecureAdvertise)
|
||||||
if err != nil {
|
|
||||||
return nil, errors.Wrap(err, "failed to get private IP")
|
|
||||||
}
|
|
||||||
if privateIP == "" {
|
|
||||||
return nil, errors.New("no private IP found, explicit advertise addr not provided")
|
|
||||||
}
|
|
||||||
ip := net.ParseIP(privateIP)
|
|
||||||
if ip == nil {
|
|
||||||
return nil, errors.Errorf("failed to parse private IP '%s'", privateIP)
|
|
||||||
}
|
|
||||||
return ip, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ip := net.ParseIP(bindAddr)
|
ip := net.ParseIP(bindAddr)
|
||||||
|
@ -64,3 +54,33 @@ func calculateAdvertiseAddress(bindAddr, advertiseAddr string) (net.IP, error) {
|
||||||
}
|
}
|
||||||
return ip, nil
|
return ip, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// discoverAdvertiseAddress will attempt to get a single IP address to use as
|
||||||
|
// the advertise address when one is not explicitly provided. It defaults to
|
||||||
|
// using a private IP address, and if not found then using a public IP if
|
||||||
|
// insecure advertising is allowed.
|
||||||
|
func discoverAdvertiseAddress(allowInsecureAdvertise bool) (net.IP, error) {
|
||||||
|
addr, err := getPrivateAddress()
|
||||||
|
if err != nil {
|
||||||
|
return nil, errors.Wrap(err, "failed to get private IP")
|
||||||
|
}
|
||||||
|
if addr == "" && !allowInsecureAdvertise {
|
||||||
|
return nil, errors.New("no private IP found, explicit advertise addr not provided")
|
||||||
|
}
|
||||||
|
|
||||||
|
if addr == "" {
|
||||||
|
addr, err = getPublicAddress()
|
||||||
|
if err != nil {
|
||||||
|
return nil, errors.Wrap(err, "failed to get public IP")
|
||||||
|
}
|
||||||
|
if addr == "" {
|
||||||
|
return nil, errors.New("no private/public IP found, explicit advertise addr not provided")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ip := net.ParseIP(addr)
|
||||||
|
if ip == nil {
|
||||||
|
return nil, errors.Errorf("failed to parse discovered IP '%s'", addr)
|
||||||
|
}
|
||||||
|
return ip, nil
|
||||||
|
}
|
||||||
|
|
|
@ -28,13 +28,17 @@ func TestCalculateAdvertiseAddress(t *testing.T) {
|
||||||
}()
|
}()
|
||||||
|
|
||||||
cases := []struct {
|
cases := []struct {
|
||||||
fn getPrivateIPFunc
|
name string
|
||||||
bind, advertise string
|
privateIPFn getIPFunc
|
||||||
|
publicIPFn getIPFunc
|
||||||
|
bind, advertise string
|
||||||
|
allowInsecureAdvertise bool
|
||||||
|
|
||||||
expectedIP net.IP
|
expectedIP net.IP
|
||||||
err bool
|
err bool
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
|
name: "use provided bind address",
|
||||||
bind: "192.0.2.1",
|
bind: "192.0.2.1",
|
||||||
advertise: "",
|
advertise: "",
|
||||||
|
|
||||||
|
@ -42,6 +46,7 @@ func TestCalculateAdvertiseAddress(t *testing.T) {
|
||||||
err: false,
|
err: false,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
name: "use provided advertise address",
|
||||||
bind: "192.0.2.1",
|
bind: "192.0.2.1",
|
||||||
advertise: "192.0.2.2",
|
advertise: "192.0.2.2",
|
||||||
|
|
||||||
|
@ -49,44 +54,93 @@ func TestCalculateAdvertiseAddress(t *testing.T) {
|
||||||
err: false,
|
err: false,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
fn: func() (string, error) { return "192.0.2.1", nil },
|
name: "discover private ip address",
|
||||||
bind: "0.0.0.0",
|
privateIPFn: func() (string, error) { return "192.0.2.1", nil },
|
||||||
advertise: "",
|
bind: "0.0.0.0",
|
||||||
|
advertise: "",
|
||||||
|
|
||||||
expectedIP: net.ParseIP("192.0.2.1"),
|
expectedIP: net.ParseIP("192.0.2.1"),
|
||||||
err: false,
|
err: false,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
fn: func() (string, error) { return "", errors.New("some error") },
|
name: "error if getPrivateAddress errors",
|
||||||
bind: "0.0.0.0",
|
privateIPFn: func() (string, error) { return "", errors.New("some error") },
|
||||||
advertise: "",
|
bind: "0.0.0.0",
|
||||||
|
advertise: "",
|
||||||
|
|
||||||
err: true,
|
err: true,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
fn: func() (string, error) { return "invalid", nil },
|
name: "error if getPrivateAddress returns an invalid address",
|
||||||
bind: "0.0.0.0",
|
privateIPFn: func() (string, error) { return "invalid", nil },
|
||||||
advertise: "",
|
bind: "0.0.0.0",
|
||||||
|
advertise: "",
|
||||||
|
|
||||||
err: true,
|
err: true,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
fn: func() (string, error) { return "", nil },
|
name: "error if getPrivateAddress returns an empty address",
|
||||||
bind: "0.0.0.0",
|
privateIPFn: func() (string, error) { return "", nil },
|
||||||
advertise: "",
|
bind: "0.0.0.0",
|
||||||
|
advertise: "",
|
||||||
|
|
||||||
|
err: true,
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
name: "discover public advertise address",
|
||||||
|
privateIPFn: func() (string, error) { return "", nil },
|
||||||
|
publicIPFn: func() (string, error) { return "192.0.2.1", nil },
|
||||||
|
bind: "0.0.0.0",
|
||||||
|
advertise: "",
|
||||||
|
allowInsecureAdvertise: true,
|
||||||
|
|
||||||
|
expectedIP: net.ParseIP("192.0.2.1"),
|
||||||
|
err: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "error if getPublicAddress errors",
|
||||||
|
privateIPFn: func() (string, error) { return "", nil },
|
||||||
|
publicIPFn: func() (string, error) { return "", errors.New("some error") },
|
||||||
|
bind: "0.0.0.0",
|
||||||
|
advertise: "",
|
||||||
|
allowInsecureAdvertise: true,
|
||||||
|
|
||||||
|
err: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "error if getPublicAddress returns an invalid address",
|
||||||
|
privateIPFn: func() (string, error) { return "", nil },
|
||||||
|
publicIPFn: func() (string, error) { return "invalid", nil },
|
||||||
|
bind: "0.0.0.0",
|
||||||
|
advertise: "",
|
||||||
|
allowInsecureAdvertise: true,
|
||||||
|
|
||||||
|
err: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "error if getPublicAddress returns an empty address",
|
||||||
|
privateIPFn: func() (string, error) { return "", nil },
|
||||||
|
publicIPFn: func() (string, error) { return "", nil },
|
||||||
|
bind: "0.0.0.0",
|
||||||
|
advertise: "",
|
||||||
|
allowInsecureAdvertise: true,
|
||||||
|
|
||||||
err: true,
|
err: true,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, c := range cases {
|
for _, c := range cases {
|
||||||
getPrivateAddress = c.fn
|
t.Run(c.name, func(t *testing.T) {
|
||||||
got, err := calculateAdvertiseAddress(c.bind, c.advertise)
|
getPrivateAddress = c.privateIPFn
|
||||||
if c.err {
|
getPublicAddress = c.publicIPFn
|
||||||
require.Error(t, err)
|
got, err := calculateAdvertiseAddress(c.bind, c.advertise, c.allowInsecureAdvertise)
|
||||||
} else {
|
if c.err {
|
||||||
|
require.Error(t, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
require.Equal(t, c.expectedIP.String(), got.String())
|
require.Equal(t, c.expectedIP.String(), got.String())
|
||||||
}
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -141,6 +141,7 @@ func Create(
|
||||||
probeTimeout time.Duration,
|
probeTimeout time.Duration,
|
||||||
probeInterval time.Duration,
|
probeInterval time.Duration,
|
||||||
tlsTransportConfig *TLSTransportConfig,
|
tlsTransportConfig *TLSTransportConfig,
|
||||||
|
allowInsecureAdvertise bool,
|
||||||
) (*Peer, error) {
|
) (*Peer, error) {
|
||||||
bindHost, bindPortStr, err := net.SplitHostPort(bindAddr)
|
bindHost, bindPortStr, err := net.SplitHostPort(bindAddr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -172,7 +173,7 @@ func Create(
|
||||||
level.Debug(l).Log("msg", "resolved peers to following addresses", "peers", strings.Join(resolvedPeers, ","))
|
level.Debug(l).Log("msg", "resolved peers to following addresses", "peers", strings.Join(resolvedPeers, ","))
|
||||||
|
|
||||||
// Initial validation of user-specified advertise address.
|
// Initial validation of user-specified advertise address.
|
||||||
addr, err := calculateAdvertiseAddress(bindHost, advertiseHost)
|
addr, err := calculateAdvertiseAddress(bindHost, advertiseHost, allowInsecureAdvertise)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
level.Warn(l).Log("err", "couldn't deduce an advertise address: "+err.Error())
|
level.Warn(l).Log("err", "couldn't deduce an advertise address: "+err.Error())
|
||||||
} else if hasNonlocal(resolvedPeers) && isUnroutable(addr.String()) {
|
} else if hasNonlocal(resolvedPeers) && isUnroutable(addr.String()) {
|
||||||
|
|
|
@ -53,6 +53,7 @@ func testJoinLeave(t *testing.T) {
|
||||||
DefaultProbeTimeout,
|
DefaultProbeTimeout,
|
||||||
DefaultProbeInterval,
|
DefaultProbeInterval,
|
||||||
nil,
|
nil,
|
||||||
|
false,
|
||||||
)
|
)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
require.NotNil(t, p)
|
require.NotNil(t, p)
|
||||||
|
@ -86,6 +87,7 @@ func testJoinLeave(t *testing.T) {
|
||||||
DefaultProbeTimeout,
|
DefaultProbeTimeout,
|
||||||
DefaultProbeInterval,
|
DefaultProbeInterval,
|
||||||
nil,
|
nil,
|
||||||
|
false,
|
||||||
)
|
)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
require.NotNil(t, p2)
|
require.NotNil(t, p2)
|
||||||
|
@ -120,6 +122,7 @@ func testReconnect(t *testing.T) {
|
||||||
DefaultProbeTimeout,
|
DefaultProbeTimeout,
|
||||||
DefaultProbeInterval,
|
DefaultProbeInterval,
|
||||||
nil,
|
nil,
|
||||||
|
false,
|
||||||
)
|
)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
require.NotNil(t, p)
|
require.NotNil(t, p)
|
||||||
|
@ -144,6 +147,7 @@ func testReconnect(t *testing.T) {
|
||||||
DefaultProbeTimeout,
|
DefaultProbeTimeout,
|
||||||
DefaultProbeInterval,
|
DefaultProbeInterval,
|
||||||
nil,
|
nil,
|
||||||
|
false,
|
||||||
)
|
)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
require.NotNil(t, p2)
|
require.NotNil(t, p2)
|
||||||
|
@ -183,6 +187,7 @@ func testRemoveFailedPeers(t *testing.T) {
|
||||||
DefaultProbeTimeout,
|
DefaultProbeTimeout,
|
||||||
DefaultProbeInterval,
|
DefaultProbeInterval,
|
||||||
nil,
|
nil,
|
||||||
|
false,
|
||||||
)
|
)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
require.NotNil(t, p)
|
require.NotNil(t, p)
|
||||||
|
@ -233,6 +238,7 @@ func testInitiallyFailingPeers(t *testing.T) {
|
||||||
DefaultProbeTimeout,
|
DefaultProbeTimeout,
|
||||||
DefaultProbeInterval,
|
DefaultProbeInterval,
|
||||||
nil,
|
nil,
|
||||||
|
false,
|
||||||
)
|
)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
require.NotNil(t, p)
|
require.NotNil(t, p)
|
||||||
|
@ -279,6 +285,7 @@ func testTLSConnection(t *testing.T) {
|
||||||
DefaultProbeTimeout,
|
DefaultProbeTimeout,
|
||||||
DefaultProbeInterval,
|
DefaultProbeInterval,
|
||||||
tlsTransportConfig1,
|
tlsTransportConfig1,
|
||||||
|
false,
|
||||||
)
|
)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
require.NotNil(t, p1)
|
require.NotNil(t, p1)
|
||||||
|
@ -309,6 +316,7 @@ func testTLSConnection(t *testing.T) {
|
||||||
DefaultProbeTimeout,
|
DefaultProbeTimeout,
|
||||||
DefaultProbeInterval,
|
DefaultProbeInterval,
|
||||||
tlsTransportConfig2,
|
tlsTransportConfig2,
|
||||||
|
false,
|
||||||
)
|
)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
require.NotNil(t, p2)
|
require.NotNil(t, p2)
|
||||||
|
|
|
@ -200,18 +200,19 @@ func run() int {
|
||||||
|
|
||||||
clusterBindAddr = kingpin.Flag("cluster.listen-address", "Listen address for cluster. Set to empty string to disable HA mode.").
|
clusterBindAddr = kingpin.Flag("cluster.listen-address", "Listen address for cluster. Set to empty string to disable HA mode.").
|
||||||
Default(defaultClusterAddr).String()
|
Default(defaultClusterAddr).String()
|
||||||
clusterAdvertiseAddr = kingpin.Flag("cluster.advertise-address", "Explicit address to advertise in cluster.").String()
|
clusterAdvertiseAddr = kingpin.Flag("cluster.advertise-address", "Explicit address to advertise in cluster.").String()
|
||||||
peers = kingpin.Flag("cluster.peer", "Initial peers (may be repeated).").Strings()
|
peers = kingpin.Flag("cluster.peer", "Initial peers (may be repeated).").Strings()
|
||||||
peerTimeout = kingpin.Flag("cluster.peer-timeout", "Time to wait between peers to send notifications.").Default("15s").Duration()
|
peerTimeout = kingpin.Flag("cluster.peer-timeout", "Time to wait between peers to send notifications.").Default("15s").Duration()
|
||||||
gossipInterval = kingpin.Flag("cluster.gossip-interval", "Interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across the cluster more quickly at the expense of increased bandwidth.").Default(cluster.DefaultGossipInterval.String()).Duration()
|
gossipInterval = kingpin.Flag("cluster.gossip-interval", "Interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across the cluster more quickly at the expense of increased bandwidth.").Default(cluster.DefaultGossipInterval.String()).Duration()
|
||||||
pushPullInterval = kingpin.Flag("cluster.pushpull-interval", "Interval for gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage.").Default(cluster.DefaultPushPullInterval.String()).Duration()
|
pushPullInterval = kingpin.Flag("cluster.pushpull-interval", "Interval for gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage.").Default(cluster.DefaultPushPullInterval.String()).Duration()
|
||||||
tcpTimeout = kingpin.Flag("cluster.tcp-timeout", "Timeout for establishing a stream connection with a remote node for a full state sync, and for stream read and write operations.").Default(cluster.DefaultTcpTimeout.String()).Duration()
|
tcpTimeout = kingpin.Flag("cluster.tcp-timeout", "Timeout for establishing a stream connection with a remote node for a full state sync, and for stream read and write operations.").Default(cluster.DefaultTcpTimeout.String()).Duration()
|
||||||
probeTimeout = kingpin.Flag("cluster.probe-timeout", "Timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of RTT (round-trip time) on your network.").Default(cluster.DefaultProbeTimeout.String()).Duration()
|
probeTimeout = kingpin.Flag("cluster.probe-timeout", "Timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of RTT (round-trip time) on your network.").Default(cluster.DefaultProbeTimeout.String()).Duration()
|
||||||
probeInterval = kingpin.Flag("cluster.probe-interval", "Interval between random node probes. Setting this lower (more frequent) will cause the cluster to detect failed nodes more quickly at the expense of increased bandwidth usage.").Default(cluster.DefaultProbeInterval.String()).Duration()
|
probeInterval = kingpin.Flag("cluster.probe-interval", "Interval between random node probes. Setting this lower (more frequent) will cause the cluster to detect failed nodes more quickly at the expense of increased bandwidth usage.").Default(cluster.DefaultProbeInterval.String()).Duration()
|
||||||
settleTimeout = kingpin.Flag("cluster.settle-timeout", "Maximum time to wait for cluster connections to settle before evaluating notifications.").Default(cluster.DefaultPushPullInterval.String()).Duration()
|
settleTimeout = kingpin.Flag("cluster.settle-timeout", "Maximum time to wait for cluster connections to settle before evaluating notifications.").Default(cluster.DefaultPushPullInterval.String()).Duration()
|
||||||
reconnectInterval = kingpin.Flag("cluster.reconnect-interval", "Interval between attempting to reconnect to lost peers.").Default(cluster.DefaultReconnectInterval.String()).Duration()
|
reconnectInterval = kingpin.Flag("cluster.reconnect-interval", "Interval between attempting to reconnect to lost peers.").Default(cluster.DefaultReconnectInterval.String()).Duration()
|
||||||
peerReconnectTimeout = kingpin.Flag("cluster.reconnect-timeout", "Length of time to attempt to reconnect to a lost peer.").Default(cluster.DefaultReconnectTimeout.String()).Duration()
|
peerReconnectTimeout = kingpin.Flag("cluster.reconnect-timeout", "Length of time to attempt to reconnect to a lost peer.").Default(cluster.DefaultReconnectTimeout.String()).Duration()
|
||||||
tlsConfigFile = kingpin.Flag("cluster.tls-config", "[EXPERIMENTAL] Path to config yaml file that can enable mutual TLS within the gossip protocol.").Default("").String()
|
tlsConfigFile = kingpin.Flag("cluster.tls-config", "[EXPERIMENTAL] Path to config yaml file that can enable mutual TLS within the gossip protocol.").Default("").String()
|
||||||
|
allowInsecureAdvertise = kingpin.Flag("cluster.allow-insecure-public-advertise-address-discovery", "[EXPERIMENTAL] Allow alertmanager to discover and listen on a public IP address.").Bool()
|
||||||
)
|
)
|
||||||
|
|
||||||
promlogflag.AddFlags(kingpin.CommandLine, &promlogConfig)
|
promlogflag.AddFlags(kingpin.CommandLine, &promlogConfig)
|
||||||
|
@ -252,6 +253,7 @@ func run() int {
|
||||||
*probeTimeout,
|
*probeTimeout,
|
||||||
*probeInterval,
|
*probeInterval,
|
||||||
tlsTransportConfig,
|
tlsTransportConfig,
|
||||||
|
*allowInsecureAdvertise,
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
level.Error(logger).Log("msg", "unable to initialize gossip mesh", "err", err)
|
level.Error(logger).Log("msg", "unable to initialize gossip mesh", "err", err)
|
||||||
|
|
Loading…
Reference in New Issue