haproxy/src/sock_inet.c
Willy Tarreau 785b89f551 MINOR: protocol: move the global reuseport flag to the protocols
Some protocol support SO_REUSEPORT and others not. Some have such a
limitation in the kernel, and others in haproxy itself (e.g. sock_unix
cannot support multiple bindings since each one will unbind the previous
one). Also it's really protocol-dependent and not just family-dependent
because on Linux for some time it was supported for TCP and not UDP.

Let's move the definition to the protocols instead. Now it's preset in
tcp/udp/quic when SO_REUSEPORT is defined, and is otherwise left unset.
The enabled() config condition test validates IPv4 (generally sufficient),
and -dR / noreuseport all protocols at once.
2023-04-23 09:46:15 +02:00

522 lines
15 KiB
C

/*
* AF_INET/AF_INET6 socket management
*
* Copyright 2000-2020 Willy Tarreau <w@1wt.eu>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <errno.h>
#include <string.h>
#include <unistd.h>
#include <sys/param.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/tcp.h>
#include <netinet/in.h>
#include <haproxy/api.h>
#include <haproxy/errors.h>
#include <haproxy/fd.h>
#include <haproxy/global.h>
#include <haproxy/namespace.h>
#include <haproxy/receiver-t.h>
#include <haproxy/sock.h>
#include <haproxy/sock_inet.h>
#include <haproxy/tools.h>
struct proto_fam proto_fam_inet4 = {
.name = "inet4",
.sock_domain = PF_INET,
.sock_family = AF_INET,
.sock_addrlen = sizeof(struct sockaddr_in),
.l3_addrlen = 32/8,
.addrcmp = sock_inet4_addrcmp,
.bind = sock_inet_bind_receiver,
.get_src = sock_get_src,
.get_dst = sock_inet_get_dst,
.set_port = sock_inet_set_port,
};
struct proto_fam proto_fam_inet6 = {
.name = "inet6",
.sock_domain = PF_INET6,
.sock_family = AF_INET6,
.sock_addrlen = sizeof(struct sockaddr_in6),
.l3_addrlen = 128/8,
.addrcmp = sock_inet6_addrcmp,
.bind = sock_inet_bind_receiver,
.get_src = sock_get_src,
.get_dst = sock_get_dst,
.set_port = sock_inet_set_port,
};
/* PLEASE NOTE for function below:
* - sock_inet4_* is solely for AF_INET (IPv4)
* - sock_inet6_* is solely for AF_INET6 (IPv6)
* - sock_inet_* is for either
*
* The address family SHOULD always be checked. In some cases a function will
* be used in a situation where the address family is guaranteed (e.g. protocol
* definitions), so the test may be avoided. This special case must then be
* mentioned in the comment before the function definition.
*/
/* determine if the operating system uses IPV6_V6ONLY by default. 0=no, 1=yes.
* It also remains if IPv6 is not enabled/configured.
*/
int sock_inet6_v6only_default = 0;
/* Default TCPv4/TCPv6 MSS settings. -1=unknown. */
int sock_inet_tcp_maxseg_default = -1;
int sock_inet6_tcp_maxseg_default = -1;
/* Compares two AF_INET sockaddr addresses. Returns 0 if they match or non-zero
* if they do not match.
*/
int sock_inet4_addrcmp(const struct sockaddr_storage *a, const struct sockaddr_storage *b)
{
const struct sockaddr_in *a4 = (const struct sockaddr_in *)a;
const struct sockaddr_in *b4 = (const struct sockaddr_in *)b;
if (a->ss_family != b->ss_family)
return -1;
if (a->ss_family != AF_INET)
return -1;
if (a4->sin_port != b4->sin_port)
return -1;
return memcmp(&a4->sin_addr, &b4->sin_addr, sizeof(a4->sin_addr));
}
/* Compares two AF_INET6 sockaddr addresses. Returns 0 if they match or
* non-zero if they do not match.
*/
int sock_inet6_addrcmp(const struct sockaddr_storage *a, const struct sockaddr_storage *b)
{
const struct sockaddr_in6 *a6 = (const struct sockaddr_in6 *)a;
const struct sockaddr_in6 *b6 = (const struct sockaddr_in6 *)b;
if (a->ss_family != b->ss_family)
return -1;
if (a->ss_family != AF_INET6)
return -1;
if (a6->sin6_port != b6->sin6_port)
return -1;
return memcmp(&a6->sin6_addr, &b6->sin6_addr, sizeof(a6->sin6_addr));
}
/* Sets the port <port> on IPv4 or IPv6 address <addr>. The address family is
* determined from the sockaddr_storage's address family. Nothing is done for
* other families.
*/
void sock_inet_set_port(struct sockaddr_storage *addr, int port)
{
if (addr->ss_family == AF_INET)
((struct sockaddr_in *)addr)->sin_port = htons(port);
else if (addr->ss_family == AF_INET6)
((struct sockaddr_in6 *)addr)->sin6_port = htons(port);
}
/*
* Retrieves the original destination address for the socket <fd> which must be
* of family AF_INET (not AF_INET6), with <dir> indicating if we're a listener
* (=0) or an initiator (!=0). In the case of a listener, if the original
* destination address was translated, the original address is retrieved. It
* returns 0 in case of success, -1 in case of error. The socket's source
* address is stored in <sa> for <salen> bytes.
*/
int sock_inet_get_dst(int fd, struct sockaddr *sa, socklen_t salen, int dir)
{
if (dir)
return getpeername(fd, sa, &salen);
else {
int ret = getsockname(fd, sa, &salen);
if (ret < 0)
return ret;
#if defined(USE_TPROXY) && defined(SO_ORIGINAL_DST)
/* For TPROXY and Netfilter's NAT, we can retrieve the original
* IPv4 address before DNAT/REDIRECT. We must not do that with
* other families because v6-mapped IPv4 addresses are still
* reported as v4.
*/
if (getsockopt(fd, IPPROTO_IP, SO_ORIGINAL_DST, sa, &salen) == 0)
return 0;
#endif
return ret;
}
}
/* Returns true if the passed FD corresponds to a socket bound with RX_O_FOREIGN
* according to the various supported socket options. The socket's address family
* must be passed in <family>.
*/
int sock_inet_is_foreign(int fd, sa_family_t family)
{
int val __maybe_unused;
socklen_t len __maybe_unused;
switch (family) {
case AF_INET:
#if defined(IP_TRANSPARENT)
val = 0; len = sizeof(val);
if (getsockopt(fd, IPPROTO_IP, IP_TRANSPARENT, &val, &len) == 0 && val)
return 1;
#endif
#if defined(IP_FREEBIND)
val = 0; len = sizeof(val);
if (getsockopt(fd, IPPROTO_IP, IP_FREEBIND, &val, &len) == 0 && val)
return 1;
#endif
#if defined(IP_BINDANY)
val = 0; len = sizeof(val);
if (getsockopt(fd, IPPROTO_IP, IP_BINDANY, &val, &len) == 0 && val)
return 1;
#endif
#if defined(SO_BINDANY)
val = 0; len = sizeof(val);
if (getsockopt(fd, SOL_SOCKET, SO_BINDANY, &val, &len) == 0 && val)
return 1;
#endif
break;
case AF_INET6:
#if defined(IPV6_TRANSPARENT)
val = 0; len = sizeof(val);
if (getsockopt(fd, IPPROTO_IPV6, IPV6_TRANSPARENT, &val, &len) == 0 && val)
return 1;
#endif
#if defined(IP_FREEBIND)
val = 0; len = sizeof(val);
if (getsockopt(fd, IPPROTO_IP, IP_FREEBIND, &val, &len) == 0 && val)
return 1;
#endif
#if defined(IPV6_BINDANY)
val = 0; len = sizeof(val);
if (getsockopt(fd, IPPROTO_IPV6, IPV6_BINDANY, &val, &len) == 0 && val)
return 1;
#endif
#if defined(SO_BINDANY)
val = 0; len = sizeof(val);
if (getsockopt(fd, SOL_SOCKET, SO_BINDANY, &val, &len) == 0 && val)
return 1;
#endif
break;
}
return 0;
}
/* Attempt all known socket options to prepare an AF_INET4 socket to be bound
* to a foreign address. The socket must already exist and must not be bound.
* 1 is returned on success, 0 on failure. The caller must check the address
* family before calling this function.
*/
int sock_inet4_make_foreign(int fd)
{
return
#if defined(IP_TRANSPARENT)
setsockopt(fd, IPPROTO_IP, IP_TRANSPARENT, &one, sizeof(one)) == 0 ||
#endif
#if defined(IP_FREEBIND)
setsockopt(fd, IPPROTO_IP, IP_FREEBIND, &one, sizeof(one)) == 0 ||
#endif
#if defined(IP_BINDANY)
setsockopt(fd, IPPROTO_IP, IP_BINDANY, &one, sizeof(one)) == 0 ||
#endif
#if defined(SO_BINDANY)
setsockopt(fd, SOL_SOCKET, SO_BINDANY, &one, sizeof(one)) == 0 ||
#endif
0;
}
/* Attempt all known socket options to prepare an AF_INET6 socket to be bound
* to a foreign address. The socket must already exist and must not be bound.
* 1 is returned on success, 0 on failure. The caller must check the address
* family before calling this function.
*/
int sock_inet6_make_foreign(int fd)
{
return
#if defined(IPV6_TRANSPARENT)
setsockopt(fd, IPPROTO_IPV6, IPV6_TRANSPARENT, &one, sizeof(one)) == 0 ||
#endif
#if defined(IP_FREEBIND)
setsockopt(fd, IPPROTO_IP, IP_FREEBIND, &one, sizeof(one)) == 0 ||
#endif
#if defined(IPV6_BINDANY)
setsockopt(fd, IPPROTO_IPV6, IPV6_BINDANY, &one, sizeof(one)) == 0 ||
#endif
#if defined(SO_BINDANY)
setsockopt(fd, SOL_SOCKET, SO_BINDANY, &one, sizeof(one)) == 0 ||
#endif
0;
}
/* Binds receiver <rx>, and assigns rx->iocb and rx->owner as the callback and
* context, respectively. Returns and error code made of ERR_* bits on failure
* or ERR_NONE on success. On failure, an error message may be passed into
* <errmsg>.
*/
int sock_inet_bind_receiver(struct receiver *rx, char **errmsg)
{
int fd, err, ext;
/* copy listener addr because sometimes we need to switch family */
struct sockaddr_storage addr_inet = rx->addr;
/* force to classic sock family, not AF_CUST_* */
addr_inet.ss_family = rx->proto->fam->sock_family;
/* ensure we never return garbage */
if (errmsg)
*errmsg = 0;
err = ERR_NONE;
if (rx->flags & RX_F_BOUND)
return ERR_NONE;
if (rx->flags & RX_F_MUST_DUP) {
/* this is a secondary receiver that is an exact copy of a
* reference which must already be bound (or has failed).
* We'll try to dup() the other one's FD and take it. We
* try hard not to reconfigure the socket since it's shared.
*/
BUG_ON(!rx->shard_info);
if (!(rx->shard_info->ref->flags & RX_F_BOUND)) {
/* it's assumed that the first one has already reported
* the error, let's not spam with another one, and do
* not set ERR_ALERT.
*/
err |= ERR_RETRYABLE;
goto bind_ret_err;
}
/* taking the other one's FD will result in it being marked
* extern and being dup()ed. Let's mark the receiver as
* inherited so that it properly bypasses all second-stage
* setup and avoids being passed to new processes.
*/
rx->flags |= RX_F_INHERITED;
rx->fd = rx->shard_info->ref->fd;
}
/* if no FD was assigned yet, we'll have to either find a compatible
* one or create a new one.
*/
if (rx->fd == -1)
rx->fd = sock_find_compatible_fd(rx);
/* if the receiver now has an fd assigned, then we were offered the fd
* by an external process (most likely the parent), and we don't want
* to create a new socket. However we still want to set a few flags on
* the socket.
*/
fd = rx->fd;
ext = (fd >= 0);
if (!ext) {
fd = my_socketat(rx->settings->netns, rx->proto->fam->sock_domain,
rx->proto->sock_type, rx->proto->sock_prot);
if (fd == -1) {
err |= ERR_RETRYABLE | ERR_ALERT;
memprintf(errmsg, "cannot create receiving socket (%s)", strerror(errno));
goto bind_return;
}
}
if (ext && fd < global.maxsock && fdtab[fd].owner) {
/* This FD was already bound so this means that it was already
* known and registered before parsing, hence it's an inherited
* FD. The only reason why it's already known here is that it
* has been registered multiple times (multiple listeners on the
* same, or a "shards" directive on the line). There cannot be
* multiple listeners on one FD but at least we can create a
* new one from the original one. We won't reconfigure it,
* however, as this was already done for the first one.
*/
fd = dup(fd);
if (fd == -1) {
err |= ERR_RETRYABLE | ERR_ALERT;
memprintf(errmsg, "cannot dup() receiving socket (%s)", strerror(errno));
goto bind_return;
}
}
if (fd >= global.maxsock) {
err |= ERR_FATAL | ERR_ABORT | ERR_ALERT;
memprintf(errmsg, "not enough free sockets (raise '-n' parameter)");
goto bind_close_return;
}
if (fd_set_nonblock(fd) == -1) {
err |= ERR_FATAL | ERR_ALERT;
memprintf(errmsg, "cannot make socket non-blocking");
goto bind_close_return;
}
if (!ext && setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) == -1) {
/* not fatal but should be reported */
memprintf(errmsg, "cannot do so_reuseaddr");
err |= ERR_ALERT;
}
#ifdef SO_REUSEPORT
/* OpenBSD and Linux 3.9 support this. As it's present in old libc versions of
* Linux, it might return an error that we will silently ignore.
*/
if (!ext && (rx->proto->flags & PROTO_F_REUSEPORT_SUPPORTED))
setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one));
#endif
#ifdef SO_REUSEPORT_LB
/* FreeBSD 12 and above use this to load-balance incoming connections.
* This is limited to 256 listeners per group however.
*/
if (!ext && (rx->proto->flags & PROTO_F_REUSEPORT_SUPPORTED))
setsockopt(fd, SOL_SOCKET, SO_REUSEPORT_LB, &one, sizeof(one));
#endif
if (!ext && (rx->settings->options & RX_O_FOREIGN)) {
switch (addr_inet.ss_family) {
case AF_INET:
if (!sock_inet4_make_foreign(fd)) {
memprintf(errmsg, "cannot make receiving socket transparent");
err |= ERR_ALERT;
}
break;
case AF_INET6:
if (!sock_inet6_make_foreign(fd)) {
memprintf(errmsg, "cannot make receiving socket transparent");
err |= ERR_ALERT;
}
break;
}
}
#ifdef SO_BINDTODEVICE
/* Note: this might fail if not CAP_NET_RAW */
if (!ext && rx->settings->interface) {
if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE,
rx->settings->interface,
strlen(rx->settings->interface) + 1) == -1) {
memprintf(errmsg, "cannot bind receiver to device '%s' (%s)", rx->settings->interface, strerror(errno));
err |= ERR_WARN;
}
}
#endif
#if defined(IPV6_V6ONLY)
if (addr_inet.ss_family == AF_INET6 && !ext) {
/* Prepare to match the v6only option against what we really want. Note
* that sadly the two options are not exclusive to each other and that
* v6only is stronger than v4v6.
*/
if ((rx->settings->options & RX_O_V6ONLY) ||
(sock_inet6_v6only_default && !(rx->settings->options & RX_O_V4V6)))
setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one));
else
setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &zero, sizeof(zero));
}
#endif
if (!ext && bind(fd, (struct sockaddr *)&addr_inet, rx->proto->fam->sock_addrlen) == -1) {
err |= ERR_RETRYABLE | ERR_ALERT;
memprintf(errmsg, "cannot bind socket (%s)", strerror(errno));
goto bind_close_return;
}
rx->fd = fd;
rx->flags |= RX_F_BOUND;
fd_insert(fd, rx->owner, rx->iocb, rx->bind_tgroup, rx->bind_thread);
/* for now, all regularly bound TCP listeners are exportable */
if (!(rx->flags & RX_F_INHERITED))
HA_ATOMIC_OR(&fdtab[fd].state, FD_EXPORTED);
bind_return:
if (errmsg && *errmsg) {
char pn[INET6_ADDRSTRLEN];
addr_to_str(&addr_inet, pn, sizeof(pn));
memprintf(errmsg, "%s for [%s:%d]", *errmsg, pn, get_host_port(&addr_inet));
}
bind_ret_err:
return err;
bind_close_return:
close(fd);
goto bind_return;
}
static void sock_inet_prepare()
{
int fd, val;
socklen_t len;
fd = socket(AF_INET, SOCK_STREAM, 0);
if (fd >= 0) {
#ifdef TCP_MAXSEG
/* retrieve the OS' default mss for TCPv4 */
len = sizeof(val);
if (getsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &val, &len) == 0)
sock_inet_tcp_maxseg_default = val;
#endif
close(fd);
}
fd = socket(AF_INET6, SOCK_STREAM, 0);
if (fd >= 0) {
#if defined(IPV6_V6ONLY)
/* retrieve the OS' bindv6only value */
len = sizeof(val);
if (getsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &val, &len) == 0 && val > 0)
sock_inet6_v6only_default = 1;
#endif
#ifdef TCP_MAXSEG
/* retrieve the OS' default mss for TCPv6 */
len = sizeof(val);
if (getsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &val, &len) == 0)
sock_inet6_tcp_maxseg_default = val;
#endif
close(fd);
}
}
INITCALL0(STG_PREPARE, sock_inet_prepare);
REGISTER_BUILD_OPTS("Built with transparent proxy support using:"
#if defined(IP_TRANSPARENT)
" IP_TRANSPARENT"
#endif
#if defined(IPV6_TRANSPARENT)
" IPV6_TRANSPARENT"
#endif
#if defined(IP_FREEBIND)
" IP_FREEBIND"
#endif
#if defined(IP_BINDANY)
" IP_BINDANY"
#endif
#if defined(IPV6_BINDANY)
" IPV6_BINDANY"
#endif
#if defined(SO_BINDANY)
" SO_BINDANY"
#endif
"");