haproxy/include/proto/connection.h

1151 lines
37 KiB
C
Raw Normal View History

/*
* include/proto/connection.h
* This file contains connection function prototypes
*
* Copyright (C) 2000-2012 Willy Tarreau - w@1wt.eu
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation, version 2.1
* exclusively.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef _PROTO_CONNECTION_H
#define _PROTO_CONNECTION_H
#include <common/config.h>
#include <common/ist.h>
#include <common/memory.h>
#include <types/connection.h>
#include <types/listener.h>
#include <proto/fd.h>
#include <proto/obj_type.h>
#include <proto/task.h>
extern struct pool_head *pool_head_connection;
extern struct pool_head *pool_head_connstream;
extern struct xprt_ops *registered_xprt[XPRT_ENTRIES];
extern struct mux_proto_list mux_proto_list;
/* I/O callback for fd-based connections. It calls the read/write handlers
* provided by the connection's sock_ops.
*/
void conn_fd_handler(int fd);
/* receive a PROXY protocol header over a connection */
int conn_recv_proxy(struct connection *conn, int flag);
int make_proxy_line(char *buf, int buf_len, struct server *srv, struct connection *remote);
int make_proxy_line_v1(char *buf, int buf_len, struct sockaddr_storage *src, struct sockaddr_storage *dst);
int make_proxy_line_v2(char *buf, int buf_len, struct server *srv, struct connection *remote);
int conn_subscribe(struct connection *conn, int event_type, void *param);
int conn_unsubscribe(struct connection *conn, int event_type, void *param);
/* receive a NetScaler Client IP insertion header over a connection */
int conn_recv_netscaler_cip(struct connection *conn, int flag);
/* raw send() directly on the socket */
int conn_sock_send(struct connection *conn, const void *buf, int len, int flags);
/* drains any pending bytes from the socket */
int conn_sock_drain(struct connection *conn);
/* returns true is the transport layer is ready */
static inline int conn_xprt_ready(const struct connection *conn)
{
return (conn->flags & CO_FL_XPRT_READY);
}
/* returns true is the control layer is ready */
static inline int conn_ctrl_ready(const struct connection *conn)
{
return (conn->flags & CO_FL_CTRL_READY);
}
MAJOR: connection: add two new flags to indicate readiness of control/transport Currently the control and transport layers of a connection are supposed to be initialized when their respective pointers are not NULL. This will not work anymore when we plan to reuse connections, because there is an asymmetry between the accept() side and the connect() side : - on accept() side, the fd is set first, then the ctrl layer then the transport layer ; upon error, they must be undone in the reverse order, then the FD must be closed. The FD must not be deleted if the control layer was not yet initialized ; - on the connect() side, the fd is set last and there is no reliable way to know if it has been initialized or not. In practice it's initialized to -1 first but this is hackish and supposes that local FDs only will be used forever. Also, there are even less solutions for keeping trace of the transport layer's state. Also it is possible to support delayed close() when something (eg: logs) tracks some information requiring the transport and/or control layers, making it even more difficult to clean them. So the proposed solution is to add two flags to the connection : - CO_FL_CTRL_READY is set when the control layer is initialized (fd_insert) and cleared after it's released (fd_delete). - CO_FL_XPRT_READY is set when the control layer is initialized (xprt->init) and cleared after it's released (xprt->close). The functions have been adapted to rely on this and not on the pointers anymore. conn_xprt_close() was unused and dangerous : it did not close the control layer (eg: the socket itself) but still marks the transport layer as closed, preventing any future call to conn_full_close() from finishing the job. The problem comes from conn_full_close() in fact. It needs to close the xprt and ctrl layers independantly. After that we're still having an issue : we don't know based on ->ctrl alone whether the fd was registered or not. For this we use the two new flags CO_FL_XPRT_READY and CO_FL_CTRL_READY. We now rely on this and not on conn->xprt nor conn->ctrl anymore to decide what remains to be done on the connection. In order not to miss some flag assignments, we introduce conn_ctrl_init() to initialize the control layer, register the fd using fd_insert() and set the flag, and conn_ctrl_close() which unregisters the fd and removes the flag, but only if the transport layer was closed. Similarly, at the transport layer, conn_xprt_init() calls ->init and sets the flag, while conn_xprt_close() checks the flag, calls ->close and clears the flag, regardless xprt_ctx or xprt_st. This also ensures that the ->init and the ->close functions are called only once each and in the correct order. Note that conn_xprt_close() does nothing if the transport layer is still tracked. conn_full_close() now simply calls conn_xprt_close() then conn_full_close() in turn, which do nothing if CO_FL_XPRT_TRACKED is set. In order to handle the error path, we also provide conn_force_close() which ignores CO_FL_XPRT_TRACKED and closes the transport and the control layers in turns. All relevant instances of fd_delete() have been replaced with conn_force_close(). Now we always know what state the connection is in and we can expect to split its initialization.
2013-10-21 14:30:56 +00:00
/* Calls the init() function of the transport layer if any and if not done yet,
* and sets the CO_FL_XPRT_READY flag to indicate it was properly initialized.
REORG: connection: rename the data layer the "transport layer" While working on the changes required to make the health checks use the new connections, it started to become obvious that some naming was not logical at all in the connections. Specifically, it is not logical to call the "data layer" the layer which is in charge for all the handshake and which does not yet provide a data layer once established until a session has allocated all the required buffers. In fact, it's more a transport layer, which makes much more sense. The transport layer offers a medium on which data can transit, and it offers the functions to move these data when the upper layer requests this. And it is the upper layer which iterates over the transport layer's functions to move data which should be called the data layer. The use case where it's obvious is with embryonic sessions : an incoming SSL connection is accepted. Only the connection is allocated, not the buffers nor stream interface, etc... The connection handles the SSL handshake by itself. Once this handshake is complete, we can't use the data functions because the buffers and stream interface are not there yet. Hence we have to first call a specific function to complete the session initialization, after which we'll be able to use the data functions. This clearly proves that SSL here is only a transport layer and that the stream interface constitutes the data layer. A similar change will be performed to rename app_cb => data, but the two could not be in the same commit for obvious reasons.
2012-10-02 22:19:48 +00:00
* Returns <0 in case of error.
*/
REORG: connection: rename the data layer the "transport layer" While working on the changes required to make the health checks use the new connections, it started to become obvious that some naming was not logical at all in the connections. Specifically, it is not logical to call the "data layer" the layer which is in charge for all the handshake and which does not yet provide a data layer once established until a session has allocated all the required buffers. In fact, it's more a transport layer, which makes much more sense. The transport layer offers a medium on which data can transit, and it offers the functions to move these data when the upper layer requests this. And it is the upper layer which iterates over the transport layer's functions to move data which should be called the data layer. The use case where it's obvious is with embryonic sessions : an incoming SSL connection is accepted. Only the connection is allocated, not the buffers nor stream interface, etc... The connection handles the SSL handshake by itself. Once this handshake is complete, we can't use the data functions because the buffers and stream interface are not there yet. Hence we have to first call a specific function to complete the session initialization, after which we'll be able to use the data functions. This clearly proves that SSL here is only a transport layer and that the stream interface constitutes the data layer. A similar change will be performed to rename app_cb => data, but the two could not be in the same commit for obvious reasons.
2012-10-02 22:19:48 +00:00
static inline int conn_xprt_init(struct connection *conn)
{
MAJOR: connection: add two new flags to indicate readiness of control/transport Currently the control and transport layers of a connection are supposed to be initialized when their respective pointers are not NULL. This will not work anymore when we plan to reuse connections, because there is an asymmetry between the accept() side and the connect() side : - on accept() side, the fd is set first, then the ctrl layer then the transport layer ; upon error, they must be undone in the reverse order, then the FD must be closed. The FD must not be deleted if the control layer was not yet initialized ; - on the connect() side, the fd is set last and there is no reliable way to know if it has been initialized or not. In practice it's initialized to -1 first but this is hackish and supposes that local FDs only will be used forever. Also, there are even less solutions for keeping trace of the transport layer's state. Also it is possible to support delayed close() when something (eg: logs) tracks some information requiring the transport and/or control layers, making it even more difficult to clean them. So the proposed solution is to add two flags to the connection : - CO_FL_CTRL_READY is set when the control layer is initialized (fd_insert) and cleared after it's released (fd_delete). - CO_FL_XPRT_READY is set when the control layer is initialized (xprt->init) and cleared after it's released (xprt->close). The functions have been adapted to rely on this and not on the pointers anymore. conn_xprt_close() was unused and dangerous : it did not close the control layer (eg: the socket itself) but still marks the transport layer as closed, preventing any future call to conn_full_close() from finishing the job. The problem comes from conn_full_close() in fact. It needs to close the xprt and ctrl layers independantly. After that we're still having an issue : we don't know based on ->ctrl alone whether the fd was registered or not. For this we use the two new flags CO_FL_XPRT_READY and CO_FL_CTRL_READY. We now rely on this and not on conn->xprt nor conn->ctrl anymore to decide what remains to be done on the connection. In order not to miss some flag assignments, we introduce conn_ctrl_init() to initialize the control layer, register the fd using fd_insert() and set the flag, and conn_ctrl_close() which unregisters the fd and removes the flag, but only if the transport layer was closed. Similarly, at the transport layer, conn_xprt_init() calls ->init and sets the flag, while conn_xprt_close() checks the flag, calls ->close and clears the flag, regardless xprt_ctx or xprt_st. This also ensures that the ->init and the ->close functions are called only once each and in the correct order. Note that conn_xprt_close() does nothing if the transport layer is still tracked. conn_full_close() now simply calls conn_xprt_close() then conn_full_close() in turn, which do nothing if CO_FL_XPRT_TRACKED is set. In order to handle the error path, we also provide conn_force_close() which ignores CO_FL_XPRT_TRACKED and closes the transport and the control layers in turns. All relevant instances of fd_delete() have been replaced with conn_force_close(). Now we always know what state the connection is in and we can expect to split its initialization.
2013-10-21 14:30:56 +00:00
int ret = 0;
if (!conn_xprt_ready(conn) && conn->xprt && conn->xprt->init)
MAJOR: connection: add two new flags to indicate readiness of control/transport Currently the control and transport layers of a connection are supposed to be initialized when their respective pointers are not NULL. This will not work anymore when we plan to reuse connections, because there is an asymmetry between the accept() side and the connect() side : - on accept() side, the fd is set first, then the ctrl layer then the transport layer ; upon error, they must be undone in the reverse order, then the FD must be closed. The FD must not be deleted if the control layer was not yet initialized ; - on the connect() side, the fd is set last and there is no reliable way to know if it has been initialized or not. In practice it's initialized to -1 first but this is hackish and supposes that local FDs only will be used forever. Also, there are even less solutions for keeping trace of the transport layer's state. Also it is possible to support delayed close() when something (eg: logs) tracks some information requiring the transport and/or control layers, making it even more difficult to clean them. So the proposed solution is to add two flags to the connection : - CO_FL_CTRL_READY is set when the control layer is initialized (fd_insert) and cleared after it's released (fd_delete). - CO_FL_XPRT_READY is set when the control layer is initialized (xprt->init) and cleared after it's released (xprt->close). The functions have been adapted to rely on this and not on the pointers anymore. conn_xprt_close() was unused and dangerous : it did not close the control layer (eg: the socket itself) but still marks the transport layer as closed, preventing any future call to conn_full_close() from finishing the job. The problem comes from conn_full_close() in fact. It needs to close the xprt and ctrl layers independantly. After that we're still having an issue : we don't know based on ->ctrl alone whether the fd was registered or not. For this we use the two new flags CO_FL_XPRT_READY and CO_FL_CTRL_READY. We now rely on this and not on conn->xprt nor conn->ctrl anymore to decide what remains to be done on the connection. In order not to miss some flag assignments, we introduce conn_ctrl_init() to initialize the control layer, register the fd using fd_insert() and set the flag, and conn_ctrl_close() which unregisters the fd and removes the flag, but only if the transport layer was closed. Similarly, at the transport layer, conn_xprt_init() calls ->init and sets the flag, while conn_xprt_close() checks the flag, calls ->close and clears the flag, regardless xprt_ctx or xprt_st. This also ensures that the ->init and the ->close functions are called only once each and in the correct order. Note that conn_xprt_close() does nothing if the transport layer is still tracked. conn_full_close() now simply calls conn_xprt_close() then conn_full_close() in turn, which do nothing if CO_FL_XPRT_TRACKED is set. In order to handle the error path, we also provide conn_force_close() which ignores CO_FL_XPRT_TRACKED and closes the transport and the control layers in turns. All relevant instances of fd_delete() have been replaced with conn_force_close(). Now we always know what state the connection is in and we can expect to split its initialization.
2013-10-21 14:30:56 +00:00
ret = conn->xprt->init(conn);
if (ret >= 0)
conn->flags |= CO_FL_XPRT_READY;
return ret;
}
MAJOR: connection: add two new flags to indicate readiness of control/transport Currently the control and transport layers of a connection are supposed to be initialized when their respective pointers are not NULL. This will not work anymore when we plan to reuse connections, because there is an asymmetry between the accept() side and the connect() side : - on accept() side, the fd is set first, then the ctrl layer then the transport layer ; upon error, they must be undone in the reverse order, then the FD must be closed. The FD must not be deleted if the control layer was not yet initialized ; - on the connect() side, the fd is set last and there is no reliable way to know if it has been initialized or not. In practice it's initialized to -1 first but this is hackish and supposes that local FDs only will be used forever. Also, there are even less solutions for keeping trace of the transport layer's state. Also it is possible to support delayed close() when something (eg: logs) tracks some information requiring the transport and/or control layers, making it even more difficult to clean them. So the proposed solution is to add two flags to the connection : - CO_FL_CTRL_READY is set when the control layer is initialized (fd_insert) and cleared after it's released (fd_delete). - CO_FL_XPRT_READY is set when the control layer is initialized (xprt->init) and cleared after it's released (xprt->close). The functions have been adapted to rely on this and not on the pointers anymore. conn_xprt_close() was unused and dangerous : it did not close the control layer (eg: the socket itself) but still marks the transport layer as closed, preventing any future call to conn_full_close() from finishing the job. The problem comes from conn_full_close() in fact. It needs to close the xprt and ctrl layers independantly. After that we're still having an issue : we don't know based on ->ctrl alone whether the fd was registered or not. For this we use the two new flags CO_FL_XPRT_READY and CO_FL_CTRL_READY. We now rely on this and not on conn->xprt nor conn->ctrl anymore to decide what remains to be done on the connection. In order not to miss some flag assignments, we introduce conn_ctrl_init() to initialize the control layer, register the fd using fd_insert() and set the flag, and conn_ctrl_close() which unregisters the fd and removes the flag, but only if the transport layer was closed. Similarly, at the transport layer, conn_xprt_init() calls ->init and sets the flag, while conn_xprt_close() checks the flag, calls ->close and clears the flag, regardless xprt_ctx or xprt_st. This also ensures that the ->init and the ->close functions are called only once each and in the correct order. Note that conn_xprt_close() does nothing if the transport layer is still tracked. conn_full_close() now simply calls conn_xprt_close() then conn_full_close() in turn, which do nothing if CO_FL_XPRT_TRACKED is set. In order to handle the error path, we also provide conn_force_close() which ignores CO_FL_XPRT_TRACKED and closes the transport and the control layers in turns. All relevant instances of fd_delete() have been replaced with conn_force_close(). Now we always know what state the connection is in and we can expect to split its initialization.
2013-10-21 14:30:56 +00:00
/* Calls the close() function of the transport layer if any and if not done
* yet, and clears the CO_FL_XPRT_READY flag. However this is not done if the
* CO_FL_XPRT_TRACKED flag is set, which allows logs to take data from the
* transport layer very late if needed.
*/
REORG: connection: rename the data layer the "transport layer" While working on the changes required to make the health checks use the new connections, it started to become obvious that some naming was not logical at all in the connections. Specifically, it is not logical to call the "data layer" the layer which is in charge for all the handshake and which does not yet provide a data layer once established until a session has allocated all the required buffers. In fact, it's more a transport layer, which makes much more sense. The transport layer offers a medium on which data can transit, and it offers the functions to move these data when the upper layer requests this. And it is the upper layer which iterates over the transport layer's functions to move data which should be called the data layer. The use case where it's obvious is with embryonic sessions : an incoming SSL connection is accepted. Only the connection is allocated, not the buffers nor stream interface, etc... The connection handles the SSL handshake by itself. Once this handshake is complete, we can't use the data functions because the buffers and stream interface are not there yet. Hence we have to first call a specific function to complete the session initialization, after which we'll be able to use the data functions. This clearly proves that SSL here is only a transport layer and that the stream interface constitutes the data layer. A similar change will be performed to rename app_cb => data, but the two could not be in the same commit for obvious reasons.
2012-10-02 22:19:48 +00:00
static inline void conn_xprt_close(struct connection *conn)
{
MAJOR: connection: add two new flags to indicate readiness of control/transport Currently the control and transport layers of a connection are supposed to be initialized when their respective pointers are not NULL. This will not work anymore when we plan to reuse connections, because there is an asymmetry between the accept() side and the connect() side : - on accept() side, the fd is set first, then the ctrl layer then the transport layer ; upon error, they must be undone in the reverse order, then the FD must be closed. The FD must not be deleted if the control layer was not yet initialized ; - on the connect() side, the fd is set last and there is no reliable way to know if it has been initialized or not. In practice it's initialized to -1 first but this is hackish and supposes that local FDs only will be used forever. Also, there are even less solutions for keeping trace of the transport layer's state. Also it is possible to support delayed close() when something (eg: logs) tracks some information requiring the transport and/or control layers, making it even more difficult to clean them. So the proposed solution is to add two flags to the connection : - CO_FL_CTRL_READY is set when the control layer is initialized (fd_insert) and cleared after it's released (fd_delete). - CO_FL_XPRT_READY is set when the control layer is initialized (xprt->init) and cleared after it's released (xprt->close). The functions have been adapted to rely on this and not on the pointers anymore. conn_xprt_close() was unused and dangerous : it did not close the control layer (eg: the socket itself) but still marks the transport layer as closed, preventing any future call to conn_full_close() from finishing the job. The problem comes from conn_full_close() in fact. It needs to close the xprt and ctrl layers independantly. After that we're still having an issue : we don't know based on ->ctrl alone whether the fd was registered or not. For this we use the two new flags CO_FL_XPRT_READY and CO_FL_CTRL_READY. We now rely on this and not on conn->xprt nor conn->ctrl anymore to decide what remains to be done on the connection. In order not to miss some flag assignments, we introduce conn_ctrl_init() to initialize the control layer, register the fd using fd_insert() and set the flag, and conn_ctrl_close() which unregisters the fd and removes the flag, but only if the transport layer was closed. Similarly, at the transport layer, conn_xprt_init() calls ->init and sets the flag, while conn_xprt_close() checks the flag, calls ->close and clears the flag, regardless xprt_ctx or xprt_st. This also ensures that the ->init and the ->close functions are called only once each and in the correct order. Note that conn_xprt_close() does nothing if the transport layer is still tracked. conn_full_close() now simply calls conn_xprt_close() then conn_full_close() in turn, which do nothing if CO_FL_XPRT_TRACKED is set. In order to handle the error path, we also provide conn_force_close() which ignores CO_FL_XPRT_TRACKED and closes the transport and the control layers in turns. All relevant instances of fd_delete() have been replaced with conn_force_close(). Now we always know what state the connection is in and we can expect to split its initialization.
2013-10-21 14:30:56 +00:00
if ((conn->flags & (CO_FL_XPRT_READY|CO_FL_XPRT_TRACKED)) == CO_FL_XPRT_READY) {
if (conn->xprt->close)
conn->xprt->close(conn);
MAJOR: connection: add two new flags to indicate readiness of control/transport Currently the control and transport layers of a connection are supposed to be initialized when their respective pointers are not NULL. This will not work anymore when we plan to reuse connections, because there is an asymmetry between the accept() side and the connect() side : - on accept() side, the fd is set first, then the ctrl layer then the transport layer ; upon error, they must be undone in the reverse order, then the FD must be closed. The FD must not be deleted if the control layer was not yet initialized ; - on the connect() side, the fd is set last and there is no reliable way to know if it has been initialized or not. In practice it's initialized to -1 first but this is hackish and supposes that local FDs only will be used forever. Also, there are even less solutions for keeping trace of the transport layer's state. Also it is possible to support delayed close() when something (eg: logs) tracks some information requiring the transport and/or control layers, making it even more difficult to clean them. So the proposed solution is to add two flags to the connection : - CO_FL_CTRL_READY is set when the control layer is initialized (fd_insert) and cleared after it's released (fd_delete). - CO_FL_XPRT_READY is set when the control layer is initialized (xprt->init) and cleared after it's released (xprt->close). The functions have been adapted to rely on this and not on the pointers anymore. conn_xprt_close() was unused and dangerous : it did not close the control layer (eg: the socket itself) but still marks the transport layer as closed, preventing any future call to conn_full_close() from finishing the job. The problem comes from conn_full_close() in fact. It needs to close the xprt and ctrl layers independantly. After that we're still having an issue : we don't know based on ->ctrl alone whether the fd was registered or not. For this we use the two new flags CO_FL_XPRT_READY and CO_FL_CTRL_READY. We now rely on this and not on conn->xprt nor conn->ctrl anymore to decide what remains to be done on the connection. In order not to miss some flag assignments, we introduce conn_ctrl_init() to initialize the control layer, register the fd using fd_insert() and set the flag, and conn_ctrl_close() which unregisters the fd and removes the flag, but only if the transport layer was closed. Similarly, at the transport layer, conn_xprt_init() calls ->init and sets the flag, while conn_xprt_close() checks the flag, calls ->close and clears the flag, regardless xprt_ctx or xprt_st. This also ensures that the ->init and the ->close functions are called only once each and in the correct order. Note that conn_xprt_close() does nothing if the transport layer is still tracked. conn_full_close() now simply calls conn_xprt_close() then conn_full_close() in turn, which do nothing if CO_FL_XPRT_TRACKED is set. In order to handle the error path, we also provide conn_force_close() which ignores CO_FL_XPRT_TRACKED and closes the transport and the control layers in turns. All relevant instances of fd_delete() have been replaced with conn_force_close(). Now we always know what state the connection is in and we can expect to split its initialization.
2013-10-21 14:30:56 +00:00
conn->flags &= ~CO_FL_XPRT_READY;
}
}
/* Initializes the connection's control layer which essentially consists in
* registering the file descriptor for polling and setting the CO_FL_CTRL_READY
* flag. The caller is responsible for ensuring that the control layer is
* already assigned to the connection prior to the call.
MAJOR: connection: add two new flags to indicate readiness of control/transport Currently the control and transport layers of a connection are supposed to be initialized when their respective pointers are not NULL. This will not work anymore when we plan to reuse connections, because there is an asymmetry between the accept() side and the connect() side : - on accept() side, the fd is set first, then the ctrl layer then the transport layer ; upon error, they must be undone in the reverse order, then the FD must be closed. The FD must not be deleted if the control layer was not yet initialized ; - on the connect() side, the fd is set last and there is no reliable way to know if it has been initialized or not. In practice it's initialized to -1 first but this is hackish and supposes that local FDs only will be used forever. Also, there are even less solutions for keeping trace of the transport layer's state. Also it is possible to support delayed close() when something (eg: logs) tracks some information requiring the transport and/or control layers, making it even more difficult to clean them. So the proposed solution is to add two flags to the connection : - CO_FL_CTRL_READY is set when the control layer is initialized (fd_insert) and cleared after it's released (fd_delete). - CO_FL_XPRT_READY is set when the control layer is initialized (xprt->init) and cleared after it's released (xprt->close). The functions have been adapted to rely on this and not on the pointers anymore. conn_xprt_close() was unused and dangerous : it did not close the control layer (eg: the socket itself) but still marks the transport layer as closed, preventing any future call to conn_full_close() from finishing the job. The problem comes from conn_full_close() in fact. It needs to close the xprt and ctrl layers independantly. After that we're still having an issue : we don't know based on ->ctrl alone whether the fd was registered or not. For this we use the two new flags CO_FL_XPRT_READY and CO_FL_CTRL_READY. We now rely on this and not on conn->xprt nor conn->ctrl anymore to decide what remains to be done on the connection. In order not to miss some flag assignments, we introduce conn_ctrl_init() to initialize the control layer, register the fd using fd_insert() and set the flag, and conn_ctrl_close() which unregisters the fd and removes the flag, but only if the transport layer was closed. Similarly, at the transport layer, conn_xprt_init() calls ->init and sets the flag, while conn_xprt_close() checks the flag, calls ->close and clears the flag, regardless xprt_ctx or xprt_st. This also ensures that the ->init and the ->close functions are called only once each and in the correct order. Note that conn_xprt_close() does nothing if the transport layer is still tracked. conn_full_close() now simply calls conn_xprt_close() then conn_full_close() in turn, which do nothing if CO_FL_XPRT_TRACKED is set. In order to handle the error path, we also provide conn_force_close() which ignores CO_FL_XPRT_TRACKED and closes the transport and the control layers in turns. All relevant instances of fd_delete() have been replaced with conn_force_close(). Now we always know what state the connection is in and we can expect to split its initialization.
2013-10-21 14:30:56 +00:00
*/
static inline void conn_ctrl_init(struct connection *conn)
{
if (!conn_ctrl_ready(conn)) {
int fd = conn->handle.fd;
MAJOR: connection: add two new flags to indicate readiness of control/transport Currently the control and transport layers of a connection are supposed to be initialized when their respective pointers are not NULL. This will not work anymore when we plan to reuse connections, because there is an asymmetry between the accept() side and the connect() side : - on accept() side, the fd is set first, then the ctrl layer then the transport layer ; upon error, they must be undone in the reverse order, then the FD must be closed. The FD must not be deleted if the control layer was not yet initialized ; - on the connect() side, the fd is set last and there is no reliable way to know if it has been initialized or not. In practice it's initialized to -1 first but this is hackish and supposes that local FDs only will be used forever. Also, there are even less solutions for keeping trace of the transport layer's state. Also it is possible to support delayed close() when something (eg: logs) tracks some information requiring the transport and/or control layers, making it even more difficult to clean them. So the proposed solution is to add two flags to the connection : - CO_FL_CTRL_READY is set when the control layer is initialized (fd_insert) and cleared after it's released (fd_delete). - CO_FL_XPRT_READY is set when the control layer is initialized (xprt->init) and cleared after it's released (xprt->close). The functions have been adapted to rely on this and not on the pointers anymore. conn_xprt_close() was unused and dangerous : it did not close the control layer (eg: the socket itself) but still marks the transport layer as closed, preventing any future call to conn_full_close() from finishing the job. The problem comes from conn_full_close() in fact. It needs to close the xprt and ctrl layers independantly. After that we're still having an issue : we don't know based on ->ctrl alone whether the fd was registered or not. For this we use the two new flags CO_FL_XPRT_READY and CO_FL_CTRL_READY. We now rely on this and not on conn->xprt nor conn->ctrl anymore to decide what remains to be done on the connection. In order not to miss some flag assignments, we introduce conn_ctrl_init() to initialize the control layer, register the fd using fd_insert() and set the flag, and conn_ctrl_close() which unregisters the fd and removes the flag, but only if the transport layer was closed. Similarly, at the transport layer, conn_xprt_init() calls ->init and sets the flag, while conn_xprt_close() checks the flag, calls ->close and clears the flag, regardless xprt_ctx or xprt_st. This also ensures that the ->init and the ->close functions are called only once each and in the correct order. Note that conn_xprt_close() does nothing if the transport layer is still tracked. conn_full_close() now simply calls conn_xprt_close() then conn_full_close() in turn, which do nothing if CO_FL_XPRT_TRACKED is set. In order to handle the error path, we also provide conn_force_close() which ignores CO_FL_XPRT_TRACKED and closes the transport and the control layers in turns. All relevant instances of fd_delete() have been replaced with conn_force_close(). Now we always know what state the connection is in and we can expect to split its initialization.
2013-10-21 14:30:56 +00:00
fd_insert(fd, conn, conn_fd_handler, tid_bit);
MAJOR: polling: rework the whole polling system This commit heavily changes the polling system in order to definitely fix the frequent breakage of SSL which needs to remember the last EAGAIN before deciding whether to poll or not. Now we have a state per direction for each FD, as opposed to a previous and current state previously. An FD can have up to 8 different states for each direction, each of which being the result of a 3-bit combination. These 3 bits indicate a wish to access the FD, the readiness of the FD and the subscription of the FD to the polling system. This means that it will now be possible to remember the state of a file descriptor across disable/enable sequences that generally happen during forwarding, where enabling reading on a previously disabled FD would result in forgetting the EAGAIN flag it met last time. Several new state manipulation functions have been introduced or adapted : - fd_want_{recv,send} : enable receiving/sending on the FD regardless of its state (sets the ACTIVE flag) ; - fd_stop_{recv,send} : stop receiving/sending on the FD regardless of its state (clears the ACTIVE flag) ; - fd_cant_{recv,send} : report a failure to receive/send on the FD corresponding to EAGAIN (clears the READY flag) ; - fd_may_{recv,send} : report the ability to receive/send on the FD as reported by poll() (sets the READY flag) ; Some functions are used to report the current FD status : - fd_{recv,send}_active - fd_{recv,send}_ready - fd_{recv,send}_polled Some functions were removed : - fd_ev_clr(), fd_ev_set(), fd_ev_rem(), fd_ev_wai() The POLLHUP/POLLERR flags are now reported as ready so that the I/O layers knows it can try to access the file descriptor to get this information. In order to simplify the conditions to add/remove cache entries, a new function fd_alloc_or_release_cache_entry() was created to be used from pollers while scanning for updates. The following pollers have been updated : ev_select() : done, built, tested on Linux 3.10 ev_poll() : done, built, tested on Linux 3.10 ev_epoll() : done, built, tested on Linux 3.10 & 3.13 ev_kqueue() : done, built, tested on OpenBSD 5.2
2014-01-10 15:58:45 +00:00
/* mark the fd as ready so as not to needlessly poll at the beginning */
fd_may_recv(fd);
fd_may_send(fd);
MAJOR: connection: add two new flags to indicate readiness of control/transport Currently the control and transport layers of a connection are supposed to be initialized when their respective pointers are not NULL. This will not work anymore when we plan to reuse connections, because there is an asymmetry between the accept() side and the connect() side : - on accept() side, the fd is set first, then the ctrl layer then the transport layer ; upon error, they must be undone in the reverse order, then the FD must be closed. The FD must not be deleted if the control layer was not yet initialized ; - on the connect() side, the fd is set last and there is no reliable way to know if it has been initialized or not. In practice it's initialized to -1 first but this is hackish and supposes that local FDs only will be used forever. Also, there are even less solutions for keeping trace of the transport layer's state. Also it is possible to support delayed close() when something (eg: logs) tracks some information requiring the transport and/or control layers, making it even more difficult to clean them. So the proposed solution is to add two flags to the connection : - CO_FL_CTRL_READY is set when the control layer is initialized (fd_insert) and cleared after it's released (fd_delete). - CO_FL_XPRT_READY is set when the control layer is initialized (xprt->init) and cleared after it's released (xprt->close). The functions have been adapted to rely on this and not on the pointers anymore. conn_xprt_close() was unused and dangerous : it did not close the control layer (eg: the socket itself) but still marks the transport layer as closed, preventing any future call to conn_full_close() from finishing the job. The problem comes from conn_full_close() in fact. It needs to close the xprt and ctrl layers independantly. After that we're still having an issue : we don't know based on ->ctrl alone whether the fd was registered or not. For this we use the two new flags CO_FL_XPRT_READY and CO_FL_CTRL_READY. We now rely on this and not on conn->xprt nor conn->ctrl anymore to decide what remains to be done on the connection. In order not to miss some flag assignments, we introduce conn_ctrl_init() to initialize the control layer, register the fd using fd_insert() and set the flag, and conn_ctrl_close() which unregisters the fd and removes the flag, but only if the transport layer was closed. Similarly, at the transport layer, conn_xprt_init() calls ->init and sets the flag, while conn_xprt_close() checks the flag, calls ->close and clears the flag, regardless xprt_ctx or xprt_st. This also ensures that the ->init and the ->close functions are called only once each and in the correct order. Note that conn_xprt_close() does nothing if the transport layer is still tracked. conn_full_close() now simply calls conn_xprt_close() then conn_full_close() in turn, which do nothing if CO_FL_XPRT_TRACKED is set. In order to handle the error path, we also provide conn_force_close() which ignores CO_FL_XPRT_TRACKED and closes the transport and the control layers in turns. All relevant instances of fd_delete() have been replaced with conn_force_close(). Now we always know what state the connection is in and we can expect to split its initialization.
2013-10-21 14:30:56 +00:00
conn->flags |= CO_FL_CTRL_READY;
}
}
/* Deletes the FD if the transport layer is already gone. Once done,
* it then removes the CO_FL_CTRL_READY flag.
*/
static inline void conn_ctrl_close(struct connection *conn)
{
if ((conn->flags & (CO_FL_XPRT_READY|CO_FL_CTRL_READY)) == CO_FL_CTRL_READY) {
fd_delete(conn->handle.fd);
conn->handle.fd = DEAD_FD_MAGIC;
MAJOR: connection: add two new flags to indicate readiness of control/transport Currently the control and transport layers of a connection are supposed to be initialized when their respective pointers are not NULL. This will not work anymore when we plan to reuse connections, because there is an asymmetry between the accept() side and the connect() side : - on accept() side, the fd is set first, then the ctrl layer then the transport layer ; upon error, they must be undone in the reverse order, then the FD must be closed. The FD must not be deleted if the control layer was not yet initialized ; - on the connect() side, the fd is set last and there is no reliable way to know if it has been initialized or not. In practice it's initialized to -1 first but this is hackish and supposes that local FDs only will be used forever. Also, there are even less solutions for keeping trace of the transport layer's state. Also it is possible to support delayed close() when something (eg: logs) tracks some information requiring the transport and/or control layers, making it even more difficult to clean them. So the proposed solution is to add two flags to the connection : - CO_FL_CTRL_READY is set when the control layer is initialized (fd_insert) and cleared after it's released (fd_delete). - CO_FL_XPRT_READY is set when the control layer is initialized (xprt->init) and cleared after it's released (xprt->close). The functions have been adapted to rely on this and not on the pointers anymore. conn_xprt_close() was unused and dangerous : it did not close the control layer (eg: the socket itself) but still marks the transport layer as closed, preventing any future call to conn_full_close() from finishing the job. The problem comes from conn_full_close() in fact. It needs to close the xprt and ctrl layers independantly. After that we're still having an issue : we don't know based on ->ctrl alone whether the fd was registered or not. For this we use the two new flags CO_FL_XPRT_READY and CO_FL_CTRL_READY. We now rely on this and not on conn->xprt nor conn->ctrl anymore to decide what remains to be done on the connection. In order not to miss some flag assignments, we introduce conn_ctrl_init() to initialize the control layer, register the fd using fd_insert() and set the flag, and conn_ctrl_close() which unregisters the fd and removes the flag, but only if the transport layer was closed. Similarly, at the transport layer, conn_xprt_init() calls ->init and sets the flag, while conn_xprt_close() checks the flag, calls ->close and clears the flag, regardless xprt_ctx or xprt_st. This also ensures that the ->init and the ->close functions are called only once each and in the correct order. Note that conn_xprt_close() does nothing if the transport layer is still tracked. conn_full_close() now simply calls conn_xprt_close() then conn_full_close() in turn, which do nothing if CO_FL_XPRT_TRACKED is set. In order to handle the error path, we also provide conn_force_close() which ignores CO_FL_XPRT_TRACKED and closes the transport and the control layers in turns. All relevant instances of fd_delete() have been replaced with conn_force_close(). Now we always know what state the connection is in and we can expect to split its initialization.
2013-10-21 14:30:56 +00:00
conn->flags &= ~CO_FL_CTRL_READY;
}
}
/* If the connection still has a transport layer, then call its close() function
* if any, and delete the file descriptor if a control layer is set. This is
* used to close everything at once and atomically. However this is not done if
* the CO_FL_XPRT_TRACKED flag is set, which allows logs to take data from the
* transport layer very late if needed.
*/
static inline void conn_full_close(struct connection *conn)
{
MAJOR: connection: add two new flags to indicate readiness of control/transport Currently the control and transport layers of a connection are supposed to be initialized when their respective pointers are not NULL. This will not work anymore when we plan to reuse connections, because there is an asymmetry between the accept() side and the connect() side : - on accept() side, the fd is set first, then the ctrl layer then the transport layer ; upon error, they must be undone in the reverse order, then the FD must be closed. The FD must not be deleted if the control layer was not yet initialized ; - on the connect() side, the fd is set last and there is no reliable way to know if it has been initialized or not. In practice it's initialized to -1 first but this is hackish and supposes that local FDs only will be used forever. Also, there are even less solutions for keeping trace of the transport layer's state. Also it is possible to support delayed close() when something (eg: logs) tracks some information requiring the transport and/or control layers, making it even more difficult to clean them. So the proposed solution is to add two flags to the connection : - CO_FL_CTRL_READY is set when the control layer is initialized (fd_insert) and cleared after it's released (fd_delete). - CO_FL_XPRT_READY is set when the control layer is initialized (xprt->init) and cleared after it's released (xprt->close). The functions have been adapted to rely on this and not on the pointers anymore. conn_xprt_close() was unused and dangerous : it did not close the control layer (eg: the socket itself) but still marks the transport layer as closed, preventing any future call to conn_full_close() from finishing the job. The problem comes from conn_full_close() in fact. It needs to close the xprt and ctrl layers independantly. After that we're still having an issue : we don't know based on ->ctrl alone whether the fd was registered or not. For this we use the two new flags CO_FL_XPRT_READY and CO_FL_CTRL_READY. We now rely on this and not on conn->xprt nor conn->ctrl anymore to decide what remains to be done on the connection. In order not to miss some flag assignments, we introduce conn_ctrl_init() to initialize the control layer, register the fd using fd_insert() and set the flag, and conn_ctrl_close() which unregisters the fd and removes the flag, but only if the transport layer was closed. Similarly, at the transport layer, conn_xprt_init() calls ->init and sets the flag, while conn_xprt_close() checks the flag, calls ->close and clears the flag, regardless xprt_ctx or xprt_st. This also ensures that the ->init and the ->close functions are called only once each and in the correct order. Note that conn_xprt_close() does nothing if the transport layer is still tracked. conn_full_close() now simply calls conn_xprt_close() then conn_full_close() in turn, which do nothing if CO_FL_XPRT_TRACKED is set. In order to handle the error path, we also provide conn_force_close() which ignores CO_FL_XPRT_TRACKED and closes the transport and the control layers in turns. All relevant instances of fd_delete() have been replaced with conn_force_close(). Now we always know what state the connection is in and we can expect to split its initialization.
2013-10-21 14:30:56 +00:00
conn_xprt_close(conn);
conn_ctrl_close(conn);
}
/* stop tracking a connection, allowing conn_full_close() to always
* succeed.
*/
static inline void conn_stop_tracking(struct connection *conn)
{
conn->flags &= ~CO_FL_XPRT_TRACKED;
}
/* Update polling on connection <c>'s file descriptor depending on its current
* state as reported in the connection's CO_FL_CURR_* flags, reports of EAGAIN
* in CO_FL_WAIT_*, and the sock layer expectations indicated by CO_FL_SOCK_*.
* The connection flags are updated with the new flags at the end of the
* operation. Polling is totally disabled if an error was reported.
*/
void conn_update_sock_polling(struct connection *c);
/* Update polling on connection <c>'s file descriptor depending on its current
* state as reported in the connection's CO_FL_CURR_* flags, reports of EAGAIN
* in CO_FL_WAIT_*, and the upper layer expectations indicated by CO_FL_XPRT_*.
* The connection flags are updated with the new flags at the end of the
* operation. Polling is totally disabled if an error was reported.
*/
void conn_update_xprt_polling(struct connection *c);
/* Refresh the connection's polling flags from its file descriptor status.
* This should be called at the beginning of a connection handler. It does
* nothing if CO_FL_WILL_UPDATE is present, indicating that an upper caller
* has already done it.
*/
static inline void conn_refresh_polling_flags(struct connection *conn)
{
if (conn_ctrl_ready(conn) && !(conn->flags & CO_FL_WILL_UPDATE)) {
unsigned int flags = conn->flags;
flags &= ~(CO_FL_CURR_RD_ENA | CO_FL_CURR_WR_ENA | CO_FL_WAIT_ROOM);
if (fd_recv_active(conn->handle.fd))
flags |= CO_FL_CURR_RD_ENA;
if (fd_send_active(conn->handle.fd))
flags |= CO_FL_CURR_WR_ENA;
conn->flags = flags;
}
}
/* inspects c->flags and returns non-zero if XPRT ENA changes from the CURR ENA
* or if the WAIT flags are set with their respective ENA flags. Additionally,
* non-zero is also returned if an error was reported on the connection. This
* function is used quite often and is inlined. In order to proceed optimally
* with very little code and CPU cycles, the bits are arranged so that a change
* can be detected by a few left shifts, a xor, and a mask. These operations
* detect when W&D are both enabled for either direction, when C&D differ for
* either direction and when Error is set. The trick consists in first keeping
* only the bits we're interested in, since they don't collide when shifted,
* and to perform the AND at the end. In practice, the compiler is able to
* replace the last AND with a TEST in boolean conditions. This results in
* checks that are done in 4-6 cycles and less than 30 bytes.
*/
static inline unsigned int conn_xprt_polling_changes(const struct connection *c)
{
unsigned int f = c->flags;
f &= CO_FL_XPRT_WR_ENA | CO_FL_XPRT_RD_ENA | CO_FL_CURR_WR_ENA |
CO_FL_CURR_RD_ENA | CO_FL_ERROR;
f = (f ^ (f << 1)) & (CO_FL_CURR_WR_ENA|CO_FL_CURR_RD_ENA); /* test C ^ D */
return f & (CO_FL_CURR_WR_ENA | CO_FL_CURR_RD_ENA | CO_FL_ERROR);
}
/* inspects c->flags and returns non-zero if SOCK ENA changes from the CURR ENA
* or if the WAIT flags are set with their respective ENA flags. Additionally,
* non-zero is also returned if an error was reported on the connection. This
* function is used quite often and is inlined. In order to proceed optimally
* with very little code and CPU cycles, the bits are arranged so that a change
* can be detected by a few left shifts, a xor, and a mask. These operations
* detect when W&S are both enabled for either direction, when C&S differ for
* either direction and when Error is set. The trick consists in first keeping
* only the bits we're interested in, since they don't collide when shifted,
* and to perform the AND at the end. In practice, the compiler is able to
* replace the last AND with a TEST in boolean conditions. This results in
* checks that are done in 4-6 cycles and less than 30 bytes.
*/
static inline unsigned int conn_sock_polling_changes(const struct connection *c)
{
unsigned int f = c->flags;
f &= CO_FL_SOCK_WR_ENA | CO_FL_SOCK_RD_ENA | CO_FL_CURR_WR_ENA |
CO_FL_CURR_RD_ENA | CO_FL_ERROR;
f = (f ^ (f << 2)) & (CO_FL_CURR_WR_ENA|CO_FL_CURR_RD_ENA); /* test C ^ S */
return f & (CO_FL_CURR_WR_ENA | CO_FL_CURR_RD_ENA | CO_FL_ERROR);
}
/* Automatically updates polling on connection <c> depending on the XPRT flags
* if no handshake is in progress. It does nothing if CO_FL_WILL_UPDATE is
* present, indicating that an upper caller is going to do it again later.
*/
static inline void conn_cond_update_xprt_polling(struct connection *c)
{
if (!(c->flags & CO_FL_WILL_UPDATE))
if (!(c->flags & CO_FL_POLL_SOCK) && conn_xprt_polling_changes(c))
conn_update_xprt_polling(c);
}
/* Automatically updates polling on connection <c> depending on the SOCK flags
* if a handshake is in progress. It does nothing if CO_FL_WILL_UPDATE is
* present, indicating that an upper caller is going to do it again later.
*/
static inline void conn_cond_update_sock_polling(struct connection *c)
{
if (!(c->flags & CO_FL_WILL_UPDATE))
if ((c->flags & CO_FL_POLL_SOCK) && conn_sock_polling_changes(c))
conn_update_sock_polling(c);
}
/* Stop all polling on the fd. This might be used when an error is encountered
* for example. It does not propage the change to the fd layer if
* CO_FL_WILL_UPDATE is present, indicating that an upper caller is going to do
* it later.
*/
static inline void conn_stop_polling(struct connection *c)
{
c->flags &= ~(CO_FL_CURR_RD_ENA | CO_FL_CURR_WR_ENA |
CO_FL_SOCK_RD_ENA | CO_FL_SOCK_WR_ENA |
CO_FL_XPRT_RD_ENA | CO_FL_XPRT_WR_ENA);
if (!(c->flags & CO_FL_WILL_UPDATE) && conn_ctrl_ready(c))
fd_stop_both(c->handle.fd);
}
/* Automatically update polling on connection <c> depending on the XPRT and
* SOCK flags, and on whether a handshake is in progress or not. This may be
* called at any moment when there is a doubt about the effectiveness of the
* polling state, for instance when entering or leaving the handshake state.
* It does nothing if CO_FL_WILL_UPDATE is present, indicating that an upper
* caller is going to do it again later.
*/
static inline void conn_cond_update_polling(struct connection *c)
{
if (unlikely(c->flags & CO_FL_ERROR))
conn_stop_polling(c);
else if (!(c->flags & CO_FL_WILL_UPDATE)) {
if (!(c->flags & CO_FL_POLL_SOCK) && conn_xprt_polling_changes(c))
conn_update_xprt_polling(c);
else if ((c->flags & CO_FL_POLL_SOCK) && conn_sock_polling_changes(c))
conn_update_sock_polling(c);
}
}
/***** Event manipulation primitives for use by DATA I/O callbacks *****/
/* The __conn_* versions do not propagate to lower layers and are only meant
* to be used by handlers called by the connection handler. The other ones
* may be used anywhere.
*/
static inline void __conn_xprt_want_recv(struct connection *c)
{
c->flags |= CO_FL_XPRT_RD_ENA;
}
static inline void __conn_xprt_stop_recv(struct connection *c)
{
c->flags &= ~CO_FL_XPRT_RD_ENA;
}
OPTIM: stream-int: don't disable polling anymore on DONT_READ Commit 5fddab0 ("OPTIM: stream_interface: disable reading when CF_READ_DONTWAIT is set") improved the connection layer's efficiency back in 1.5-dev13 by avoiding successive read attempts on an active FD. But by disabling this on a polled FD, it causes an unpleasant side effect which is that the FD that was subscribed to polling is suddenly stopped and may need to be re-enabled once the kernel starts to slow down on data eviction (eg: saturated server at the other end, bursty traffic caused by too large maxpollevents). This behaviour is observable with persistent connections when there is a large enough connection count so that there's no data in the early connection and polling is required, because there are then up to 4 epoll_ctl() calls per request. It's important that the server is slower than haproxy to cause some delays when reading response. The current connection layer as designed in 1.6 with the FD cache doesn't require this trick anymore, though it still benefits from it when it saves an FD from being uselessly polled. But compared to the increased cost of enabling and disabling poll all the time, it's still better to disable it. In some cases it's possible to observe a performance increase as high as 30% by avoiding this epoll_ctl() dance. In the end we only want to disable it when the FD is speculatively read and not when it's polled. For this we introduce a new function __conn_data_done_recv() which is used to indicate that we're done with recv() and not interested in new attempts. If/when we later support event-triggered epoll, this function will have to change a bit to do the same even in the polled case. A quick test with keep-alive requests run on a dual-core / dual- thread Atom shows a significant improvement : single process, 0 bytes : before: Requests per second: 12243.20 [#/sec] (mean) after: Requests per second: 13354.54 [#/sec] (mean) single process, 4k : before: Requests per second: 9639.81 [#/sec] (mean) after: Requests per second: 10991.89 [#/sec] (mean) dual process, 0 bytes (unstable) : before: Requests per second: 16900-19800 ~ 17600 [#/sec] (mean) after: Requests per second: 18600-21400 ~ 20500 [#/sec] (mean)
2016-11-29 20:47:02 +00:00
/* this one is used only to stop speculative recv(). It doesn't stop it if the
* fd is already polled in order to avoid expensive polling status changes.
* Since it might require the upper layer to re-enable reading, we'll return 1
* if we've really stopped something otherwise zero.
*/
static inline int __conn_xprt_done_recv(struct connection *c)
OPTIM: stream-int: don't disable polling anymore on DONT_READ Commit 5fddab0 ("OPTIM: stream_interface: disable reading when CF_READ_DONTWAIT is set") improved the connection layer's efficiency back in 1.5-dev13 by avoiding successive read attempts on an active FD. But by disabling this on a polled FD, it causes an unpleasant side effect which is that the FD that was subscribed to polling is suddenly stopped and may need to be re-enabled once the kernel starts to slow down on data eviction (eg: saturated server at the other end, bursty traffic caused by too large maxpollevents). This behaviour is observable with persistent connections when there is a large enough connection count so that there's no data in the early connection and polling is required, because there are then up to 4 epoll_ctl() calls per request. It's important that the server is slower than haproxy to cause some delays when reading response. The current connection layer as designed in 1.6 with the FD cache doesn't require this trick anymore, though it still benefits from it when it saves an FD from being uselessly polled. But compared to the increased cost of enabling and disabling poll all the time, it's still better to disable it. In some cases it's possible to observe a performance increase as high as 30% by avoiding this epoll_ctl() dance. In the end we only want to disable it when the FD is speculatively read and not when it's polled. For this we introduce a new function __conn_data_done_recv() which is used to indicate that we're done with recv() and not interested in new attempts. If/when we later support event-triggered epoll, this function will have to change a bit to do the same even in the polled case. A quick test with keep-alive requests run on a dual-core / dual- thread Atom shows a significant improvement : single process, 0 bytes : before: Requests per second: 12243.20 [#/sec] (mean) after: Requests per second: 13354.54 [#/sec] (mean) single process, 4k : before: Requests per second: 9639.81 [#/sec] (mean) after: Requests per second: 10991.89 [#/sec] (mean) dual process, 0 bytes (unstable) : before: Requests per second: 16900-19800 ~ 17600 [#/sec] (mean) after: Requests per second: 18600-21400 ~ 20500 [#/sec] (mean)
2016-11-29 20:47:02 +00:00
{
if (!conn_ctrl_ready(c) || !fd_recv_polled(c->handle.fd)) {
c->flags &= ~CO_FL_XPRT_RD_ENA;
OPTIM: stream-int: don't disable polling anymore on DONT_READ Commit 5fddab0 ("OPTIM: stream_interface: disable reading when CF_READ_DONTWAIT is set") improved the connection layer's efficiency back in 1.5-dev13 by avoiding successive read attempts on an active FD. But by disabling this on a polled FD, it causes an unpleasant side effect which is that the FD that was subscribed to polling is suddenly stopped and may need to be re-enabled once the kernel starts to slow down on data eviction (eg: saturated server at the other end, bursty traffic caused by too large maxpollevents). This behaviour is observable with persistent connections when there is a large enough connection count so that there's no data in the early connection and polling is required, because there are then up to 4 epoll_ctl() calls per request. It's important that the server is slower than haproxy to cause some delays when reading response. The current connection layer as designed in 1.6 with the FD cache doesn't require this trick anymore, though it still benefits from it when it saves an FD from being uselessly polled. But compared to the increased cost of enabling and disabling poll all the time, it's still better to disable it. In some cases it's possible to observe a performance increase as high as 30% by avoiding this epoll_ctl() dance. In the end we only want to disable it when the FD is speculatively read and not when it's polled. For this we introduce a new function __conn_data_done_recv() which is used to indicate that we're done with recv() and not interested in new attempts. If/when we later support event-triggered epoll, this function will have to change a bit to do the same even in the polled case. A quick test with keep-alive requests run on a dual-core / dual- thread Atom shows a significant improvement : single process, 0 bytes : before: Requests per second: 12243.20 [#/sec] (mean) after: Requests per second: 13354.54 [#/sec] (mean) single process, 4k : before: Requests per second: 9639.81 [#/sec] (mean) after: Requests per second: 10991.89 [#/sec] (mean) dual process, 0 bytes (unstable) : before: Requests per second: 16900-19800 ~ 17600 [#/sec] (mean) after: Requests per second: 18600-21400 ~ 20500 [#/sec] (mean)
2016-11-29 20:47:02 +00:00
return 1;
}
return 0;
}
static inline void __conn_xprt_want_send(struct connection *c)
{
c->flags |= CO_FL_XPRT_WR_ENA;
}
static inline void __conn_xprt_stop_send(struct connection *c)
{
c->flags &= ~CO_FL_XPRT_WR_ENA;
}
static inline void __conn_xprt_stop_both(struct connection *c)
{
c->flags &= ~(CO_FL_XPRT_WR_ENA | CO_FL_XPRT_RD_ENA);
}
static inline void conn_xprt_want_recv(struct connection *c)
{
__conn_xprt_want_recv(c);
conn_cond_update_xprt_polling(c);
}
static inline void conn_xprt_stop_recv(struct connection *c)
{
__conn_xprt_stop_recv(c);
conn_cond_update_xprt_polling(c);
}
static inline void conn_xprt_want_send(struct connection *c)
{
__conn_xprt_want_send(c);
conn_cond_update_xprt_polling(c);
}
static inline void conn_xprt_stop_send(struct connection *c)
{
__conn_xprt_stop_send(c);
conn_cond_update_xprt_polling(c);
}
static inline void conn_xprt_stop_both(struct connection *c)
{
__conn_xprt_stop_both(c);
conn_cond_update_xprt_polling(c);
}
/***** Event manipulation primitives for use by handshake I/O callbacks *****/
/* The __conn_* versions do not propagate to lower layers and are only meant
* to be used by handlers called by the connection handler. The other ones
* may be used anywhere.
*/
static inline void __conn_sock_want_recv(struct connection *c)
{
c->flags |= CO_FL_SOCK_RD_ENA;
}
static inline void __conn_sock_stop_recv(struct connection *c)
{
c->flags &= ~CO_FL_SOCK_RD_ENA;
}
static inline void __conn_sock_want_send(struct connection *c)
{
c->flags |= CO_FL_SOCK_WR_ENA;
}
static inline void __conn_sock_stop_send(struct connection *c)
{
c->flags &= ~CO_FL_SOCK_WR_ENA;
}
static inline void __conn_sock_stop_both(struct connection *c)
{
c->flags &= ~(CO_FL_SOCK_WR_ENA | CO_FL_SOCK_RD_ENA);
}
static inline void conn_sock_want_recv(struct connection *c)
{
__conn_sock_want_recv(c);
conn_cond_update_sock_polling(c);
}
static inline void conn_sock_stop_recv(struct connection *c)
{
__conn_sock_stop_recv(c);
conn_cond_update_sock_polling(c);
}
static inline void conn_sock_want_send(struct connection *c)
{
__conn_sock_want_send(c);
conn_cond_update_sock_polling(c);
}
static inline void conn_sock_stop_send(struct connection *c)
{
__conn_sock_stop_send(c);
conn_cond_update_sock_polling(c);
}
static inline void conn_sock_stop_both(struct connection *c)
{
__conn_sock_stop_both(c);
conn_cond_update_sock_polling(c);
}
/* read shutdown, called from the rcv_buf/rcv_pipe handlers when
* detecting an end of connection.
*/
static inline void conn_sock_read0(struct connection *c)
{
c->flags |= CO_FL_SOCK_RD_SH;
__conn_sock_stop_recv(c);
/* we don't risk keeping ports unusable if we found the
* zero from the other side.
*/
if (conn_ctrl_ready(c))
fdtab[c->handle.fd].linger_risk = 0;
}
/* write shutdown, indication that the upper layer is not willing to send
BUG/MAJOR: connection: refine the situations where we don't send shutw() Since commit f9ce57e ("MEDIUM: connection: make conn_sock_shutw() aware of lingering"), we refrain from performing the shutw() on the socket if there is no lingering risk. But there is a problem with this in tunnel and in TCP modes where a client is explicitly allowed to send a shutw to the server, eventhough it it risky. Not doing it creates this situation reported by Ricardo Fraile and diagnosed by Christopher : a typical HTTP client (eg: curl) connecting via the config below to an HTTP server would receive its response, immediately close while the server remains in keep-alive mode. The shutr() received by haproxy from the client is "propagated" to the server side but not acted upon because fdtab[fd].linger_risk is set, so we expect that the next close will immediately complete this operation. listen proxy-tcp bind 127.0.0.1:8888 mode tcp timeout connect 5s timeout server 10s timeout client 10s server server1 127.0.0.1:8000 But since the whole stream will not end until the server closes in turn, the server doesn't close and haproxy expires on server timeout. This problem has already struck by waking up an older bug and was partially fixed with commit 8059351 ("BUG/MEDIUM: http: don't disable lingering on requests with tunnelled responses") though it was not enough. The problem is that linger_risk is not suited here. In fact we need to know whether or not it is desired to close normally or silently, and whether or not a shutr() has already been received on this connection. This is the approach this patch takes, and it solves the problem for the various difficult modes (tcp, http-server-close, pretend-keepalive). This fix needs to be backported to 1.8. Many thanks to Ricardo for providing very detailed traces and configurations.
2017-12-22 17:46:33 +00:00
* anything anymore and wants to close after pending data are sent. The
* <clean> argument will allow not to perform the socket layer shutdown if
* equal to 0.
*/
BUG/MAJOR: connection: refine the situations where we don't send shutw() Since commit f9ce57e ("MEDIUM: connection: make conn_sock_shutw() aware of lingering"), we refrain from performing the shutw() on the socket if there is no lingering risk. But there is a problem with this in tunnel and in TCP modes where a client is explicitly allowed to send a shutw to the server, eventhough it it risky. Not doing it creates this situation reported by Ricardo Fraile and diagnosed by Christopher : a typical HTTP client (eg: curl) connecting via the config below to an HTTP server would receive its response, immediately close while the server remains in keep-alive mode. The shutr() received by haproxy from the client is "propagated" to the server side but not acted upon because fdtab[fd].linger_risk is set, so we expect that the next close will immediately complete this operation. listen proxy-tcp bind 127.0.0.1:8888 mode tcp timeout connect 5s timeout server 10s timeout client 10s server server1 127.0.0.1:8000 But since the whole stream will not end until the server closes in turn, the server doesn't close and haproxy expires on server timeout. This problem has already struck by waking up an older bug and was partially fixed with commit 8059351 ("BUG/MEDIUM: http: don't disable lingering on requests with tunnelled responses") though it was not enough. The problem is that linger_risk is not suited here. In fact we need to know whether or not it is desired to close normally or silently, and whether or not a shutr() has already been received on this connection. This is the approach this patch takes, and it solves the problem for the various difficult modes (tcp, http-server-close, pretend-keepalive). This fix needs to be backported to 1.8. Many thanks to Ricardo for providing very detailed traces and configurations.
2017-12-22 17:46:33 +00:00
static inline void conn_sock_shutw(struct connection *c, int clean)
{
c->flags |= CO_FL_SOCK_WR_SH;
conn_refresh_polling_flags(c);
__conn_sock_stop_send(c);
conn_cond_update_sock_polling(c);
BUG/MAJOR: connection: refine the situations where we don't send shutw() Since commit f9ce57e ("MEDIUM: connection: make conn_sock_shutw() aware of lingering"), we refrain from performing the shutw() on the socket if there is no lingering risk. But there is a problem with this in tunnel and in TCP modes where a client is explicitly allowed to send a shutw to the server, eventhough it it risky. Not doing it creates this situation reported by Ricardo Fraile and diagnosed by Christopher : a typical HTTP client (eg: curl) connecting via the config below to an HTTP server would receive its response, immediately close while the server remains in keep-alive mode. The shutr() received by haproxy from the client is "propagated" to the server side but not acted upon because fdtab[fd].linger_risk is set, so we expect that the next close will immediately complete this operation. listen proxy-tcp bind 127.0.0.1:8888 mode tcp timeout connect 5s timeout server 10s timeout client 10s server server1 127.0.0.1:8000 But since the whole stream will not end until the server closes in turn, the server doesn't close and haproxy expires on server timeout. This problem has already struck by waking up an older bug and was partially fixed with commit 8059351 ("BUG/MEDIUM: http: don't disable lingering on requests with tunnelled responses") though it was not enough. The problem is that linger_risk is not suited here. In fact we need to know whether or not it is desired to close normally or silently, and whether or not a shutr() has already been received on this connection. This is the approach this patch takes, and it solves the problem for the various difficult modes (tcp, http-server-close, pretend-keepalive). This fix needs to be backported to 1.8. Many thanks to Ricardo for providing very detailed traces and configurations.
2017-12-22 17:46:33 +00:00
/* don't perform a clean shutdown if we're going to reset or
* if the shutr was already received.
*/
if (conn_ctrl_ready(c) && !(c->flags & CO_FL_SOCK_RD_SH) && clean)
shutdown(c->handle.fd, SHUT_WR);
}
static inline void conn_xprt_shutw(struct connection *c)
{
__conn_xprt_stop_send(c);
/* clean data-layer shutdown */
if (c->xprt && c->xprt->shutw)
c->xprt->shutw(c, 1);
}
static inline void conn_xprt_shutw_hard(struct connection *c)
{
__conn_xprt_stop_send(c);
/* unclean data-layer shutdown */
if (c->xprt && c->xprt->shutw)
c->xprt->shutw(c, 0);
}
/* shut read */
static inline void cs_shutr(struct conn_stream *cs, enum cs_shr_mode mode)
{
/* clean data-layer shutdown */
if (cs->conn->mux && cs->conn->mux->shutr)
cs->conn->mux->shutr(cs, mode);
cs->flags |= (mode == CS_SHR_DRAIN) ? CS_FL_SHRD : CS_FL_SHRR;
}
/* shut write */
static inline void cs_shutw(struct conn_stream *cs, enum cs_shw_mode mode)
{
/* clean data-layer shutdown */
if (cs->conn->mux && cs->conn->mux->shutw)
cs->conn->mux->shutw(cs, mode);
cs->flags |= (mode == CS_SHW_NORMAL) ? CS_FL_SHWN : CS_FL_SHWS;
}
/* completely close a conn_stream (but do not detach it) */
static inline void cs_close(struct conn_stream *cs)
{
cs_shutw(cs, CS_SHW_SILENT);
cs_shutr(cs, CS_SHR_RESET);
cs->flags = CS_FL_NONE;
}
/* detect sock->data read0 transition */
static inline int conn_xprt_read0_pending(struct connection *c)
{
return (c->flags & CO_FL_SOCK_RD_SH) != 0;
}
/* prepares a connection to work with protocol <proto> and transport <xprt>.
MEDIUM: connection: start to introduce a mux layer between xprt and data For HTTP/2 and QUIC, we'll need to deal with multiplexed streams inside a connection. After quite a long brainstorming, it appears that the connection interface to the existing streams is appropriate just like the connection interface to the lower layers. In fact we need to have the mux layer in the middle of the connection, between the transport and the data layer. A mux can exist on two directions/sides. On the inbound direction, it instanciates new streams from incoming connections, while on the outbound direction it muxes streams into outgoing connections. The difference is visible on the mux->init() call : in one case, an upper context is already known (outgoing connection), and in the other case, the upper context is not yet known (incoming connection) and will have to be allocated by the mux. The session doesn't have to create the new streams anymore, as this is performed by the mux itself. This patch introduces this and creates a pass-through mux called "mux_pt" which is used for all new connections and which only calls the data layer's recv,send,wake() calls. One incoming stream is immediately created when init() is called on the inbound direction. There should not be any visible impact. Note that the connection's mux is purposely not set until the session is completed so that we don't accidently run with the wrong mux. This must not cause any issue as the xprt_done_cb function is always called prior to using mux's recv/send functions.
2017-08-28 08:53:00 +00:00
* The transport's is initialized as well, and the mux and its context are
* cleared. The target is not reinitialized and it is recommended that it is
* set prior to calling this function so that the function may make use of it
* in the future to refine the mux choice if needed.
*/
static inline void conn_prepare(struct connection *conn, const struct protocol *proto, const struct xprt_ops *xprt)
{
conn->ctrl = proto;
conn->xprt = xprt;
MEDIUM: connection: start to introduce a mux layer between xprt and data For HTTP/2 and QUIC, we'll need to deal with multiplexed streams inside a connection. After quite a long brainstorming, it appears that the connection interface to the existing streams is appropriate just like the connection interface to the lower layers. In fact we need to have the mux layer in the middle of the connection, between the transport and the data layer. A mux can exist on two directions/sides. On the inbound direction, it instanciates new streams from incoming connections, while on the outbound direction it muxes streams into outgoing connections. The difference is visible on the mux->init() call : in one case, an upper context is already known (outgoing connection), and in the other case, the upper context is not yet known (incoming connection) and will have to be allocated by the mux. The session doesn't have to create the new streams anymore, as this is performed by the mux itself. This patch introduces this and creates a pass-through mux called "mux_pt" which is used for all new connections and which only calls the data layer's recv,send,wake() calls. One incoming stream is immediately created when init() is called on the inbound direction. There should not be any visible impact. Note that the connection's mux is purposely not set until the session is completed so that we don't accidently run with the wrong mux. This must not cause any issue as the xprt_done_cb function is always called prior to using mux's recv/send functions.
2017-08-28 08:53:00 +00:00
conn->mux = NULL;
conn->xprt_st = 0;
conn->xprt_ctx = NULL;
conn->ctx = NULL;
}
/*
* Initializes all required fields for a new conn_strema.
*/
static inline void cs_init(struct conn_stream *cs, struct connection *conn)
{
cs->obj_type = OBJ_TYPE_CS;
cs->flags = CS_FL_NONE;
cs->conn = conn;
}
/* Initializes all required fields for a new connection. Note that it does the
* minimum acceptable initialization for a connection that already exists and
* is about to be reused. It also leaves the addresses untouched, which makes
* it usable across connection retries to reset a connection to a known state.
*/
static inline void conn_init(struct connection *conn)
{
conn->obj_type = OBJ_TYPE_CONN;
conn->flags = CO_FL_NONE;
conn->tmp_early_data = -1;
conn->sent_early_data = 0;
MEDIUM: connection: start to introduce a mux layer between xprt and data For HTTP/2 and QUIC, we'll need to deal with multiplexed streams inside a connection. After quite a long brainstorming, it appears that the connection interface to the existing streams is appropriate just like the connection interface to the lower layers. In fact we need to have the mux layer in the middle of the connection, between the transport and the data layer. A mux can exist on two directions/sides. On the inbound direction, it instanciates new streams from incoming connections, while on the outbound direction it muxes streams into outgoing connections. The difference is visible on the mux->init() call : in one case, an upper context is already known (outgoing connection), and in the other case, the upper context is not yet known (incoming connection) and will have to be allocated by the mux. The session doesn't have to create the new streams anymore, as this is performed by the mux itself. This patch introduces this and creates a pass-through mux called "mux_pt" which is used for all new connections and which only calls the data layer's recv,send,wake() calls. One incoming stream is immediately created when init() is called on the inbound direction. There should not be any visible impact. Note that the connection's mux is purposely not set until the session is completed so that we don't accidently run with the wrong mux. This must not cause any issue as the xprt_done_cb function is always called prior to using mux's recv/send functions.
2017-08-28 08:53:00 +00:00
conn->mux = NULL;
conn->ctx = NULL;
conn->owner = NULL;
conn->send_proxy_ofs = 0;
conn->handle.fd = DEAD_FD_MAGIC;
conn->err_code = CO_ER_NONE;
conn->target = NULL;
conn->xprt_done_cb = NULL;
conn->destroy_cb = NULL;
MAJOR: namespace: add Linux network namespace support This patch makes it possible to create binds and servers in separate namespaces. This can be used to proxy between multiple completely independent virtual networks (with possibly overlapping IP addresses) and a non-namespace-aware proxy implementation that supports the proxy protocol (v2). The setup is something like this: net1 on VLAN 1 (namespace 1) -\ net2 on VLAN 2 (namespace 2) -- haproxy ==== proxy (namespace 0) net3 on VLAN 3 (namespace 3) -/ The proxy is configured to make server connections through haproxy and sending the expected source/target addresses to haproxy using the proxy protocol. The network namespace setup on the haproxy node is something like this: = 8< = $ cat setup.sh ip netns add 1 ip link add link eth1 type vlan id 1 ip link set eth1.1 netns 1 ip netns exec 1 ip addr add 192.168.91.2/24 dev eth1.1 ip netns exec 1 ip link set eth1.$id up ... = 8< = = 8< = $ cat haproxy.cfg frontend clients bind 127.0.0.1:50022 namespace 1 transparent default_backend scb backend server mode tcp server server1 192.168.122.4:2222 namespace 2 send-proxy-v2 = 8< = A bind line creates the listener in the specified namespace, and connections originating from that listener also have their network namespace set to that of the listener. A server line either forces the connection to be made in a specified namespace or may use the namespace from the client-side connection if that was set. For more documentation please read the documentation included in the patch itself. Signed-off-by: KOVACS Tamas <ktamas@balabit.com> Signed-off-by: Sarkozi Laszlo <laszlo.sarkozi@balabit.com> Signed-off-by: KOVACS Krisztian <hidden@balabit.com>
2014-11-17 14:11:45 +00:00
conn->proxy_netns = NULL;
LIST_INIT(&conn->list);
LIST_INIT(&conn->session_list);
conn->send_wait = NULL;
conn->recv_wait = NULL;
conn->idle_time = 0;
}
/* sets <owner> as the connection's owner */
static inline void conn_set_owner(struct connection *conn, void *owner, void (*cb)(struct connection *))
{
conn->owner = owner;
conn->destroy_cb = cb;
}
/* registers <cb> as a callback to notify for transport's readiness or failure */
static inline void conn_set_xprt_done_cb(struct connection *conn, int (*cb)(struct connection *))
{
conn->xprt_done_cb = cb;
}
/* unregisters the callback to notify for transport's readiness or failure */
static inline void conn_clear_xprt_done_cb(struct connection *conn)
{
conn->xprt_done_cb = NULL;
}
/* Tries to allocate a new connection and initialized its main fields. The
* connection is returned on success, NULL on failure. The connection must
* be released using pool_free() or conn_free().
*/
static inline struct connection *conn_new()
{
struct connection *conn;
conn = pool_alloc(pool_head_connection);
if (likely(conn != NULL))
conn_init(conn);
return conn;
}
/* Releases a conn_stream previously allocated by cs_new(), as well as any
* buffer it would still hold.
*/
static inline void cs_free(struct conn_stream *cs)
{
pool_free(pool_head_connstream, cs);
}
/* Tries to allocate a new conn_stream and initialize its main fields. If
* <conn> is NULL, then a new connection is allocated on the fly, initialized,
* and assigned to cs->conn ; this connection will then have to be released
* using pool_free() or conn_free(). The conn_stream is initialized and added
* to the mux's stream list on success, then returned. On failure, nothing is
* allocated and NULL is returned.
*/
static inline struct conn_stream *cs_new(struct connection *conn)
{
struct conn_stream *cs;
cs = pool_alloc(pool_head_connstream);
if (!likely(cs))
return NULL;
if (!conn) {
conn = conn_new();
if (!likely(conn)) {
cs_free(cs);
return NULL;
}
conn_init(conn);
}
cs_init(cs, conn);
return cs;
}
/* Retrieves any valid conn_stream from this connection, preferably the first
* valid one. The purpose is to be able to figure one other end of a private
* connection for purposes like source binding or proxy protocol header
* emission. In such cases, any conn_stream is expected to be valid so the
* mux is encouraged to return the first one it finds. If the connection has
* no mux or the mux has no get_first_cs() method or the mux has no valid
* conn_stream, NULL is returned. The output pointer is purposely marked
* const to discourage the caller from modifying anything there.
*/
static inline const struct conn_stream *cs_get_first(const struct connection *conn)
{
if (!conn || !conn->mux || !conn->mux->get_first_cs)
return NULL;
return conn->mux->get_first_cs(conn);
}
static inline void conn_force_unsubscribe(struct connection *conn)
{
if (conn->recv_wait) {
conn->recv_wait->events &= ~SUB_RETRY_RECV;
conn->recv_wait = NULL;
}
if (conn->send_wait) {
conn->send_wait->events &= ~SUB_RETRY_SEND;
conn->send_wait = NULL;
}
}
/* Releases a connection previously allocated by conn_new() */
static inline void conn_free(struct connection *conn)
{
/* Remove ourself from the session's connections list, if any. */
if (!LIST_ISEMPTY(&conn->session_list)) {
struct session *sess = conn->owner;
sess->resp_conns--;
LIST_DEL(&conn->session_list);
}
/* By convention we always place a NULL where the ctx points to if the
* mux is null. It may have been used to store the connection as a
* stream_interface's end point for example.
*/
if (conn->ctx != NULL && conn->mux == NULL)
*(void **)conn->ctx = NULL;
/* The connection is currently in the server's idle list, so tell it
* there's one less connection available in that list.
*/
if (conn->idle_time > 0) {
struct server *srv = __objt_server(conn->target);
srv->curr_idle_conns--;
}
conn_force_unsubscribe(conn);
LIST_DEL(&conn->list);
LIST_INIT(&conn->list);
pool_free(pool_head_connection, conn);
}
/* Release a conn_stream */
static inline void cs_destroy(struct conn_stream *cs)
{
if (cs->conn->mux)
cs->conn->mux->detach(cs);
else {
/* It's too early to have a mux, let's just destroy
* the connection
*/
struct connection *conn = cs->conn;
conn_stop_tracking(conn);
conn_full_close(conn);
if (conn->destroy_cb)
conn->destroy_cb(conn);
conn_free(conn);
}
cs_free(cs);
}
/* Returns the conn from a cs. If cs is NULL, returns NULL */
static inline struct connection *cs_conn(const struct conn_stream *cs)
{
return cs ? cs->conn : NULL;
}
/* Retrieves the connection's source address */
static inline void conn_get_from_addr(struct connection *conn)
{
if (conn->flags & CO_FL_ADDR_FROM_SET)
return;
if (!conn_ctrl_ready(conn) || !conn->ctrl->get_src)
return;
if (conn->ctrl->get_src(conn->handle.fd, (struct sockaddr *)&conn->addr.from,
sizeof(conn->addr.from),
obj_type(conn->target) != OBJ_TYPE_LISTENER) == -1)
return;
conn->flags |= CO_FL_ADDR_FROM_SET;
}
/* Retrieves the connection's original destination address */
static inline void conn_get_to_addr(struct connection *conn)
{
if (conn->flags & CO_FL_ADDR_TO_SET)
return;
if (!conn_ctrl_ready(conn) || !conn->ctrl->get_dst)
return;
if (conn->ctrl->get_dst(conn->handle.fd, (struct sockaddr *)&conn->addr.to,
sizeof(conn->addr.to),
obj_type(conn->target) != OBJ_TYPE_LISTENER) == -1)
return;
conn->flags |= CO_FL_ADDR_TO_SET;
}
/* Sets the TOS header in IPv4 and the traffic class header in IPv6 packets
* (as per RFC3260 #4 and BCP37 #4.2 and #5.2). The connection is tested and if
* it is null, nothing is done.
*/
static inline void conn_set_tos(const struct connection *conn, int tos)
{
if (!conn || !conn_ctrl_ready(conn))
return;
#ifdef IP_TOS
if (conn->addr.from.ss_family == AF_INET)
setsockopt(conn->handle.fd, IPPROTO_IP, IP_TOS, &tos, sizeof(tos));
#endif
#ifdef IPV6_TCLASS
if (conn->addr.from.ss_family == AF_INET6) {
if (IN6_IS_ADDR_V4MAPPED(&((struct sockaddr_in6 *)&conn->addr.from)->sin6_addr))
/* v4-mapped addresses need IP_TOS */
setsockopt(conn->handle.fd, IPPROTO_IP, IP_TOS, &tos, sizeof(tos));
else
setsockopt(conn->handle.fd, IPPROTO_IPV6, IPV6_TCLASS, &tos, sizeof(tos));
}
#endif
}
/* Sets the netfilter mark on the connection's socket. The connection is tested
* and if it is null, nothing is done.
*/
static inline void conn_set_mark(const struct connection *conn, int mark)
{
if (!conn || !conn_ctrl_ready(conn))
return;
#ifdef SO_MARK
setsockopt(conn->handle.fd, SOL_SOCKET, SO_MARK, &mark, sizeof(mark));
#endif
}
/* Sets adjust the TCP quick-ack feature on the connection's socket. The
* connection is tested and if it is null, nothing is done.
*/
static inline void conn_set_quickack(const struct connection *conn, int value)
{
if (!conn || !conn_ctrl_ready(conn))
return;
#ifdef TCP_QUICKACK
setsockopt(conn->handle.fd, IPPROTO_TCP, TCP_QUICKACK, &value, sizeof(value));
#endif
}
/* Attaches a conn_stream to a data layer and sets the relevant callbacks */
static inline void cs_attach(struct conn_stream *cs, void *data, const struct data_cb *data_cb)
{
cs->data_cb = data_cb;
cs->data = data;
}
static inline struct wait_event *wl_set_waitcb(struct wait_event *wl, struct task *(*cb)(struct task *, void *, unsigned short), void *ctx)
{
if (!wl->task->process) {
wl->task->process = cb;
wl->task->context = ctx;
}
return wl;
}
MEDIUM: connection: start to introduce a mux layer between xprt and data For HTTP/2 and QUIC, we'll need to deal with multiplexed streams inside a connection. After quite a long brainstorming, it appears that the connection interface to the existing streams is appropriate just like the connection interface to the lower layers. In fact we need to have the mux layer in the middle of the connection, between the transport and the data layer. A mux can exist on two directions/sides. On the inbound direction, it instanciates new streams from incoming connections, while on the outbound direction it muxes streams into outgoing connections. The difference is visible on the mux->init() call : in one case, an upper context is already known (outgoing connection), and in the other case, the upper context is not yet known (incoming connection) and will have to be allocated by the mux. The session doesn't have to create the new streams anymore, as this is performed by the mux itself. This patch introduces this and creates a pass-through mux called "mux_pt" which is used for all new connections and which only calls the data layer's recv,send,wake() calls. One incoming stream is immediately created when init() is called on the inbound direction. There should not be any visible impact. Note that the connection's mux is purposely not set until the session is completed so that we don't accidently run with the wrong mux. This must not cause any issue as the xprt_done_cb function is always called prior to using mux's recv/send functions.
2017-08-28 08:53:00 +00:00
/* Installs the connection's mux layer for upper context <ctx>.
* Returns < 0 on error.
*/
static inline int conn_install_mux(struct connection *conn, const struct mux_ops *mux,
void *ctx, struct proxy *prx, struct session *sess)
MEDIUM: connection: start to introduce a mux layer between xprt and data For HTTP/2 and QUIC, we'll need to deal with multiplexed streams inside a connection. After quite a long brainstorming, it appears that the connection interface to the existing streams is appropriate just like the connection interface to the lower layers. In fact we need to have the mux layer in the middle of the connection, between the transport and the data layer. A mux can exist on two directions/sides. On the inbound direction, it instanciates new streams from incoming connections, while on the outbound direction it muxes streams into outgoing connections. The difference is visible on the mux->init() call : in one case, an upper context is already known (outgoing connection), and in the other case, the upper context is not yet known (incoming connection) and will have to be allocated by the mux. The session doesn't have to create the new streams anymore, as this is performed by the mux itself. This patch introduces this and creates a pass-through mux called "mux_pt" which is used for all new connections and which only calls the data layer's recv,send,wake() calls. One incoming stream is immediately created when init() is called on the inbound direction. There should not be any visible impact. Note that the connection's mux is purposely not set until the session is completed so that we don't accidently run with the wrong mux. This must not cause any issue as the xprt_done_cb function is always called prior to using mux's recv/send functions.
2017-08-28 08:53:00 +00:00
{
conn->mux = mux;
conn->ctx = ctx;
return mux->init ? mux->init(conn, prx, sess) : 0;
MEDIUM: connection: start to introduce a mux layer between xprt and data For HTTP/2 and QUIC, we'll need to deal with multiplexed streams inside a connection. After quite a long brainstorming, it appears that the connection interface to the existing streams is appropriate just like the connection interface to the lower layers. In fact we need to have the mux layer in the middle of the connection, between the transport and the data layer. A mux can exist on two directions/sides. On the inbound direction, it instanciates new streams from incoming connections, while on the outbound direction it muxes streams into outgoing connections. The difference is visible on the mux->init() call : in one case, an upper context is already known (outgoing connection), and in the other case, the upper context is not yet known (incoming connection) and will have to be allocated by the mux. The session doesn't have to create the new streams anymore, as this is performed by the mux itself. This patch introduces this and creates a pass-through mux called "mux_pt" which is used for all new connections and which only calls the data layer's recv,send,wake() calls. One incoming stream is immediately created when init() is called on the inbound direction. There should not be any visible impact. Note that the connection's mux is purposely not set until the session is completed so that we don't accidently run with the wrong mux. This must not cause any issue as the xprt_done_cb function is always called prior to using mux's recv/send functions.
2017-08-28 08:53:00 +00:00
}
/* returns a human-readable error code for conn->err_code, or NULL if the code
* is unknown.
*/
static inline const char *conn_err_code_str(struct connection *c)
{
switch (c->err_code) {
case CO_ER_NONE: return "Success";
case CO_ER_CONF_FDLIM: return "Reached configured maxconn value";
case CO_ER_PROC_FDLIM: return "Too many sockets on the process";
case CO_ER_SYS_FDLIM: return "Too many sockets on the system";
case CO_ER_SYS_MEMLIM: return "Out of system buffers";
case CO_ER_NOPROTO: return "Protocol or address family not supported";
case CO_ER_SOCK_ERR: return "General socket error";
case CO_ER_PORT_RANGE: return "Source port range exhausted";
case CO_ER_CANT_BIND: return "Can't bind to source address";
case CO_ER_FREE_PORTS: return "Out of local source ports on the system";
case CO_ER_ADDR_INUSE: return "Local source address already in use";
case CO_ER_PRX_EMPTY: return "Connection closed while waiting for PROXY protocol header";
case CO_ER_PRX_ABORT: return "Connection error while waiting for PROXY protocol header";
case CO_ER_PRX_TIMEOUT: return "Timeout while waiting for PROXY protocol header";
case CO_ER_PRX_TRUNCATED: return "Truncated PROXY protocol header received";
case CO_ER_PRX_NOT_HDR: return "Received something which does not look like a PROXY protocol header";
case CO_ER_PRX_BAD_HDR: return "Received an invalid PROXY protocol header";
case CO_ER_PRX_BAD_PROTO: return "Received an unhandled protocol in the PROXY protocol header";
case CO_ER_CIP_EMPTY: return "Connection closed while waiting for NetScaler Client IP header";
case CO_ER_CIP_ABORT: return "Connection error while waiting for NetScaler Client IP header";
case CO_ER_CIP_TRUNCATED: return "Truncated NetScaler Client IP header received";
case CO_ER_CIP_BAD_MAGIC: return "Received an invalid NetScaler Client IP magic number";
case CO_ER_CIP_BAD_PROTO: return "Received an unhandled protocol in the NetScaler Client IP header";
case CO_ER_SSL_EMPTY: return "Connection closed during SSL handshake";
case CO_ER_SSL_ABORT: return "Connection error during SSL handshake";
case CO_ER_SSL_TIMEOUT: return "Timeout during SSL handshake";
case CO_ER_SSL_TOO_MANY: return "Too many SSL connections";
case CO_ER_SSL_NO_MEM: return "Out of memory when initializing an SSL connection";
case CO_ER_SSL_RENEG: return "Rejected a client-initiated SSL renegociation attempt";
case CO_ER_SSL_CA_FAIL: return "SSL client CA chain cannot be verified";
case CO_ER_SSL_CRT_FAIL: return "SSL client certificate not trusted";
case CO_ER_SSL_MISMATCH: return "Server presented an SSL certificate different from the configured one";
case CO_ER_SSL_MISMATCH_SNI: return "Server presented an SSL certificate different from the expected one";
case CO_ER_SSL_HANDSHAKE: return "SSL handshake failure";
case CO_ER_SSL_HANDSHAKE_HB: return "SSL handshake failure after heartbeat";
case CO_ER_SSL_KILLED_HB: return "Stopped a TLSv1 heartbeat attack (CVE-2014-0160)";
case CO_ER_SSL_NO_TARGET: return "Attempt to use SSL on an unknown target (internal error)";
}
return NULL;
}
static inline const char *conn_get_ctrl_name(const struct connection *conn)
{
if (!conn_ctrl_ready(conn))
return "NONE";
return conn->ctrl->name;
}
static inline const char *conn_get_xprt_name(const struct connection *conn)
{
if (!conn_xprt_ready(conn))
return "NONE";
return conn->xprt->name;
}
MEDIUM: connection: start to introduce a mux layer between xprt and data For HTTP/2 and QUIC, we'll need to deal with multiplexed streams inside a connection. After quite a long brainstorming, it appears that the connection interface to the existing streams is appropriate just like the connection interface to the lower layers. In fact we need to have the mux layer in the middle of the connection, between the transport and the data layer. A mux can exist on two directions/sides. On the inbound direction, it instanciates new streams from incoming connections, while on the outbound direction it muxes streams into outgoing connections. The difference is visible on the mux->init() call : in one case, an upper context is already known (outgoing connection), and in the other case, the upper context is not yet known (incoming connection) and will have to be allocated by the mux. The session doesn't have to create the new streams anymore, as this is performed by the mux itself. This patch introduces this and creates a pass-through mux called "mux_pt" which is used for all new connections and which only calls the data layer's recv,send,wake() calls. One incoming stream is immediately created when init() is called on the inbound direction. There should not be any visible impact. Note that the connection's mux is purposely not set until the session is completed so that we don't accidently run with the wrong mux. This must not cause any issue as the xprt_done_cb function is always called prior to using mux's recv/send functions.
2017-08-28 08:53:00 +00:00
static inline const char *conn_get_mux_name(const struct connection *conn)
{
if (!conn->mux)
return "NONE";
return conn->mux->name;
}
static inline const char *cs_get_data_name(const struct conn_stream *cs)
{
if (!cs->data_cb)
return "NONE";
return cs->data_cb->name;
}
/* registers pointer to transport layer <id> (XPRT_*) */
static inline void xprt_register(int id, struct xprt_ops *xprt)
{
if (id >= XPRT_ENTRIES)
return;
registered_xprt[id] = xprt;
}
/* returns pointer to transport layer <id> (XPRT_*) or NULL if not registered */
static inline struct xprt_ops *xprt_get(int id)
{
if (id >= XPRT_ENTRIES)
return NULL;
return registered_xprt[id];
}
static inline int conn_get_alpn(const struct connection *conn, const char **str, int *len)
{
if (!conn_xprt_ready(conn) || !conn->xprt->get_alpn)
return 0;
return conn->xprt->get_alpn(conn, str, len);
}
/* registers proto mux list <list>. Modifies the list element! */
static inline void register_mux_proto(struct mux_proto_list *list)
{
LIST_ADDQ(&mux_proto_list.list, &list->list);
}
/* unregisters proto mux list <list> */
static inline void unregister_mux_proto(struct mux_proto_list *list)
{
LIST_DEL(&list->list);
LIST_INIT(&list->list);
}
static inline struct mux_proto_list *get_mux_proto(const struct ist proto)
{
struct mux_proto_list *item;
list_for_each_entry(item, &mux_proto_list.list, list) {
if (isteq(proto, item->token))
return item;
}
return NULL;
}
/* Lists the known proto mux on <out> */
static inline void list_mux_proto(FILE *out)
{
struct mux_proto_list *item;
struct ist proto;
char *mode, *side;
fprintf(out, "Available multiplexer protocols :\n"
"(protocols marked as <default> cannot be specified using 'proto' keyword)\n");
list_for_each_entry(item, &mux_proto_list.list, list) {
proto = item->token;
if (item->mode == PROTO_MODE_ANY)
mode = "TCP|HTTP";
else if (item->mode == PROTO_MODE_TCP)
mode = "TCP";
else if (item->mode == PROTO_MODE_HTTP)
mode = "HTTP";
else if (item->mode == PROTO_MODE_HTX)
mode = "HTX";
else if (item->mode == (PROTO_MODE_HTTP | PROTO_MODE_HTX))
mode = "HTTP|HTX";
else
mode = "NONE";
if (item->side == PROTO_SIDE_BOTH)
side = "FE|BE";
else if (item->side == PROTO_SIDE_FE)
side = "FE";
else if (item->side == PROTO_SIDE_BE)
side = "BE";
else
side = "NONE";
fprintf(out, " %15s : mode=%-10s side=%s\n",
(proto.len ? proto.ptr : "<default>"), mode, side);
}
}
/* returns the first mux entry in the list matching the exact same <mux_proto>
* and compatible with the <proto_side> (FE or BE) and the <proto_mode> (TCP or
* HTTP). <mux_proto> can be empty. Will fall back to the first compatible mux
* with exactly the same <proto_mode> or with an empty name. May return
* null if the code improperly registered the default mux to use as a fallback.
*/
static inline const struct mux_proto_list *conn_get_best_mux_entry(
const struct ist mux_proto,
int proto_side, int proto_mode)
{
struct mux_proto_list *item;
struct mux_proto_list *fallback = NULL;
list_for_each_entry(item, &mux_proto_list.list, list) {
if (!(item->side & proto_side) || !(item->mode & proto_mode))
continue;
if (istlen(mux_proto) && isteq(mux_proto, item->token))
return item;
else if (!istlen(item->token)) {
if (!fallback || (item->mode == proto_mode && fallback->mode != proto_mode))
fallback = item;
}
}
return fallback;
}
/* returns the first mux in the list matching the exact same <mux_proto> and
* compatible with the <proto_side> (FE or BE) and the <proto_mode> (TCP or
* HTTP). <mux_proto> can be empty. Will fall back to the first compatible mux
* with exactly the same <proto_mode> or with an empty name. May return
* null if the code improperly registered the default mux to use as a fallback.
*/
static inline const struct mux_ops *conn_get_best_mux(struct connection *conn,
const struct ist mux_proto,
int proto_side, int proto_mode)
{
const struct mux_proto_list *item;
item = conn_get_best_mux_entry(mux_proto, proto_side, proto_mode);
return item ? item->mux : NULL;
}
/* returns 0 if the connection is valid and is a frontend connection, otherwise
* returns 1 indicating it's a backend connection. And uninitialized connection
* also returns 1 to better handle the usage in the middle of initialization.
*/
static inline int conn_is_back(const struct connection *conn)
{
return !objt_listener(conn->target);
}
/* returns a pointer to the proxy associated with this connection. For a front
* connection it returns a pointer to the frontend ; for a back connection, it
* returns a pointer to the backend.
*/
static inline struct proxy *conn_get_proxy(const struct connection *conn)
{
struct listener *l;
struct server *s;
/* check if it's a frontend connection */
l = objt_listener(conn->target);
if (l)
return l->bind_conf->frontend;
/* check if it's a backend connection */
s = objt_server(conn->target);
if (s)
return s->proxy;
return objt_proxy(conn->target);
}
/* installs the best mux for incoming connection <conn> using the upper context
* <ctx>. If the mux protocol is forced, we use it to find the best
* mux. Otherwise we use the ALPN name, if any. Returns < 0 on error.
*/
static inline int conn_install_mux_fe(struct connection *conn, void *ctx)
{
struct bind_conf *bind_conf = __objt_listener(conn->target)->bind_conf;
const struct mux_ops *mux_ops;
if (bind_conf->mux_proto)
mux_ops = bind_conf->mux_proto->mux;
else {
struct ist mux_proto;
const char *alpn_str = NULL;
int alpn_len = 0;
int mode;
if (bind_conf->frontend->mode == PR_MODE_TCP)
mode = PROTO_MODE_TCP;
else if (bind_conf->frontend->options2 & PR_O2_USE_HTX)
mode = PROTO_MODE_HTX;
else
mode = PROTO_MODE_HTTP;
conn_get_alpn(conn, &alpn_str, &alpn_len);
mux_proto = ist2(alpn_str, alpn_len);
mux_ops = conn_get_best_mux(conn, mux_proto, PROTO_SIDE_FE, mode);
if (!mux_ops)
return -1;
}
return conn_install_mux(conn, mux_ops, ctx, bind_conf->frontend, conn->owner);
}
/* installs the best mux for outgoing connection <conn> using the upper context
* <ctx>. If the mux protocol is forced, we use it to find the best mux. Returns
* < 0 on error.
*/
static inline int conn_install_mux_be(struct connection *conn, void *ctx, struct session *sess)
{
struct server *srv = objt_server(conn->target);
struct proxy *prx = objt_proxy(conn->target);
const struct mux_ops *mux_ops;
if (srv)
prx = srv->proxy;
if (!prx) // target must be either proxy or server
return -1;
if (srv && srv->mux_proto)
mux_ops = srv->mux_proto->mux;
else {
struct ist mux_proto;
const char *alpn_str = NULL;
int alpn_len = 0;
int mode;
if (prx->mode == PR_MODE_TCP)
mode = PROTO_MODE_TCP;
else if (prx->options2 & PR_O2_USE_HTX)
mode = PROTO_MODE_HTX;
else
mode = PROTO_MODE_HTTP;
conn_get_alpn(conn, &alpn_str, &alpn_len);
mux_proto = ist2(alpn_str, alpn_len);
mux_ops = conn_get_best_mux(conn, mux_proto, PROTO_SIDE_BE, mode);
if (!mux_ops)
return -1;
}
return conn_install_mux(conn, mux_ops, ctx, prx, sess);
}
#endif /* _PROTO_CONNECTION_H */
/*
* Local variables:
* c-indent-level: 8
* c-basic-offset: 8
* End:
*/