mirror of
http://git.haproxy.org/git/haproxy.git/
synced 2025-04-28 05:48:01 +00:00
OPTIM: global: move byte counts out of global and per-thread
During multiple tests we've already noticed that shared stats counters have become a real bottleneck under large thread counts. With QUIC it's pretty visible, with qc_snd_buf() taking 2.5% of the CPU on a 48-thread machine at only 25 Gbps, and this CPU is entirely spent in the atomic increment of the byte count and byte rate. It's also visible in H1/H2 but slightly less since we're working with larger buffers, hence less frequent updates. These counters are exclusively used to report the byte count in "show info" and the byte rate in the stats. Let's move them to the thread_ctx struct and make the stats reader just collect each thread's stats when requested. That's way more efficient than competing on a single cache line. After this, qc_snd_buf has totally disappeared from the perf profile and tests made in h1 show roughly 1% performance increase on small objects.
This commit is contained in:
parent
522841c47b
commit
6be8d09a61
@ -198,10 +198,7 @@ struct global {
|
||||
struct freq_ctr ssl_be_keys_per_sec;
|
||||
struct freq_ctr comp_bps_in; /* bytes per second, before http compression */
|
||||
struct freq_ctr comp_bps_out; /* bytes per second, after http compression */
|
||||
struct freq_ctr out_32bps; /* #of 32-byte blocks emitted per second */
|
||||
uint sslconns, totalsslconns; /* active, total # of SSL conns */
|
||||
unsigned long long out_bytes; /* total #of bytes emitted */
|
||||
unsigned long long spliced_out_bytes; /* total #of bytes emitted though a kernel pipe */
|
||||
int cps_lim, cps_max;
|
||||
int sps_lim, sps_max;
|
||||
int ssl_lim, ssl_max;
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include <import/ebtree-t.h>
|
||||
|
||||
#include <haproxy/api-t.h>
|
||||
#include <haproxy/freq_ctr-t.h>
|
||||
#include <haproxy/thread-t.h>
|
||||
|
||||
/* tasklet classes */
|
||||
@ -138,6 +139,10 @@ struct thread_ctx {
|
||||
struct eb_root rqueue_shared; /* run queue fed by other threads */
|
||||
__decl_thread(HA_SPINLOCK_T rqsh_lock); /* lock protecting the shared runqueue */
|
||||
|
||||
struct freq_ctr out_32bps; /* #of 32-byte blocks emitted per second */
|
||||
unsigned long long out_bytes; /* total #of bytes emitted */
|
||||
unsigned long long spliced_out_bytes; /* total #of bytes emitted though a kernel pipe */
|
||||
|
||||
ALWAYS_ALIGN(128);
|
||||
};
|
||||
|
||||
|
@ -552,8 +552,8 @@ int qc_snd_buf(struct quic_conn *qc, const struct buffer *buf, size_t sz,
|
||||
* The reason for the latter is that freq_ctr are limited to 4GB and
|
||||
* that it's not enough per second.
|
||||
*/
|
||||
_HA_ATOMIC_ADD(&global.out_bytes, ret);
|
||||
update_freq_ctr(&global.out_32bps, (ret + 16) / 32);
|
||||
_HA_ATOMIC_ADD(&th_ctx->out_bytes, ret);
|
||||
update_freq_ctr(&th_ctx->out_32bps, (ret + 16) / 32);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -152,9 +152,9 @@ int raw_sock_to_pipe(struct connection *conn, void *xprt_ctx, struct pipe *pipe,
|
||||
* blocks. The reason for the latter is that freq_ctr are
|
||||
* limited to 4GB and that it's not enough per second.
|
||||
*/
|
||||
_HA_ATOMIC_ADD(&global.out_bytes, retval);
|
||||
_HA_ATOMIC_ADD(&global.spliced_out_bytes, retval);
|
||||
update_freq_ctr(&global.out_32bps, (retval + 16) / 32);
|
||||
_HA_ATOMIC_ADD(&th_ctx->out_bytes, retval);
|
||||
_HA_ATOMIC_ADD(&th_ctx->spliced_out_bytes, retval);
|
||||
update_freq_ctr(&th_ctx->out_32bps, (retval + 16) / 32);
|
||||
}
|
||||
return retval;
|
||||
|
||||
@ -421,8 +421,8 @@ static size_t raw_sock_from_buf(struct connection *conn, void *xprt_ctx, const s
|
||||
* blocks. The reason for the latter is that freq_ctr are
|
||||
* limited to 4GB and that it's not enough per second.
|
||||
*/
|
||||
_HA_ATOMIC_ADD(&global.out_bytes, done);
|
||||
update_freq_ctr(&global.out_32bps, (done + 16) / 32);
|
||||
_HA_ATOMIC_ADD(&th_ctx->out_bytes, done);
|
||||
update_freq_ctr(&th_ctx->out_32bps, (done + 16) / 32);
|
||||
}
|
||||
return done;
|
||||
}
|
||||
|
23
src/stats.c
23
src/stats.c
@ -3461,7 +3461,11 @@ static void stats_dump_html_info(struct stconn *sc, struct uri_auth *uri)
|
||||
unsigned int up = (now.tv_sec - start_date.tv_sec);
|
||||
char scope_txt[STAT_SCOPE_TXT_MAXLEN + sizeof STAT_SCOPE_PATTERN];
|
||||
const char *scope_ptr = stats_scope_ptr(appctx, sc);
|
||||
unsigned long long bps = (unsigned long long)read_freq_ctr(&global.out_32bps) * 32;
|
||||
unsigned long long bps;
|
||||
int thr;
|
||||
|
||||
for (bps = thr = 0; thr < global.nbthread; thr++)
|
||||
bps += 32ULL * read_freq_ctr(&ha_thread_ctx[thr].out_32bps);
|
||||
|
||||
/* Turn the bytes per second to bits per second and take care of the
|
||||
* usual ethernet overhead in order to help figure how far we are from
|
||||
@ -4505,6 +4509,8 @@ int stats_fill_info(struct field *info, int len, uint flags)
|
||||
{
|
||||
struct timeval up;
|
||||
struct buffer *out = get_trash_chunk();
|
||||
uint64_t glob_out_bytes, glob_spl_bytes, glob_out_b32;
|
||||
int thr;
|
||||
|
||||
#ifdef USE_OPENSSL
|
||||
double ssl_sess_rate = read_freq_ctr_flt(&global.ssl_per_sec);
|
||||
@ -4515,6 +4521,15 @@ int stats_fill_info(struct field *info, int len, uint flags)
|
||||
ssl_reuse = 100.0 * (1.0 - ssl_key_rate / ssl_sess_rate);
|
||||
#endif
|
||||
|
||||
/* sum certain per-thread totals (mostly byte counts) */
|
||||
glob_out_bytes = glob_spl_bytes = glob_out_b32 = 0;
|
||||
for (thr = 0; thr < global.nbthread; thr++) {
|
||||
glob_out_bytes += HA_ATOMIC_LOAD(&ha_thread_ctx[thr].out_bytes);
|
||||
glob_spl_bytes += HA_ATOMIC_LOAD(&ha_thread_ctx[thr].spliced_out_bytes);
|
||||
glob_out_b32 += read_freq_ctr(&ha_thread_ctx[thr].out_32bps);
|
||||
}
|
||||
glob_out_b32 *= 32; // values are 32-byte units
|
||||
|
||||
tv_remain(&start_date, &now, &up);
|
||||
|
||||
if (len < INF_TOTAL_FIELDS)
|
||||
@ -4601,9 +4616,9 @@ int stats_fill_info(struct field *info, int len, uint flags)
|
||||
info[INF_DROPPED_LOGS] = mkf_u32(0, dropped_logs);
|
||||
info[INF_BUSY_POLLING] = mkf_u32(0, !!(global.tune.options & GTUNE_BUSY_POLLING));
|
||||
info[INF_FAILED_RESOLUTIONS] = mkf_u32(0, resolv_failed_resolutions);
|
||||
info[INF_TOTAL_BYTES_OUT] = mkf_u64(0, global.out_bytes);
|
||||
info[INF_TOTAL_SPLICED_BYTES_OUT] = mkf_u64(0, global.spliced_out_bytes);
|
||||
info[INF_BYTES_OUT_RATE] = mkf_u64(FN_RATE, (unsigned long long)read_freq_ctr(&global.out_32bps) * 32);
|
||||
info[INF_TOTAL_BYTES_OUT] = mkf_u64(0, glob_out_bytes);
|
||||
info[INF_TOTAL_SPLICED_BYTES_OUT] = mkf_u64(0, glob_spl_bytes);
|
||||
info[INF_BYTES_OUT_RATE] = mkf_u64(FN_RATE, glob_out_b32);
|
||||
info[INF_DEBUG_COMMANDS_ISSUED] = mkf_u32(0, debug_commands_issued);
|
||||
info[INF_CUM_LOG_MSGS] = mkf_u32(FN_COUNTER, cum_log_messages);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user