From 87d269707bf804c60da1aaf67d8fd91ade3069e3 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 8 Jul 2024 18:28:54 +0200 Subject: [PATCH] OPTIM: pool: improve needed_avg cache line access pattern On an AMD EPYC 3rd gen, 20% of the CPU is spent calculating the amount of pools needed when using QUIC, because pool allocations/releases are quite frequent and the inter-CCX communication is super slow. Still, there's a way to save between 0.5 and 1% CPU by using fetch-add and sub-fetch that are converted to XADD so that the result is directly fed into the swrate_add argument without having to re-read the memory area. That's what this patch does. --- src/pool.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/pool.c b/src/pool.c index 1fbbe5079c..7caed7bf6b 100644 --- a/src/pool.c +++ b/src/pool.c @@ -424,15 +424,17 @@ void *pool_alloc_nocache(struct pool_head *pool, const void *caller) { void *ptr = NULL; uint bucket; + uint used; ptr = pool_get_from_os_noinc(pool); if (!ptr) return NULL; bucket = pool_pbucket(ptr); - swrate_add_scaled_opportunistic(&pool->buckets[bucket].needed_avg, POOL_AVG_SAMPLES, pool->buckets[bucket].used, POOL_AVG_SAMPLES/4); + _HA_ATOMIC_INC(&pool->buckets[bucket].allocated); - _HA_ATOMIC_INC(&pool->buckets[bucket].used); + used = _HA_ATOMIC_FETCH_ADD(&pool->buckets[bucket].used, 1); + swrate_add_scaled_opportunistic(&pool->buckets[bucket].needed_avg, POOL_AVG_SAMPLES, used, POOL_AVG_SAMPLES/4); /* keep track of where the element was allocated from */ POOL_DEBUG_SET_MARK(pool, ptr); @@ -447,10 +449,11 @@ void *pool_alloc_nocache(struct pool_head *pool, const void *caller) void pool_free_nocache(struct pool_head *pool, void *ptr) { uint bucket = pool_pbucket(ptr); + uint used; - _HA_ATOMIC_DEC(&pool->buckets[bucket].used); + used = _HA_ATOMIC_SUB_FETCH(&pool->buckets[bucket].used, 1); _HA_ATOMIC_DEC(&pool->buckets[bucket].allocated); - swrate_add_opportunistic(&pool->buckets[bucket].needed_avg, POOL_AVG_SAMPLES, pool->buckets[bucket].used); + swrate_add_opportunistic(&pool->buckets[bucket].needed_avg, POOL_AVG_SAMPLES, used); pool_put_to_os_nodec(pool, ptr); } @@ -519,6 +522,7 @@ static void pool_evict_last_items(struct pool_head *pool, struct pool_cache_head uint cluster = 0; uint to_free_max; uint bucket; + uint used; BUG_ON(pool_debugging & POOL_DBG_NO_CACHE); @@ -534,8 +538,8 @@ static void pool_evict_last_items(struct pool_head *pool, struct pool_cache_head LIST_DELETE(&item->by_lru); bucket = pool_pbucket(item); - _HA_ATOMIC_DEC(&pool->buckets[bucket].used); - swrate_add_opportunistic(&pool->buckets[bucket].needed_avg, POOL_AVG_SAMPLES, pool->buckets[bucket].used); + used = _HA_ATOMIC_SUB_FETCH(&pool->buckets[bucket].used, 1); + swrate_add_opportunistic(&pool->buckets[bucket].needed_avg, POOL_AVG_SAMPLES, used); if (to_free_max > released || cluster) { /* will never match when global pools are disabled */