mirror of
https://github.com/ceph/ceph
synced 2025-01-01 16:42:29 +00:00
7fe0ac7c11
There already is a test to verify the mempool sharding works, in the sense that it uses at least half of the variables available to count the number of allocated objects and their total size. This new test verifies that, with sharding, object counting is at least twice faster than without sharding. It also collects cacheline contention data with the perf c2c tool. The manual analysis of this data shows the optimization gain is indeed related to cacheline contention. Fixes: https://tracker.ceph.com/issues/49896 Signed-off-by: Loïc Dachary <loic@dachary.org>
85 lines
3.1 KiB
Bash
Executable File
85 lines
3.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
set -ex
|
|
|
|
function run_perf_c2c() {
|
|
# First get some background system info
|
|
uname -a > uname.out
|
|
lscpu > lscpu.out
|
|
cat /proc/cmdline > cmdline.out
|
|
timeout -s INT 10 vmstat -w 1 > vmstat.out || true
|
|
sudo dmesg >& dmesg.out
|
|
cat /proc/cpuinfo > cpuinfo.out
|
|
ps axo psr,time,stat,ppid,pid,pcpu,comm > ps.1.out
|
|
ps -eafT > ps.2.out
|
|
sudo sysctl -a > sysctl.out
|
|
|
|
nodecnt=`lscpu|grep "NUMA node(" |awk '{print $3}'`
|
|
for ((i=0; i<$nodecnt; i++))
|
|
do
|
|
sudo cat /sys/devices/system/node/node${i}/meminfo > meminfo.$i.out
|
|
done
|
|
sudo more `sudo find /proc -name status` > proc_parent_child_status.out
|
|
sudo more /proc/*/numa_maps > numa_maps.out
|
|
|
|
#
|
|
# Get separate kernel and user perf-c2c stats
|
|
#
|
|
sudo perf c2c record -a --ldlat=70 --all-user -o perf_c2c_a_all_user.data sleep 5
|
|
sudo perf c2c report --stdio -i perf_c2c_a_all_user.data > perf_c2c_a_all_user.out 2>&1
|
|
sudo perf c2c report --full-symbols --stdio -i perf_c2c_a_all_user.data > perf_c2c_full-sym_a_all_user.out 2>&1
|
|
|
|
sudo perf c2c record --call-graph dwarf -a --ldlat=70 --all-user -o perf_c2c_g_a_all_user.data sleep 5
|
|
sudo perf c2c report -g --stdio -i perf_c2c_g_a_all_user.data > perf_c2c_g_a_all_user.out 2>&1
|
|
|
|
sudo perf c2c record -a --ldlat=70 --all-kernel -o perf_c2c_a_all_kernel.data sleep 4
|
|
sudo perf c2c report --stdio -i perf_c2c_a_all_kernel.data > perf_c2c_a_all_kernel.out 2>&1
|
|
|
|
sudo perf c2c record --call-graph dwarf --ldlat=70 -a --all-kernel -o perf_c2c_g_a_all_kernel.data sleep 4
|
|
|
|
sudo perf c2c report -g --stdio -i perf_c2c_g_a_all_kernel.data > perf_c2c_g_a_all_kernel.out 2>&1
|
|
|
|
#
|
|
# Get combined kernel and user perf-c2c stats
|
|
#
|
|
sudo perf c2c record -a --ldlat=70 -o perf_c2c_a_both.data sleep 4
|
|
sudo perf c2c report --stdio -i perf_c2c_a_both.data > perf_c2c_a_both.out 2>&1
|
|
|
|
sudo perf c2c record --call-graph dwarf --ldlat=70 -a --all-kernel -o perf_c2c_g_a_both.data sleep 4
|
|
sudo perf c2c report -g --stdio -i perf_c2c_g_a_both.data > perf_c2c_g_a_both.out 2>&1
|
|
|
|
#
|
|
# Get all-user physical addr stats, in case multiple threads or processes are
|
|
# accessing shared memory with different vaddrs.
|
|
#
|
|
sudo perf c2c record --phys-data -a --ldlat=70 --all-user -o perf_c2c_a_all_user_phys_data.data sleep 5
|
|
sudo perf c2c report --stdio -i perf_c2c_a_all_user_phys_data.data > perf_c2c_a_all_user_phys_data.out 2>&1
|
|
}
|
|
|
|
function run() {
|
|
local dir=$1
|
|
shift
|
|
(
|
|
rm -fr $dir
|
|
mkdir $dir
|
|
cd $dir
|
|
ceph_test_c2c --threads $(($(nproc) * 2)) "$@" &
|
|
sleep 30 # let it warm up
|
|
run_perf_c2c
|
|
kill $! || { echo "ceph_test_c2c WAS NOT RUNNING" ; exit 1 ; }
|
|
) || exit 1
|
|
}
|
|
|
|
function bench() {
|
|
optimized=$(timeout 30 ceph_test_c2c --threads $(($(nproc) * 2)) --sharding 2> /dev/null || true)
|
|
not_optimized=$(timeout 30 ceph_test_c2c --threads $(($(nproc) * 2)) 2> /dev/null || true)
|
|
if ! (( $optimized > ( $not_optimized * 2 ) )) ; then
|
|
echo "the optimization is expected to be at least x2 faster"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
run with-sharding --sharding
|
|
run without-sharding
|
|
bench
|