From 3d9217f7ab6423a3317e166c36f5698ff6ee4a78 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 16 Feb 2023 04:41:27 +0100 Subject: [PATCH] btrfs-progs: hash-speedtest: select implementation by features Now put all the recent changes into action. Add a callback that will reinitialize the implementation pointers according to the desired feature. Reference implementations use the NONE CPU flag to distinguish them from the rest. Example results: $ hash-speedtest CPU flags: 0xff CPU features: SSE2 SSSE3 SSE41 SSE42 SHA AVX AVX2 Block size: 4096 Iterations: 1000000 Implementation: builtin Units: CPU cycles NULL-NOP: cycles: 67129026, cycles/i 67 NULL-MEMCPY: cycles: 231303654, cycles/i 231, 60792.500 MiB/s CRC32C-ref: cycles: 23982698042, cycles/i 23982, 586.322 MiB/s CRC32C-NI: cycles: 1168017624, cycles/i 1168, 12038.828 MiB/s XXHASH: cycles: 838434468, cycles/i 838, 16771.152 MiB/s SHA256-ref: cycles: 68296865380, cycles/i 68296, 205.889 MiB/s SHA256-NI: cycles: 29748853920, cycles/i 29748, 472.676 MiB/s BLAKE2-ref: cycles: 14532177414, cycles/i 14532, 967.617 MiB/s BLAKE2-SSE2: cycles: 17762215810, cycles/i 17762, 791.657 MiB/s BLAKE2-SSE41: cycles: 12370044656, cycles/i 12370, 1136.744 MiB/s BLAKE2-AVX2: cycles: 9472823338, cycles/i 9472, 1484.412 MiB/s Previously: Block size: 4096 Iterations: 1000000 Implementation: builtin Units: CPU cycles NULL-NOP: cycles: 67714016, cycles/i 67 NULL-MEMCPY: cycles: 234140818, cycles/i 234, 60055.762 MiB/s CRC32C: cycles: 1187358432, cycles/i 1187, 11842.733 MiB/s XXHASH: cycles: 1897530684, cycles/i 1897, 7410.448 MiB/s SHA256: cycles: 69855340702, cycles/i 69855, 201.296 MiB/s BLAKE2: cycles: 14713130972, cycles/i 14713, 955.716 MiB/s The CPU is i7-11700 3.60GHz and not the same as previous results mentioned in changelogs so the results are incomparable. Otherwise, the updated xxhash implementation is twice as fast, no significant changes for the rest. Signed-off-by: David Sterba --- crypto/hash-speedtest.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/crypto/hash-speedtest.c b/crypto/hash-speedtest.c index 3657970c..25dad11c 100644 --- a/crypto/hash-speedtest.c +++ b/crypto/hash-speedtest.c @@ -183,21 +183,27 @@ int main(int argc, char **argv) { u64 cycles; u64 time; unsigned long cpu_flag; + void (*init_accel)(void); } contestants[] = { { .name = "NULL-NOP", .digest = hash_null_nop, .digest_size = 32 }, { .name = "NULL-MEMCPY", .digest = hash_null_memcpy, .digest_size = 32 }, - { .name = "CRC32C", .digest = hash_crc32c, .digest_size = 4 }, + { .name = "CRC32C-ref", .digest = hash_crc32c, .digest_size = 4, + .cpu_flag = CPU_FLAG_NONE, .init_accel = hash_init_crc32c }, + { .name = "CRC32C-NI", .digest = hash_crc32c, .digest_size = 4, + .cpu_flag = CPU_FLAG_SSE42, .init_accel = hash_init_crc32c }, { .name = "XXHASH", .digest = hash_xxhash, .digest_size = 8 }, - { .name = "SHA256-ref", .digest = hash_sha256, .digest_size = 32 }, + { .name = "SHA256-ref", .digest = hash_sha256, .digest_size = 32, + .cpu_flag = CPU_FLAG_NONE, .init_accel = hash_init_sha256 }, { .name = "SHA256-NI", .digest = hash_sha256, .digest_size = 32, - .cpu_flag = CPU_FLAG_SHA }, - { .name = "BLAKE2-ref", .digest = hash_blake2b, .digest_size = 32 }, + .cpu_flag = CPU_FLAG_SHA, .init_accel = hash_init_sha256 }, + { .name = "BLAKE2-ref", .digest = hash_blake2b, .digest_size = 32, + .cpu_flag = CPU_FLAG_NONE, .init_accel = hash_init_blake2 }, { .name = "BLAKE2-SSE2", .digest = hash_blake2b, .digest_size = 32, - .cpu_flag = CPU_FLAG_SSE2 }, + .cpu_flag = CPU_FLAG_SSE2, .init_accel = hash_init_blake2 }, { .name = "BLAKE2-SSE41", .digest = hash_blake2b, .digest_size = 32, - .cpu_flag = CPU_FLAG_SSE41 }, + .cpu_flag = CPU_FLAG_SSE41, .init_accel = hash_init_blake2 }, { .name = "BLAKE2-AVX2", .digest = hash_blake2b, .digest_size = 32, - .cpu_flag = CPU_FLAG_AVX2 }, + .cpu_flag = CPU_FLAG_AVX2, .init_accel = hash_init_blake2 }, }; int units = UNITS_CYCLES; @@ -270,7 +276,10 @@ int main(int argc, char **argv) { printf("%12s: ", c->name); fflush(stdout); - cpu_set_level(c->cpu_flag); + if (c->cpu_flag) { + cpu_set_level(c->cpu_flag); + c->init_accel(); + } tstart = get_time(); start = get_cycles(units); for (iter = 0; iter < iterations; iter++) {