lavu/riscv: use Zbb REV8 at run-time

This adds runtime support to use Zbb REV8 for 32- and 64-bit byte-wise
swaps. The result is about five times slower than if targetting Zbb
statically, but still a lot faster than the default bespoke C code or a
call to GCC run-time functions.

For 16-bit swap, this is however unsurprisingly a lot worse, and so this
sticks to the baseline. In fact, even using REV8 statically does not
seem to be beneficial in that case.

         Zbb static    Zbb dynamic   I baseline
bswap16:  0.668184765   3.340764069   0.668029012
bswap32:  0.668174014   3.340763319   9.353855435
bswap64:  0.668221765   3.340496313  14.698672283
(seconds for 1 billion iterations on a SiFive-U74 core)
This commit is contained in:
Rémi Denis-Courmont 2024-05-30 22:04:10 +03:00
parent 378d1b06c3
commit 324899b748
1 changed files with 42 additions and 2 deletions

View File

@ -22,11 +22,51 @@
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/riscv/cpu.h"
#if defined (__GNUC__) || defined (__clang__)
#define av_bswap16 __builtin_bswap16
#define av_bswap32 __builtin_bswap32
#define av_bswap64 __builtin_bswap64
static av_always_inline av_const uint32_t av_bswap32_rv(uint32_t x)
{
#if HAVE_RV && !defined(__riscv_zbb)
if (!__builtin_constant_p(x) &&
__builtin_expect(ff_rv_zbb_support(), 1)) {
uintptr_t y;
__asm__ (
".option push\n"
".option arch, +zbb\n"
"rev8 %0, %1\n"
".option pop" : "=r" (y) : "r" (x));
return y >> (__riscv_xlen - 32);
}
#endif
return __builtin_bswap32(x);
}
#define av_bswap32 av_bswap32_rv
#if __riscv_xlen >= 64
static av_always_inline av_const uint64_t av_bswap64_rv(uint64_t x)
{
#if HAVE_RV && !defined(__riscv_zbb)
if (!__builtin_constant_p(x) &&
__builtin_expect(ff_rv_zbb_support(), 1)) {
uintptr_t y;
__asm__ (
".option push\n"
".option arch, +zbb\n"
"rev8 %0, %1\n"
".option pop" : "=r" (y) : "r" (x));
return y >> (__riscv_xlen - 64);
}
#endif
return __builtin_bswap64(x);
}
#define av_bswap64 av_bswap64_rv
#endif
#endif
#endif /* AVUTIL_RISCV_BSWAP_H */