From f21899db7dae114e4519c0d14dd047efe022e16b Mon Sep 17 00:00:00 2001 From: Lynne Date: Sat, 24 Sep 2022 03:51:48 +0200 Subject: [PATCH] x86/tx_float: enable AVX-only split-radix FFT codelets Sandy Bridge, Ivy Bridge and Bulldozer cores don't support FMA3. --- libavutil/x86/tx_float.asm | 2 ++ libavutil/x86/tx_float_init.c | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm index 0fbab99e45..5ed0007530 100644 --- a/libavutil/x86/tx_float.asm +++ b/libavutil/x86/tx_float.asm @@ -1379,6 +1379,8 @@ cglobal fft_sr_ns_float, 4, 10, 16, 272, ctx, out, in, tmp, len, lut, itab, rtab %endmacro %if ARCH_X86_64 +FFT_SPLIT_RADIX_FN avx, 0 +FFT_SPLIT_RADIX_FN avx, 1 FFT_SPLIT_RADIX_FN fma3, 0 FFT_SPLIT_RADIX_FN fma3, 1 %if HAVE_AVX2_EXTERNAL diff --git a/libavutil/x86/tx_float_init.c b/libavutil/x86/tx_float_init.c index 20c1ad6869..8e2babb539 100644 --- a/libavutil/x86/tx_float_init.c +++ b/libavutil/x86/tx_float_init.c @@ -38,6 +38,8 @@ TX_DECL_FN(fft32, avx) TX_DECL_FN(fft32_ns, avx) TX_DECL_FN(fft32, fma3) TX_DECL_FN(fft32_ns, fma3) +TX_DECL_FN(fft_sr, avx) +TX_DECL_FN(fft_sr_ns, avx) TX_DECL_FN(fft_sr, fma3) TX_DECL_FN(fft_sr_ns, fma3) TX_DECL_FN(fft_sr, avx2) @@ -57,6 +59,7 @@ TX_DECL_FN(fft16_asm, avx) TX_DECL_FN(fft16_asm, fma3) TX_DECL_FN(fft32_asm, avx) TX_DECL_FN(fft32_asm, fma3) +TX_DECL_FN(fft_sr_asm, avx) TX_DECL_FN(fft_sr_asm, fma3) TX_DECL_FN(fft_sr_asm, avx2) @@ -214,6 +217,11 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = { AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 256, b8_i2, avx, AVX, 0, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft_sr_asm, FFT, 64, 131072, 2, 0, 320, b8_i2, avx, AVX, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, + AV_CPU_FLAG_AVXSLOW), TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 288, b8_i2, fma3, FMA3, 0, AV_CPU_FLAG_AVXSLOW), TX_DEF(fft_sr_asm, FFT, 64, 131072, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),