mirror of https://git.ffmpeg.org/ffmpeg.git
x86/tx_float: replace fft_sr_avx with fft_sr_fma3
When the SLOW_GATHER flag was added to the AVX2 version, this made FMA3-features not enabled on Zen CPUs. As FMA3 adds 6-7% across all platforms that support it, in the interest of saving space, this commit removes the AVX version and replaces it with an FMA3 version. The only CPUs affected are Sandy Bridge and Bulldozer, which have AVX support, but no FMA3 support. In the future, if there's a demand for it, a version of the function duplicated for AVX can be added.
This commit is contained in:
parent
0938ff9701
commit
27cffd16aa
|
@ -1285,8 +1285,8 @@ FFT_SPLIT_RADIX_DEF 131072
|
|||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
FFT_SPLIT_RADIX_FN avx, float, 0
|
||||
FFT_SPLIT_RADIX_FN avx, ns_float, 1
|
||||
FFT_SPLIT_RADIX_FN fma3, float, 0
|
||||
FFT_SPLIT_RADIX_FN fma3, ns_float, 1
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
FFT_SPLIT_RADIX_FN avx2, float, 0
|
||||
FFT_SPLIT_RADIX_FN avx2, ns_float, 1
|
||||
|
|
|
@ -38,8 +38,8 @@ TX_DECL_FN(fft32, avx)
|
|||
TX_DECL_FN(fft32_ns, avx)
|
||||
TX_DECL_FN(fft32, fma3)
|
||||
TX_DECL_FN(fft32_ns, fma3)
|
||||
TX_DECL_FN(fft_sr, avx)
|
||||
TX_DECL_FN(fft_sr_ns, avx)
|
||||
TX_DECL_FN(fft_sr, fma3)
|
||||
TX_DECL_FN(fft_sr_ns, fma3)
|
||||
TX_DECL_FN(fft_sr, avx2)
|
||||
TX_DECL_FN(fft_sr_ns, avx2)
|
||||
|
||||
|
@ -88,13 +88,13 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
|
|||
TX_DEF(fft32, FFT, 32, 32, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
|
||||
TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
||||
AV_CPU_FLAG_AVXSLOW),
|
||||
#if HAVE_AVX2_EXTERNAL
|
||||
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 256, b8_i2, avx, AVX, 0, AV_CPU_FLAG_AVXSLOW),
|
||||
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
||||
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 288, b8_i2, fma3, FMA3, 0, AV_CPU_FLAG_AVXSLOW),
|
||||
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
||||
AV_CPU_FLAG_AVXSLOW),
|
||||
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 288, b8_i2, avx2, AVX2, 0,
|
||||
#if HAVE_AVX2_EXTERNAL
|
||||
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 320, b8_i2, avx2, AVX2, 0,
|
||||
AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
|
||||
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 352, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
||||
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 384, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
||||
AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
|
||||
#endif
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue