mirror of https://git.ffmpeg.org/ffmpeg.git
Revert "x86/tx_float: remove vgatherdpd usage"
This reverts commit 82a68a8771
.
Smarter slow ISA penalties makes gathers still useful.
The intention is to use gathers with the final stage of non-ptwo iMDCTs,
where they give benefit.
This commit is contained in:
parent
9ab20b1614
commit
9e94c35941
|
@ -97,7 +97,13 @@ SECTION .text
|
|||
; %4 - LUT offset
|
||||
; %5 - temporary GPR (only used if vgather is not used)
|
||||
; %6 - temporary register (for avx only)
|
||||
%macro LOAD64_LUT 5-6
|
||||
; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set)
|
||||
%macro LOAD64_LUT 5-7
|
||||
%if %0 > 6 && cpuflag(avx2)
|
||||
pcmpeqd %6, %6 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25
|
||||
movapd xmm%7, [%3 + %4] ; float mov since vgatherdpd is a float instruction
|
||||
vgatherdpd %1, [%2 + xmm%7*8], %6 ; must use separate registers for args
|
||||
%else
|
||||
mov %5d, [%3 + %4 + 0]
|
||||
movsd xmm%1, [%2 + %5q*8]
|
||||
%if mmsize == 32
|
||||
|
@ -111,6 +117,7 @@ SECTION .text
|
|||
movhps xmm%6, [%2 + %5q*8]
|
||||
vinsertf128 %1, %1, xmm%6, 1
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; Single 2-point in-place complex FFT (will do 2 transforms at once in AVX mode)
|
||||
|
@ -813,10 +820,10 @@ cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp
|
|||
movaps m7, [inq + 7*mmsize]
|
||||
%else
|
||||
mov ctxq, [ctxq + AVTXContext.map]
|
||||
LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8
|
||||
LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m9
|
||||
LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m10
|
||||
LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m11
|
||||
LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m9
|
||||
LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m10, m11
|
||||
LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m12, m13
|
||||
LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m14, m15
|
||||
%endif
|
||||
|
||||
FFT8 m4, m5, m6, m7, m8, m9
|
||||
|
@ -827,10 +834,10 @@ cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp
|
|||
movaps m2, [inq + 2*mmsize]
|
||||
movaps m3, [inq + 3*mmsize]
|
||||
%else
|
||||
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m8
|
||||
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m9
|
||||
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m10
|
||||
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m11
|
||||
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m8, m9
|
||||
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m10, m11
|
||||
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m12, m13
|
||||
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m14, m15
|
||||
%endif
|
||||
|
||||
movaps m8, [tab_32_float]
|
||||
|
@ -932,10 +939,10 @@ ALIGN 16
|
|||
movaps m6, [inq + 6*mmsize]
|
||||
movaps m7, [inq + 7*mmsize]
|
||||
%else
|
||||
LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq, m8
|
||||
LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq, m9
|
||||
LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m10
|
||||
LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m11
|
||||
LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq, m8, m9
|
||||
LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq, m10, m11
|
||||
LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m12, m13
|
||||
LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m14, m15
|
||||
%endif
|
||||
|
||||
FFT8 m4, m5, m6, m7, m8, m9
|
||||
|
@ -946,10 +953,10 @@ ALIGN 16
|
|||
movaps m2, [inq + 2*mmsize]
|
||||
movaps m3, [inq + 3*mmsize]
|
||||
%else
|
||||
LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq, m8
|
||||
LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq, m9
|
||||
LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m10
|
||||
LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m11
|
||||
LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq, m8, m9
|
||||
LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq, m10, m11
|
||||
LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m12, m13
|
||||
LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m14, m15
|
||||
%endif
|
||||
|
||||
movaps m8, [tab_32_float]
|
||||
|
@ -1006,10 +1013,10 @@ ALIGN 16
|
|||
movaps tx1_o0, [inq + 2*mmsize]
|
||||
movaps tx1_o1, [inq + 3*mmsize]
|
||||
%else
|
||||
LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e
|
||||
LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tw_o
|
||||
LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tmp1
|
||||
LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tmp2
|
||||
LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e, tw_o
|
||||
LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tmp1, tmp2
|
||||
LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tw_e, tw_o
|
||||
LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tmp1, tmp2
|
||||
%endif
|
||||
|
||||
FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1
|
||||
|
@ -1020,10 +1027,10 @@ ALIGN 16
|
|||
movaps tx2_o0, [inq + 6*mmsize]
|
||||
movaps tx2_o1, [inq + 7*mmsize]
|
||||
%else
|
||||
LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tmp1
|
||||
LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tmp2
|
||||
LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tw_o
|
||||
LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_e
|
||||
LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tmp1, tmp2
|
||||
LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tw_e, tw_o
|
||||
LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tmp1, tmp2
|
||||
LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_e, tw_o
|
||||
%endif
|
||||
|
||||
FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o
|
||||
|
@ -1280,6 +1287,8 @@ FFT_SPLIT_RADIX_DEF 131072
|
|||
%if ARCH_X86_64
|
||||
FFT_SPLIT_RADIX_FN avx, float, 0
|
||||
FFT_SPLIT_RADIX_FN avx, ns_float, 1
|
||||
FFT_SPLIT_RADIX_FN fma3, float, 0
|
||||
FFT_SPLIT_RADIX_FN fma3, ns_float, 1
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
FFT_SPLIT_RADIX_FN avx2, float, 0
|
||||
FFT_SPLIT_RADIX_FN avx2, ns_float, 1
|
||||
%endif
|
||||
%endif
|
||||
|
|
|
@ -40,8 +40,8 @@ TX_DECL_FN(fft32, fma3)
|
|||
TX_DECL_FN(fft32_ns, fma3)
|
||||
TX_DECL_FN(fft_sr, avx)
|
||||
TX_DECL_FN(fft_sr_ns, avx)
|
||||
TX_DECL_FN(fft_sr, fma3)
|
||||
TX_DECL_FN(fft_sr_ns, fma3)
|
||||
TX_DECL_FN(fft_sr, avx2)
|
||||
TX_DECL_FN(fft_sr_ns, avx2)
|
||||
|
||||
#define DECL_INIT_FN(basis, interleave) \
|
||||
static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \
|
||||
|
@ -83,10 +83,13 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
|
|||
TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
|
||||
TX_DEF(fft32, FFT, 32, 32, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, 0),
|
||||
TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
|
||||
#if HAVE_AVX2_EXTERNAL
|
||||
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 256, b8_i2, avx, AVX, 0, 0),
|
||||
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
|
||||
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 288, b8_i2, fma3, FMA3, 0, 0),
|
||||
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
|
||||
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 288, b8_i2, avx2, AVX2, 0, AV_CPU_FLAG_AVXSLOW),
|
||||
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 352, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
||||
AV_CPU_FLAG_AVXSLOW),
|
||||
#endif
|
||||
#endif
|
||||
|
||||
NULL,
|
||||
|
|
Loading…
Reference in New Issue