diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm index b644db49be..b3a85a7cb9 100644 --- a/libavutil/x86/tx_float.asm +++ b/libavutil/x86/tx_float.asm @@ -682,15 +682,27 @@ SECTION .text %endmacro INIT_XMM sse3 +cglobal fft2_asm_float, 0, 0, 0, ctx, out, in, stride + movaps m0, [inq] + FFT2 m0, m1 + movaps [outq], m0 + add inq, mmsize*1 + add outq, mmsize*1 + ret + cglobal fft2_float, 4, 4, 2, ctx, out, in, stride movaps m0, [inq] FFT2 m0, m1 movaps [outq], m0 RET -%macro FFT4 2 +%macro FFT4_FN 3 INIT_XMM sse2 +%if %3 +cglobal fft4_ %+ %1 %+ _asm_float, 0, 0, 0, ctx, out, in, stride +%else cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride +%endif movaps m0, [inq + 0*mmsize] movaps m1, [inq + 1*mmsize] @@ -708,11 +720,19 @@ cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride movaps [outq + 0*mmsize], m2 movaps [outq + 1*mmsize], m0 +%if %3 + add inq, mmsize*2 + add outq, mmsize*2 + ret +%else RET +%endif %endmacro -FFT4 fwd, 0 -FFT4 inv, 1 +FFT4_FN fwd, 0, 0 +FFT4_FN fwd, 0, 1 +FFT4_FN inv, 1, 0 +FFT4_FN inv, 1, 1 %macro FFT8_SSE_FN 1 INIT_XMM sse3 diff --git a/libavutil/x86/tx_float_init.c b/libavutil/x86/tx_float_init.c index 25de7b3ec6..06df749fa9 100644 --- a/libavutil/x86/tx_float_init.c +++ b/libavutil/x86/tx_float_init.c @@ -45,6 +45,9 @@ TX_DECL_FN(fft_sr_ns, avx2) TX_DECL_FN(mdct_sr_inv, avx2) +TX_DECL_FN(fft2_asm, sse3) +TX_DECL_FN(fft4_fwd_asm, sse2) +TX_DECL_FN(fft4_inv_asm, sse2) TX_DECL_FN(fft8_asm, sse3) TX_DECL_FN(fft8_asm, avx) TX_DECL_FN(fft16_asm, avx) @@ -101,8 +104,14 @@ static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd, const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = { TX_DEF(fft2, FFT, 2, 2, 2, 0, 128, NULL, sse3, SSE3, AV_TX_INPLACE, 0), + TX_DEF(fft2_asm, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0), TX_DEF(fft2, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY, 0), + TX_DEF(fft4_fwd_asm, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2, + AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0), + TX_DEF(fft4_inv_asm, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, + AV_TX_INPLACE | FF_TX_INVERSE_ONLY | FF_TX_ASM_CALL, 0), TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), TX_DEF(fft4_inv, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0), TX_DEF(fft8, FFT, 8, 8, 2, 0, 128, b8_i0, sse3, SSE3, AV_TX_INPLACE, 0),