diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm index 0061829581..6f83555ce5 100644 --- a/libavutil/x86/tx_float.asm +++ b/libavutil/x86/tx_float.asm @@ -1515,6 +1515,69 @@ FFT_SPLIT_RADIX_FN avx2, 1 %endif %endif +%macro FFT15_FN 2 +INIT_YMM avx2 +cglobal fft15_ %+ %2, 4, 10, 16, ctx, out, in, stride, len, lut, tmp, tgt5, stride3, stride5 + mov lutq, [ctxq + AVTXContext.map] + + imul stride3q, strideq, 3 + imul stride5q, strideq, 5 + + movaps m11, [mask_mmppmmmm] ; mmppmmmm + movaps m10, [tab_53_float] ; tab5 + movaps xm9, [tab_53_float + 32] ; tab3 + vpermpd m9, m9, q1110 ; tab[23232323] + movaps m8, [s15_perm] + +%if %1 + movups xm0, [inq] + movddup xm5, [inq + 16] + movups m2, [inq + mmsize*0 + 24] + movups m3, [inq + mmsize*1 + 24] + movups m4, [inq + mmsize*2 + 24] +%else + LOAD64_LUT xm0, inq, lutq, 0, tmpq, m14, xm15 + LOAD64_LUT m2, inq, lutq, (mmsize/2)*0 + 12, tmpq, m6, m7 + LOAD64_LUT m3, inq, lutq, (mmsize/2)*1 + 12, tmpq, m14, m15 + LOAD64_LUT m4, inq, lutq, (mmsize/2)*2 + 12, tmpq, m6, m7 + mov tmpd, [lutq + 8] + movddup xm5, [inq + tmpq*8] +%endif + + FFT15 + + lea tgt5q, [outq + stride5q] + lea tmpq, [outq + stride5q*2] + + movhps [outq], xm14 ; out[0] + movhps [outq + stride5q*1], xm15 ; out[5] + movlps [outq + stride5q*2], xm15 ; out[10] + + vextractf128 xm3, m0, 1 + vextractf128 xm4, m1, 1 + vextractf128 xm5, m2, 1 + + movlps [outq + strideq*1], xm1 + movhps [outq + strideq*2], xm2 + movlps [outq + stride3q*1], xm3 + movhps [outq + strideq*4], xm4 + movlps [outq + stride3q*2], xm0 + movlps [outq + strideq*8], xm5 + movhps [outq + stride3q*4], xm0 + movhps [tgt5q + strideq*2], xm1 + movhps [tgt5q + strideq*4], xm3 + movlps [tmpq + strideq*1], xm2 + movlps [tmpq + stride3q*1], xm4 + movhps [tmpq + strideq*4], xm5 + + RET +%endmacro + +%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +FFT15_FN 0, float +FFT15_FN 1, ns_float +%endif + %macro IMDCT_FN 1 INIT_YMM %1 cglobal mdct_inv_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, exp, t1, t2, t3, \ diff --git a/libavutil/x86/tx_float_init.c b/libavutil/x86/tx_float_init.c index 8e2babb539..523b43a689 100644 --- a/libavutil/x86/tx_float_init.c +++ b/libavutil/x86/tx_float_init.c @@ -30,6 +30,8 @@ TX_DECL_FN(fft8, sse3) TX_DECL_FN(fft8_ns, sse3) TX_DECL_FN(fft8, avx) TX_DECL_FN(fft8_ns, avx) +TX_DECL_FN(fft15, avx2) +TX_DECL_FN(fft15_ns, avx2) TX_DECL_FN(fft16, avx) TX_DECL_FN(fft16_ns, avx) TX_DECL_FN(fft16, fma3) @@ -85,6 +87,53 @@ static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \ DECL_INIT_FN(8, 0) DECL_INIT_FN(8, 2) +static av_cold int factor_init(AVTXContext *s, const FFTXCodelet *cd, + uint64_t flags, FFTXCodeletOptions *opts, + int len, int inv, const void *scale) +{ + TX_TAB(ff_tx_init_tabs)(len); + + s->map = av_malloc(len*sizeof(s->map)); + s->map[0] = 0; /* DC is always at the start */ + if (inv) /* Reversing the ACs flips the transform direction */ + for (int i = 1; i < len; i++) + s->map[i] = len - i; + else + for (int i = 1; i < len; i++) + s->map[i] = i; + + if (len == 15) { + int cnt = 0, tmp[15]; + + /* Our 15-point transform is actually a 5x3 PFA, so embed its input map. */ + memcpy(tmp, s->map, 15*sizeof(*tmp)); + for (int i = 0; i < 5; i++) + for (int j = 0; j < 3; j++) + s->map[i*3 + j] = tmp[(i*3 + j*5) % 15]; + + /* Special 15-point assembly permutation */ + memcpy(tmp, s->map, 15*sizeof(*tmp)); + for (int i = 1; i < 15; i += 3) { + s->map[cnt] = tmp[i]; + cnt++; + } + for (int i = 2; i < 15; i += 3) { + s->map[cnt] = tmp[i]; + cnt++; + } + for (int i = 0; i < 15; i += 3) { + s->map[cnt] = tmp[i]; + cnt++; + } + memmove(&s->map[7], &s->map[6], 4*sizeof(int)); + memmove(&s->map[3], &s->map[1], 4*sizeof(int)); + s->map[1] = tmp[2]; + s->map[2] = tmp[0]; + } + + return 0; +} + static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale) @@ -229,6 +278,11 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = { AV_CPU_FLAG_AVXSLOW), #if HAVE_AVX2_EXTERNAL + TX_DEF(fft15, FFT, 15, 15, 15, 0, 320, factor_init, avx2, AVX2, + AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft15_ns, FFT, 15, 15, 15, 0, 384, factor_init, avx2, AVX2, + AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 320, b8_i2, avx2, AVX2, 0, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), TX_DEF(fft_sr_asm, FFT, 64, 131072, 2, 0, 384, b8_i2, avx2, AVX2,