mirror of https://git.ffmpeg.org/ffmpeg.git
x86/tx_float: add a standalone 15-point AVX2 transform
Enables its use everywhere else in the framework.
This commit is contained in:
parent
877e575b5d
commit
cc1df4045e
|
@ -1515,6 +1515,69 @@ FFT_SPLIT_RADIX_FN avx2, 1
|
|||
%endif
|
||||
%endif
|
||||
|
||||
%macro FFT15_FN 2
|
||||
INIT_YMM avx2
|
||||
cglobal fft15_ %+ %2, 4, 10, 16, ctx, out, in, stride, len, lut, tmp, tgt5, stride3, stride5
|
||||
mov lutq, [ctxq + AVTXContext.map]
|
||||
|
||||
imul stride3q, strideq, 3
|
||||
imul stride5q, strideq, 5
|
||||
|
||||
movaps m11, [mask_mmppmmmm] ; mmppmmmm
|
||||
movaps m10, [tab_53_float] ; tab5
|
||||
movaps xm9, [tab_53_float + 32] ; tab3
|
||||
vpermpd m9, m9, q1110 ; tab[23232323]
|
||||
movaps m8, [s15_perm]
|
||||
|
||||
%if %1
|
||||
movups xm0, [inq]
|
||||
movddup xm5, [inq + 16]
|
||||
movups m2, [inq + mmsize*0 + 24]
|
||||
movups m3, [inq + mmsize*1 + 24]
|
||||
movups m4, [inq + mmsize*2 + 24]
|
||||
%else
|
||||
LOAD64_LUT xm0, inq, lutq, 0, tmpq, m14, xm15
|
||||
LOAD64_LUT m2, inq, lutq, (mmsize/2)*0 + 12, tmpq, m6, m7
|
||||
LOAD64_LUT m3, inq, lutq, (mmsize/2)*1 + 12, tmpq, m14, m15
|
||||
LOAD64_LUT m4, inq, lutq, (mmsize/2)*2 + 12, tmpq, m6, m7
|
||||
mov tmpd, [lutq + 8]
|
||||
movddup xm5, [inq + tmpq*8]
|
||||
%endif
|
||||
|
||||
FFT15
|
||||
|
||||
lea tgt5q, [outq + stride5q]
|
||||
lea tmpq, [outq + stride5q*2]
|
||||
|
||||
movhps [outq], xm14 ; out[0]
|
||||
movhps [outq + stride5q*1], xm15 ; out[5]
|
||||
movlps [outq + stride5q*2], xm15 ; out[10]
|
||||
|
||||
vextractf128 xm3, m0, 1
|
||||
vextractf128 xm4, m1, 1
|
||||
vextractf128 xm5, m2, 1
|
||||
|
||||
movlps [outq + strideq*1], xm1
|
||||
movhps [outq + strideq*2], xm2
|
||||
movlps [outq + stride3q*1], xm3
|
||||
movhps [outq + strideq*4], xm4
|
||||
movlps [outq + stride3q*2], xm0
|
||||
movlps [outq + strideq*8], xm5
|
||||
movhps [outq + stride3q*4], xm0
|
||||
movhps [tgt5q + strideq*2], xm1
|
||||
movhps [tgt5q + strideq*4], xm3
|
||||
movlps [tmpq + strideq*1], xm2
|
||||
movlps [tmpq + stride3q*1], xm4
|
||||
movhps [tmpq + strideq*4], xm5
|
||||
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
|
||||
FFT15_FN 0, float
|
||||
FFT15_FN 1, ns_float
|
||||
%endif
|
||||
|
||||
%macro IMDCT_FN 1
|
||||
INIT_YMM %1
|
||||
cglobal mdct_inv_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, exp, t1, t2, t3, \
|
||||
|
|
|
@ -30,6 +30,8 @@ TX_DECL_FN(fft8, sse3)
|
|||
TX_DECL_FN(fft8_ns, sse3)
|
||||
TX_DECL_FN(fft8, avx)
|
||||
TX_DECL_FN(fft8_ns, avx)
|
||||
TX_DECL_FN(fft15, avx2)
|
||||
TX_DECL_FN(fft15_ns, avx2)
|
||||
TX_DECL_FN(fft16, avx)
|
||||
TX_DECL_FN(fft16_ns, avx)
|
||||
TX_DECL_FN(fft16, fma3)
|
||||
|
@ -85,6 +87,53 @@ static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \
|
|||
DECL_INIT_FN(8, 0)
|
||||
DECL_INIT_FN(8, 2)
|
||||
|
||||
static av_cold int factor_init(AVTXContext *s, const FFTXCodelet *cd,
|
||||
uint64_t flags, FFTXCodeletOptions *opts,
|
||||
int len, int inv, const void *scale)
|
||||
{
|
||||
TX_TAB(ff_tx_init_tabs)(len);
|
||||
|
||||
s->map = av_malloc(len*sizeof(s->map));
|
||||
s->map[0] = 0; /* DC is always at the start */
|
||||
if (inv) /* Reversing the ACs flips the transform direction */
|
||||
for (int i = 1; i < len; i++)
|
||||
s->map[i] = len - i;
|
||||
else
|
||||
for (int i = 1; i < len; i++)
|
||||
s->map[i] = i;
|
||||
|
||||
if (len == 15) {
|
||||
int cnt = 0, tmp[15];
|
||||
|
||||
/* Our 15-point transform is actually a 5x3 PFA, so embed its input map. */
|
||||
memcpy(tmp, s->map, 15*sizeof(*tmp));
|
||||
for (int i = 0; i < 5; i++)
|
||||
for (int j = 0; j < 3; j++)
|
||||
s->map[i*3 + j] = tmp[(i*3 + j*5) % 15];
|
||||
|
||||
/* Special 15-point assembly permutation */
|
||||
memcpy(tmp, s->map, 15*sizeof(*tmp));
|
||||
for (int i = 1; i < 15; i += 3) {
|
||||
s->map[cnt] = tmp[i];
|
||||
cnt++;
|
||||
}
|
||||
for (int i = 2; i < 15; i += 3) {
|
||||
s->map[cnt] = tmp[i];
|
||||
cnt++;
|
||||
}
|
||||
for (int i = 0; i < 15; i += 3) {
|
||||
s->map[cnt] = tmp[i];
|
||||
cnt++;
|
||||
}
|
||||
memmove(&s->map[7], &s->map[6], 4*sizeof(int));
|
||||
memmove(&s->map[3], &s->map[1], 4*sizeof(int));
|
||||
s->map[1] = tmp[2];
|
||||
s->map[2] = tmp[0];
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd,
|
||||
uint64_t flags, FFTXCodeletOptions *opts,
|
||||
int len, int inv, const void *scale)
|
||||
|
@ -229,6 +278,11 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
|
|||
AV_CPU_FLAG_AVXSLOW),
|
||||
|
||||
#if HAVE_AVX2_EXTERNAL
|
||||
TX_DEF(fft15, FFT, 15, 15, 15, 0, 320, factor_init, avx2, AVX2,
|
||||
AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
|
||||
TX_DEF(fft15_ns, FFT, 15, 15, 15, 0, 384, factor_init, avx2, AVX2,
|
||||
AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW),
|
||||
|
||||
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 320, b8_i2, avx2, AVX2, 0,
|
||||
AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
|
||||
TX_DEF(fft_sr_asm, FFT, 64, 131072, 2, 0, 384, b8_i2, avx2, AVX2,
|
||||
|
|
Loading…
Reference in New Issue