From b9fa32082c71013e90eab9e9997967d2939cf4a6 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Sun, 13 Jul 2008 15:03:58 +0000 Subject: [PATCH] exploit mdct symmetry 2% faster vorbis on conroe, k8. 7% on celeron. Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/dsputil.c | 15 ++++++-- libavcodec/dsputil.h | 8 +++++ libavcodec/fft.c | 3 ++ libavcodec/i386/dsputil_mmx.c | 68 +++++++++++++++++++++++++++-------- libavcodec/i386/fft_3dn2.c | 57 ++++++++++++++++++++++++++--- libavcodec/i386/fft_sse.c | 58 +++++++++++++++++++++++++++--- libavcodec/mdct.c | 57 +++++++++++++++++++++++------ libavcodec/vorbis_dec.c | 20 +++++------ 8 files changed, 241 insertions(+), 45 deletions(-) diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index 55929d0e76..212e2415da 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -3931,9 +3931,18 @@ void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, } void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){ - int i; - for(i=0; ifft_calc = ff_fft_calc_c; s->imdct_calc = ff_imdct_calc; + s->imdct_half = ff_imdct_half; s->exptab1 = NULL; #ifdef HAVE_MMX @@ -67,6 +68,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse) if (has_vectors & MM_3DNOWEXT) { /* 3DNowEx for K7/K8 */ s->imdct_calc = ff_imdct_calc_3dn2; + s->imdct_half = ff_imdct_half_3dn2; s->fft_calc = ff_fft_calc_3dn2; } else if (has_vectors & MM_3DNOW) { /* 3DNow! for K6-2/3 */ @@ -74,6 +76,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse) } else if (has_vectors & MM_SSE) { /* SSE for P3/P4 */ s->imdct_calc = ff_imdct_calc_sse; + s->imdct_half = ff_imdct_half_sse; s->fft_calc = ff_fft_calc_sse; } else { shuffle = 0; diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c index 5ee168b3e3..db8be862dd 100644 --- a/libavcodec/i386/dsputil_mmx.c +++ b/libavcodec/i386/dsputil_mmx.c @@ -2022,33 +2022,71 @@ static void vector_fmul_add_add_sse(float *dst, const float *src0, const float * ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); } +static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1, + const float *win, float add_bias, int len){ +#ifdef HAVE_6REGS + if(add_bias == 0){ + x86_reg i = -len*4; + x86_reg j = len*4-8; + asm volatile( + "1: \n" + "pswapd (%5,%1), %%mm1 \n" + "movq (%5,%0), %%mm0 \n" + "pswapd (%4,%1), %%mm5 \n" + "movq (%3,%0), %%mm4 \n" + "movq %%mm0, %%mm2 \n" + "movq %%mm1, %%mm3 \n" + "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i] + "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j] + "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j] + "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i] + "pfadd %%mm3, %%mm2 \n" + "pfsub %%mm0, %%mm1 \n" + "pswapd %%mm2, %%mm2 \n" + "movq %%mm1, (%2,%0) \n" + "movq %%mm2, (%2,%1) \n" + "sub $8, %1 \n" + "add $8, %0 \n" + "jl 1b \n" + "femms \n" + :"+r"(i), "+r"(j) + :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) + ); + }else +#endif + ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); +} + static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){ #ifdef HAVE_6REGS if(add_bias == 0){ - x86_reg i = -len*2; - x86_reg j = len*2-16; + x86_reg i = -len*4; + x86_reg j = len*4-16; asm volatile( "1: \n" - "movaps (%5,%0), %%xmm0 \n" "movaps (%5,%1), %%xmm1 \n" + "movaps (%5,%0), %%xmm0 \n" + "movaps (%4,%1), %%xmm5 \n" + "movaps (%3,%0), %%xmm4 \n" + "shufps $0x1b, %%xmm1, %%xmm1 \n" + "shufps $0x1b, %%xmm5, %%xmm5 \n" "movaps %%xmm0, %%xmm2 \n" "movaps %%xmm1, %%xmm3 \n" + "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i] + "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j] + "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j] + "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i] + "addps %%xmm3, %%xmm2 \n" + "subps %%xmm0, %%xmm1 \n" "shufps $0x1b, %%xmm2, %%xmm2 \n" - "shufps $0x1b, %%xmm3, %%xmm3 \n" - "mulps (%4,%0), %%xmm0 \n" - "mulps (%4,%1), %%xmm1 \n" - "mulps (%3,%0), %%xmm3 \n" - "mulps (%3,%1), %%xmm2 \n" - "addps %%xmm3, %%xmm0 \n" - "addps %%xmm2, %%xmm1 \n" - "movaps %%xmm0, (%2,%0) \n" - "movaps %%xmm1, (%2,%1) \n" + "movaps %%xmm1, (%2,%0) \n" + "movaps %%xmm2, (%2,%1) \n" "sub $16, %1 \n" "add $16, %0 \n" "jl 1b \n" :"+r"(i), "+r"(j) - :"r"(dst+len/2), "r"(src0+len/2), "r"(src1+len/2), "r"(win+len/2) + :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) ); }else #endif @@ -2638,8 +2676,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->float_to_int16_interleave = float_to_int16_interleave_3dnow; } } - if(mm_flags & MM_3DNOWEXT) + if(mm_flags & MM_3DNOWEXT){ c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; + c->vector_fmul_window = vector_fmul_window_3dnow2; + } if(mm_flags & MM_SSE){ c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; c->vector_fmul = vector_fmul_sse; diff --git a/libavcodec/i386/fft_3dn2.c b/libavcodec/i386/fft_3dn2.c index 32c4be369b..9068dff24b 100644 --- a/libavcodec/i386/fft_3dn2.c +++ b/libavcodec/i386/fft_3dn2.c @@ -124,10 +124,9 @@ void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) asm volatile("femms"); } -void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, - const FFTSample *input, FFTSample *tmp) +static void imdct_3dn2(MDCTContext *s, const FFTSample *input, FFTSample *tmp) { - long n8, n4, n2, n; + long n4, n2, n; x86_reg k; const uint16_t *revtab = s->fft.revtab; const FFTSample *tcos = s->tcos; @@ -138,7 +137,6 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, n = 1 << s->nbits; n2 = n >> 1; n4 = n >> 2; - n8 = n >> 3; /* pre rotation */ in1 = input; @@ -182,6 +180,20 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, :"m"(tcos[k]), "m"(tsin[k]) ); } +} + +void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp) +{ + x86_reg k; + long n8, n2, n; + FFTComplex *z = (FFTComplex *)tmp; + + n = 1 << s->nbits; + n2 = n >> 1; + n8 = n >> 3; + + imdct_3dn2(s, input, tmp); k = n-8; asm volatile("movd %0, %%mm7" ::"r"(1<<31)); @@ -212,3 +224,40 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, asm volatile("femms"); } +void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp) +{ + x86_reg j, k; + long n8, n4, n; + FFTComplex *z = (FFTComplex *)tmp; + + n = 1 << s->nbits; + n4 = n >> 2; + n8 = n >> 3; + + imdct_3dn2(s, input, tmp); + + j = -n; + k = n-8; + asm volatile("movd %0, %%mm7" ::"r"(1<<31)); + asm volatile( + "1: \n\t" + "movq (%3,%1), %%mm0 \n\t" // z[n8+k] + "pswapd (%3,%0), %%mm1 \n\t" // z[n8-1-k] + "movq %%mm0, %%mm2 \n\t" + "punpckldq %%mm1, %%mm0 \n\t" + "punpckhdq %%mm2, %%mm1 \n\t" + "pxor %%mm7, %%mm0 \n\t" + "pxor %%mm7, %%mm1 \n\t" + "movq %%mm0, (%2,%1) \n\t" // output[n4+2*k] = { -z[n8+k].re, z[n8-1-k].im } + "movq %%mm1, (%2,%0) \n\t" // output[n4-2-2*k] = { -z[n8-1-k].re, z[n8+k].im } + "sub $8, %1 \n\t" + "add $8, %0 \n\t" + "jl 1b \n\t" + :"+r"(j), "+r"(k) + :"r"(output+n4), "r"(z+n8) + :"memory" + ); + asm volatile("femms"); +} + diff --git a/libavcodec/i386/fft_sse.c b/libavcodec/i386/fft_sse.c index 83cbd87088..305f44a0ce 100644 --- a/libavcodec/i386/fft_sse.c +++ b/libavcodec/i386/fft_sse.c @@ -142,11 +142,10 @@ void ff_fft_calc_sse(FFTContext *s, FFTComplex *z) } while (nblocks != 0); } -void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, - const FFTSample *input, FFTSample *tmp) +static void imdct_sse(MDCTContext *s, const FFTSample *input, FFTSample *tmp) { x86_reg k; - long n8, n4, n2, n; + long n4, n2, n; const uint16_t *revtab = s->fft.revtab; const FFTSample *tcos = s->tcos; const FFTSample *tsin = s->tsin; @@ -156,7 +155,6 @@ void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, n = 1 << s->nbits; n2 = n >> 1; n4 = n >> 2; - n8 = n >> 3; #ifdef ARCH_X86_64 asm volatile ("movaps %0, %%xmm8\n\t"::"m"(*p1m1p1m1)); @@ -260,6 +258,20 @@ void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, #endif ); } +} + +void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp) +{ + x86_reg k; + long n8, n2, n; + FFTComplex *z = (FFTComplex *)tmp; + + n = 1 << s->nbits; + n2 = n >> 1; + n8 = n >> 3; + + imdct_sse(s, input, tmp); /* Mnemonics: @@ -301,3 +313,41 @@ void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, ); } +void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp) +{ + x86_reg j, k; + long n8, n4, n; + FFTComplex *z = (FFTComplex *)tmp; + + n = 1 << s->nbits; + n4 = n >> 2; + n8 = n >> 3; + + imdct_sse(s, input, tmp); + + j = -n; + k = n-16; + asm volatile("movaps %0, %%xmm7 \n\t"::"m"(*m1m1m1m1)); + asm volatile( + "1: \n\t" + "movaps (%3,%1), %%xmm0 \n\t" + "movaps (%3,%0), %%xmm1 \n\t" + "xorps %%xmm7, %%xmm0 \n\t" + "movaps %%xmm0, %%xmm2 \n\t" + "shufps $141,%%xmm1, %%xmm0 \n\t" + "shufps $216,%%xmm1, %%xmm2 \n\t" + "shufps $54, %%xmm0, %%xmm0 \n\t" + "shufps $156,%%xmm2, %%xmm2 \n\t" + "xorps %%xmm7, %%xmm0 \n\t" + "movaps %%xmm2, (%2,%1) \n\t" + "movaps %%xmm0, (%2,%0) \n\t" + "sub $16, %1 \n\t" + "add $16, %0 \n\t" + "jl 1b \n\t" + :"+r"(j), "+r"(k) + :"r"(output+n4), "r"(z+n8) + :"memory" + ); +} + diff --git a/libavcodec/mdct.c b/libavcodec/mdct.c index 07eef2b3d4..6a3b69a014 100644 --- a/libavcodec/mdct.c +++ b/libavcodec/mdct.c @@ -100,16 +100,9 @@ int ff_mdct_init(MDCTContext *s, int nbits, int inverse) (pim) = _are * _bim + _aim * _bre;\ } -/** - * Compute inverse MDCT of size N = 2^nbits - * @param output N samples - * @param input N/2 samples - * @param tmp N/2 samples - */ -void ff_imdct_calc(MDCTContext *s, FFTSample *output, - const FFTSample *input, FFTSample *tmp) +static void imdct_c(MDCTContext *s, const FFTSample *input, FFTSample *tmp) { - int k, n8, n4, n2, n, j; + int k, n4, n2, n, j; const uint16_t *revtab = s->fft.revtab; const FFTSample *tcos = s->tcos; const FFTSample *tsin = s->tsin; @@ -119,7 +112,6 @@ void ff_imdct_calc(MDCTContext *s, FFTSample *output, n = 1 << s->nbits; n2 = n >> 1; n4 = n >> 2; - n8 = n >> 3; /* pre rotation */ in1 = input; @@ -137,6 +129,25 @@ void ff_imdct_calc(MDCTContext *s, FFTSample *output, for(k = 0; k < n4; k++) { CMUL(z[k].re, z[k].im, z[k].re, z[k].im, tcos[k], tsin[k]); } +} + +/** + * Compute inverse MDCT of size N = 2^nbits + * @param output N samples + * @param input N/2 samples + * @param tmp N/2 samples + */ +void ff_imdct_calc(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp) +{ + int k, n8, n2, n; + FFTComplex *z = (FFTComplex *)tmp; + n = 1 << s->nbits; + n2 = n >> 1; + n8 = n >> 3; + + imdct_c(s, input, tmp); + for(k = 0; k < n8; k++) { output[2*k] = -z[n8 + k].im; output[n2-1-2*k] = z[n8 + k].im; @@ -152,6 +163,32 @@ void ff_imdct_calc(MDCTContext *s, FFTSample *output, } } +/** + * Compute the middle half of the inverse MDCT of size N = 2^nbits, + * thus excluding the parts that can be derived by symmetry + * @param output N/2 samples + * @param input N/2 samples + * @param tmp N/2 samples + */ +void ff_imdct_half(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp) +{ + int k, n8, n4, n; + FFTComplex *z = (FFTComplex *)tmp; + n = 1 << s->nbits; + n4 = n >> 2; + n8 = n >> 3; + + imdct_c(s, input, tmp); + + for(k = 0; k < n8; k++) { + output[n4-1-2*k] = z[n8+k].im; + output[n4-1-2*k-1] = -z[n8-k-1].re; + output[n4 + 2*k] = -z[n8+k].re; + output[n4 + 2*k+1] = z[n8-k-1].im; + } +} + /** * Compute MDCT of size N = 2^nbits * @param input N samples diff --git a/libavcodec/vorbis_dec.c b/libavcodec/vorbis_dec.c index ed13229cd4..d4a5402d86 100644 --- a/libavcodec/vorbis_dec.c +++ b/libavcodec/vorbis_dec.c @@ -899,10 +899,10 @@ static int vorbis_parse_id_hdr(vorbis_context *vc){ vc->channel_residues= av_malloc((vc->blocksize[1]/2)*vc->audio_channels * sizeof(float)); vc->channel_floors = av_malloc((vc->blocksize[1]/2)*vc->audio_channels * sizeof(float)); - vc->saved = av_mallocz((vc->blocksize[1]/2)*vc->audio_channels * sizeof(float)); + vc->saved = av_mallocz((vc->blocksize[1]/4)*vc->audio_channels * sizeof(float)); vc->ret = av_malloc((vc->blocksize[1]/2)*vc->audio_channels * sizeof(float)); - vc->buf = av_malloc( vc->blocksize[1] * sizeof(float)); - vc->buf_tmp = av_malloc( vc->blocksize[1] * sizeof(float)); + vc->buf = av_malloc( vc->blocksize[1]/2 * sizeof(float)); + vc->buf_tmp = av_malloc( vc->blocksize[1]/2 * sizeof(float)); vc->previous_window=0; ff_mdct_init(&vc->mdct[0], bl0, 1); @@ -1520,23 +1520,23 @@ static int vorbis_parse_audio_packet(vorbis_context *vc) { for(j=0;jaudio_channels;++j) { uint_fast16_t bs0=vc->blocksize[0]; uint_fast16_t bs1=vc->blocksize[1]; - float *saved=vc->saved+j*bs1/2; + float *saved=vc->saved+j*bs1/4; float *ret=vc->ret+j*retlen; float *buf=vc->buf; const float *win=vc->win[blockflag&previous_window]; - vc->mdct[0].fft.imdct_calc(&vc->mdct[blockflag], buf, vc->channel_floors+j*blocksize/2, vc->buf_tmp); + vc->mdct[0].fft.imdct_half(&vc->mdct[blockflag], buf, vc->channel_floors+j*blocksize/2, vc->buf_tmp); if(blockflag == previous_window) { - vc->dsp.vector_fmul_window(ret, saved, buf, win, fadd_bias, blocksize/2); + vc->dsp.vector_fmul_window(ret, saved, buf, win, fadd_bias, blocksize/4); } else if(blockflag > previous_window) { - vc->dsp.vector_fmul_window(ret, saved, buf+(bs1-bs0)/4, win, fadd_bias, bs0/2); - copy_normalize(ret+bs0/2, buf+(bs1+bs0)/4, (bs1-bs0)/4, vc->exp_bias, fadd_bias); + vc->dsp.vector_fmul_window(ret, saved, buf, win, fadd_bias, bs0/4); + copy_normalize(ret+bs0/2, buf+bs0/4, (bs1-bs0)/4, vc->exp_bias, fadd_bias); } else { copy_normalize(ret, saved, (bs1-bs0)/4, vc->exp_bias, fadd_bias); - vc->dsp.vector_fmul_window(ret+(bs1-bs0)/4, saved+(bs1-bs0)/4, buf, win, fadd_bias, bs0/2); + vc->dsp.vector_fmul_window(ret+(bs1-bs0)/4, saved+(bs1-bs0)/4, buf, win, fadd_bias, bs0/4); } - memcpy(saved, buf+blocksize/2, blocksize/2*sizeof(float)); + memcpy(saved, buf+blocksize/4, blocksize/4*sizeof(float)); } vc->previous_window = blockflag;