From f99a5ef92e5aba87a2d861822274147c994041d5 Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Thu, 30 Jun 2011 17:48:44 -0400 Subject: [PATCH] ac3dsp: add x86-optimized versions of ac3dsp.extract_exponents(). --- libavcodec/x86/ac3dsp.asm | 102 ++++++++++++++++++++++++++++++++++++ libavcodec/x86/ac3dsp_mmx.c | 9 ++++ 2 files changed, 111 insertions(+) diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm index 6892ec2765..c1b0906a85 100644 --- a/libavcodec/x86/ac3dsp.asm +++ b/libavcodec/x86/ac3dsp.asm @@ -32,6 +32,11 @@ cextern ac3_bap_bits pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768 pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7 +; used in ff_ac3_extract_exponents() +pd_1: times 4 dd 1 +pd_151: times 4 dd 151 +pb_shuf_4dwb: db 0, 4, 8, 12 + SECTION .text ;----------------------------------------------------------------------------- @@ -346,3 +351,100 @@ cglobal ac3_compute_mantissa_size_sse2, 1,2,4, mant_cnt, sum movd eax, m0 add eax, sumd RET + +;------------------------------------------------------------------------------ +; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs) +;------------------------------------------------------------------------------ + +%macro PABSD_MMX 2 ; src/dst, tmp + pxor %2, %2 + pcmpgtd %2, %1 + pxor %1, %2 + psubd %1, %2 +%endmacro + +%macro PABSD_SSSE3 1-2 ; src/dst, unused + pabsd %1, %1 +%endmacro + +%ifdef HAVE_AMD3DNOW +INIT_MMX +cglobal ac3_extract_exponents_3dnow, 3,3,0, exp, coef, len + add expq, lenq + lea coefq, [coefq+4*lenq] + neg lenq + movq m3, [pd_1] + movq m4, [pd_151] +.loop: + movq m0, [coefq+4*lenq ] + movq m1, [coefq+4*lenq+8] + PABSD_MMX m0, m2 + PABSD_MMX m1, m2 + pslld m0, 1 + por m0, m3 + pi2fd m2, m0 + psrld m2, 23 + movq m0, m4 + psubd m0, m2 + pslld m1, 1 + por m1, m3 + pi2fd m2, m1 + psrld m2, 23 + movq m1, m4 + psubd m1, m2 + packssdw m0, m0 + packuswb m0, m0 + packssdw m1, m1 + packuswb m1, m1 + punpcklwd m0, m1 + movd [expq+lenq], m0 + add lenq, 4 + jl .loop + REP_RET +%endif + +%macro AC3_EXTRACT_EXPONENTS 1 +cglobal ac3_extract_exponents_%1, 3,3,5, exp, coef, len + add expq, lenq + lea coefq, [coefq+4*lenq] + neg lenq + mova m2, [pd_1] + mova m3, [pd_151] +%ifidn %1, ssse3 ; + movd m4, [pb_shuf_4dwb] +%endif +.loop: + ; move 4 32-bit coefs to xmm0 + mova m0, [coefq+4*lenq] + ; absolute value + PABSD m0, m1 + ; convert to float and extract exponents + pslld m0, 1 + por m0, m2 + cvtdq2ps m1, m0 + psrld m1, 23 + mova m0, m3 + psubd m0, m1 + ; move the lowest byte in each of 4 dwords to the low dword +%ifidn %1, ssse3 + pshufb m0, m4 +%else + packssdw m0, m0 + packuswb m0, m0 +%endif + movd [expq+lenq], m0 + + add lenq, 4 + jl .loop + REP_RET +%endmacro + +%ifdef HAVE_SSE +INIT_XMM +%define PABSD PABSD_MMX +AC3_EXTRACT_EXPONENTS sse2 +%ifdef HAVE_SSSE3 +%define PABSD PABSD_SSSE3 +AC3_EXTRACT_EXPONENTS ssse3 +%endif +%endif diff --git a/libavcodec/x86/ac3dsp_mmx.c b/libavcodec/x86/ac3dsp_mmx.c index 2664736bb6..692d240d4c 100644 --- a/libavcodec/x86/ac3dsp_mmx.c +++ b/libavcodec/x86/ac3dsp_mmx.c @@ -44,6 +44,10 @@ extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned i extern int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]); +extern void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs); +extern void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs); +extern void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs); + av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) { int mm_flags = av_get_cpu_flags(); @@ -56,6 +60,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx; } if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) { + c->extract_exponents = ff_ac3_extract_exponents_3dnow; if (!bit_exact) { c->float_to_fixed24 = ff_float_to_fixed24_3dnow; } @@ -72,6 +77,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2; c->float_to_fixed24 = ff_float_to_fixed24_sse2; c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2; + c->extract_exponents = ff_ac3_extract_exponents_sse2; if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2; c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2; @@ -79,6 +85,9 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) } if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) { c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3; + if (!(mm_flags & AV_CPU_FLAG_ATOM)) { + c->extract_exponents = ff_ac3_extract_exponents_ssse3; + } } #endif }