diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c index 4d9db9be50..9bfa7300e3 100644 --- a/libavcodec/ac3dsp.c +++ b/libavcodec/ac3dsp.c @@ -85,13 +85,30 @@ static void ac3_rshift_int32_c(int32_t *src, unsigned int len, } while (len > 0); } -av_cold void ff_ac3dsp_init(AC3DSPContext *c) +static void float_to_fixed24_c(int32_t *dst, const float *src, unsigned int len) +{ + const float scale = 1 << 24; + do { + *dst++ = lrintf(*src++ * scale); + *dst++ = lrintf(*src++ * scale); + *dst++ = lrintf(*src++ * scale); + *dst++ = lrintf(*src++ * scale); + *dst++ = lrintf(*src++ * scale); + *dst++ = lrintf(*src++ * scale); + *dst++ = lrintf(*src++ * scale); + *dst++ = lrintf(*src++ * scale); + len -= 8; + } while (len > 0); +} + +av_cold void ff_ac3dsp_init(AC3DSPContext *c, int bit_exact) { c->ac3_exponent_min = ac3_exponent_min_c; c->ac3_max_msb_abs_int16 = ac3_max_msb_abs_int16_c; c->ac3_lshift_int16 = ac3_lshift_int16_c; c->ac3_rshift_int32 = ac3_rshift_int32_c; + c->float_to_fixed24 = float_to_fixed24_c; if (HAVE_MMX) - ff_ac3dsp_init_x86(c); + ff_ac3dsp_init_x86(c, bit_exact); } diff --git a/libavcodec/ac3dsp.h b/libavcodec/ac3dsp.h index 31a0af375d..0a2dedf478 100644 --- a/libavcodec/ac3dsp.h +++ b/libavcodec/ac3dsp.h @@ -68,9 +68,22 @@ typedef struct AC3DSPContext { * constraints: range [0,31] */ void (*ac3_rshift_int32)(int32_t *src, unsigned int len, unsigned int shift); + + /** + * Convert an array of float in range [-1.0,1.0] to int32_t with range + * [-(1<<24),(1<<24)] + * + * @param dst destination array of int32_t. + * constraints: 16-byte aligned + * @param src source array of float. + * constraints: 16-byte aligned + * @param len number of elements to convert. + * constraints: multiple of 32 greater than zero + */ + void (*float_to_fixed24)(int32_t *dst, const float *src, unsigned int len); } AC3DSPContext; -void ff_ac3dsp_init (AC3DSPContext *c); -void ff_ac3dsp_init_x86(AC3DSPContext *c); +void ff_ac3dsp_init (AC3DSPContext *c, int bit_exact); +void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact); #endif /* AVCODEC_AC3DSP_H */ diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c index 4c01fe3cbd..5b76ae6735 100644 --- a/libavcodec/ac3enc.c +++ b/libavcodec/ac3enc.c @@ -1843,7 +1843,7 @@ static av_cold int ac3_encode_init(AVCodecContext *avctx) avctx->coded_frame= avcodec_alloc_frame(); dsputil_init(&s->dsp, avctx); - ff_ac3dsp_init(&s->ac3dsp); + ff_ac3dsp_init(&s->ac3dsp, avctx->flags & CODEC_FLAG_BITEXACT); return 0; init_fail: diff --git a/libavcodec/ac3enc_float.c b/libavcodec/ac3enc_float.c index 8668b2e033..4b13e4c723 100644 --- a/libavcodec/ac3enc_float.c +++ b/libavcodec/ac3enc_float.c @@ -103,9 +103,8 @@ static int normalize_samples(AC3EncodeContext *s) */ static void scale_coefficients(AC3EncodeContext *s) { - int i; - for (i = 0; i < AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels; i++) - s->fixed_coef_buffer[i] = SCALE_FLOAT(s->mdct_coef_buffer[i], 24); + s->ac3dsp.float_to_fixed24(s->fixed_coef_buffer, s->mdct_coef_buffer, + AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels); } diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm index e281791b1e..8b7e826a2d 100644 --- a/libavcodec/x86/ac3dsp.asm +++ b/libavcodec/x86/ac3dsp.asm @@ -22,6 +22,11 @@ %include "x86inc.asm" %include "x86util.asm" +SECTION_RODATA + +; 16777216.0f - used in ff_float_to_fixed24() +pf_1_24: times 4 dd 0x4B800000 + SECTION .text ;----------------------------------------------------------------------------- @@ -178,3 +183,113 @@ INIT_MMX AC3_SHIFT r, 32, psrad, mmx INIT_XMM AC3_SHIFT r, 32, psrad, sse2 + +;----------------------------------------------------------------------------- +; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len) +;----------------------------------------------------------------------------- + +; The 3DNow! version is not bit-identical because pf2id uses truncation rather +; than round-to-nearest. +INIT_MMX +cglobal float_to_fixed24_3dnow, 3,3,0, dst, src, len + movq m0, [pf_1_24] +.loop: + movq m1, [srcq ] + movq m2, [srcq+8 ] + movq m3, [srcq+16] + movq m4, [srcq+24] + pfmul m1, m0 + pfmul m2, m0 + pfmul m3, m0 + pfmul m4, m0 + pf2id m1, m1 + pf2id m2, m2 + pf2id m3, m3 + pf2id m4, m4 + movq [dstq ], m1 + movq [dstq+8 ], m2 + movq [dstq+16], m3 + movq [dstq+24], m4 + add srcq, 32 + add dstq, 32 + sub lend, 8 + ja .loop + REP_RET + +INIT_XMM +cglobal float_to_fixed24_sse, 3,3,3, dst, src, len + movaps m0, [pf_1_24] +.loop: + movaps m1, [srcq ] + movaps m2, [srcq+16] + mulps m1, m0 + mulps m2, m0 + cvtps2pi mm0, m1 + movhlps m1, m1 + cvtps2pi mm1, m1 + cvtps2pi mm2, m2 + movhlps m2, m2 + cvtps2pi mm3, m2 + movq [dstq ], mm0 + movq [dstq+ 8], mm1 + movq [dstq+16], mm2 + movq [dstq+24], mm3 + add srcq, 32 + add dstq, 32 + sub lend, 8 + ja .loop + REP_RET + +INIT_XMM +cglobal float_to_fixed24_sse2, 3,3,9, dst, src, len + movaps m0, [pf_1_24] +.loop: + movaps m1, [srcq ] + movaps m2, [srcq+16 ] + movaps m3, [srcq+32 ] + movaps m4, [srcq+48 ] +%ifdef m8 + movaps m5, [srcq+64 ] + movaps m6, [srcq+80 ] + movaps m7, [srcq+96 ] + movaps m8, [srcq+112] +%endif + mulps m1, m0 + mulps m2, m0 + mulps m3, m0 + mulps m4, m0 +%ifdef m8 + mulps m5, m0 + mulps m6, m0 + mulps m7, m0 + mulps m8, m0 +%endif + cvtps2dq m1, m1 + cvtps2dq m2, m2 + cvtps2dq m3, m3 + cvtps2dq m4, m4 +%ifdef m8 + cvtps2dq m5, m5 + cvtps2dq m6, m6 + cvtps2dq m7, m7 + cvtps2dq m8, m8 +%endif + movdqa [dstq ], m1 + movdqa [dstq+16 ], m2 + movdqa [dstq+32 ], m3 + movdqa [dstq+48 ], m4 +%ifdef m8 + movdqa [dstq+64 ], m5 + movdqa [dstq+80 ], m6 + movdqa [dstq+96 ], m7 + movdqa [dstq+112], m8 + add srcq, 128 + add dstq, 128 + sub lenq, 32 +%else + add srcq, 64 + add dstq, 64 + sub lenq, 16 +%endif + ja .loop + REP_RET diff --git a/libavcodec/x86/ac3dsp_mmx.c b/libavcodec/x86/ac3dsp_mmx.c index 835b10696d..97d0657aa6 100644 --- a/libavcodec/x86/ac3dsp_mmx.c +++ b/libavcodec/x86/ac3dsp_mmx.c @@ -38,7 +38,11 @@ extern void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned in extern void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift); extern void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift); -av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c) +extern void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len); +extern void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len); +extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len); + +av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) { int mm_flags = av_get_cpu_flags(); @@ -49,13 +53,22 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c) c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx; c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx; } + if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) { + if (!bit_exact) { + c->float_to_fixed24 = ff_float_to_fixed24_3dnow; + } + } if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) { c->ac3_exponent_min = ff_ac3_exponent_min_mmxext; c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext; } + if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) { + c->float_to_fixed24 = ff_float_to_fixed24_sse; + } if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) { c->ac3_exponent_min = ff_ac3_exponent_min_sse2; c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2; + c->float_to_fixed24 = ff_float_to_fixed24_sse2; if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2; c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;