ac3enc: add float_to_fixed24() with x86-optimized versions to AC3DSPContext

and use in scale_coefficients() for the floating-point AC-3 encoder.
This commit is contained in:
Justin Ruggles 2011-03-15 22:29:04 -04:00
parent 487fef2dcc
commit 0f999cfddb
6 changed files with 166 additions and 9 deletions

View File

@ -85,13 +85,30 @@ static void ac3_rshift_int32_c(int32_t *src, unsigned int len,
} while (len > 0);
}
av_cold void ff_ac3dsp_init(AC3DSPContext *c)
static void float_to_fixed24_c(int32_t *dst, const float *src, unsigned int len)
{
const float scale = 1 << 24;
do {
*dst++ = lrintf(*src++ * scale);
*dst++ = lrintf(*src++ * scale);
*dst++ = lrintf(*src++ * scale);
*dst++ = lrintf(*src++ * scale);
*dst++ = lrintf(*src++ * scale);
*dst++ = lrintf(*src++ * scale);
*dst++ = lrintf(*src++ * scale);
*dst++ = lrintf(*src++ * scale);
len -= 8;
} while (len > 0);
}
av_cold void ff_ac3dsp_init(AC3DSPContext *c, int bit_exact)
{
c->ac3_exponent_min = ac3_exponent_min_c;
c->ac3_max_msb_abs_int16 = ac3_max_msb_abs_int16_c;
c->ac3_lshift_int16 = ac3_lshift_int16_c;
c->ac3_rshift_int32 = ac3_rshift_int32_c;
c->float_to_fixed24 = float_to_fixed24_c;
if (HAVE_MMX)
ff_ac3dsp_init_x86(c);
ff_ac3dsp_init_x86(c, bit_exact);
}

View File

@ -68,9 +68,22 @@ typedef struct AC3DSPContext {
* constraints: range [0,31]
*/
void (*ac3_rshift_int32)(int32_t *src, unsigned int len, unsigned int shift);
/**
* Convert an array of float in range [-1.0,1.0] to int32_t with range
* [-(1<<24),(1<<24)]
*
* @param dst destination array of int32_t.
* constraints: 16-byte aligned
* @param src source array of float.
* constraints: 16-byte aligned
* @param len number of elements to convert.
* constraints: multiple of 32 greater than zero
*/
void (*float_to_fixed24)(int32_t *dst, const float *src, unsigned int len);
} AC3DSPContext;
void ff_ac3dsp_init (AC3DSPContext *c);
void ff_ac3dsp_init_x86(AC3DSPContext *c);
void ff_ac3dsp_init (AC3DSPContext *c, int bit_exact);
void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact);
#endif /* AVCODEC_AC3DSP_H */

View File

@ -1843,7 +1843,7 @@ static av_cold int ac3_encode_init(AVCodecContext *avctx)
avctx->coded_frame= avcodec_alloc_frame();
dsputil_init(&s->dsp, avctx);
ff_ac3dsp_init(&s->ac3dsp);
ff_ac3dsp_init(&s->ac3dsp, avctx->flags & CODEC_FLAG_BITEXACT);
return 0;
init_fail:

View File

@ -103,9 +103,8 @@ static int normalize_samples(AC3EncodeContext *s)
*/
static void scale_coefficients(AC3EncodeContext *s)
{
int i;
for (i = 0; i < AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels; i++)
s->fixed_coef_buffer[i] = SCALE_FLOAT(s->mdct_coef_buffer[i], 24);
s->ac3dsp.float_to_fixed24(s->fixed_coef_buffer, s->mdct_coef_buffer,
AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels);
}

View File

@ -22,6 +22,11 @@
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA
; 16777216.0f - used in ff_float_to_fixed24()
pf_1_24: times 4 dd 0x4B800000
SECTION .text
;-----------------------------------------------------------------------------
@ -178,3 +183,113 @@ INIT_MMX
AC3_SHIFT r, 32, psrad, mmx
INIT_XMM
AC3_SHIFT r, 32, psrad, sse2
;-----------------------------------------------------------------------------
; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
;-----------------------------------------------------------------------------
; The 3DNow! version is not bit-identical because pf2id uses truncation rather
; than round-to-nearest.
INIT_MMX
cglobal float_to_fixed24_3dnow, 3,3,0, dst, src, len
movq m0, [pf_1_24]
.loop:
movq m1, [srcq ]
movq m2, [srcq+8 ]
movq m3, [srcq+16]
movq m4, [srcq+24]
pfmul m1, m0
pfmul m2, m0
pfmul m3, m0
pfmul m4, m0
pf2id m1, m1
pf2id m2, m2
pf2id m3, m3
pf2id m4, m4
movq [dstq ], m1
movq [dstq+8 ], m2
movq [dstq+16], m3
movq [dstq+24], m4
add srcq, 32
add dstq, 32
sub lend, 8
ja .loop
REP_RET
INIT_XMM
cglobal float_to_fixed24_sse, 3,3,3, dst, src, len
movaps m0, [pf_1_24]
.loop:
movaps m1, [srcq ]
movaps m2, [srcq+16]
mulps m1, m0
mulps m2, m0
cvtps2pi mm0, m1
movhlps m1, m1
cvtps2pi mm1, m1
cvtps2pi mm2, m2
movhlps m2, m2
cvtps2pi mm3, m2
movq [dstq ], mm0
movq [dstq+ 8], mm1
movq [dstq+16], mm2
movq [dstq+24], mm3
add srcq, 32
add dstq, 32
sub lend, 8
ja .loop
REP_RET
INIT_XMM
cglobal float_to_fixed24_sse2, 3,3,9, dst, src, len
movaps m0, [pf_1_24]
.loop:
movaps m1, [srcq ]
movaps m2, [srcq+16 ]
movaps m3, [srcq+32 ]
movaps m4, [srcq+48 ]
%ifdef m8
movaps m5, [srcq+64 ]
movaps m6, [srcq+80 ]
movaps m7, [srcq+96 ]
movaps m8, [srcq+112]
%endif
mulps m1, m0
mulps m2, m0
mulps m3, m0
mulps m4, m0
%ifdef m8
mulps m5, m0
mulps m6, m0
mulps m7, m0
mulps m8, m0
%endif
cvtps2dq m1, m1
cvtps2dq m2, m2
cvtps2dq m3, m3
cvtps2dq m4, m4
%ifdef m8
cvtps2dq m5, m5
cvtps2dq m6, m6
cvtps2dq m7, m7
cvtps2dq m8, m8
%endif
movdqa [dstq ], m1
movdqa [dstq+16 ], m2
movdqa [dstq+32 ], m3
movdqa [dstq+48 ], m4
%ifdef m8
movdqa [dstq+64 ], m5
movdqa [dstq+80 ], m6
movdqa [dstq+96 ], m7
movdqa [dstq+112], m8
add srcq, 128
add dstq, 128
sub lenq, 32
%else
add srcq, 64
add dstq, 64
sub lenq, 16
%endif
ja .loop
REP_RET

View File

@ -38,7 +38,11 @@ extern void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned in
extern void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
extern void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
extern void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len);
extern void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len);
extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
{
int mm_flags = av_get_cpu_flags();
@ -49,13 +53,22 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
}
if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) {
if (!bit_exact) {
c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
}
}
if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) {
c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
}
if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {
c->float_to_fixed24 = ff_float_to_fixed24_sse;
}
if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
c->float_to_fixed24 = ff_float_to_fixed24_sse2;
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;