lavr: add x86-optimized function for mixing 2 to 1 s16p with q8 coeffs

This commit is contained in:
Justin Ruggles 2012-04-23 15:10:35 -04:00
parent c140fb2cbc
commit b75726cb79
2 changed files with 46 additions and 0 deletions

View File

@ -109,3 +109,44 @@ INIT_XMM sse2
MIX_2_TO_1_S16P_FLT
INIT_XMM sse4
MIX_2_TO_1_S16P_FLT
;-----------------------------------------------------------------------------
; void ff_mix_2_to_1_s16p_q8(int16_t **src, int16_t **matrix, int len,
; int out_ch, int in_ch);
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal mix_2_to_1_s16p_q8, 3,4,6, src, matrix, len, src1
mov src1q, [srcq+gprsize]
mov srcq, [srcq]
sub src1q, srcq
mov matrixq, [matrixq]
movd m4, [matrixq]
movd m5, [matrixq]
SPLATW m4, m4, 0
SPLATW m5, m5, 1
pxor m0, m0
punpcklwd m4, m0
punpcklwd m5, m0
ALIGN 16
.loop:
mova m0, [srcq ]
mova m2, [srcq+src1q]
punpckhwd m1, m0, m0
punpcklwd m0, m0
punpckhwd m3, m2, m2
punpcklwd m2, m2
pmaddwd m0, m4
pmaddwd m1, m4
pmaddwd m2, m5
pmaddwd m3, m5
paddd m0, m2
paddd m1, m3
psrad m0, 8
psrad m1, 8
packssdw m0, m1
mova [srcq], m0
add srcq, mmsize
sub lend, mmsize/2
jg .loop
REP_RET

View File

@ -32,6 +32,9 @@ extern void ff_mix_2_to_1_s16p_flt_sse2(int16_t **src, float **matrix, int len,
extern void ff_mix_2_to_1_s16p_flt_sse4(int16_t **src, float **matrix, int len,
int out_ch, int in_ch);
extern void ff_mix_2_to_1_s16p_q8_sse2(int16_t **src, int16_t **matrix,
int len, int out_ch, int in_ch);
av_cold void ff_audio_mix_init_x86(AudioMix *am)
{
#if HAVE_YASM
@ -44,6 +47,8 @@ av_cold void ff_audio_mix_init_x86(AudioMix *am)
if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,
2, 1, 16, 8, "SSE2", ff_mix_2_to_1_s16p_flt_sse2);
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_Q8,
2, 1, 16, 8, "SSE2", ff_mix_2_to_1_s16p_q8_sse2);
}
if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,