lavr: add x86-optimized functions for mixing 2 to 1 s16p with float coeffs

This commit is contained in:
Justin Ruggles 2012-04-23 15:04:09 -04:00
parent 8dfc122719
commit c140fb2cbc
3 changed files with 92 additions and 0 deletions

View File

@ -21,6 +21,7 @@
%include "x86inc.asm"
%include "x86util.asm"
%include "util.asm"
SECTION_TEXT
@ -64,3 +65,47 @@ MIX_2_TO_1_FLTP_FLT
INIT_YMM avx
MIX_2_TO_1_FLTP_FLT
%endif
;-----------------------------------------------------------------------------
; void ff_mix_2_to_1_s16p_flt(int16_t **src, float **matrix, int len,
; int out_ch, int in_ch);
;-----------------------------------------------------------------------------
%macro MIX_2_TO_1_S16P_FLT 0
cglobal mix_2_to_1_s16p_flt, 3,4,6, src, matrix, len, src1
mov src1q, [srcq+gprsize]
mov srcq, [srcq]
sub src1q, srcq
mov matrixq, [matrixq ]
VBROADCASTSS m4, [matrixq ]
VBROADCASTSS m5, [matrixq+4]
ALIGN 16
.loop:
mova m0, [srcq ]
mova m2, [srcq+src1q]
S16_TO_S32_SX 0, 1
S16_TO_S32_SX 2, 3
cvtdq2ps m0, m0
cvtdq2ps m1, m1
cvtdq2ps m2, m2
cvtdq2ps m3, m3
mulps m0, m4
mulps m1, m4
mulps m2, m5
mulps m3, m5
addps m0, m2
addps m1, m3
cvtps2dq m0, m0
cvtps2dq m1, m1
packssdw m0, m1
mova [srcq], m0
add srcq, mmsize
sub lend, mmsize/2
jg .loop
REP_RET
%endmacro
INIT_XMM sse2
MIX_2_TO_1_S16P_FLT
INIT_XMM sse4
MIX_2_TO_1_S16P_FLT

View File

@ -27,6 +27,11 @@ extern void ff_mix_2_to_1_fltp_flt_sse(float **src, float **matrix, int len,
extern void ff_mix_2_to_1_fltp_flt_avx(float **src, float **matrix, int len,
int out_ch, int in_ch);
extern void ff_mix_2_to_1_s16p_flt_sse2(int16_t **src, float **matrix, int len,
int out_ch, int in_ch);
extern void ff_mix_2_to_1_s16p_flt_sse4(int16_t **src, float **matrix, int len,
int out_ch, int in_ch);
av_cold void ff_audio_mix_init_x86(AudioMix *am)
{
#if HAVE_YASM
@ -36,6 +41,14 @@ av_cold void ff_audio_mix_init_x86(AudioMix *am)
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,
2, 1, 16, 8, "SSE", ff_mix_2_to_1_fltp_flt_sse);
}
if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,
2, 1, 16, 8, "SSE2", ff_mix_2_to_1_s16p_flt_sse2);
}
if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,
2, 1, 16, 8, "SSE4", ff_mix_2_to_1_s16p_flt_sse4);
}
if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,
2, 1, 32, 16, "AVX", ff_mix_2_to_1_fltp_flt_avx);

View File

@ -0,0 +1,34 @@
;******************************************************************************
;* x86 utility macros for libavresample
;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
;*
;* This file is part of Libav.
;*
;* Libav is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* Libav is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with Libav; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%macro S16_TO_S32_SX 2 ; src/low dst, high dst
%if cpuflag(sse4)
pmovsxwd m%2, m%1
psrldq m%1, 8
pmovsxwd m%1, m%1
SWAP %1, %2
%else
punpckhwd m%2, m%1
punpcklwd m%1, m%1
psrad m%2, 16
psrad m%1, 16
%endif
%endmacro