lavr: x86: optimized 6-channel fltp to s16 conversion

This commit is contained in:
Justin Ruggles 2012-05-02 16:27:11 -04:00
parent a58a013980
commit d5b4e50c47
2 changed files with 129 additions and 0 deletions

View File

@ -581,6 +581,120 @@ CONV_FLTP_TO_S16_2CH
INIT_XMM ssse3
CONV_FLTP_TO_S16_2CH
;------------------------------------------------------------------------------
; void ff_conv_fltp_to_s16_6ch(int16_t *dst, float *const *src, int len,
; int channels);
;------------------------------------------------------------------------------
%macro CONV_FLTP_TO_S16_6CH 0
%if ARCH_X86_64
cglobal conv_fltp_to_s16_6ch, 3,8,7, dst, src, len, src1, src2, src3, src4, src5
%else
cglobal conv_fltp_to_s16_6ch, 2,7,7, dst, src, src1, src2, src3, src4, src5
%define lend dword r2m
%endif
mov src1q, [srcq+1*gprsize]
mov src2q, [srcq+2*gprsize]
mov src3q, [srcq+3*gprsize]
mov src4q, [srcq+4*gprsize]
mov src5q, [srcq+5*gprsize]
mov srcq, [srcq]
sub src1q, srcq
sub src2q, srcq
sub src3q, srcq
sub src4q, srcq
sub src5q, srcq
movaps xmm6, [pf_s16_scale]
.loop:
%if cpuflag(sse2)
mulps m0, m6, [srcq ]
mulps m1, m6, [srcq+src1q]
mulps m2, m6, [srcq+src2q]
mulps m3, m6, [srcq+src3q]
mulps m4, m6, [srcq+src4q]
mulps m5, m6, [srcq+src5q]
cvtps2dq m0, m0
cvtps2dq m1, m1
cvtps2dq m2, m2
cvtps2dq m3, m3
cvtps2dq m4, m4
cvtps2dq m5, m5
packssdw m0, m3 ; m0 = 0, 6, 12, 18, 3, 9, 15, 21
packssdw m1, m4 ; m1 = 1, 7, 13, 19, 4, 10, 16, 22
packssdw m2, m5 ; m2 = 2, 8, 14, 20, 5, 11, 17, 23
; unpack words:
movhlps m3, m0 ; m3 = 3, 9, 15, 21, x, x, x, x
punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
punpckhwd m1, m2 ; m1 = 4, 5, 10, 11, 16, 17, 22, 23
punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
; blend dwords:
shufps m3, m0, m2, q2020 ; m3 = 0, 1, 12, 13, 2, 3, 14, 15
shufps m0, m1, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
shufps m2, m1, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
; shuffle dwords:
shufps m1, m2, m3, q3120 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
shufps m3, m0, q0220 ; m3 = 0, 1, 2, 3, 4, 5, 6, 7
shufps m0, m2, q3113 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
mova [dstq+0*mmsize], m3
mova [dstq+1*mmsize], m1
mova [dstq+2*mmsize], m0
%else ; sse
movlps xmm0, [srcq ]
movlps xmm1, [srcq+src1q]
movlps xmm2, [srcq+src2q]
movlps xmm3, [srcq+src3q]
movlps xmm4, [srcq+src4q]
movlps xmm5, [srcq+src5q]
mulps xmm0, xmm6
mulps xmm1, xmm6
mulps xmm2, xmm6
mulps xmm3, xmm6
mulps xmm4, xmm6
mulps xmm5, xmm6
cvtps2pi mm0, xmm0
cvtps2pi mm1, xmm1
cvtps2pi mm2, xmm2
cvtps2pi mm3, xmm3
cvtps2pi mm4, xmm4
cvtps2pi mm5, xmm5
packssdw mm0, mm3 ; m0 = 0, 6, 3, 9
packssdw mm1, mm4 ; m1 = 1, 7, 4, 10
packssdw mm2, mm5 ; m2 = 2, 8, 5, 11
; unpack words
pshufw mm3, mm0, q1032 ; m3 = 3, 9, 0, 6
punpcklwd mm0, mm1 ; m0 = 0, 1, 6, 7
punpckhwd mm1, mm2 ; m1 = 4, 5, 10, 11
punpcklwd mm2, mm3 ; m2 = 2, 3, 8, 9
; unpack dwords
pshufw mm3, mm0, q1032 ; m3 = 6, 7, 0, 1
punpckldq mm0, mm2 ; m0 = 0, 1, 2, 3 (final)
punpckhdq mm2, mm1 ; m2 = 8, 9, 10, 11 (final)
punpckldq mm1, mm3 ; m1 = 4, 5, 6, 7 (final)
mova [dstq+0*mmsize], mm0
mova [dstq+1*mmsize], mm1
mova [dstq+2*mmsize], mm2
%endif
add srcq, mmsize
add dstq, mmsize*3
sub lend, mmsize/4
jg .loop
%if mmsize == 8
emms
RET
%else
REP_RET
%endif
%endmacro
INIT_MMX sse
CONV_FLTP_TO_S16_6CH
INIT_XMM sse2
CONV_FLTP_TO_S16_6CH
%if HAVE_AVX
INIT_XMM avx
CONV_FLTP_TO_S16_6CH
%endif
;-----------------------------------------------------------------------------
; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len,
; int channels);

View File

@ -71,6 +71,13 @@ extern void ff_conv_fltp_to_s16_2ch_sse2 (int16_t *dst, float *const *src,
extern void ff_conv_fltp_to_s16_2ch_ssse3(int16_t *dst, float *const *src,
int len, int channels);
extern void ff_conv_fltp_to_s16_6ch_sse (int16_t *dst, float *const *src,
int len, int channels);
extern void ff_conv_fltp_to_s16_6ch_sse2(int16_t *dst, float *const *src,
int len, int channels);
extern void ff_conv_fltp_to_s16_6ch_avx (int16_t *dst, float *const *src,
int len, int channels);
extern void ff_conv_fltp_to_flt_6ch_mmx (float *dst, float *const *src, int len,
int channels);
extern void ff_conv_fltp_to_flt_6ch_sse4(float *dst, float *const *src, int len,
@ -89,12 +96,18 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
6, 1, 4, "MMX", ff_conv_fltp_to_flt_6ch_mmx);
}
if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_FLTP,
6, 1, 2, "SSE", ff_conv_fltp_to_s16_6ch_sse);
}
if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S32,
0, 16, 16, "SSE2", ff_conv_s32_to_s16_sse2);
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S16P,
6, 16, 8, "SSE2", ff_conv_s16p_to_s16_6ch_sse2);
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_FLTP,
6, 16, 4, "SSE2", ff_conv_fltp_to_s16_6ch_sse2);
} else {
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S16P,
6, 1, 4, "SSE2SLOW", ff_conv_s16p_to_s16_6ch_sse2slow);
@ -143,6 +156,8 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
2, 16, 8, "AVX", ff_conv_s16p_to_flt_2ch_avx);
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S16P,
6, 16, 4, "AVX", ff_conv_s16p_to_flt_6ch_avx);
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_FLTP,
6, 16, 4, "AVX", ff_conv_fltp_to_s16_6ch_avx);
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
6, 16, 4, "AVX", ff_conv_fltp_to_flt_6ch_avx);
}