x86/swr: add ff_resample_{common, linear}_int16_xop

Signed-off-by: James Almer <jamrial@gmail.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
James Almer 2014-06-30 13:06:02 -03:00 committed by Michael Niedermayer
parent 1a69224f44
commit c45b7f0d80
2 changed files with 26 additions and 15 deletions

View File

@ -176,8 +176,7 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
.inner_loop: .inner_loop:
movu m1, [srcq+min_filter_count_x4q*1] movu m1, [srcq+min_filter_count_x4q*1]
%ifidn %1, int16 %ifidn %1, int16
pmaddwd m1, [filterq+min_filter_count_x4q*1] PMADCSWD m0, m1, [filterq+min_filter_count_x4q*1], m0, m1
paddd m0, m1
%else ; float/double %else ; float/double
%if cpuflag(fma4) || cpuflag(fma3) %if cpuflag(fma4) || cpuflag(fma3)
fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0 fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0
@ -190,14 +189,7 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
js .inner_loop js .inner_loop
%ifidn %1, int16 %ifidn %1, int16
%if mmsize == 16 HADDD m0, m1
pshufd m1, m0, q0032
paddd m0, m1
pshufd m1, m0, q0001
%else ; mmsize == 8
pshufw m1, m0, q0032
%endif
paddd m0, m1
psrad m0, 15 psrad m0, 15
add fracd, dst_incr_modd add fracd, dst_incr_modd
packssdw m0, m0 packssdw m0, m0
@ -427,10 +419,15 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
.inner_loop: .inner_loop:
movu m1, [srcq+min_filter_count_x4q*1] movu m1, [srcq+min_filter_count_x4q*1]
%ifidn %1, int16 %ifidn %1, int16
%if cpuflag(xop)
vpmadcswd m2, m1, [filter2q+min_filter_count_x4q*1], m2
vpmadcswd m0, m1, [filter1q+min_filter_count_x4q*1], m0
%else
pmaddwd m3, m1, [filter2q+min_filter_count_x4q*1] pmaddwd m3, m1, [filter2q+min_filter_count_x4q*1]
pmaddwd m1, [filter1q+min_filter_count_x4q*1] pmaddwd m1, [filter1q+min_filter_count_x4q*1]
paddd m2, m3 paddd m2, m3
paddd m0, m1 paddd m0, m1
%endif ; cpuflag
%else ; float/double %else ; float/double
%if cpuflag(fma4) || cpuflag(fma3) %if cpuflag(fma4) || cpuflag(fma3)
fmaddp%4 m2, m1, [filter2q+min_filter_count_x4q*1], m2 fmaddp%4 m2, m1, [filter2q+min_filter_count_x4q*1], m2
@ -447,18 +444,21 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
%ifidn %1, int16 %ifidn %1, int16
%if mmsize == 16 %if mmsize == 16
%if cpuflag(xop)
vphadddq m2, m2
vphadddq m0, m0
%endif
pshufd m3, m2, q0032 pshufd m3, m2, q0032
pshufd m1, m0, q0032 pshufd m1, m0, q0032
paddd m2, m3 paddd m2, m3
paddd m0, m1 paddd m0, m1
pshufd m3, m2, q0001
pshufd m1, m0, q0001
%else ; mmsize == 8
pshufw m3, m2, q0032
pshufw m1, m0, q0032
%endif %endif
%if notcpuflag(xop)
PSHUFLW m3, m2, q0032
PSHUFLW m1, m0, q0032
paddd m2, m3 paddd m2, m3
paddd m0, m1 paddd m0, m1
%endif
psubd m2, m0 psubd m2, m0
; This is probably a really bad idea on atom and other machines with a ; This is probably a really bad idea on atom and other machines with a
; long transfer latency between GPRs and XMMs (atom). However, it does ; long transfer latency between GPRs and XMMs (atom). However, it does
@ -591,4 +591,10 @@ RESAMPLE_FNS int16, 2, 1
INIT_XMM sse2 INIT_XMM sse2
RESAMPLE_FNS int16, 2, 1 RESAMPLE_FNS int16, 2, 1
%if HAVE_XOP_EXTERNAL
INIT_XMM xop
RESAMPLE_FNS int16, 2, 1
%endif
INIT_XMM sse2
RESAMPLE_FNS double, 8, 3, d, pdbl_1 RESAMPLE_FNS double, 8, 3, d, pdbl_1

View File

@ -35,6 +35,7 @@ int ff_resample_linear_##type##_##opt(ResampleContext *c, uint8_t *dst, \
RESAMPLE_FUNCS(int16, mmxext); RESAMPLE_FUNCS(int16, mmxext);
RESAMPLE_FUNCS(int16, sse2); RESAMPLE_FUNCS(int16, sse2);
RESAMPLE_FUNCS(int16, xop);
RESAMPLE_FUNCS(float, sse); RESAMPLE_FUNCS(float, sse);
RESAMPLE_FUNCS(float, avx); RESAMPLE_FUNCS(float, avx);
RESAMPLE_FUNCS(float, fma3); RESAMPLE_FUNCS(float, fma3);
@ -73,4 +74,8 @@ void swresample_dsp_x86_init(ResampleContext *c)
c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_fma4; c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_fma4;
c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_fma4; c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_fma4;
} }
if (HAVE_XOP_EXTERNAL && mm_flags & AV_CPU_FLAG_XOP) {
c->dsp.resample_common[FNIDX(S16P)] = ff_resample_common_int16_xop;
c->dsp.resample_linear[FNIDX(S16P)] = ff_resample_linear_int16_xop;
}
} }