x86/mpegvideoencdsp: improve ff_pix_sum16_sse2

~15% faster.

Also add an mmxext version that takes advantage of the new code, and
build it alongside with the mmx version only on x86_32.

Reviewed-by: Michael Niedermayer <michaelni@gmx.at>
Signed-off-by: James Almer <jamrial@gmail.com>
This commit is contained in:
James Almer 2014-09-30 22:21:40 -03:00
parent f2e53808e3
commit acebff8e5d
2 changed files with 42 additions and 18 deletions

View File

@ -29,16 +29,16 @@ cextern pw_1
SECTION .text SECTION .text
; int ff_pix_sum16_mmx(uint8_t *pix, int line_size) ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
; %1 = number of xmm registers used ; %1 = number of loops
; %2 = number of loops ; %2 = number of GPRs used
; %3 = number of GPRs used %macro PIX_SUM16 3
%macro PIX_SUM16 4 cglobal pix_sum16, 2, %2, 6
cglobal pix_sum16, 2, %3, %1
movsxdifnidn r1, r1d movsxdifnidn r1, r1d
mov r2, %2 mov r2, %1
%if cpuflag(xop) %if mmsize == 16
lea r3, [r1*3] lea r3, [r1*3]
%else %endif
%if notcpuflag(xop)
pxor m5, m5 pxor m5, m5
%endif %endif
pxor m4, m4 pxor m4, m4
@ -52,42 +52,59 @@ cglobal pix_sum16, 2, %3, %1
mova m0, [r0] mova m0, [r0]
%if mmsize == 8 %if mmsize == 8
mova m1, [r0+8] mova m1, [r0+8]
%else %if cpuflag(mmxext)
mova m1, [r0+r1] mova m2, [r0+r1]
mova m3, [r0+r1+8]
%endif %endif
%else ; sse2
mova m1, [r0+r1]
mova m2, [r0+r1*2]
mova m3, [r0+r3]
%endif
%if cpuflag(mmxext)
psadbw m0, m5
psadbw m1, m5
psadbw m2, m5
psadbw m3, m5
%else ; mmx
punpckhbw m2, m0, m5 punpckhbw m2, m0, m5
punpcklbw m0, m5 punpcklbw m0, m5
punpckhbw m3, m1, m5 punpckhbw m3, m1, m5
punpcklbw m1, m5 punpcklbw m1, m5
%endif ; cpuflag(mmxext)
%endif ; cpuflag(xop) %endif ; cpuflag(xop)
paddw m1, m0 paddw m1, m0
paddw m3, m2 paddw m3, m2
paddw m3, m1 paddw m3, m1
paddw m4, m3 paddw m4, m3
%if mmsize == 8 %if cpuflag(mmxext)
add r0, r1 lea r0, [r0+r1*%3]
%else %else
lea r0, [r0+r1*%4] add r0, r1
%endif %endif
dec r2 dec r2
jne .loop jne .loop
%if cpuflag(xop) %if mmsize == 16
pshufd m0, m4, q0032 pshufd m0, m4, q0032
paddd m4, m0 paddd m4, m0
%else %elif notcpuflag(mmxext)
HADDW m4, m5 HADDW m4, m5
%endif %endif
movd eax, m4 movd eax, m4
RET RET
%endmacro %endmacro
%if ARCH_X86_32
INIT_MMX mmx INIT_MMX mmx
PIX_SUM16 0, 16, 3, 0 PIX_SUM16 16, 3, 0
INIT_MMX mmxext
PIX_SUM16 8, 4, 2
%endif
INIT_XMM sse2 INIT_XMM sse2
PIX_SUM16 6, 8, 3, 2 PIX_SUM16 4, 4, 4
%if HAVE_XOP_EXTERNAL %if HAVE_XOP_EXTERNAL
INIT_XMM xop INIT_XMM xop
PIX_SUM16 5, 4, 4, 4 PIX_SUM16 4, 4, 4
%endif %endif
; int ff_pix_norm1_mmx(uint8_t *pix, int line_size) ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)

View File

@ -24,6 +24,7 @@
#include "libavcodec/mpegvideoencdsp.h" #include "libavcodec/mpegvideoencdsp.h"
int ff_pix_sum16_mmx(uint8_t *pix, int line_size); int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
int ff_pix_sum16_mmxext(uint8_t *pix, int line_size);
int ff_pix_sum16_sse2(uint8_t *pix, int line_size); int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
int ff_pix_sum16_xop(uint8_t *pix, int line_size); int ff_pix_sum16_xop(uint8_t *pix, int line_size);
int ff_pix_norm1_mmx(uint8_t *pix, int line_size); int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
@ -218,11 +219,17 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
{ {
int cpu_flags = av_get_cpu_flags(); int cpu_flags = av_get_cpu_flags();
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) { if (EXTERNAL_MMX(cpu_flags)) {
c->pix_sum = ff_pix_sum16_mmx; c->pix_sum = ff_pix_sum16_mmx;
c->pix_norm1 = ff_pix_norm1_mmx; c->pix_norm1 = ff_pix_norm1_mmx;
} }
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->pix_sum = ff_pix_sum16_mmxext;
}
#endif
if (EXTERNAL_SSE2(cpu_flags)) { if (EXTERNAL_SSE2(cpu_flags)) {
c->pix_sum = ff_pix_sum16_sse2; c->pix_sum = ff_pix_sum16_sse2;
c->pix_norm1 = ff_pix_norm1_sse2; c->pix_norm1 = ff_pix_norm1_sse2;