diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm index 6269532a0e..0ff0e605b5 100644 --- a/libavcodec/x86/dsputilenc.asm +++ b/libavcodec/x86/dsputilenc.asm @@ -23,6 +23,10 @@ %include "libavutil/x86/x86util.asm" +SECTION_RODATA + +cextern pw_1 + SECTION .text %macro DIFF_PIXELS_1 4 @@ -439,73 +443,92 @@ cglobal diff_pixels, 4, 5, 5 jne .loop RET -INIT_MMX mmx ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size) -cglobal pix_sum16, 2, 3 +; %1 = number of xmm registers used +; %2 = number of loops +%macro PIX_SUM16 2 +cglobal pix_sum16, 2, 3, %1 movsxdifnidn r1, r1d - mov r2, r1 - neg r2 - shl r2, 4 - sub r0, r2 - pxor m7, m7 - pxor m6, m6 + mov r2, %2 + pxor m5, m5 + pxor m4, m4 .loop: - mova m0, [r0+r2+0] - mova m1, [r0+r2+0] - mova m2, [r0+r2+8] - mova m3, [r0+r2+8] - punpcklbw m0, m7 - punpckhbw m1, m7 - punpcklbw m2, m7 - punpckhbw m3, m7 + mova m0, [r0] +%if mmsize == 8 + mova m1, [r0+8] +%else + mova m1, [r0+r1] +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 paddw m1, m0 paddw m3, m2 paddw m3, m1 - paddw m6, m3 - add r2, r1 - js .loop - mova m5, m6 - psrlq m6, 32 - paddw m6, m5 - mova m5, m6 - psrlq m6, 16 - paddw m6, m5 - movd eax, m6 + paddw m4, m3 +%if mmsize == 8 + add r0, r1 +%else + lea r0, [r0+r1*2] +%endif + dec r2 + jne .loop + HADDW m4, m5 + movd eax, m4 and eax, 0xffff RET +%endmacro INIT_MMX mmx +PIX_SUM16 0, 16 +INIT_XMM sse2 +PIX_SUM16 6, 8 + ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size) -cglobal pix_norm1, 2, 4 +; %1 = number of xmm registers used +; %2 = number of loops +%macro PIX_NORM1 2 +cglobal pix_norm1, 2, 3, %1 movsxdifnidn r1, r1d - mov r2, 16 + mov r2, %2 pxor m0, m0 - pxor m7, m7 + pxor m5, m5 .loop: mova m2, [r0+0] +%if mmsize == 8 mova m3, [r0+8] - mova m1, m2 - punpckhbw m1, m0 +%else + mova m3, [r0+r1] +%endif + punpckhbw m1, m2, m0 punpcklbw m2, m0 - mova m4, m3 - punpckhbw m3, m0 - punpcklbw m4, m0 + punpckhbw m4, m3, m0 + punpcklbw m3, m0 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 pmaddwd m4, m4 paddd m2, m1 paddd m4, m3 - paddd m7, m2 + paddd m5, m2 + paddd m5, m4 +%if mmsize == 8 add r0, r1 - paddd m7, m4 +%else + lea r0, [r0+r1*2] +%endif dec r2 jne .loop - mova m1, m7 - psrlq m7, 32 - paddd m1, m7 - movd eax, m1 + HADDD m5, m1 + movd eax, m5 RET +%endmacro + +INIT_MMX mmx +PIX_NORM1 0, 16 +INIT_XMM sse2 +PIX_NORM1 6, 8 ;----------------------------------------------- ;int ff_sum_abs_dctelem(int16_t *block) diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c index 5aae14791b..efe835fac9 100644 --- a/libavcodec/x86/dsputilenc_mmx.c +++ b/libavcodec/x86/dsputilenc_mmx.c @@ -38,7 +38,9 @@ void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2, void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2, int stride); int ff_pix_sum16_mmx(uint8_t *pix, int line_size); +int ff_pix_sum16_sse2(uint8_t *pix, int line_size); int ff_pix_norm1_mmx(uint8_t *pix, int line_size); +int ff_pix_norm1_sse2(uint8_t *pix, int line_size); int ff_sum_abs_dctelem_mmx(int16_t *block); int ff_sum_abs_dctelem_mmxext(int16_t *block); int ff_sum_abs_dctelem_sse2(int16_t *block); @@ -906,6 +908,8 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, c->sse[0] = ff_sse16_sse2; c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2; c->diff_pixels = ff_diff_pixels_sse2; + c->pix_sum = ff_pix_sum16_sse2; + c->pix_norm1 = ff_pix_norm1_sse2; #if HAVE_ALIGNED_STACK c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm index 67d7905132..807e87e60e 100644 --- a/libavutil/x86/x86util.asm +++ b/libavutil/x86/x86util.asm @@ -288,7 +288,12 @@ paddd %1, %2 %endif %if notcpuflag(xop) || sizeof%1 != 16 +%if cpuflag(mmxext) PSHUFLW %2, %1, q0032 +%else ; mmx + mova %2, %1 + psrlq %2, 32 +%endif paddd %1, %2 %endif %undef %1