From a3950a90f64377327197009ce00f1256d81f3408 Mon Sep 17 00:00:00 2001 From: Michael Niedermayer Date: Sun, 25 May 2014 19:42:18 +0200 Subject: [PATCH] Revert "x86: dsputilenc: convert ff_sse{8, 16}_mmx() to yasm" This reverts commit ad733089b024e4a3ff8f024d247a032f79a50ac8. breaks with --disable-yasm revert requested by: Christophe Gisquet --- libavcodec/x86/dsputilenc.asm | 52 +++--------- libavcodec/x86/dsputilenc_mmx.c | 141 ++++++++++++++++++++++++++++++-- 2 files changed, 146 insertions(+), 47 deletions(-) diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm index c6f33dcc26..46330fe116 100644 --- a/libavcodec/x86/dsputilenc.asm +++ b/libavcodec/x86/dsputilenc.asm @@ -274,27 +274,19 @@ INIT_XMM ssse3 %define ABS_SUM_8x8 ABS_SUM_8x8_64 HADAMARD8_DIFF 9 -; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, -; int line_size, int h) - -%macro SUM_SQUARED_ERRORS 2 -cglobal sse%1, 5,5,%2, v, pix1, pix2, lsize, h -%if %1 == mmsize - shr hd, 1 -%endif +INIT_XMM sse2 +; int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, +; int line_size, int h); +cglobal sse16, 5, 5, 8 + shr r4d, 1 pxor m0, m0 ; mm0 = 0 pxor m7, m7 ; mm7 holds the sum .next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned - movu m1, [pix1q] ; m1 = pix1[0][0-15], [0-7] for mmx - movu m2, [pix2q] ; m2 = pix2[0][0-15], [0-7] for mmx -%if %1 == mmsize - movu m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx - movu m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx -%else ; %1 / 2 == mmsize; mmx only - mova m3, [pix1q+8] ; m3 = pix1[0][8-15] - mova m4, [pix2q+8] ; m4 = pix2[0][8-15] -%endif + movu m1, [r1 ] ; mm1 = pix1[0][0-15] + movu m2, [r2 ] ; mm2 = pix2[0][0-15] + movu m3, [r1+r3] ; mm3 = pix1[1][0-15] + movu m4, [r2+r3] ; mm4 = pix2[1][0-15] ; todo: mm1-mm2, mm3-mm4 ; algo: subtract mm1 from mm2 with saturation and vice versa @@ -323,43 +315,25 @@ cglobal sse%1, 5,5,%2, v, pix1, pix2, lsize, h pmaddwd m1, m1 pmaddwd m3, m3 + lea r1, [r1+r3*2] ; pix1 += 2*line_size + lea r2, [r2+r3*2] ; pix2 += 2*line_size + paddd m1, m2 paddd m3, m4 paddd m7, m1 paddd m7, m3 -%if %1 == mmsize - lea pix1q, [pix1q + 2*lsizeq] - lea pix2q, [pix2q + 2*lsizeq] -%else - add pix1q, lsizeq - add pix2q, lsizeq -%endif - dec hd + dec r4 jnz .next2lines mova m1, m7 -%if mmsize == 8 - psrlq m7, 32 ; shift hi dword to lo -%else psrldq m7, 8 ; shift hi qword to lo paddd m7, m1 mova m1, m7 psrldq m7, 4 ; shift hi dword to lo -%endif paddd m7, m1 movd eax, m7 ; return value RET -%endmacro - -INIT_MMX mmx -SUM_SQUARED_ERRORS 8, 0 - -INIT_MMX mmx -SUM_SQUARED_ERRORS 16, 0 - -INIT_XMM sse2 -SUM_SQUARED_ERRORS 16, 8 INIT_MMX mmx ; void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size) diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c index a7518eb1c8..e63d510ab9 100644 --- a/libavcodec/x86/dsputilenc_mmx.c +++ b/libavcodec/x86/dsputilenc_mmx.c @@ -42,13 +42,138 @@ int ff_sum_abs_dctelem_mmx(int16_t *block); int ff_sum_abs_dctelem_mmxext(int16_t *block); int ff_sum_abs_dctelem_sse2(int16_t *block); int ff_sum_abs_dctelem_ssse3(int16_t *block); -int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h); -int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h); #if HAVE_INLINE_ASM +static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int tmp; + + __asm__ volatile ( + "movl %4, %%ecx \n" + "shr $1, %%ecx \n" + "pxor %%mm0, %%mm0 \n" /* mm0 = 0 */ + "pxor %%mm7, %%mm7 \n" /* mm7 holds the sum */ + "1: \n" + "movq (%0), %%mm1 \n" /* mm1 = pix1[0][0 - 7] */ + "movq (%1), %%mm2 \n" /* mm2 = pix2[0][0 - 7] */ + "movq (%0, %3), %%mm3 \n" /* mm3 = pix1[1][0 - 7] */ + "movq (%1, %3), %%mm4 \n" /* mm4 = pix2[1][0 - 7] */ + + /* todo: mm1-mm2, mm3-mm4 */ + /* algo: subtract mm1 from mm2 with saturation and vice versa */ + /* OR the results to get absolute difference */ + "movq %%mm1, %%mm5 \n" + "movq %%mm3, %%mm6 \n" + "psubusb %%mm2, %%mm1 \n" + "psubusb %%mm4, %%mm3 \n" + "psubusb %%mm5, %%mm2 \n" + "psubusb %%mm6, %%mm4 \n" + + "por %%mm1, %%mm2 \n" + "por %%mm3, %%mm4 \n" + + /* now convert to 16-bit vectors so we can square them */ + "movq %%mm2, %%mm1 \n" + "movq %%mm4, %%mm3 \n" + + "punpckhbw %%mm0, %%mm2 \n" + "punpckhbw %%mm0, %%mm4 \n" + "punpcklbw %%mm0, %%mm1 \n" /* mm1 now spread over (mm1, mm2) */ + "punpcklbw %%mm0, %%mm3 \n" /* mm4 now spread over (mm3, mm4) */ + + "pmaddwd %%mm2, %%mm2 \n" + "pmaddwd %%mm4, %%mm4 \n" + "pmaddwd %%mm1, %%mm1 \n" + "pmaddwd %%mm3, %%mm3 \n" + + "lea (%0, %3, 2), %0 \n" /* pix1 += 2 * line_size */ + "lea (%1, %3, 2), %1 \n" /* pix2 += 2 * line_size */ + + "paddd %%mm2, %%mm1 \n" + "paddd %%mm4, %%mm3 \n" + "paddd %%mm1, %%mm7 \n" + "paddd %%mm3, %%mm7 \n" + + "decl %%ecx \n" + "jnz 1b \n" + + "movq %%mm7, %%mm1 \n" + "psrlq $32, %%mm7 \n" /* shift hi dword to lo */ + "paddd %%mm7, %%mm1 \n" + "movd %%mm1, %2 \n" + : "+r" (pix1), "+r" (pix2), "=r" (tmp) + : "r" ((x86_reg) line_size), "m" (h) + : "%ecx"); + + return tmp; +} + +static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int tmp; + + __asm__ volatile ( + "movl %4, %%ecx\n" + "pxor %%mm0, %%mm0\n" /* mm0 = 0 */ + "pxor %%mm7, %%mm7\n" /* mm7 holds the sum */ + "1:\n" + "movq (%0), %%mm1\n" /* mm1 = pix1[0 - 7] */ + "movq (%1), %%mm2\n" /* mm2 = pix2[0 - 7] */ + "movq 8(%0), %%mm3\n" /* mm3 = pix1[8 - 15] */ + "movq 8(%1), %%mm4\n" /* mm4 = pix2[8 - 15] */ + + /* todo: mm1-mm2, mm3-mm4 */ + /* algo: subtract mm1 from mm2 with saturation and vice versa */ + /* OR the results to get absolute difference */ + "movq %%mm1, %%mm5\n" + "movq %%mm3, %%mm6\n" + "psubusb %%mm2, %%mm1\n" + "psubusb %%mm4, %%mm3\n" + "psubusb %%mm5, %%mm2\n" + "psubusb %%mm6, %%mm4\n" + + "por %%mm1, %%mm2\n" + "por %%mm3, %%mm4\n" + + /* now convert to 16-bit vectors so we can square them */ + "movq %%mm2, %%mm1\n" + "movq %%mm4, %%mm3\n" + + "punpckhbw %%mm0, %%mm2\n" + "punpckhbw %%mm0, %%mm4\n" + "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */ + "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */ + + "pmaddwd %%mm2, %%mm2\n" + "pmaddwd %%mm4, %%mm4\n" + "pmaddwd %%mm1, %%mm1\n" + "pmaddwd %%mm3, %%mm3\n" + + "add %3, %0\n" + "add %3, %1\n" + + "paddd %%mm2, %%mm1\n" + "paddd %%mm4, %%mm3\n" + "paddd %%mm1, %%mm7\n" + "paddd %%mm3, %%mm7\n" + + "decl %%ecx\n" + "jnz 1b\n" + + "movq %%mm7, %%mm1\n" + "psrlq $32, %%mm7\n" /* shift hi dword to lo */ + "paddd %%mm7, %%mm1\n" + "movd %%mm1, %2\n" + : "+r" (pix1), "+r" (pix2), "=r" (tmp) + : "r" ((x86_reg) line_size), "m" (h) + : "%ecx"); + + return tmp; +} + static int hf_noise8_mmx(uint8_t *pix1, int line_size, int h) { int tmp; @@ -302,7 +427,7 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, if (c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); else - score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h); + score1 = sse16_mmx(c, pix1, pix2, line_size, h); score2 = hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); @@ -315,7 +440,7 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { - int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h); + int score1 = sse8_mmx(c, pix1, pix2, line_size, h); int score2 = hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); @@ -766,8 +891,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, c->diff_pixels = ff_diff_pixels_mmx; c->pix_sum = ff_pix_sum16_mmx; c->pix_norm1 = ff_pix_norm1_mmx; - c->sse[0] = ff_sse16_mmx; - c->sse[1] = ff_sse8_mmx; } if (EXTERNAL_SSE2(cpu_flags)) @@ -781,6 +904,8 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, c->fdct = ff_fdct_mmx; c->diff_bytes = diff_bytes_mmx; + c->sse[0] = sse16_mmx; + c->sse[1] = sse8_mmx; c->vsad[4] = vsad_intra16_mmx; c->nsse[0] = nsse16_mmx;