diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm index ad06d485ab..10809bbfb1 100644 --- a/libavcodec/x86/me_cmp.asm +++ b/libavcodec/x86/me_cmp.asm @@ -261,11 +261,10 @@ hadamard8_16_wrapper 0, 14 %endif %endmacro -INIT_MMX mmx -HADAMARD8_DIFF - +%if HAVE_ALIGNED_STACK == 0 INIT_MMX mmxext HADAMARD8_DIFF +%endif INIT_XMM sse2 %if ARCH_X86_64 @@ -385,10 +384,6 @@ cglobal sum_abs_dctelem, 1, 1, %1, block RET %endmacro -INIT_MMX mmx -SUM_ABS_DCTELEM 0, 4 -INIT_MMX mmxext -SUM_ABS_DCTELEM 0, 4 INIT_XMM sse2 SUM_ABS_DCTELEM 7, 2 INIT_XMM ssse3 diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c index 9af911bb88..61e9396b8f 100644 --- a/libavcodec/x86/me_cmp_init.c +++ b/libavcodec/x86/me_cmp_init.c @@ -30,8 +30,6 @@ #include "libavcodec/me_cmp.h" #include "libavcodec/mpegvideo.h" -int ff_sum_abs_dctelem_mmx(int16_t *block); -int ff_sum_abs_dctelem_mmxext(int16_t *block); int ff_sum_abs_dctelem_sse2(int16_t *block); int ff_sum_abs_dctelem_ssse3(int16_t *block); int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, @@ -85,7 +83,6 @@ int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \ uint8_t *src2, ptrdiff_t stride, int h); -hadamard_func(mmx) hadamard_func(mmxext) hadamard_func(sse2) hadamard_func(ssse3) @@ -126,232 +123,12 @@ static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, #if HAVE_INLINE_ASM -static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, - ptrdiff_t stride, int h) -{ - int tmp; - - av_assert2(((uintptr_t) pix & 7) == 0); - av_assert2((stride & 7) == 0); - -#define SUM(in0, in1, out0, out1) \ - "movq (%0), %%mm2\n" \ - "movq 8(%0), %%mm3\n" \ - "add %2,%0\n" \ - "movq %%mm2, " #out0 "\n" \ - "movq %%mm3, " #out1 "\n" \ - "psubusb " #in0 ", %%mm2\n" \ - "psubusb " #in1 ", %%mm3\n" \ - "psubusb " #out0 ", " #in0 "\n" \ - "psubusb " #out1 ", " #in1 "\n" \ - "por %%mm2, " #in0 "\n" \ - "por %%mm3, " #in1 "\n" \ - "movq " #in0 ", %%mm2\n" \ - "movq " #in1 ", %%mm3\n" \ - "punpcklbw %%mm7, " #in0 "\n" \ - "punpcklbw %%mm7, " #in1 "\n" \ - "punpckhbw %%mm7, %%mm2\n" \ - "punpckhbw %%mm7, %%mm3\n" \ - "paddw " #in1 ", " #in0 "\n" \ - "paddw %%mm3, %%mm2\n" \ - "paddw %%mm2, " #in0 "\n" \ - "paddw " #in0 ", %%mm6\n" - - - __asm__ volatile ( - "movl %3, %%ecx\n" - "pxor %%mm6, %%mm6\n" - "pxor %%mm7, %%mm7\n" - "movq (%0), %%mm0\n" - "movq 8(%0), %%mm1\n" - "add %2, %0\n" - "jmp 2f\n" - "1:\n" - - SUM(%%mm4, %%mm5, %%mm0, %%mm1) - "2:\n" - SUM(%%mm0, %%mm1, %%mm4, %%mm5) - - "subl $2, %%ecx\n" - "jnz 1b\n" - - "movq %%mm6, %%mm0\n" - "psrlq $32, %%mm6\n" - "paddw %%mm6, %%mm0\n" - "movq %%mm0, %%mm6\n" - "psrlq $16, %%mm0\n" - "paddw %%mm6, %%mm0\n" - "movd %%mm0, %1\n" - : "+r" (pix), "=r" (tmp) - : "r" (stride), "m" (h) - : "%ecx"); - - return tmp & 0xFFFF; -} -#undef SUM - -static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - ptrdiff_t stride, int h) -{ - int tmp; - - av_assert2(((uintptr_t)pix1 & 7) == 0); - av_assert2(((uintptr_t)pix2 & 7) == 0); - av_assert2((stride & 7) == 0); - -#define SUM(in0, in1, out0, out1) \ - "movq (%0), %%mm2\n" \ - "movq (%1), " #out0 "\n" \ - "movq 8(%0), %%mm3\n" \ - "movq 8(%1), " #out1 "\n" \ - "add %3, %0\n" \ - "add %3, %1\n" \ - "psubb " #out0 ", %%mm2\n" \ - "psubb " #out1 ", %%mm3\n" \ - "pxor %%mm7, %%mm2\n" \ - "pxor %%mm7, %%mm3\n" \ - "movq %%mm2, " #out0 "\n" \ - "movq %%mm3, " #out1 "\n" \ - "psubusb " #in0 ", %%mm2\n" \ - "psubusb " #in1 ", %%mm3\n" \ - "psubusb " #out0 ", " #in0 "\n" \ - "psubusb " #out1 ", " #in1 "\n" \ - "por %%mm2, " #in0 "\n" \ - "por %%mm3, " #in1 "\n" \ - "movq " #in0 ", %%mm2\n" \ - "movq " #in1 ", %%mm3\n" \ - "punpcklbw %%mm7, " #in0 "\n" \ - "punpcklbw %%mm7, " #in1 "\n" \ - "punpckhbw %%mm7, %%mm2\n" \ - "punpckhbw %%mm7, %%mm3\n" \ - "paddw " #in1 ", " #in0 "\n" \ - "paddw %%mm3, %%mm2\n" \ - "paddw %%mm2, " #in0 "\n" \ - "paddw " #in0 ", %%mm6\n" - - - __asm__ volatile ( - "movl %4, %%ecx\n" - "pxor %%mm6, %%mm6\n" - "pcmpeqw %%mm7, %%mm7\n" - "psllw $15, %%mm7\n" - "packsswb %%mm7, %%mm7\n" - "movq (%0), %%mm0\n" - "movq (%1), %%mm2\n" - "movq 8(%0), %%mm1\n" - "movq 8(%1), %%mm3\n" - "add %3, %0\n" - "add %3, %1\n" - "psubb %%mm2, %%mm0\n" - "psubb %%mm3, %%mm1\n" - "pxor %%mm7, %%mm0\n" - "pxor %%mm7, %%mm1\n" - "jmp 2f\n" - "1:\n" - - SUM(%%mm4, %%mm5, %%mm0, %%mm1) - "2:\n" - SUM(%%mm0, %%mm1, %%mm4, %%mm5) - - "subl $2, %%ecx\n" - "jnz 1b\n" - - "movq %%mm6, %%mm0\n" - "psrlq $32, %%mm6\n" - "paddw %%mm6, %%mm0\n" - "movq %%mm0, %%mm6\n" - "psrlq $16, %%mm0\n" - "paddw %%mm6, %%mm0\n" - "movd %%mm0, %2\n" - : "+r" (pix1), "+r" (pix2), "=r" (tmp) - : "r" (stride), "m" (h) - : "%ecx"); - - return tmp & 0x7FFF; -} -#undef SUM - DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = { 0x0000000000000000ULL, 0x0001000100010001ULL, 0x0002000200020002ULL, }; -static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, - ptrdiff_t stride, int h) -{ - x86_reg len = -stride * h; - __asm__ volatile ( - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" - "movq (%2, %%"FF_REG_a"), %%mm2 \n\t" - "movq (%2, %%"FF_REG_a"), %%mm4 \n\t" - "add %3, %%"FF_REG_a" \n\t" - "psubusb %%mm0, %%mm2 \n\t" - "psubusb %%mm4, %%mm0 \n\t" - "movq (%1, %%"FF_REG_a"), %%mm1 \n\t" - "movq (%2, %%"FF_REG_a"), %%mm3 \n\t" - "movq (%2, %%"FF_REG_a"), %%mm5 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm5, %%mm1 \n\t" - "por %%mm2, %%mm0 \n\t" - "por %%mm1, %%mm3 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm3, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "punpckhbw %%mm7, %%mm2 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm3, %%mm2 \n\t" - "paddw %%mm2, %%mm0 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "add %3, %%"FF_REG_a" \n\t" - " js 1b \n\t" - : "+a" (len) - : "r" (blk1 - len), "r" (blk2 - len), "r" (stride)); -} - -static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, - ptrdiff_t stride, int h) -{ - x86_reg len = -stride * h; - __asm__ volatile ( - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" - "movq (%2, %%"FF_REG_a"), %%mm1 \n\t" - "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" - "movq (%2, %%"FF_REG_a"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "paddw %%mm2, %%mm3 \n\t" - "movq (%3, %%"FF_REG_a"), %%mm4 \n\t" - "movq (%3, %%"FF_REG_a"), %%mm2 \n\t" - "paddw %%mm5, %%mm1 \n\t" - "paddw %%mm5, %%mm3 \n\t" - "psrlw $1, %%mm1 \n\t" - "psrlw $1, %%mm3 \n\t" - "packuswb %%mm3, %%mm1 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm2, %%mm1 \n\t" - "por %%mm4, %%mm1 \n\t" - "movq %%mm1, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "add %4, %%"FF_REG_a" \n\t" - " js 1b \n\t" - : "+a" (len) - : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len), - "r" (stride)); -} - static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, ptrdiff_t stride, int h) { @@ -421,63 +198,7 @@ static inline int sum_mmx(void) return ret & 0xFFFF; } -static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, - ptrdiff_t stride, int h) -{ - sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h); -} - -static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, - ptrdiff_t stride, int h) -{ - sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h); -} - -#define PIX_SAD(suf) \ -static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \ - uint8_t *blk1, ptrdiff_t stride, int h) \ -{ \ - av_assert2(h == 8); \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "pxor %%mm6, %%mm6 \n\t" \ - :); \ - \ - sad8_1_ ## suf(blk1, blk2, stride, 8); \ - \ - return sum_ ## suf(); \ -} \ - \ -static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ - uint8_t *blk1, ptrdiff_t stride, int h) \ -{ \ - av_assert2(h == 8); \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "pxor %%mm6, %%mm6 \n\t" \ - "movq %0, %%mm5 \n\t" \ - :: "m" (round_tab[1])); \ - \ - sad8_x2a_ ## suf(blk1, blk2, stride, 8); \ - \ - return sum_ ## suf(); \ -} \ - \ -static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ - uint8_t *blk1, ptrdiff_t stride, int h) \ -{ \ - av_assert2(h == 8); \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "pxor %%mm6, %%mm6 \n\t" \ - "movq %0, %%mm5 \n\t" \ - :: "m" (round_tab[1])); \ - \ - sad8_y2a_ ## suf(blk1, blk2, stride, 8); \ - \ - return sum_ ## suf(); \ -} \ - \ +#define PIX_SADXY(suf) \ static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ uint8_t *blk1, ptrdiff_t stride, int h) \ { \ @@ -492,50 +213,6 @@ static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ return sum_ ## suf(); \ } \ \ -static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \ - uint8_t *blk1, ptrdiff_t stride, int h) \ -{ \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "pxor %%mm6, %%mm6 \n\t" \ - :); \ - \ - sad8_1_ ## suf(blk1, blk2, stride, h); \ - sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ - \ - return sum_ ## suf(); \ -} \ - \ -static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ - uint8_t *blk1, ptrdiff_t stride, int h) \ -{ \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "pxor %%mm6, %%mm6 \n\t" \ - "movq %0, %%mm5 \n\t" \ - :: "m" (round_tab[1])); \ - \ - sad8_x2a_ ## suf(blk1, blk2, stride, h); \ - sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ - \ - return sum_ ## suf(); \ -} \ - \ -static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ - uint8_t *blk1, ptrdiff_t stride, int h) \ -{ \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "pxor %%mm6, %%mm6 \n\t" \ - "movq %0, %%mm5 \n\t" \ - :: "m" (round_tab[1])); \ - \ - sad8_y2a_ ## suf(blk1, blk2, stride, h); \ - sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ - \ - return sum_ ## suf(); \ -} \ - \ static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ uint8_t *blk1, ptrdiff_t stride, int h) \ { \ @@ -550,7 +227,7 @@ static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ return sum_ ## suf(); \ } \ -PIX_SAD(mmx) +PIX_SADXY(mmx) #endif /* HAVE_INLINE_ASM */ @@ -560,32 +237,13 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx) #if HAVE_INLINE_ASM if (INLINE_MMX(cpu_flags)) { - c->pix_abs[0][0] = sad16_mmx; - c->pix_abs[0][1] = sad16_x2_mmx; - c->pix_abs[0][2] = sad16_y2_mmx; c->pix_abs[0][3] = sad16_xy2_mmx; - c->pix_abs[1][0] = sad8_mmx; - c->pix_abs[1][1] = sad8_x2_mmx; - c->pix_abs[1][2] = sad8_y2_mmx; c->pix_abs[1][3] = sad8_xy2_mmx; - - c->sad[0] = sad16_mmx; - c->sad[1] = sad8_mmx; - - c->vsad[4] = vsad_intra16_mmx; - - if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) { - c->vsad[0] = vsad16_mmx; - } } #endif /* HAVE_INLINE_ASM */ if (EXTERNAL_MMX(cpu_flags)) { - c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx; - c->hadamard8_diff[1] = ff_hadamard8_diff_mmx; - c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx; - c->sse[0] = ff_sse16_mmx; c->sse[1] = ff_sse8_mmx; #if HAVE_X86ASM c->nsse[0] = nsse16_mmx; @@ -594,9 +252,10 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx) } if (EXTERNAL_MMXEXT(cpu_flags)) { +#if !HAVE_ALIGNED_STACK c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext; c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext; - c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext; +#endif c->sad[0] = ff_sad16_mmxext; c->sad[1] = ff_sad8_mmxext;