diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index d449efdcca..401a488cb5 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -63,123 +63,119 @@ void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTEL /***********************************/ /* deblocking */ -static av_always_inline void h264_loop_filter_strength_iteration_mmx2(int16_t bS[2][4][4], uint8_t nnz[40], - int8_t ref[2][40], int16_t mv[2][40][2], - int bidir, int edges, int step, - int mask_mv, int dir, const int d_idx, - const uint64_t mask_dir) -{ - x86_reg b_idx; - mask_mv <<= 3; - for( b_idx=0; b_idx= limit - "psubusb %%mm5, %%mm3 \n" - "packsswb %%mm3, %%mm1 \n" - - "por %%mm1, %%mm0 \n" - "movq %a7(%1,%2,4), %%mm1 \n" - "movq %a8(%1,%2,4), %%mm2 \n" - "movq %%mm1, %%mm3 \n" - "movq %%mm2, %%mm4 \n" - "psubw 48(%1,%2,4), %%mm1 \n" - "psubw 56(%1,%2,4), %%mm2 \n" - "psubw 208(%1,%2,4), %%mm3 \n" - "psubw 216(%1,%2,4), %%mm4 \n" - "packsswb %%mm2, %%mm1 \n" - "packsswb %%mm4, %%mm3 \n" - "paddb %%mm6, %%mm1 \n" - "paddb %%mm6, %%mm3 \n" - "psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit - "psubusb %%mm5, %%mm3 \n" - "packsswb %%mm3, %%mm1 \n" - - "pshufw $0x4E, %%mm1, %%mm1 \n" - "por %%mm1, %%mm0 \n" - "pshufw $0x4E, %%mm0, %%mm1 \n" - "pminub %%mm1, %%mm0 \n" - ::"r"(ref), - "r"(mv), - "r"(b_idx), - "i"(d_idx+12), - "i"(d_idx+52), - "i"(d_idx*4+48), - "i"(d_idx*4+56), - "i"(d_idx*4+208), - "i"(d_idx*4+216) - ); - } else { - __asm__ volatile( - "movd 12(%0,%2), %%mm0 \n" - "psubb %a3(%0,%2), %%mm0 \n" // ref[b] != ref[bn] - "movq 48(%1,%2,4), %%mm1 \n" - "movq 56(%1,%2,4), %%mm2 \n" - "psubw %a4(%1,%2,4), %%mm1 \n" - "psubw %a5(%1,%2,4), %%mm2 \n" - "packsswb %%mm2, %%mm1 \n" - "paddb %%mm6, %%mm1 \n" - "psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit - "packsswb %%mm1, %%mm1 \n" - "por %%mm1, %%mm0 \n" - ::"r"(ref), - "r"(mv), - "r"(b_idx), - "i"(d_idx+12), - "i"(d_idx*4+48), - "i"(d_idx*4+56) - ); - } - } - __asm__ volatile( - "movd 12(%0,%1), %%mm1 \n" - "por %a2(%0,%1), %%mm1 \n" // nnz[b] || nnz[bn] - ::"r"(nnz), - "r"(b_idx), - "i"(d_idx+12) - ); - __asm__ volatile( - "pminub %%mm7, %%mm1 \n" - "pminub %%mm7, %%mm0 \n" - "psllw $1, %%mm1 \n" - "pxor %%mm2, %%mm2 \n" - "pmaxub %%mm0, %%mm1 \n" - "punpcklbw %%mm2, %%mm1 \n" - "movq %%mm1, %a1(%0,%2) \n" - ::"r"(bS), - "i"(32*dir), - "r"(b_idx) - :"memory" - ); - } -} +#define h264_loop_filter_strength_iteration_mmx2(bS, nz, ref, mv, bidir, edges, step, mask_mv, dir, d_idx, mask_dir) \ + do { \ + x86_reg b_idx; \ + mask_mv <<= 3; \ + for( b_idx=0; b_idx= limit */ \ + "psubusb %%mm5, %%mm3 \n" \ + "packsswb %%mm3, %%mm1 \n" \ + \ + "por %%mm1, %%mm0 \n" \ + "movq %a7(%1,%2,4), %%mm1 \n" \ + "movq %a8(%1,%2,4), %%mm2 \n" \ + "movq %%mm1, %%mm3 \n" \ + "movq %%mm2, %%mm4 \n" \ + "psubw 48(%1,%2,4), %%mm1 \n" \ + "psubw 56(%1,%2,4), %%mm2 \n" \ + "psubw 208(%1,%2,4), %%mm3 \n" \ + "psubw 216(%1,%2,4), %%mm4 \n" \ + "packsswb %%mm2, %%mm1 \n" \ + "packsswb %%mm4, %%mm3 \n" \ + "paddb %%mm6, %%mm1 \n" \ + "paddb %%mm6, %%mm3 \n" \ + "psubusb %%mm5, %%mm1 \n" /* abs(mv[b] - mv[bn]) >= limit */ \ + "psubusb %%mm5, %%mm3 \n" \ + "packsswb %%mm3, %%mm1 \n" \ + \ + "pshufw $0x4E, %%mm1, %%mm1 \n" \ + "por %%mm1, %%mm0 \n" \ + "pshufw $0x4E, %%mm0, %%mm1 \n" \ + "pminub %%mm1, %%mm0 \n" \ + ::"r"(ref), \ + "r"(mv), \ + "r"(b_idx), \ + "i"(d_idx+12), \ + "i"(d_idx+52), \ + "i"(d_idx*4+48), \ + "i"(d_idx*4+56), \ + "i"(d_idx*4+208), \ + "i"(d_idx*4+216) \ + ); \ + } else { \ + __asm__ volatile( \ + "movd 12(%0,%2), %%mm0 \n" \ + "psubb %a3(%0,%2), %%mm0 \n" /* ref[b] != ref[bn] */ \ + "movq 48(%1,%2,4), %%mm1 \n" \ + "movq 56(%1,%2,4), %%mm2 \n" \ + "psubw %a4(%1,%2,4), %%mm1 \n" \ + "psubw %a5(%1,%2,4), %%mm2 \n" \ + "packsswb %%mm2, %%mm1 \n" \ + "paddb %%mm6, %%mm1 \n" \ + "psubusb %%mm5, %%mm1 \n" /* abs(mv[b] - mv[bn]) >= limit */ \ + "packsswb %%mm1, %%mm1 \n" \ + "por %%mm1, %%mm0 \n" \ + ::"r"(ref), \ + "r"(mv), \ + "r"(b_idx), \ + "i"(d_idx+12), \ + "i"(d_idx*4+48), \ + "i"(d_idx*4+56) \ + ); \ + } \ + } \ + __asm__ volatile( \ + "movd 12(%0,%1), %%mm1 \n" \ + "por %a2(%0,%1), %%mm1 \n" /* nnz[b] || nnz[bn] */ \ + ::"r"(nnz), \ + "r"(b_idx), \ + "i"(d_idx+12) \ + ); \ + __asm__ volatile( \ + "pminub %%mm7, %%mm1 \n" \ + "pminub %%mm7, %%mm0 \n" \ + "psllw $1, %%mm1 \n" \ + "pxor %%mm2, %%mm2 \n" \ + "pmaxub %%mm0, %%mm1 \n" \ + "punpcklbw %%mm2, %%mm1 \n" \ + "movq %%mm1, %a1(%0,%2) \n" \ + ::"r"(bS), \ + "i"(32*dir), \ + "r"(b_idx) \ + :"memory" \ + ); \ + } \ + } while (0) static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {