From 4b81511cab1d53f7f189bcb09aac4303b20a4ce8 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Wed, 29 Sep 2010 13:34:20 +0000 Subject: [PATCH] Unloop the outer loop in h264_loop_filter_strength_mmx2(), which allows inlining various constants within the loop code. 20 cycles faster on cathedral sample. Originally committed as revision 25252 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/x86/h264dsp_mmx.c | 54 +++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index 249675f391..4df3f12154 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -63,29 +63,12 @@ void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTEL /***********************************/ /* deblocking */ -static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], - int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { - int dir; - __asm__ volatile( - "movq %0, %%mm7 \n" - "movq %1, %%mm6 \n" - ::"m"(ff_pb_1), "m"(ff_pb_3) - ); - if(field) - __asm__ volatile( - "movq %0, %%mm6 \n" - ::"m"(ff_pb_3_1) - ); - __asm__ volatile( - "movq %%mm6, %%mm5 \n" - "paddb %%mm5, %%mm5 \n" - :); - - // could do a special case for dir==0 && edges==1, but it only reduces the - // average filter time by 1.2% - for( dir=1; dir>=0; dir-- ) { +static av_always_inline void h264_loop_filter_strength_iteration_mmx2(int16_t bS[2][4][4], uint8_t nnz[40], + int8_t ref[2][40], int16_t mv[2][40][2], + int bidir, int edges, int step, + int mask_mv, int dir) +{ const x86_reg d_idx = dir ? -8 : -1; - const int mask_mv = dir ? mask_mv1 : mask_mv0; DECLARE_ALIGNED(8, const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; int b_idx, edge; for( b_idx=12, edge=0; edge