diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 77d79eaedb..10fb43d9d0 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -1086,7 +1086,7 @@ void ff_put_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) void ff_avg_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) { - avg_pixels8_mmx(dst, src, stride, 8); + ff_avg_pixels8_mmx(dst, src, stride, 8); } void ff_put_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) @@ -1096,7 +1096,7 @@ void ff_put_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) void ff_avg_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) { - avg_pixels16_mmx(dst, src, stride, 16); + ff_avg_pixels16_mmx(dst, src, stride, 16); } /* VC-1-specific */ @@ -1134,7 +1134,7 @@ void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[ #if HAVE_MMX_INLINE DIRAC_PIXOP(put, ff_put, mmx) -DIRAC_PIXOP(avg, avg, mmx) +DIRAC_PIXOP(avg, ff_avg, mmx) #endif #if HAVE_YASM diff --git a/libavcodec/x86/dsputil_mmx.h b/libavcodec/x86/dsputil_mmx.h index a0a4cdb6cf..f2943a0b2a 100644 --- a/libavcodec/x86/dsputil_mmx.h +++ b/libavcodec/x86/dsputil_mmx.h @@ -156,6 +156,10 @@ void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_s void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size); +void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, diff --git a/libavcodec/x86/fpel_mmx.c b/libavcodec/x86/fpel_mmx.c index faa1b0335f..9d3f36b6c8 100644 --- a/libavcodec/x86/fpel_mmx.c +++ b/libavcodec/x86/fpel_mmx.c @@ -29,6 +29,51 @@ #if HAVE_MMX_INLINE +// in case more speed is needed - unroling would certainly help +void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + MOVQ_BFE(mm6); + JUMPALIGN(); + do { + __asm__ volatile( + "movq %0, %%mm0 \n\t" + "movq %1, %%mm1 \n\t" + PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6) + "movq %%mm2, %0 \n\t" + :"+m"(*block) + :"m"(*pixels) + :"memory"); + pixels += line_size; + block += line_size; + } + while (--h); +} + +void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + MOVQ_BFE(mm6); + JUMPALIGN(); + do { + __asm__ volatile( + "movq %0, %%mm0 \n\t" + "movq %1, %%mm1 \n\t" + PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6) + "movq %%mm2, %0 \n\t" + "movq 8%0, %%mm0 \n\t" + "movq 8%1, %%mm1 \n\t" + PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6) + "movq %%mm2, 8%0 \n\t" + :"+m"(*block) + :"m"(*pixels) + :"memory"); + pixels += line_size; + block += line_size; + } + while (--h); +} + void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c index 392f4bc36c..6a92623f42 100644 --- a/libavcodec/x86/hpeldsp_init.c +++ b/libavcodec/x86/hpeldsp_init.c @@ -74,8 +74,11 @@ void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +#define avg_pixels8_mmx ff_avg_pixels8_mmx +#define avg_pixels16_mmx ff_avg_pixels16_mmx #define put_pixels8_mmx ff_put_pixels8_mmx #define put_pixels16_mmx ff_put_pixels16_mmx +#define avg_no_rnd_pixels16_mmx ff_avg_pixels16_mmx #define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx #define put_no_rnd_pixels16_mmx ff_put_pixels16_mmx diff --git a/libavcodec/x86/rnd_template.c b/libavcodec/x86/rnd_template.c index 776530910e..35d80edfb6 100644 --- a/libavcodec/x86/rnd_template.c +++ b/libavcodec/x86/rnd_template.c @@ -92,51 +92,6 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff } // avg_pixels -#ifndef NO_RND -// in case more speed is needed - unroling would certainly help -static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %0, %%mm0 \n\t" - "movq %1, %%mm1 \n\t" - OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, %0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } - while (--h); -} -#endif /* NO_RND */ - -static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %0, %%mm0 \n\t" - "movq %1, %%mm1 \n\t" - OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, %0 \n\t" - "movq 8%0, %%mm0 \n\t" - "movq 8%1, %%mm1 \n\t" - OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, 8%0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } - while (--h); -} - // this routine is 'slightly' suboptimal but mostly unused static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) {