From 81aa0f4604f98da692f2689c84968f90354a92ea Mon Sep 17 00:00:00 2001 From: Christophe Gisquet Date: Thu, 22 May 2014 23:47:06 +0200 Subject: [PATCH] x86: hpeldsp: implement SSSE3 version of _xy2 Loading pb_1 rather than pw_8192 was benchmarked to be more efficient. Loading of the 2 yields no advantage. Loading of one saves ~11 cycles. decicycles count: put8: 3223(mmx) -> 2387 avg8: 2863(mmxext) -> 2125 put16: 4356(sse2) -> 3553 avg16: 4481(sse2) -> 3513 Signed-off-by: Michael Niedermayer --- libavcodec/x86/hpeldsp.asm | 70 +++++++++++++++++++++++++++++++++++ libavcodec/x86/hpeldsp_init.c | 22 +++++++++++ 2 files changed, 92 insertions(+) diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm index 4af423aee5..76e4632cbc 100644 --- a/libavcodec/x86/hpeldsp.asm +++ b/libavcodec/x86/hpeldsp.asm @@ -30,6 +30,9 @@ SECTION_RODATA cextern pb_1 cextern pw_2 +pw_8192: times 8 dw (1<<13) +pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7 SECTION_TEXT @@ -635,3 +638,70 @@ SET_PIXELS_XY2 avg INIT_XMM sse2 SET_PIXELS_XY2 put SET_PIXELS_XY2 avg + +%macro SSSE3_PIXELS_XY2 1-2 +%if %0 == 2 ; sse2 +cglobal %1_pixels16_xy2, 4,5,%2 + mova m4, [pb_interleave16] +%else +cglobal %1_pixels8_xy2, 4,5 + mova m4, [pb_interleave8] +%endif + mova m5, [pb_1] + movu m0, [r1] + movu m1, [r1+1] + pmaddubsw m0, m5 + pmaddubsw m1, m5 + xor r4, r4 + add r1, r2 +.loop: + movu m2, [r1+r4] + movu m3, [r1+r4+1] + pmaddubsw m2, m5 + pmaddubsw m3, m5 + paddusw m0, m2 + paddusw m1, m3 + pmulhrsw m0, [pw_8192] + pmulhrsw m1, [pw_8192] +%ifidn %1, avg + mova m6, [r0+r4] + packuswb m0, m1 + pshufb m0, m4 + pavgb m0, m6 +%else + packuswb m0, m1 + pshufb m0, m4 +%endif + mova [r0+r4], m0 + add r4, r2 + + movu m0, [r1+r4] + movu m1, [r1+r4+1] + pmaddubsw m0, m5 + pmaddubsw m1, m5 + paddusw m2, m0 + paddusw m3, m1 + pmulhrsw m2, [pw_8192] + pmulhrsw m3, [pw_8192] +%ifidn %1, avg + mova m6, [r0+r4] + packuswb m2, m3 + pshufb m2, m4 + pavgb m2, m6 +%else + packuswb m2, m3 + pshufb m2, m4 +%endif + mova [r0+r4], m2 + add r4, r2 + sub r3d, 2 + jnz .loop + REP_RET +%endmacro + +INIT_MMX ssse3 +SSSE3_PIXELS_XY2 put +SSSE3_PIXELS_XY2 avg +INIT_XMM ssse3 +SSSE3_PIXELS_XY2 put, 6 +SSSE3_PIXELS_XY2 avg, 7 diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c index cda16dc722..42e33416eb 100644 --- a/libavcodec/x86/hpeldsp_init.c +++ b/libavcodec/x86/hpeldsp_init.c @@ -95,6 +95,15 @@ void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); + #define avg_pixels8_mmx ff_avg_pixels8_mmx #define avg_pixels8_x2_mmx ff_avg_pixels8_x2_mmx #define avg_pixels16_mmx ff_avg_pixels16_mmx @@ -307,6 +316,16 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags) #endif /* HAVE_SSE2_EXTERNAL */ } +static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags, int cpu_flags) +{ +#if HAVE_SSSE3_EXTERNAL + c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_ssse3; + c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_ssse3; + c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_ssse3; + c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_ssse3; +#endif +} + av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags) { int cpu_flags = av_get_cpu_flags(); @@ -322,4 +341,7 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags) if (EXTERNAL_SSE2(cpu_flags)) hpeldsp_init_sse2(c, flags, cpu_flags); + + if (EXTERNAL_SSSE3(cpu_flags)) + hpeldsp_init_ssse3(c, flags, cpu_flags); }