x86: hpeldsp: implement SSSE3 version of _xy2

Loading pb_1 rather than pw_8192 was benchmarked to be more efficient.
Loading of the 2 yields no advantage. Loading of one saves ~11 cycles.

decicycles count:
put8:  3223(mmx)    -> 2387
avg8:  2863(mmxext) -> 2125
put16: 4356(sse2)   -> 3553
avg16: 4481(sse2)   -> 3513

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Christophe Gisquet 2014-05-22 23:47:06 +02:00 committed by Michael Niedermayer
parent 726316240b
commit 81aa0f4604
2 changed files with 92 additions and 0 deletions

View File

@ -30,6 +30,9 @@
SECTION_RODATA
cextern pb_1
cextern pw_2
pw_8192: times 8 dw (1<<13)
pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7
SECTION_TEXT
@ -635,3 +638,70 @@ SET_PIXELS_XY2 avg
INIT_XMM sse2
SET_PIXELS_XY2 put
SET_PIXELS_XY2 avg
%macro SSSE3_PIXELS_XY2 1-2
%if %0 == 2 ; sse2
cglobal %1_pixels16_xy2, 4,5,%2
mova m4, [pb_interleave16]
%else
cglobal %1_pixels8_xy2, 4,5
mova m4, [pb_interleave8]
%endif
mova m5, [pb_1]
movu m0, [r1]
movu m1, [r1+1]
pmaddubsw m0, m5
pmaddubsw m1, m5
xor r4, r4
add r1, r2
.loop:
movu m2, [r1+r4]
movu m3, [r1+r4+1]
pmaddubsw m2, m5
pmaddubsw m3, m5
paddusw m0, m2
paddusw m1, m3
pmulhrsw m0, [pw_8192]
pmulhrsw m1, [pw_8192]
%ifidn %1, avg
mova m6, [r0+r4]
packuswb m0, m1
pshufb m0, m4
pavgb m0, m6
%else
packuswb m0, m1
pshufb m0, m4
%endif
mova [r0+r4], m0
add r4, r2
movu m0, [r1+r4]
movu m1, [r1+r4+1]
pmaddubsw m0, m5
pmaddubsw m1, m5
paddusw m2, m0
paddusw m3, m1
pmulhrsw m2, [pw_8192]
pmulhrsw m3, [pw_8192]
%ifidn %1, avg
mova m6, [r0+r4]
packuswb m2, m3
pshufb m2, m4
pavgb m2, m6
%else
packuswb m2, m3
pshufb m2, m4
%endif
mova [r0+r4], m2
add r4, r2
sub r3d, 2
jnz .loop
REP_RET
%endmacro
INIT_MMX ssse3
SSSE3_PIXELS_XY2 put
SSSE3_PIXELS_XY2 avg
INIT_XMM ssse3
SSSE3_PIXELS_XY2 put, 6
SSSE3_PIXELS_XY2 avg, 7

View File

@ -95,6 +95,15 @@ void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
#define avg_pixels8_mmx ff_avg_pixels8_mmx
#define avg_pixels8_x2_mmx ff_avg_pixels8_x2_mmx
#define avg_pixels16_mmx ff_avg_pixels16_mmx
@ -307,6 +316,16 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags)
#endif /* HAVE_SSE2_EXTERNAL */
}
static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags, int cpu_flags)
{
#if HAVE_SSSE3_EXTERNAL
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_ssse3;
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_ssse3;
c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_ssse3;
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_ssse3;
#endif
}
av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
{
int cpu_flags = av_get_cpu_flags();
@ -322,4 +341,7 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
if (EXTERNAL_SSE2(cpu_flags))
hpeldsp_init_sse2(c, flags, cpu_flags);
if (EXTERNAL_SSSE3(cpu_flags))
hpeldsp_init_ssse3(c, flags, cpu_flags);
}