mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2025-02-04 13:53:26 +00:00
x86: hpeldsp: implement SSSE3 version of _xy2
Loading pb_1 rather than pw_8192 was benchmarked to be more efficient. Loading of the 2 yields no advantage. Loading of one saves ~11 cycles. decicycles count: put8: 3223(mmx) -> 2387 avg8: 2863(mmxext) -> 2125 put16: 4356(sse2) -> 3553 avg16: 4481(sse2) -> 3513 Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
726316240b
commit
81aa0f4604
@ -30,6 +30,9 @@
|
||||
SECTION_RODATA
|
||||
cextern pb_1
|
||||
cextern pw_2
|
||||
pw_8192: times 8 dw (1<<13)
|
||||
pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
|
||||
pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7
|
||||
|
||||
SECTION_TEXT
|
||||
|
||||
@ -635,3 +638,70 @@ SET_PIXELS_XY2 avg
|
||||
INIT_XMM sse2
|
||||
SET_PIXELS_XY2 put
|
||||
SET_PIXELS_XY2 avg
|
||||
|
||||
%macro SSSE3_PIXELS_XY2 1-2
|
||||
%if %0 == 2 ; sse2
|
||||
cglobal %1_pixels16_xy2, 4,5,%2
|
||||
mova m4, [pb_interleave16]
|
||||
%else
|
||||
cglobal %1_pixels8_xy2, 4,5
|
||||
mova m4, [pb_interleave8]
|
||||
%endif
|
||||
mova m5, [pb_1]
|
||||
movu m0, [r1]
|
||||
movu m1, [r1+1]
|
||||
pmaddubsw m0, m5
|
||||
pmaddubsw m1, m5
|
||||
xor r4, r4
|
||||
add r1, r2
|
||||
.loop:
|
||||
movu m2, [r1+r4]
|
||||
movu m3, [r1+r4+1]
|
||||
pmaddubsw m2, m5
|
||||
pmaddubsw m3, m5
|
||||
paddusw m0, m2
|
||||
paddusw m1, m3
|
||||
pmulhrsw m0, [pw_8192]
|
||||
pmulhrsw m1, [pw_8192]
|
||||
%ifidn %1, avg
|
||||
mova m6, [r0+r4]
|
||||
packuswb m0, m1
|
||||
pshufb m0, m4
|
||||
pavgb m0, m6
|
||||
%else
|
||||
packuswb m0, m1
|
||||
pshufb m0, m4
|
||||
%endif
|
||||
mova [r0+r4], m0
|
||||
add r4, r2
|
||||
|
||||
movu m0, [r1+r4]
|
||||
movu m1, [r1+r4+1]
|
||||
pmaddubsw m0, m5
|
||||
pmaddubsw m1, m5
|
||||
paddusw m2, m0
|
||||
paddusw m3, m1
|
||||
pmulhrsw m2, [pw_8192]
|
||||
pmulhrsw m3, [pw_8192]
|
||||
%ifidn %1, avg
|
||||
mova m6, [r0+r4]
|
||||
packuswb m2, m3
|
||||
pshufb m2, m4
|
||||
pavgb m2, m6
|
||||
%else
|
||||
packuswb m2, m3
|
||||
pshufb m2, m4
|
||||
%endif
|
||||
mova [r0+r4], m2
|
||||
add r4, r2
|
||||
sub r3d, 2
|
||||
jnz .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX ssse3
|
||||
SSSE3_PIXELS_XY2 put
|
||||
SSSE3_PIXELS_XY2 avg
|
||||
INIT_XMM ssse3
|
||||
SSSE3_PIXELS_XY2 put, 6
|
||||
SSSE3_PIXELS_XY2 avg, 7
|
||||
|
@ -95,6 +95,15 @@ void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
#define avg_pixels8_mmx ff_avg_pixels8_mmx
|
||||
#define avg_pixels8_x2_mmx ff_avg_pixels8_x2_mmx
|
||||
#define avg_pixels16_mmx ff_avg_pixels16_mmx
|
||||
@ -307,6 +316,16 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags)
|
||||
#endif /* HAVE_SSE2_EXTERNAL */
|
||||
}
|
||||
|
||||
static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags, int cpu_flags)
|
||||
{
|
||||
#if HAVE_SSSE3_EXTERNAL
|
||||
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_ssse3;
|
||||
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_ssse3;
|
||||
c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_ssse3;
|
||||
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_ssse3;
|
||||
#endif
|
||||
}
|
||||
|
||||
av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
@ -322,4 +341,7 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
hpeldsp_init_sse2(c, flags, cpu_flags);
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags))
|
||||
hpeldsp_init_ssse3(c, flags, cpu_flags);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user