mirror of https://git.ffmpeg.org/ffmpeg.git
x86/dsputil: port ff_put_signed_pixels_clamped_mmx to yasm
Also add an SSE2 version Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
7b05267239
commit
80ee2dfcf6
|
@ -31,6 +31,8 @@ pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
|
||||||
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
|
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
|
||||||
pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
|
pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
|
||||||
|
|
||||||
|
cextern pb_80
|
||||||
|
|
||||||
SECTION_TEXT
|
SECTION_TEXT
|
||||||
|
|
||||||
%macro SCALARPRODUCT 0
|
%macro SCALARPRODUCT 0
|
||||||
|
@ -573,3 +575,53 @@ CLEAR_BLOCKS 0
|
||||||
INIT_XMM sse
|
INIT_XMM sse
|
||||||
%define ZERO xorps
|
%define ZERO xorps
|
||||||
CLEAR_BLOCKS 1
|
CLEAR_BLOCKS 1
|
||||||
|
|
||||||
|
;--------------------------------------------------------------------------
|
||||||
|
;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels,
|
||||||
|
; int line_size)
|
||||||
|
;--------------------------------------------------------------------------
|
||||||
|
|
||||||
|
%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1
|
||||||
|
mova m1, [blockq+mmsize*0+%1]
|
||||||
|
mova m2, [blockq+mmsize*2+%1]
|
||||||
|
%if mmsize == 8
|
||||||
|
mova m3, [blockq+mmsize*4+%1]
|
||||||
|
mova m4, [blockq+mmsize*6+%1]
|
||||||
|
%endif
|
||||||
|
packsswb m1, [blockq+mmsize*1+%1]
|
||||||
|
packsswb m2, [blockq+mmsize*3+%1]
|
||||||
|
%if mmsize == 8
|
||||||
|
packsswb m3, [blockq+mmsize*5+%1]
|
||||||
|
packsswb m4, [blockq+mmsize*7+%1]
|
||||||
|
%endif
|
||||||
|
paddb m1, m0
|
||||||
|
paddb m2, m0
|
||||||
|
%if mmsize == 8
|
||||||
|
paddb m3, m0
|
||||||
|
paddb m4, m0
|
||||||
|
movq [pixelsq+lsizeq*0], m1
|
||||||
|
movq [pixelsq+lsizeq*1], m2
|
||||||
|
movq [pixelsq+lsizeq*2], m3
|
||||||
|
movq [pixelsq+lsize3q ], m4
|
||||||
|
%else
|
||||||
|
movq [pixelsq+lsizeq*0], m1
|
||||||
|
movhps [pixelsq+lsizeq*1], m1
|
||||||
|
movq [pixelsq+lsizeq*2], m2
|
||||||
|
movhps [pixelsq+lsize3q ], m2
|
||||||
|
%endif
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro PUT_SIGNED_PIXELS_CLAMPED 1
|
||||||
|
cglobal put_signed_pixels_clamped, 3, 4, %1, block, pixels, lsize, lsize3
|
||||||
|
mova m0, [pb_80]
|
||||||
|
lea lsize3q, [lsizeq*3]
|
||||||
|
PUT_SIGNED_PIXELS_CLAMPED_HALF 0
|
||||||
|
lea pixelsq, [pixelsq+lsizeq*4]
|
||||||
|
PUT_SIGNED_PIXELS_CLAMPED_HALF 64
|
||||||
|
RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_MMX mmx
|
||||||
|
PUT_SIGNED_PIXELS_CLAMPED 0
|
||||||
|
INIT_XMM sse2
|
||||||
|
PUT_SIGNED_PIXELS_CLAMPED 3
|
||||||
|
|
|
@ -530,7 +530,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
|
||||||
{
|
{
|
||||||
#if HAVE_MMX_INLINE
|
#if HAVE_MMX_INLINE
|
||||||
c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
|
c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
|
||||||
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
|
|
||||||
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
|
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
|
||||||
|
|
||||||
if (!high_bit_depth) {
|
if (!high_bit_depth) {
|
||||||
|
@ -550,6 +549,7 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
|
||||||
c->clear_blocks = ff_clear_blocks_mmx;
|
c->clear_blocks = ff_clear_blocks_mmx;
|
||||||
}
|
}
|
||||||
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
|
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
|
||||||
|
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
|
||||||
#endif /* HAVE_MMX_EXTERNAL */
|
#endif /* HAVE_MMX_EXTERNAL */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -627,6 +627,7 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
|
||||||
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
|
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
|
||||||
}
|
}
|
||||||
c->bswap_buf = ff_bswap32_buf_sse2;
|
c->bswap_buf = ff_bswap32_buf_sse2;
|
||||||
|
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
|
||||||
#endif /* HAVE_SSE2_EXTERNAL */
|
#endif /* HAVE_SSE2_EXTERNAL */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -94,42 +94,6 @@ void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
|
||||||
: "memory");
|
: "memory");
|
||||||
}
|
}
|
||||||
|
|
||||||
#define put_signed_pixels_clamped_mmx_half(off) \
|
|
||||||
"movq "#off"(%2), %%mm1 \n\t" \
|
|
||||||
"movq 16 + "#off"(%2), %%mm2 \n\t" \
|
|
||||||
"movq 32 + "#off"(%2), %%mm3 \n\t" \
|
|
||||||
"movq 48 + "#off"(%2), %%mm4 \n\t" \
|
|
||||||
"packsswb 8 + "#off"(%2), %%mm1 \n\t" \
|
|
||||||
"packsswb 24 + "#off"(%2), %%mm2 \n\t" \
|
|
||||||
"packsswb 40 + "#off"(%2), %%mm3 \n\t" \
|
|
||||||
"packsswb 56 + "#off"(%2), %%mm4 \n\t" \
|
|
||||||
"paddb %%mm0, %%mm1 \n\t" \
|
|
||||||
"paddb %%mm0, %%mm2 \n\t" \
|
|
||||||
"paddb %%mm0, %%mm3 \n\t" \
|
|
||||||
"paddb %%mm0, %%mm4 \n\t" \
|
|
||||||
"movq %%mm1, (%0) \n\t" \
|
|
||||||
"movq %%mm2, (%0, %3) \n\t" \
|
|
||||||
"movq %%mm3, (%0, %3, 2) \n\t" \
|
|
||||||
"movq %%mm4, (%0, %1) \n\t"
|
|
||||||
|
|
||||||
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
|
|
||||||
int line_size)
|
|
||||||
{
|
|
||||||
x86_reg line_skip = line_size;
|
|
||||||
x86_reg line_skip3;
|
|
||||||
|
|
||||||
__asm__ volatile (
|
|
||||||
"movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
|
|
||||||
"lea (%3, %3, 2), %1 \n\t"
|
|
||||||
put_signed_pixels_clamped_mmx_half(0)
|
|
||||||
"lea (%0, %3, 4), %0 \n\t"
|
|
||||||
put_signed_pixels_clamped_mmx_half(64)
|
|
||||||
: "+&r" (pixels), "=&r" (line_skip3)
|
|
||||||
: "r" (block), "r" (line_skip)
|
|
||||||
NAMED_CONSTRAINTS_ADD(ff_pb_80)
|
|
||||||
: "memory");
|
|
||||||
}
|
|
||||||
|
|
||||||
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
|
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
|
||||||
int line_size)
|
int line_size)
|
||||||
{
|
{
|
||||||
|
|
|
@ -37,6 +37,8 @@ void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
|
||||||
int line_size);
|
int line_size);
|
||||||
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
|
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
|
||||||
int line_size);
|
int line_size);
|
||||||
|
void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
|
||||||
|
int line_size);
|
||||||
|
|
||||||
void ff_clear_block_mmx(int16_t *block);
|
void ff_clear_block_mmx(int16_t *block);
|
||||||
void ff_clear_block_sse(int16_t *block);
|
void ff_clear_block_sse(int16_t *block);
|
||||||
|
|
Loading…
Reference in New Issue