mirror of https://git.ffmpeg.org/ffmpeg.git
x86/dsputil: port clear_block functions to yasm
Signed-off-by: James Almer <jamrial@gmail.com> Reviewed-by: Christophe Gisquet <christophe.gisquet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
afaa39b46d
commit
7b05267239
|
@ -513,3 +513,63 @@ BSWAP32_BUF
|
|||
|
||||
INIT_XMM ssse3
|
||||
BSWAP32_BUF
|
||||
|
||||
;----------------------------------------
|
||||
; void ff_clear_block(int16_t *blocks);
|
||||
;----------------------------------------
|
||||
; %1 = number of xmm registers used
|
||||
; %2 = number of inline store loops
|
||||
%macro CLEAR_BLOCK 2
|
||||
cglobal clear_block, 1, 1, %1, blocks
|
||||
ZERO m0, m0
|
||||
%assign %%i 0
|
||||
%rep %2
|
||||
mova [blocksq+mmsize*(0+%%i)], m0
|
||||
mova [blocksq+mmsize*(1+%%i)], m0
|
||||
mova [blocksq+mmsize*(2+%%i)], m0
|
||||
mova [blocksq+mmsize*(3+%%i)], m0
|
||||
mova [blocksq+mmsize*(4+%%i)], m0
|
||||
mova [blocksq+mmsize*(5+%%i)], m0
|
||||
mova [blocksq+mmsize*(6+%%i)], m0
|
||||
mova [blocksq+mmsize*(7+%%i)], m0
|
||||
%assign %%i %%i+8
|
||||
%endrep
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
%define ZERO pxor
|
||||
CLEAR_BLOCK 0, 2
|
||||
INIT_XMM sse
|
||||
%define ZERO xorps
|
||||
CLEAR_BLOCK 1, 1
|
||||
|
||||
;-----------------------------------------
|
||||
; void ff_clear_blocks(int16_t *blocks);
|
||||
;-----------------------------------------
|
||||
; %1 = number of xmm registers used
|
||||
%macro CLEAR_BLOCKS 1
|
||||
cglobal clear_blocks, 1, 2, %1, blocks, len
|
||||
add blocksq, 768
|
||||
mov lenq, -768
|
||||
ZERO m0, m0
|
||||
.loop
|
||||
mova [blocksq+lenq+mmsize*0], m0
|
||||
mova [blocksq+lenq+mmsize*1], m0
|
||||
mova [blocksq+lenq+mmsize*2], m0
|
||||
mova [blocksq+lenq+mmsize*3], m0
|
||||
mova [blocksq+lenq+mmsize*4], m0
|
||||
mova [blocksq+lenq+mmsize*5], m0
|
||||
mova [blocksq+lenq+mmsize*6], m0
|
||||
mova [blocksq+lenq+mmsize*7], m0
|
||||
add lenq, mmsize*8
|
||||
js .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
%define ZERO pxor
|
||||
CLEAR_BLOCKS 0
|
||||
INIT_XMM sse
|
||||
%define ZERO xorps
|
||||
CLEAR_BLOCKS 1
|
||||
|
|
|
@ -534,8 +534,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
|
|||
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
|
||||
|
||||
if (!high_bit_depth) {
|
||||
c->clear_block = ff_clear_block_mmx;
|
||||
c->clear_blocks = ff_clear_blocks_mmx;
|
||||
c->draw_edges = ff_draw_edges_mmx;
|
||||
}
|
||||
|
||||
|
@ -547,6 +545,10 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
|
|||
#endif /* HAVE_MMX_INLINE */
|
||||
|
||||
#if HAVE_MMX_EXTERNAL
|
||||
if (!high_bit_depth) {
|
||||
c->clear_block = ff_clear_block_mmx;
|
||||
c->clear_blocks = ff_clear_blocks_mmx;
|
||||
}
|
||||
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
|
||||
#endif /* HAVE_MMX_EXTERNAL */
|
||||
}
|
||||
|
@ -585,7 +587,10 @@ static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
|
|||
{
|
||||
#if HAVE_SSE_INLINE
|
||||
c->vector_clipf = ff_vector_clipf_sse;
|
||||
#endif /* HAVE_SSE_INLINE */
|
||||
|
||||
#if HAVE_YASM
|
||||
#if HAVE_SSE_EXTERNAL
|
||||
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
|
||||
if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
|
||||
return;
|
||||
|
@ -594,9 +599,7 @@ static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
|
|||
c->clear_block = ff_clear_block_sse;
|
||||
c->clear_blocks = ff_clear_blocks_sse;
|
||||
}
|
||||
#endif /* HAVE_SSE_INLINE */
|
||||
|
||||
#if HAVE_YASM
|
||||
#endif
|
||||
#if HAVE_INLINE_ASM && CONFIG_VIDEODSP
|
||||
c->gmc = ff_gmc_sse;
|
||||
#endif
|
||||
|
|
|
@ -172,61 +172,6 @@ void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
|
|||
} while (--i);
|
||||
}
|
||||
|
||||
#define CLEAR_BLOCKS(name, n) \
|
||||
void name(int16_t *blocks) \
|
||||
{ \
|
||||
__asm__ volatile ( \
|
||||
"pxor %%mm7, %%mm7 \n\t" \
|
||||
"mov $-"#n", %%"REG_a" \n\t" \
|
||||
"1: \n\t" \
|
||||
"movq %%mm7, (%0, %%"REG_a") \n\t" \
|
||||
"movq %%mm7, 8(%0, %%"REG_a") \n\t" \
|
||||
"movq %%mm7, 16(%0, %%"REG_a") \n\t" \
|
||||
"movq %%mm7, 24(%0, %%"REG_a") \n\t" \
|
||||
"add $32, %%"REG_a" \n\t" \
|
||||
"js 1b \n\t" \
|
||||
:: "r"(((uint8_t *) blocks) + n) \
|
||||
: "%"REG_a); \
|
||||
}
|
||||
CLEAR_BLOCKS(ff_clear_blocks_mmx, 768)
|
||||
CLEAR_BLOCKS(ff_clear_block_mmx, 128)
|
||||
|
||||
void ff_clear_block_sse(int16_t *block)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"xorps %%xmm0, %%xmm0 \n"
|
||||
"movaps %%xmm0, (%0) \n"
|
||||
"movaps %%xmm0, 16(%0) \n"
|
||||
"movaps %%xmm0, 32(%0) \n"
|
||||
"movaps %%xmm0, 48(%0) \n"
|
||||
"movaps %%xmm0, 64(%0) \n"
|
||||
"movaps %%xmm0, 80(%0) \n"
|
||||
"movaps %%xmm0, 96(%0) \n"
|
||||
"movaps %%xmm0, 112(%0) \n"
|
||||
:: "r" (block)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
void ff_clear_blocks_sse(int16_t *blocks)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"xorps %%xmm0, %%xmm0 \n"
|
||||
"mov $-768, %%"REG_a" \n"
|
||||
"1: \n"
|
||||
"movaps %%xmm0, (%0, %%"REG_a") \n"
|
||||
"movaps %%xmm0, 16(%0, %%"REG_a") \n"
|
||||
"movaps %%xmm0, 32(%0, %%"REG_a") \n"
|
||||
"movaps %%xmm0, 48(%0, %%"REG_a") \n"
|
||||
"movaps %%xmm0, 64(%0, %%"REG_a") \n"
|
||||
"movaps %%xmm0, 80(%0, %%"REG_a") \n"
|
||||
"movaps %%xmm0, 96(%0, %%"REG_a") \n"
|
||||
"movaps %%xmm0, 112(%0, %%"REG_a") \n"
|
||||
"add $128, %%"REG_a" \n"
|
||||
"js 1b \n"
|
||||
:: "r"(((uint8_t *) blocks) + 128 * 6)
|
||||
: "%"REG_a);
|
||||
}
|
||||
|
||||
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
|
||||
{
|
||||
x86_reg i = 0;
|
||||
|
|
Loading…
Reference in New Issue