diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm index c1ea9bf4e2..c91dd8eb69 100644 --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -513,3 +513,63 @@ BSWAP32_BUF INIT_XMM ssse3 BSWAP32_BUF + +;---------------------------------------- +; void ff_clear_block(int16_t *blocks); +;---------------------------------------- +; %1 = number of xmm registers used +; %2 = number of inline store loops +%macro CLEAR_BLOCK 2 +cglobal clear_block, 1, 1, %1, blocks + ZERO m0, m0 +%assign %%i 0 +%rep %2 + mova [blocksq+mmsize*(0+%%i)], m0 + mova [blocksq+mmsize*(1+%%i)], m0 + mova [blocksq+mmsize*(2+%%i)], m0 + mova [blocksq+mmsize*(3+%%i)], m0 + mova [blocksq+mmsize*(4+%%i)], m0 + mova [blocksq+mmsize*(5+%%i)], m0 + mova [blocksq+mmsize*(6+%%i)], m0 + mova [blocksq+mmsize*(7+%%i)], m0 +%assign %%i %%i+8 +%endrep + RET +%endmacro + +INIT_MMX mmx +%define ZERO pxor +CLEAR_BLOCK 0, 2 +INIT_XMM sse +%define ZERO xorps +CLEAR_BLOCK 1, 1 + +;----------------------------------------- +; void ff_clear_blocks(int16_t *blocks); +;----------------------------------------- +; %1 = number of xmm registers used +%macro CLEAR_BLOCKS 1 +cglobal clear_blocks, 1, 2, %1, blocks, len + add blocksq, 768 + mov lenq, -768 + ZERO m0, m0 +.loop + mova [blocksq+lenq+mmsize*0], m0 + mova [blocksq+lenq+mmsize*1], m0 + mova [blocksq+lenq+mmsize*2], m0 + mova [blocksq+lenq+mmsize*3], m0 + mova [blocksq+lenq+mmsize*4], m0 + mova [blocksq+lenq+mmsize*5], m0 + mova [blocksq+lenq+mmsize*6], m0 + mova [blocksq+lenq+mmsize*7], m0 + add lenq, mmsize*8 + js .loop + RET +%endmacro + +INIT_MMX mmx +%define ZERO pxor +CLEAR_BLOCKS 0 +INIT_XMM sse +%define ZERO xorps +CLEAR_BLOCKS 1 diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c index 414da14118..4461ae464f 100644 --- a/libavcodec/x86/dsputil_init.c +++ b/libavcodec/x86/dsputil_init.c @@ -534,8 +534,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, c->add_pixels_clamped = ff_add_pixels_clamped_mmx; if (!high_bit_depth) { - c->clear_block = ff_clear_block_mmx; - c->clear_blocks = ff_clear_blocks_mmx; c->draw_edges = ff_draw_edges_mmx; } @@ -547,6 +545,10 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, #endif /* HAVE_MMX_INLINE */ #if HAVE_MMX_EXTERNAL + if (!high_bit_depth) { + c->clear_block = ff_clear_block_mmx; + c->clear_blocks = ff_clear_blocks_mmx; + } c->vector_clip_int32 = ff_vector_clip_int32_mmx; #endif /* HAVE_MMX_EXTERNAL */ } @@ -585,7 +587,10 @@ static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, { #if HAVE_SSE_INLINE c->vector_clipf = ff_vector_clipf_sse; +#endif /* HAVE_SSE_INLINE */ +#if HAVE_YASM +#if HAVE_SSE_EXTERNAL /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb) return; @@ -594,9 +599,7 @@ static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, c->clear_block = ff_clear_block_sse; c->clear_blocks = ff_clear_blocks_sse; } -#endif /* HAVE_SSE_INLINE */ - -#if HAVE_YASM +#endif #if HAVE_INLINE_ASM && CONFIG_VIDEODSP c->gmc = ff_gmc_sse; #endif diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 42e25c4ff6..a9c584d88a 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -172,61 +172,6 @@ void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, } while (--i); } -#define CLEAR_BLOCKS(name, n) \ -void name(int16_t *blocks) \ -{ \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "mov $-"#n", %%"REG_a" \n\t" \ - "1: \n\t" \ - "movq %%mm7, (%0, %%"REG_a") \n\t" \ - "movq %%mm7, 8(%0, %%"REG_a") \n\t" \ - "movq %%mm7, 16(%0, %%"REG_a") \n\t" \ - "movq %%mm7, 24(%0, %%"REG_a") \n\t" \ - "add $32, %%"REG_a" \n\t" \ - "js 1b \n\t" \ - :: "r"(((uint8_t *) blocks) + n) \ - : "%"REG_a); \ -} -CLEAR_BLOCKS(ff_clear_blocks_mmx, 768) -CLEAR_BLOCKS(ff_clear_block_mmx, 128) - -void ff_clear_block_sse(int16_t *block) -{ - __asm__ volatile ( - "xorps %%xmm0, %%xmm0 \n" - "movaps %%xmm0, (%0) \n" - "movaps %%xmm0, 16(%0) \n" - "movaps %%xmm0, 32(%0) \n" - "movaps %%xmm0, 48(%0) \n" - "movaps %%xmm0, 64(%0) \n" - "movaps %%xmm0, 80(%0) \n" - "movaps %%xmm0, 96(%0) \n" - "movaps %%xmm0, 112(%0) \n" - :: "r" (block) - : "memory"); -} - -void ff_clear_blocks_sse(int16_t *blocks) -{ - __asm__ volatile ( - "xorps %%xmm0, %%xmm0 \n" - "mov $-768, %%"REG_a" \n" - "1: \n" - "movaps %%xmm0, (%0, %%"REG_a") \n" - "movaps %%xmm0, 16(%0, %%"REG_a") \n" - "movaps %%xmm0, 32(%0, %%"REG_a") \n" - "movaps %%xmm0, 48(%0, %%"REG_a") \n" - "movaps %%xmm0, 64(%0, %%"REG_a") \n" - "movaps %%xmm0, 80(%0, %%"REG_a") \n" - "movaps %%xmm0, 96(%0, %%"REG_a") \n" - "movaps %%xmm0, 112(%0, %%"REG_a") \n" - "add $128, %%"REG_a" \n" - "js 1b \n" - :: "r"(((uint8_t *) blocks) + 128 * 6) - : "%"REG_a); -} - void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w) { x86_reg i = 0;