From a493f8541de20e76073433f39f66da31f3834bc4 Mon Sep 17 00:00:00 2001 From: Michael Niedermayer Date: Mon, 20 Jan 2014 03:51:21 +0100 Subject: [PATCH] avcodec/x86/dsp: add_int16_mmx / add_int16_sse2 Signed-off-by: Michael Niedermayer --- libavcodec/x86/dsputil.asm | 65 +++++++++++++++++++++++++++++++++++ libavcodec/x86/dsputil_init.c | 3 ++ libavcodec/x86/dsputil_x86.h | 2 ++ 3 files changed, 70 insertions(+) diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm index 77069e20f8..9450cd8fd6 100644 --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -465,6 +465,71 @@ cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left .src_unaligned: ADD_HFYU_LEFT_LOOP 0, 0 + +%macro ADD_INT16_LOOP 1 ; %1 = is_aligned + movd m4, maskq + punpcklwd m4, m4 + punpcklwd m4, m4 + punpcklwd m4, m4 + add wq, wq + test wq, 2*mmsize - 1 + jz %%.tomainloop +%%.wordloop: + sub wq, 2 + mov ax, [srcq+wq] + add ax, [dstq+wq] + and ax, maskw + mov [dstq+wq], ax + test wq, 2*mmsize - 1 + jnz %%.wordloop +%%.tomainloop: + add srcq, wq + add dstq, wq + neg wq + jz %%.end +%%.loop: +%if %1 + mova m0, [srcq+wq] + mova m1, [dstq+wq] + mova m2, [srcq+wq+mmsize] + mova m3, [dstq+wq+mmsize] +%else + movu m0, [srcq+wq] + movu m1, [dstq+wq] + movu m2, [srcq+wq+mmsize] + movu m3, [dstq+wq+mmsize] +%endif + paddw m0, m1 + paddw m2, m3 + pand m0, m4 + pand m2, m4 +%if %1 + mova [dstq+wq] , m0 + mova [dstq+wq+mmsize], m2 +%else + movu [dstq+wq] , m0 + movu [dstq+wq+mmsize], m2 +%endif + add wq, 2*mmsize + jl %%.loop +%%.end: + RET +%endmacro + +INIT_MMX mmx +cglobal add_int16, 4,4,5, dst, src, mask, w + ADD_INT16_LOOP 1 + +INIT_XMM sse2 +cglobal add_int16, 4,4,5, dst, src, mask, w + test srcq, mmsize-1 + jnz .unaligned + test dstq, mmsize-1 + jnz .unaligned + ADD_INT16_LOOP 1 +.unaligned: + ADD_INT16_LOOP 0 + ;----------------------------------------------------------------------------- ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, ; int32_t max, unsigned int len) diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c index e0b40410a7..08bd29720a 100644 --- a/libavcodec/x86/dsputil_init.c +++ b/libavcodec/x86/dsputil_init.c @@ -542,6 +542,7 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, #endif /* HAVE_MMX_INLINE */ #if HAVE_MMX_EXTERNAL + c->add_int16 = ff_add_int16_mmx; c->vector_clip_int32 = ff_vector_clip_int32_mmx; #endif /* HAVE_MMX_EXTERNAL */ } @@ -625,6 +626,8 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, c->vector_clip_int32 = ff_vector_clip_int32_sse2; } c->bswap_buf = ff_bswap32_buf_sse2; + + c->add_int16 = ff_add_int16_sse2; #endif /* HAVE_SSE2_EXTERNAL */ } diff --git a/libavcodec/x86/dsputil_x86.h b/libavcodec/x86/dsputil_x86.h index 356b2c142f..e707e55a59 100644 --- a/libavcodec/x86/dsputil_x86.h +++ b/libavcodec/x86/dsputil_x86.h @@ -116,6 +116,8 @@ void ff_clear_blocks_mmx(int16_t *blocks); void ff_clear_blocks_sse(int16_t *blocks); void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w); +void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w); +void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w); void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w,