From a493f8541de20e76073433f39f66da31f3834bc4 Mon Sep 17 00:00:00 2001
From: Michael Niedermayer <michaelni@gmx.at>
Date: Mon, 20 Jan 2014 03:51:21 +0100
Subject: [PATCH] avcodec/x86/dsp: add_int16_mmx / add_int16_sse2

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
---
 libavcodec/x86/dsputil.asm    | 65 +++++++++++++++++++++++++++++++++++
 libavcodec/x86/dsputil_init.c |  3 ++
 libavcodec/x86/dsputil_x86.h  |  2 ++
 3 files changed, 70 insertions(+)

diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 77069e20f8..9450cd8fd6 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -465,6 +465,71 @@ cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
 .src_unaligned:
     ADD_HFYU_LEFT_LOOP 0, 0
 
+
+%macro ADD_INT16_LOOP 1 ; %1 = is_aligned
+    movd      m4, maskq
+    punpcklwd m4, m4
+    punpcklwd m4, m4
+    punpcklwd m4, m4
+    add     wq, wq
+    test    wq, 2*mmsize - 1
+    jz %%.tomainloop
+%%.wordloop:
+    sub     wq, 2
+    mov     ax, [srcq+wq]
+    add     ax, [dstq+wq]
+    and     ax, maskw
+    mov     [dstq+wq], ax
+    test    wq, 2*mmsize - 1
+    jnz %%.wordloop
+%%.tomainloop:
+    add     srcq, wq
+    add     dstq, wq
+    neg     wq
+    jz      %%.end
+%%.loop:
+%if %1
+    mova    m0, [srcq+wq]
+    mova    m1, [dstq+wq]
+    mova    m2, [srcq+wq+mmsize]
+    mova    m3, [dstq+wq+mmsize]
+%else
+    movu    m0, [srcq+wq]
+    movu    m1, [dstq+wq]
+    movu    m2, [srcq+wq+mmsize]
+    movu    m3, [dstq+wq+mmsize]
+%endif
+    paddw   m0, m1
+    paddw   m2, m3
+    pand    m0, m4
+    pand    m2, m4
+%if %1
+    mova    [dstq+wq]       , m0
+    mova    [dstq+wq+mmsize], m2
+%else
+    movu    [dstq+wq]       , m0
+    movu    [dstq+wq+mmsize], m2
+%endif
+    add     wq, 2*mmsize
+    jl %%.loop
+%%.end:
+    RET
+%endmacro
+
+INIT_MMX mmx
+cglobal add_int16, 4,4,5, dst, src, mask, w
+    ADD_INT16_LOOP 1
+
+INIT_XMM sse2
+cglobal add_int16, 4,4,5, dst, src, mask, w
+    test srcq, mmsize-1
+    jnz .unaligned
+    test dstq, mmsize-1
+    jnz .unaligned
+    ADD_INT16_LOOP 1
+.unaligned:
+    ADD_INT16_LOOP 0
+
 ;-----------------------------------------------------------------------------
 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
 ;                           int32_t max, unsigned int len)
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index e0b40410a7..08bd29720a 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -542,6 +542,7 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
 #endif /* HAVE_MMX_INLINE */
 
 #if HAVE_MMX_EXTERNAL
+    c->add_int16 = ff_add_int16_mmx;
     c->vector_clip_int32 = ff_vector_clip_int32_mmx;
 #endif /* HAVE_MMX_EXTERNAL */
 }
@@ -625,6 +626,8 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
         c->vector_clip_int32 = ff_vector_clip_int32_sse2;
     }
     c->bswap_buf = ff_bswap32_buf_sse2;
+
+    c->add_int16 = ff_add_int16_sse2;
 #endif /* HAVE_SSE2_EXTERNAL */
 }
 
diff --git a/libavcodec/x86/dsputil_x86.h b/libavcodec/x86/dsputil_x86.h
index 356b2c142f..e707e55a59 100644
--- a/libavcodec/x86/dsputil_x86.h
+++ b/libavcodec/x86/dsputil_x86.h
@@ -116,6 +116,8 @@ void ff_clear_blocks_mmx(int16_t *blocks);
 void ff_clear_blocks_sse(int16_t *blocks);
 
 void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w);
+void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
 
 void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
                                         const uint8_t *diff, int w,