From 357f45d9bc532180bde2204ff6f03adf881d12d5 Mon Sep 17 00:00:00 2001
From: David Conrad <lessen42@gmail.com>
Date: Fri, 17 Oct 2008 03:18:08 +0000
Subject: [PATCH] MMX VP3 Loop Filter

Originally committed as revision 15630 to svn://svn.ffmpeg.org/ffmpeg/trunk
---
 libavcodec/i386/dsputil_mmx.c |   7 +++
 libavcodec/i386/dsputil_mmx.h |  18 ++++++
 libavcodec/i386/vp3dsp_mmx.c  | 101 ++++++++++++++++++++++++++++++++++
 libavcodec/i386/vp3dsp_mmx.h  |   3 +
 libavcodec/vp3.c              |   3 +-
 5 files changed, 131 insertions(+), 1 deletion(-)

diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
index f15eac987f..b23664d960 100644
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -63,7 +63,9 @@ DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1  ) = 0x0101010101010101ULL;
 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3  ) = 0x0303030303030303ULL;
 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7  ) = 0x0707070707070707ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
 
@@ -2591,6 +2593,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             c->h263_v_loop_filter= h263_v_loop_filter_mmx;
             c->h263_h_loop_filter= h263_h_loop_filter_mmx;
         }
+        if ((ENABLE_VP3_DECODER || ENABLE_THEORA_DECODER) &&
+            !(avctx->flags & CODEC_FLAG_BITEXACT)) {
+            c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx;
+            c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx;
+        }
         c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd;
         c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
         c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd;
diff --git a/libavcodec/i386/dsputil_mmx.h b/libavcodec/i386/dsputil_mmx.h
index 5f81cb88d9..6c056f7313 100644
--- a/libavcodec/i386/dsputil_mmx.h
+++ b/libavcodec/i386/dsputil_mmx.h
@@ -50,7 +50,9 @@ extern const uint64_t ff_pw_255;
 extern const uint64_t ff_pb_1;
 extern const uint64_t ff_pb_3;
 extern const uint64_t ff_pb_7;
+extern const uint64_t ff_pb_1F;
 extern const uint64_t ff_pb_3F;
+extern const uint64_t ff_pb_81;
 extern const uint64_t ff_pb_A1;
 extern const uint64_t ff_pb_FC;
 
@@ -86,6 +88,22 @@ extern const double ff_pd_2[2];
     SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
     SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
 
+// e,f,g,h can be memory
+// out: a,d,t,c
+#define TRANSPOSE8x4(a,b,c,d,e,f,g,h,t)\
+    "punpcklbw " #e ", " #a " \n\t" /* a0 e0 a1 e1 a2 e2 a3 e3 */\
+    "punpcklbw " #f ", " #b " \n\t" /* b0 f0 b1 f1 b2 f2 b3 f3 */\
+    "punpcklbw " #g ", " #c " \n\t" /* c0 g0 c1 g1 c2 g2 d3 g3 */\
+    "punpcklbw " #h ", " #d " \n\t" /* d0 h0 d1 h1 d2 h2 d3 h3 */\
+    SBUTTERFLY(a, b, t, bw, q)   /* a= a0 b0 e0 f0 a1 b1 e1 f1 */\
+                                 /* t= a2 b2 e2 f2 a3 b3 e3 f3 */\
+    SBUTTERFLY(c, d, b, bw, q)   /* c= c0 d0 g0 h0 c1 d1 g1 h1 */\
+                                 /* b= c2 d2 g2 h2 c3 d3 g3 h3 */\
+    SBUTTERFLY(a, c, d, wd, q)   /* a= a0 b0 c0 d0 e0 f0 g0 h0 */\
+                                 /* d= a1 b1 c1 d1 e1 f1 g1 h1 */\
+    SBUTTERFLY(t, b, c, wd, q)   /* t= a2 b2 c2 d2 e2 f2 g2 h2 */\
+                                 /* c= a3 b3 c3 d3 e3 f3 g3 h3 */
+
 #ifdef ARCH_X86_64
 // permutes 01234567 -> 05736421
 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
diff --git a/libavcodec/i386/vp3dsp_mmx.c b/libavcodec/i386/vp3dsp_mmx.c
index e7571c0b9e..32af301c00 100644
--- a/libavcodec/i386/vp3dsp_mmx.c
+++ b/libavcodec/i386/vp3dsp_mmx.c
@@ -23,11 +23,112 @@
  * MMX-optimized functions cribbed from the original VP3 source code.
  */
 
+#include "libavutil/x86_cpu.h"
 #include "libavcodec/dsputil.h"
 #include "dsputil_mmx.h"
 
 extern const uint16_t ff_vp3_idct_data[];
 
+// this is off by one or two for some cases when filter_limit is greater than 63
+// in:  p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
+// out: p1 in mm4, p2 in mm3
+#define VP3_LOOP_FILTER(flim) \
+    "movq       %%mm6, %%mm7 \n\t" \
+    "pand    "MANGLE(ff_pb_7 )", %%mm6 \n\t" /* p0&7 */ \
+    "psrlw         $3, %%mm7 \n\t" \
+    "pand    "MANGLE(ff_pb_1F)", %%mm7 \n\t" /* p0>>3 */ \
+    "movq       %%mm2, %%mm3 \n\t" /* mm3 = p2 */ \
+    "pxor       %%mm4, %%mm2 \n\t" \
+    "pand    "MANGLE(ff_pb_1 )", %%mm2 \n\t" /* (p2^p1)&1 */ \
+    "movq       %%mm2, %%mm5 \n\t" \
+    "paddb      %%mm2, %%mm2 \n\t" \
+    "paddb      %%mm5, %%mm2 \n\t" /* 3*(p2^p1)&1 */ \
+    "paddb      %%mm6, %%mm2 \n\t" /* extra bits lost in shifts */ \
+    "pcmpeqb    %%mm0, %%mm0 \n\t" \
+    "pxor       %%mm0, %%mm1 \n\t" /* 255 - p3 */ \
+    "pavgb      %%mm2, %%mm1 \n\t" /* (256 - p3 + extrabits) >> 1 */ \
+    "pxor       %%mm4, %%mm0 \n\t" /* 255 - p1 */ \
+    "pavgb      %%mm3, %%mm0 \n\t" /* (256 + p2-p1) >> 1 */ \
+    "paddb   "MANGLE(ff_pb_3 )", %%mm1 \n\t" \
+    "pavgb      %%mm0, %%mm1 \n\t" /* 128+2+(   p2-p1  - p3) >> 2 */ \
+    "pavgb      %%mm0, %%mm1 \n\t" /* 128+1+(3*(p2-p1) - p3) >> 3 */ \
+    "paddusb    %%mm1, %%mm7 \n\t" /* d+128+1 */ \
+    "movq    "MANGLE(ff_pb_81)", %%mm6 \n\t" \
+    "psubusb    %%mm7, %%mm6 \n\t" \
+    "psubusb "MANGLE(ff_pb_81)", %%mm7 \n\t" \
+\
+    "movq     "#flim", %%mm5 \n\t" \
+    "pminub     %%mm5, %%mm6 \n\t" \
+    "pminub     %%mm5, %%mm7 \n\t" \
+    "movq       %%mm6, %%mm0 \n\t" \
+    "movq       %%mm7, %%mm1 \n\t" \
+    "paddb      %%mm6, %%mm6 \n\t" \
+    "paddb      %%mm7, %%mm7 \n\t" \
+    "pminub     %%mm5, %%mm6 \n\t" \
+    "pminub     %%mm5, %%mm7 \n\t" \
+    "psubb      %%mm0, %%mm6 \n\t" \
+    "psubb      %%mm1, %%mm7 \n\t" \
+    "paddusb    %%mm7, %%mm4 \n\t" \
+    "psubusb    %%mm6, %%mm4 \n\t" \
+    "psubusb    %%mm7, %%mm3 \n\t" \
+    "paddusb    %%mm6, %%mm3 \n\t"
+
+#define STORE_4_WORDS(dst0, dst1, dst2, dst3, mm) \
+    "movd "#mm", %0        \n\t" \
+    "movw   %w0, -1"#dst0" \n\t" \
+    "psrlq  $32, "#mm"     \n\t" \
+    "shr    $16, %0        \n\t" \
+    "movw   %w0, -1"#dst1" \n\t" \
+    "movd "#mm", %0        \n\t" \
+    "movw   %w0, -1"#dst2" \n\t" \
+    "shr    $16, %0        \n\t" \
+    "movw   %w0, -1"#dst3" \n\t"
+
+void ff_vp3_v_loop_filter_mmx(uint8_t *src, int stride, int *bounding_values)
+{
+    __asm__ volatile(
+        "movq          %0, %%mm6 \n\t"
+        "movq          %1, %%mm4 \n\t"
+        "movq          %2, %%mm2 \n\t"
+        "movq          %3, %%mm1 \n\t"
+
+        VP3_LOOP_FILTER(%4)
+
+        "movq       %%mm4, %1    \n\t"
+        "movq       %%mm3, %2    \n\t"
+
+        : "+m" (*(uint64_t*)(src - 2*stride)),
+          "+m" (*(uint64_t*)(src - 1*stride)),
+          "+m" (*(uint64_t*)(src + 0*stride)),
+          "+m" (*(uint64_t*)(src + 1*stride))
+        : "m"(*(uint64_t*)(bounding_values+129))
+    );
+}
+
+void ff_vp3_h_loop_filter_mmx(uint8_t *src, int stride, int *bounding_values)
+{
+    x86_reg tmp;
+
+    __asm__ volatile(
+        "movd -2(%1),      %%mm6 \n\t"
+        "movd -2(%1,%3),   %%mm0 \n\t"
+        "movd -2(%1,%3,2), %%mm1 \n\t"
+        "movd -2(%1,%4),   %%mm4 \n\t"
+
+        TRANSPOSE8x4(%%mm6, %%mm0, %%mm1, %%mm4, -2(%2), -2(%2,%3), -2(%2,%3,2), -2(%2,%4), %%mm2)
+        VP3_LOOP_FILTER(%5)
+        SBUTTERFLY(%%mm4, %%mm3, %%mm5, bw, q)
+
+        STORE_4_WORDS((%1), (%1,%3), (%1,%3,2), (%1,%4), %%mm4)
+        STORE_4_WORDS((%2), (%2,%3), (%2,%3,2), (%2,%4), %%mm5)
+
+        : "=&r"(tmp)
+        : "r"(src), "r"(src+4*stride), "r"((x86_reg)stride), "r"((x86_reg)3*stride),
+          "m"(*(uint64_t*)(bounding_values+129))
+        : "memory"
+    );
+}
+
 /* from original comments: The Macro does IDct on 4 1-D Dcts */
 #define BeginIDCT() \
     "movq   "I(3)", %%mm2 \n\t" \
diff --git a/libavcodec/i386/vp3dsp_mmx.h b/libavcodec/i386/vp3dsp_mmx.h
index 2c6fc4fa59..252c5f1548 100644
--- a/libavcodec/i386/vp3dsp_mmx.h
+++ b/libavcodec/i386/vp3dsp_mmx.h
@@ -29,4 +29,7 @@ void ff_vp3_idct_mmx(int16_t *data);
 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
 
+void ff_vp3_v_loop_filter_mmx(uint8_t *src, int stride, int *bounding_values);
+void ff_vp3_h_loop_filter_mmx(uint8_t *src, int stride, int *bounding_values);
+
 #endif /* AVCODEC_I386_VP3DSP_MMX_H */
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index 9dd7055732..a5b97adb9e 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -229,7 +229,7 @@ typedef struct Vp3DecodeContext {
     uint16_t huffman_table[80][32][2];
 
     uint8_t filter_limit_values[64];
-    int bounding_values_array[256];
+    DECLARE_ALIGNED_8(int, bounding_values_array[256+2]);
 } Vp3DecodeContext;
 
 /************************************************************************
@@ -533,6 +533,7 @@ static void init_loop_filter(Vp3DecodeContext *s)
         bounding_values[x] = x;
         bounding_values[x + filter_limit] = filter_limit - x;
     }
+    bounding_values[129] = bounding_values[130] = filter_limit * 0x02020202;
 }
 
 /*