From 557c1675cf0e803b2fee43b4c8b58433842c84d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 10 Nov 2016 11:07:39 +0200
Subject: [PATCH] arm: vp9mc: Minor adjustments from review of the aarch64
 version
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This work is sponsored by, and copyright, Google.

The speedup for the large horizontal filters is surprisingly
big on A7 and A53, while there's a minor slowdown (almost within
measurement noise) on A8 and A9.

                            Cortex    A7        A8        A9       A53
orig:
vp9_put_8tap_smooth_64h_neon:    20270.0   14447.3   19723.9   10910.9
new:
vp9_put_8tap_smooth_64h_neon:    20165.8   14466.5   19730.2   10668.8

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/arm/vp9dsp_init_arm.c |   2 +-
 libavcodec/arm/vp9mc_neon.S      | 133 ++++++++++---------------------
 2 files changed, 44 insertions(+), 91 deletions(-)

diff --git a/libavcodec/arm/vp9dsp_init_arm.c b/libavcodec/arm/vp9dsp_init_arm.c
index 1b00177f85..839037aed3 100644
--- a/libavcodec/arm/vp9dsp_init_arm.c
+++ b/libavcodec/arm/vp9dsp_init_arm.c
@@ -43,7 +43,7 @@ static void op##_##filter##sz##_hv_neon(uint8_t *dst, ptrdiff_t dst_stride,
                                         const uint8_t *src, ptrdiff_t src_stride, \
                                         int h, int mx, int my)                    \
 {                                                                                 \
-    LOCAL_ALIGNED_16(uint8_t, temp, [((sz < 64 ? 2 * sz : 64) + 8) * sz]);        \
+    LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz]);           \
     /* We only need h + 7 lines, but the horizontal filter assumes an             \
      * even number of rows, so filter h + 8 lines here. */                        \
     ff_vp9_put_##filter##sz##_h_neon(temp, sz,                                    \
diff --git a/libavcodec/arm/vp9mc_neon.S b/libavcodec/arm/vp9mc_neon.S
index cc8f241764..9deb6562a5 100644
--- a/libavcodec/arm/vp9mc_neon.S
+++ b/libavcodec/arm/vp9mc_neon.S
@@ -20,60 +20,6 @@
 
 #include "libavutil/arm/asm.S"
 
-const regular_filter, align=4
-        .short  0,  1,  -5, 126,   8,  -3,  1,  0
-        .short -1,  3, -10, 122,  18,  -6,  2,  0
-        .short -1,  4, -13, 118,  27,  -9,  3, -1
-        .short -1,  4, -16, 112,  37, -11,  4, -1
-        .short -1,  5, -18, 105,  48, -14,  4, -1
-        .short -1,  5, -19,  97,  58, -16,  5, -1
-        .short -1,  6, -19,  88,  68, -18,  5, -1
-        .short -1,  6, -19,  78,  78, -19,  6, -1
-        .short -1,  5, -18,  68,  88, -19,  6, -1
-        .short -1,  5, -16,  58,  97, -19,  5, -1
-        .short -1,  4, -14,  48, 105, -18,  5, -1
-        .short -1,  4, -11,  37, 112, -16,  4, -1
-        .short -1,  3,  -9,  27, 118, -13,  4, -1
-        .short  0,  2,  -6,  18, 122, -10,  3, -1
-        .short  0,  1,  -3,   8, 126,  -5,  1,  0
-endconst
-
-const sharp_filter, align=4
-        .short -1,  3,  -7, 127,   8,  -3,  1,  0
-        .short -2,  5, -13, 125,  17,  -6,  3, -1
-        .short -3,  7, -17, 121,  27, -10,  5, -2
-        .short -4,  9, -20, 115,  37, -13,  6, -2
-        .short -4, 10, -23, 108,  48, -16,  8, -3
-        .short -4, 10, -24, 100,  59, -19,  9, -3
-        .short -4, 11, -24,  90,  70, -21, 10, -4
-        .short -4, 11, -23,  80,  80, -23, 11, -4
-        .short -4, 10, -21,  70,  90, -24, 11, -4
-        .short -3,  9, -19,  59, 100, -24, 10, -4
-        .short -3,  8, -16,  48, 108, -23, 10, -4
-        .short -2,  6, -13,  37, 115, -20,  9, -4
-        .short -2,  5, -10,  27, 121, -17,  7, -3
-        .short -1,  3,  -6,  17, 125, -13,  5, -2
-        .short  0,  1,  -3,   8, 127,  -7,  3, -1
-endconst
-
-const smooth_filter, align=4
-        .short -3, -1,  32,  64,  38,   1, -3,  0
-        .short -2, -2,  29,  63,  41,   2, -3,  0
-        .short -2, -2,  26,  63,  43,   4, -4,  0
-        .short -2, -3,  24,  62,  46,   5, -4,  0
-        .short -2, -3,  21,  60,  49,   7, -4,  0
-        .short -1, -4,  18,  59,  51,   9, -4,  0
-        .short -1, -4,  16,  57,  53,  12, -4, -1
-        .short -1, -4,  14,  55,  55,  14, -4, -1
-        .short -1, -4,  12,  53,  57,  16, -4, -1
-        .short  0, -4,   9,  51,  59,  18, -4, -1
-        .short  0, -4,   7,  49,  60,  21, -3, -2
-        .short  0, -4,   5,  46,  62,  24, -3, -2
-        .short  0, -4,   4,  43,  63,  26, -2, -2
-        .short  0, -3,   2,  41,  63,  29, -2, -2
-        .short  0, -3,   1,  38,  64,  32, -1, -3
-endconst
-
 @ All public functions in this file have the following signature:
 @ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
 @                            const uint8_t *ref, ptrdiff_t ref_stride,
@@ -156,20 +102,21 @@ function ff_vp9_copy16_neon, export=1
 endfunc
 
 function ff_vp9_avg16_neon, export=1
-        ldr             r12, [sp]
+        push            {lr}
+        ldr             r12, [sp, #4]
+        mov             lr,  r0
 1:
         vld1.8          {q2},  [r2], r3
         vld1.8          {q0},  [r0, :128], r1
         vld1.8          {q3},  [r2], r3
         vrhadd.u8       q0,  q0,  q2
-        vld1.8          {q1},  [r0, :128]
-        sub             r0,  r0,  r1
+        vld1.8          {q1},  [r0, :128], r1
         vrhadd.u8       q1,  q1,  q3
         subs            r12, r12, #2
-        vst1.8          {q0},  [r0, :128], r1
-        vst1.8          {q1},  [r0, :128], r1
+        vst1.8          {q0},  [lr, :128], r1
+        vst1.8          {q1},  [lr, :128], r1
         bne             1b
-        bx              lr
+        pop             {pc}
 endfunc
 
 function ff_vp9_copy8_neon, export=1
@@ -218,7 +165,9 @@ function ff_vp9_copy4_neon, export=1
 endfunc
 
 function ff_vp9_avg4_neon, export=1
-        ldr             r12, [sp]
+        push            {lr}
+        ldr             r12, [sp, #4]
+        mov             lr,  r0
 1:
         vld1.32         {d4[]},   [r2], r3
         vld1.32         {d0[]},   [r0, :32], r1
@@ -231,15 +180,14 @@ function ff_vp9_avg4_neon, export=1
         vld1.32         {d7[]},   [r2], r3
         vrhadd.u8       d2,  d2,  d6
         vld1.32         {d3[]},   [r0, :32], r1
-        sub             r0,  r0,  r1, lsl #2
         subs            r12, r12, #4
-        vst1.32         {d0[0]},  [r0, :32], r1
+        vst1.32         {d0[0]},  [lr, :32], r1
         vrhadd.u8       d3,  d3,  d7
-        vst1.32         {d1[0]},  [r0, :32], r1
-        vst1.32         {d2[0]},  [r0, :32], r1
-        vst1.32         {d3[0]},  [r0, :32], r1
+        vst1.32         {d1[0]},  [lr, :32], r1
+        vst1.32         {d2[0]},  [lr, :32], r1
+        vst1.32         {d3[0]},  [lr, :32], r1
         bne             1b
-        bx              lr
+        pop             {pc}
 endfunc
 
 @ Helper macros for vmul/vmla with a constant from either d0 or d1 depending on index
@@ -327,7 +275,8 @@ function \type\()_8tap_\size\()h_\idx1\idx2
         sub             r3,  r3,  #8
 .endif
         @ Load the filter vector
-        vld1.16         {q0},  [r12,:128]
+        vld1.8          {d0},  [r12,:64]
+        vmovl.s8        q0,  d0
 1:
 .if \size >= 16
         mov             r12, r5
@@ -397,12 +346,12 @@ function \type\()_8tap_\size\()h_\idx1\idx2
 .endif
         @ Store and loop horizontally (for size >= 16)
 .if \size >= 16
+        subs            r12, r12, #16
         vst1.8          {q1}, [r0,:128]!
         vst1.8          {q3}, [r6,:128]!
+        beq             3f
         vmov            q8,  q10
         vmov            q11, q13
-        subs            r12, r12, #16
-        beq             3f
         vld1.8          {q10}, [r2]!
         vld1.8          {q13}, [r7]!
         vmovl.u8        q9,  d20
@@ -444,7 +393,7 @@ do_8tap_h_size 4
 do_8tap_h_size 8
 do_8tap_h_size 16
 
-.macro do_8tap_h_func type, filter, size
+.macro do_8tap_h_func type, filter, offset, size
 function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
         push            {r4-r7}
 .if \size >= 16
@@ -455,9 +404,10 @@ function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
         ldr             r4,  [sp, #16]
         ldr             r5,  [sp, #20]
 .endif
-        movrel          r12, \filter\()_filter-16
+        movrelx         r12, X(ff_vp9_subpel_filters)
+        add             r12, r12, 120*\offset - 8
         cmp             r5,  #8
-        add             r12,  r12, r5, lsl #4
+        add             r12, r12, r5, lsl #3
         mov             r5, #\size
 .if \size >= 16
         bge             \type\()_8tap_16h_34
@@ -470,12 +420,12 @@ endfunc
 .endm
 
 .macro do_8tap_h_filters size
-do_8tap_h_func put, regular, \size
-do_8tap_h_func avg, regular, \size
-do_8tap_h_func put, sharp,   \size
-do_8tap_h_func avg, sharp,   \size
-do_8tap_h_func put, smooth,  \size
-do_8tap_h_func avg, smooth,  \size
+do_8tap_h_func put, regular, 1, \size
+do_8tap_h_func avg, regular, 1, \size
+do_8tap_h_func put, sharp,   2, \size
+do_8tap_h_func avg, sharp,   2, \size
+do_8tap_h_func put, smooth,  0, \size
+do_8tap_h_func avg, smooth,  0, \size
 .endm
 
 do_8tap_h_filters 64
@@ -590,7 +540,8 @@ do_8tap_h_filters 4
 function \type\()_8tap_8v_\idx1\idx2
         sub             r2,  r2,  r3, lsl #1
         sub             r2,  r2,  r3
-        vld1.16         {q0},  [r12, :128]
+        vld1.8          {d0},  [r12, :64]
+        vmovl.s8        q0,  d0
 1:
         mov             r12,  r4
 
@@ -660,7 +611,8 @@ do_8tap_8v avg, 4, 3
 function \type\()_8tap_4v_\idx1\idx2
         sub             r2,  r2,  r3, lsl #1
         sub             r2,  r2,  r3
-        vld1.16         {q0},  [r12, :128]
+        vld1.8          {d0},  [r12, :64]
+        vmovl.s8        q0,  d0
 
         vld1.32         {d2[]},   [r2], r3
         vld1.32         {d3[]},   [r2], r3
@@ -723,14 +675,15 @@ do_8tap_4v put, 4, 3
 do_8tap_4v avg, 3, 4
 do_8tap_4v avg, 4, 3
 
-.macro do_8tap_v_func type, filter, size
+.macro do_8tap_v_func type, filter, offset, size
 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
         push            {r4-r5}
         vpush           {q4-q7}
         ldr             r4,  [sp, #72]
         ldr             r5,  [sp, #80]
-        movrel          r12, \filter\()_filter-16
-        add             r12,  r12, r5, lsl #4
+        movrelx         r12, X(ff_vp9_subpel_filters)
+        add             r12, r12, 120*\offset - 8
+        add             r12, r12, r5, lsl #3
         cmp             r5,  #8
         mov             r5,  #\size
 .if \size >= 8
@@ -744,12 +697,12 @@ endfunc
 .endm
 
 .macro do_8tap_v_filters size
-do_8tap_v_func put, regular, \size
-do_8tap_v_func avg, regular, \size
-do_8tap_v_func put, sharp,   \size
-do_8tap_v_func avg, sharp,   \size
-do_8tap_v_func put, smooth,  \size
-do_8tap_v_func avg, smooth,  \size
+do_8tap_v_func put, regular, 1, \size
+do_8tap_v_func avg, regular, 1, \size
+do_8tap_v_func put, sharp,   2, \size
+do_8tap_v_func avg, sharp,   2, \size
+do_8tap_v_func put, smooth,  0, \size
+do_8tap_v_func avg, smooth,  0, \size
 .endm
 
 do_8tap_v_filters 64