From 62ea07d797c503bc4b727e56d9c0f914a93c8ef6 Mon Sep 17 00:00:00 2001
From: Janne Grunau <janne-libav@jannau.net>
Date: Tue, 10 Jan 2017 00:15:07 +0200
Subject: [PATCH] aarch64: vp9: use alternative returns in the core loop filter
 function

Since aarch64 has enough free general purpose registers use them to
branch to the appropiate storage code. 1-2 cycles faster for the
functions using loop_filter 8/16, ... on a cortex-a53. Mixed results
(up to 2 cycles faster/slower) on a cortex-a57.

This is cherrypicked from libav commit
d7595de0b25e7064fd9e06dea5d0425536cef6dc.

Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
---
 libavcodec/aarch64/vp9lpf_neon.S | 48 ++++++++++++--------------------
 1 file changed, 18 insertions(+), 30 deletions(-)

diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index e727a4d0de..78aae61e87 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -410,15 +410,19 @@
 .endif
         // If no pixels needed flat8in nor flat8out, jump to a
         // writeout of the inner 4 pixels
-        cbz             x5,  7f
+        cbnz            x5,  1f
+        br              x14
+1:
         mov             x5,  v7.d[0]
 .ifc \sz, .16b
         mov             x6,  v7.d[1]
         orr             x5,  x5,  x6
 .endif
         // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
-        cbz             x5,  8f
+        cbnz            x5,  1f
+        br              x15
 
+1:
         // flat8out
         // This writes all outputs into v2-v17 (skipping v6 and v16).
         // If this part is skipped, the output is read from v21-v26 (which is the input
@@ -549,35 +553,24 @@ endfunc
 
 function vp9_loop_filter_8
         loop_filter     8,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
-        mov             x5,  #0
         ret
 6:
-        mov             x5,  #6
-        ret
+        br              x13
 9:
         br              x10
 endfunc
 
 function vp9_loop_filter_8_16b_mix
         loop_filter     8,  .16b, 88,   v16, v17, v18, v19, v28, v29, v30, v31
-        mov             x5,  #0
         ret
 6:
-        mov             x5,  #6
-        ret
+        br              x13
 9:
         br              x10
 endfunc
 
 function vp9_loop_filter_16
         loop_filter     16, .8b,  0,    v8,  v9,  v10, v11, v12, v13, v14, v15
-        mov             x5,  #0
-        ret
-7:
-        mov             x5,  #7
-        ret
-8:
-        mov             x5,  #8
         ret
 9:
         ldp             d8,  d9,  [sp], 0x10
@@ -589,13 +582,6 @@ endfunc
 
 function vp9_loop_filter_16_16b
         loop_filter     16, .16b, 0,    v8,  v9,  v10, v11, v12, v13, v14, v15
-        mov             x5,  #0
-        ret
-7:
-        mov             x5,  #7
-        ret
-8:
-        mov             x5,  #8
         ret
 9:
         ldp             d8,  d9,  [sp], 0x10
@@ -614,11 +600,14 @@ endfunc
 .endm
 
 .macro loop_filter_8
+        // calculate alternative 'return' targets
+        adr             x13, 6f
         bl              vp9_loop_filter_8
-        cbnz            x5,  6f
 .endm
 
 .macro loop_filter_8_16b_mix mix
+        // calculate alternative 'return' targets
+        adr             x13, 6f
 .if \mix == 48
         mov             x11, #0xffffffff00000000
 .elseif \mix == 84
@@ -627,21 +616,20 @@ endfunc
         mov             x11, #0xffffffffffffffff
 .endif
         bl              vp9_loop_filter_8_16b_mix
-        cbnz            x5,  6f
 .endm
 
 .macro loop_filter_16
+        // calculate alternative 'return' targets
+        adr             x14, 7f
+        adr             x15, 8f
         bl              vp9_loop_filter_16
-        cmp             x5,  7
-        b.gt            8f
-        b.eq            7f
 .endm
 
 .macro loop_filter_16_16b
+        // calculate alternative 'return' targets
+        adr             x14, 7f
+        adr             x15, 8f
         bl              vp9_loop_filter_16_16b
-        cmp             x5,  7
-        b.gt            8f
-        b.eq            7f
 .endm