From 557c1675cf0e803b2fee43b4c8b58433842c84d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 10 Nov 2016 11:07:39 +0200 Subject: [PATCH] arm: vp9mc: Minor adjustments from review of the aarch64 version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is sponsored by, and copyright, Google. The speedup for the large horizontal filters is surprisingly big on A7 and A53, while there's a minor slowdown (almost within measurement noise) on A8 and A9. Cortex A7 A8 A9 A53 orig: vp9_put_8tap_smooth_64h_neon: 20270.0 14447.3 19723.9 10910.9 new: vp9_put_8tap_smooth_64h_neon: 20165.8 14466.5 19730.2 10668.8 Signed-off-by: Martin Storsjö --- libavcodec/arm/vp9dsp_init_arm.c | 2 +- libavcodec/arm/vp9mc_neon.S | 133 ++++++++++--------------------- 2 files changed, 44 insertions(+), 91 deletions(-) diff --git a/libavcodec/arm/vp9dsp_init_arm.c b/libavcodec/arm/vp9dsp_init_arm.c index 1b00177f85..839037aed3 100644 --- a/libavcodec/arm/vp9dsp_init_arm.c +++ b/libavcodec/arm/vp9dsp_init_arm.c @@ -43,7 +43,7 @@ static void op##_##filter##sz##_hv_neon(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \ int h, int mx, int my) \ { \ - LOCAL_ALIGNED_16(uint8_t, temp, [((sz < 64 ? 2 * sz : 64) + 8) * sz]); \ + LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz]); \ /* We only need h + 7 lines, but the horizontal filter assumes an \ * even number of rows, so filter h + 8 lines here. */ \ ff_vp9_put_##filter##sz##_h_neon(temp, sz, \ diff --git a/libavcodec/arm/vp9mc_neon.S b/libavcodec/arm/vp9mc_neon.S index cc8f241764..9deb6562a5 100644 --- a/libavcodec/arm/vp9mc_neon.S +++ b/libavcodec/arm/vp9mc_neon.S @@ -20,60 +20,6 @@ #include "libavutil/arm/asm.S" -const regular_filter, align=4 - .short 0, 1, -5, 126, 8, -3, 1, 0 - .short -1, 3, -10, 122, 18, -6, 2, 0 - .short -1, 4, -13, 118, 27, -9, 3, -1 - .short -1, 4, -16, 112, 37, -11, 4, -1 - .short -1, 5, -18, 105, 48, -14, 4, -1 - .short -1, 5, -19, 97, 58, -16, 5, -1 - .short -1, 6, -19, 88, 68, -18, 5, -1 - .short -1, 6, -19, 78, 78, -19, 6, -1 - .short -1, 5, -18, 68, 88, -19, 6, -1 - .short -1, 5, -16, 58, 97, -19, 5, -1 - .short -1, 4, -14, 48, 105, -18, 5, -1 - .short -1, 4, -11, 37, 112, -16, 4, -1 - .short -1, 3, -9, 27, 118, -13, 4, -1 - .short 0, 2, -6, 18, 122, -10, 3, -1 - .short 0, 1, -3, 8, 126, -5, 1, 0 -endconst - -const sharp_filter, align=4 - .short -1, 3, -7, 127, 8, -3, 1, 0 - .short -2, 5, -13, 125, 17, -6, 3, -1 - .short -3, 7, -17, 121, 27, -10, 5, -2 - .short -4, 9, -20, 115, 37, -13, 6, -2 - .short -4, 10, -23, 108, 48, -16, 8, -3 - .short -4, 10, -24, 100, 59, -19, 9, -3 - .short -4, 11, -24, 90, 70, -21, 10, -4 - .short -4, 11, -23, 80, 80, -23, 11, -4 - .short -4, 10, -21, 70, 90, -24, 11, -4 - .short -3, 9, -19, 59, 100, -24, 10, -4 - .short -3, 8, -16, 48, 108, -23, 10, -4 - .short -2, 6, -13, 37, 115, -20, 9, -4 - .short -2, 5, -10, 27, 121, -17, 7, -3 - .short -1, 3, -6, 17, 125, -13, 5, -2 - .short 0, 1, -3, 8, 127, -7, 3, -1 -endconst - -const smooth_filter, align=4 - .short -3, -1, 32, 64, 38, 1, -3, 0 - .short -2, -2, 29, 63, 41, 2, -3, 0 - .short -2, -2, 26, 63, 43, 4, -4, 0 - .short -2, -3, 24, 62, 46, 5, -4, 0 - .short -2, -3, 21, 60, 49, 7, -4, 0 - .short -1, -4, 18, 59, 51, 9, -4, 0 - .short -1, -4, 16, 57, 53, 12, -4, -1 - .short -1, -4, 14, 55, 55, 14, -4, -1 - .short -1, -4, 12, 53, 57, 16, -4, -1 - .short 0, -4, 9, 51, 59, 18, -4, -1 - .short 0, -4, 7, 49, 60, 21, -3, -2 - .short 0, -4, 5, 46, 62, 24, -3, -2 - .short 0, -4, 4, 43, 63, 26, -2, -2 - .short 0, -3, 2, 41, 63, 29, -2, -2 - .short 0, -3, 1, 38, 64, 32, -1, -3 -endconst - @ All public functions in this file have the following signature: @ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride, @ const uint8_t *ref, ptrdiff_t ref_stride, @@ -156,20 +102,21 @@ function ff_vp9_copy16_neon, export=1 endfunc function ff_vp9_avg16_neon, export=1 - ldr r12, [sp] + push {lr} + ldr r12, [sp, #4] + mov lr, r0 1: vld1.8 {q2}, [r2], r3 vld1.8 {q0}, [r0, :128], r1 vld1.8 {q3}, [r2], r3 vrhadd.u8 q0, q0, q2 - vld1.8 {q1}, [r0, :128] - sub r0, r0, r1 + vld1.8 {q1}, [r0, :128], r1 vrhadd.u8 q1, q1, q3 subs r12, r12, #2 - vst1.8 {q0}, [r0, :128], r1 - vst1.8 {q1}, [r0, :128], r1 + vst1.8 {q0}, [lr, :128], r1 + vst1.8 {q1}, [lr, :128], r1 bne 1b - bx lr + pop {pc} endfunc function ff_vp9_copy8_neon, export=1 @@ -218,7 +165,9 @@ function ff_vp9_copy4_neon, export=1 endfunc function ff_vp9_avg4_neon, export=1 - ldr r12, [sp] + push {lr} + ldr r12, [sp, #4] + mov lr, r0 1: vld1.32 {d4[]}, [r2], r3 vld1.32 {d0[]}, [r0, :32], r1 @@ -231,15 +180,14 @@ function ff_vp9_avg4_neon, export=1 vld1.32 {d7[]}, [r2], r3 vrhadd.u8 d2, d2, d6 vld1.32 {d3[]}, [r0, :32], r1 - sub r0, r0, r1, lsl #2 subs r12, r12, #4 - vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [lr, :32], r1 vrhadd.u8 d3, d3, d7 - vst1.32 {d1[0]}, [r0, :32], r1 - vst1.32 {d2[0]}, [r0, :32], r1 - vst1.32 {d3[0]}, [r0, :32], r1 + vst1.32 {d1[0]}, [lr, :32], r1 + vst1.32 {d2[0]}, [lr, :32], r1 + vst1.32 {d3[0]}, [lr, :32], r1 bne 1b - bx lr + pop {pc} endfunc @ Helper macros for vmul/vmla with a constant from either d0 or d1 depending on index @@ -327,7 +275,8 @@ function \type\()_8tap_\size\()h_\idx1\idx2 sub r3, r3, #8 .endif @ Load the filter vector - vld1.16 {q0}, [r12,:128] + vld1.8 {d0}, [r12,:64] + vmovl.s8 q0, d0 1: .if \size >= 16 mov r12, r5 @@ -397,12 +346,12 @@ function \type\()_8tap_\size\()h_\idx1\idx2 .endif @ Store and loop horizontally (for size >= 16) .if \size >= 16 + subs r12, r12, #16 vst1.8 {q1}, [r0,:128]! vst1.8 {q3}, [r6,:128]! + beq 3f vmov q8, q10 vmov q11, q13 - subs r12, r12, #16 - beq 3f vld1.8 {q10}, [r2]! vld1.8 {q13}, [r7]! vmovl.u8 q9, d20 @@ -444,7 +393,7 @@ do_8tap_h_size 4 do_8tap_h_size 8 do_8tap_h_size 16 -.macro do_8tap_h_func type, filter, size +.macro do_8tap_h_func type, filter, offset, size function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1 push {r4-r7} .if \size >= 16 @@ -455,9 +404,10 @@ function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1 ldr r4, [sp, #16] ldr r5, [sp, #20] .endif - movrel r12, \filter\()_filter-16 + movrelx r12, X(ff_vp9_subpel_filters) + add r12, r12, 120*\offset - 8 cmp r5, #8 - add r12, r12, r5, lsl #4 + add r12, r12, r5, lsl #3 mov r5, #\size .if \size >= 16 bge \type\()_8tap_16h_34 @@ -470,12 +420,12 @@ endfunc .endm .macro do_8tap_h_filters size -do_8tap_h_func put, regular, \size -do_8tap_h_func avg, regular, \size -do_8tap_h_func put, sharp, \size -do_8tap_h_func avg, sharp, \size -do_8tap_h_func put, smooth, \size -do_8tap_h_func avg, smooth, \size +do_8tap_h_func put, regular, 1, \size +do_8tap_h_func avg, regular, 1, \size +do_8tap_h_func put, sharp, 2, \size +do_8tap_h_func avg, sharp, 2, \size +do_8tap_h_func put, smooth, 0, \size +do_8tap_h_func avg, smooth, 0, \size .endm do_8tap_h_filters 64 @@ -590,7 +540,8 @@ do_8tap_h_filters 4 function \type\()_8tap_8v_\idx1\idx2 sub r2, r2, r3, lsl #1 sub r2, r2, r3 - vld1.16 {q0}, [r12, :128] + vld1.8 {d0}, [r12, :64] + vmovl.s8 q0, d0 1: mov r12, r4 @@ -660,7 +611,8 @@ do_8tap_8v avg, 4, 3 function \type\()_8tap_4v_\idx1\idx2 sub r2, r2, r3, lsl #1 sub r2, r2, r3 - vld1.16 {q0}, [r12, :128] + vld1.8 {d0}, [r12, :64] + vmovl.s8 q0, d0 vld1.32 {d2[]}, [r2], r3 vld1.32 {d3[]}, [r2], r3 @@ -723,14 +675,15 @@ do_8tap_4v put, 4, 3 do_8tap_4v avg, 3, 4 do_8tap_4v avg, 4, 3 -.macro do_8tap_v_func type, filter, size +.macro do_8tap_v_func type, filter, offset, size function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1 push {r4-r5} vpush {q4-q7} ldr r4, [sp, #72] ldr r5, [sp, #80] - movrel r12, \filter\()_filter-16 - add r12, r12, r5, lsl #4 + movrelx r12, X(ff_vp9_subpel_filters) + add r12, r12, 120*\offset - 8 + add r12, r12, r5, lsl #3 cmp r5, #8 mov r5, #\size .if \size >= 8 @@ -744,12 +697,12 @@ endfunc .endm .macro do_8tap_v_filters size -do_8tap_v_func put, regular, \size -do_8tap_v_func avg, regular, \size -do_8tap_v_func put, sharp, \size -do_8tap_v_func avg, sharp, \size -do_8tap_v_func put, smooth, \size -do_8tap_v_func avg, smooth, \size +do_8tap_v_func put, regular, 1, \size +do_8tap_v_func avg, regular, 1, \size +do_8tap_v_func put, sharp, 2, \size +do_8tap_v_func avg, sharp, 2, \size +do_8tap_v_func put, smooth, 0, \size +do_8tap_v_func avg, smooth, 0, \size .endm do_8tap_v_filters 64