diff --git a/libavcodec/arm/vp9itxfm_16bpp_neon.S b/libavcodec/arm/vp9itxfm_16bpp_neon.S index 29d95ca228..8350153f73 100644 --- a/libavcodec/arm/vp9itxfm_16bpp_neon.S +++ b/libavcodec/arm/vp9itxfm_16bpp_neon.S @@ -807,7 +807,7 @@ function idct16x16_dc_add_neon endfunc .ltorg -.macro idct16 +function idct16 mbutterfly0 d16, d24, d16, d24, d8, d10, q4, q5 @ d16 = t0a, d24 = t1a mbutterfly d20, d28, d1[0], d1[1], q4, q5 @ d20 = t2a, d28 = t3a mbutterfly d18, d30, d2[0], d2[1], q4, q5 @ d18 = t4a, d30 = t7a @@ -853,9 +853,10 @@ endfunc vmov d8, d21 @ d8 = t10a butterfly d20, d27, d10, d27 @ d20 = out[4], d27 = out[11] butterfly d21, d26, d26, d8 @ d21 = out[5], d26 = out[10] -.endm + bx lr +endfunc -.macro iadst16 +function iadst16 movrel r12, iadst16_coeffs vld1.16 {q0}, [r12,:128]! vmovl.s16 q1, d1 @@ -933,7 +934,8 @@ endfunc vmov d16, d2 vmov d30, d4 -.endm + bx lr +endfunc .macro itxfm16_1d_funcs txfm @ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it, @@ -941,6 +943,8 @@ endfunc @ r0 = dst (temp buffer) @ r2 = src function \txfm\()16_1d_2x16_pass1_neon + push {lr} + mov r12, #64 vmov.s32 q4, #0 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 @@ -948,7 +952,7 @@ function \txfm\()16_1d_2x16_pass1_neon vst1.32 {d8}, [r2,:64], r12 .endr - \txfm\()16 + bl \txfm\()16 @ Do eight 2x2 transposes. Originally, d16-d31 contain the @ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight @@ -959,7 +963,7 @@ function \txfm\()16_1d_2x16_pass1_neon .irp i, 16, 18, 20, 22, 24, 26, 28, 30, 17, 19, 21, 23, 25, 27, 29, 31 vst1.32 {d\i}, [r0,:64]! .endr - bx lr + pop {pc} endfunc @ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it, @@ -968,6 +972,8 @@ endfunc @ r1 = dst stride @ r2 = src (temp buffer) function \txfm\()16_1d_2x16_pass2_neon + push {lr} + mov r12, #64 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 vld1.16 {d\i}, [r2,:64], r12 @@ -975,7 +981,7 @@ function \txfm\()16_1d_2x16_pass2_neon add r3, r0, r1 lsl r1, r1, #1 - \txfm\()16 + bl \txfm\()16 .macro load_add_store coef0, coef1, coef2, coef3 vrshr.s32 \coef0, \coef0, #6 @@ -1019,7 +1025,7 @@ function \txfm\()16_1d_2x16_pass2_neon load_add_store q12, q13, q14, q15 .purgem load_add_store - bx lr + pop {pc} endfunc .endm @@ -1193,7 +1199,7 @@ function idct32x32_dc_add_neon pop {r4-r9,pc} endfunc -.macro idct32_odd +function idct32_odd movrel r12, idct_coeffs @ Overwrite the idct16 coeffs with the stored ones for idct32 @@ -1262,7 +1268,8 @@ endfunc mbutterfly0 d26, d21, d26, d21, d8, d10, q4, q5 @ d26 = t26a, d21 = t21a mbutterfly0 d25, d22, d25, d22, d8, d10, q4, q5 @ d25 = t25, d22 = t22 mbutterfly0 d24, d23, d24, d23, d8, d10, q4, q5 @ d24 = t24a, d23 = t23a -.endm + bx lr +endfunc @ Do an 32-point IDCT of a 2x32 slice out of a 32x32 matrix. @ We don't have register space to do a single pass IDCT of 2x32 though, @@ -1274,6 +1281,8 @@ endfunc @ r1 = unused @ r2 = src function idct32_1d_2x32_pass1_neon + push {lr} + @ Double stride of the input, since we only read every other line mov r12, #256 vmov.s32 d8, #0 @@ -1284,7 +1293,7 @@ function idct32_1d_2x32_pass1_neon vst1.32 {d8}, [r2,:64], r12 .endr - idct16 + bl idct16 @ Do eight 2x2 transposes. Originally, d16-d31 contain the @ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight @@ -1319,7 +1328,7 @@ function idct32_1d_2x32_pass1_neon vst1.16 {d8}, [r2,:64], r12 .endr - idct32_odd + bl idct32_odd transpose32_8x_2x2 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 @@ -1343,7 +1352,7 @@ function idct32_1d_2x32_pass1_neon store_rev 31, 29, 27, 25, 23, 21, 19, 17 store_rev 30, 28, 26, 24, 22, 20, 18, 16 .purgem store_rev - bx lr + pop {pc} endfunc .ltorg @@ -1354,6 +1363,8 @@ endfunc @ r1 = dst stride @ r2 = src (temp buffer) function idct32_1d_2x32_pass2_neon + push {lr} + mov r12, #256 @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30) .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 @@ -1361,7 +1372,7 @@ function idct32_1d_2x32_pass2_neon .endr sub r2, r2, r12, lsl #4 - idct16 + bl idct16 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 vst1.32 {d\i}, [r2,:64], r12 @@ -1377,7 +1388,7 @@ function idct32_1d_2x32_pass2_neon sub r2, r2, r12, lsl #4 sub r2, r2, #128 - idct32_odd + bl idct32_odd @ Narrow the ict16 coefficients in q0-q3 into q0-q1, to @ allow clobbering q2-q3 below. @@ -1439,7 +1450,7 @@ function idct32_1d_2x32_pass2_neon vmovl.s16 q3, d3 vmovl.s16 q1, d1 vmovl.s16 q0, d0 - bx lr + pop {pc} endfunc const min_eob_idct_idct_32, align=4