diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S index 0fddbecae3..1ad32c359d 100644 --- a/libavcodec/aarch64/neon.S +++ b/libavcodec/aarch64/neon.S @@ -109,12 +109,25 @@ trn2 \r5\().4H, \r0\().4H, \r1\().4H trn1 \r6\().4H, \r2\().4H, \r3\().4H trn2 \r7\().4H, \r2\().4H, \r3\().4H + trn1 \r0\().2S, \r4\().2S, \r6\().2S trn2 \r2\().2S, \r4\().2S, \r6\().2S trn1 \r1\().2S, \r5\().2S, \r7\().2S trn2 \r3\().2S, \r5\().2S, \r7\().2S .endm +.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().8H, \r0\().8H, \r1\().8H + trn2 \t5\().8H, \r0\().8H, \r1\().8H + trn1 \t6\().8H, \r2\().8H, \r3\().8H + trn2 \t7\().8H, \r2\().8H, \r3\().8H + + trn1 \r0\().4S, \t4\().4S, \t6\().4S + trn2 \r2\().4S, \t4\().4S, \t6\().4S + trn1 \r1\().4S, \t5\().4S, \t7\().4S + trn2 \r3\().4S, \t5\().4S, \t7\().4S +.endm + .macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9 trn1 \r8\().8H, \r0\().8H, \r1\().8H trn2 \r9\().8H, \r0\().8H, \r1\().8H diff --git a/libavcodec/aarch64/vp9lpf_16bpp_neon.S b/libavcodec/aarch64/vp9lpf_16bpp_neon.S index 9075f3d406..9869614a29 100644 --- a/libavcodec/aarch64/vp9lpf_16bpp_neon.S +++ b/libavcodec/aarch64/vp9lpf_16bpp_neon.S @@ -22,18 +22,6 @@ #include "neon.S" -.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7 - trn1 \t4\().8h, \r0\().8h, \r1\().8h - trn2 \t5\().8h, \r0\().8h, \r1\().8h - trn1 \t6\().8h, \r2\().8h, \r3\().8h - trn2 \t7\().8h, \r2\().8h, \r3\().8h - - trn1 \r0\().4s, \t4\().4s, \t6\().4s - trn2 \r2\().4s, \t4\().4s, \t6\().4s - trn1 \r1\().4s, \t5\().4s, \t7\().4s - trn2 \r3\().4s, \t5\().4s, \t7\().4s -.endm - // The input to and output from this macro is in the registers v16-v31, // and v0-v7 are used as scratch registers. // p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31