aarch64: vp9itxfm: Don't repeatedly set x9 when nothing overwrites it

Signed-off-by: Martin Storsjö <martin@martin.st>
2025-04-18 21:17:07 +00:00 · 2016-11-22 15:47:17 +02:00 · 2016-11-22 15:47:17 +02:00 · 2f99117f6f
commit 2f99117f6f
parent 2dbe2aa2c2
1 changed files with 15 additions and 11 deletions
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@ -599,9 +599,9 @@ endfunc
 // x1 = unused
 // x2 = src
 // x3 = slice offset
 // x9 = input stride
 .macro itxfm16_1d_funcs txfm
 function \txfm\()16_1d_8x16_pass1_neon
        mov             x9, #32
        movi            v2.8h, #0
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
        load_clear      \i,  x2,  x9
@ -649,8 +649,8 @@ endfunc
 // x1 = dst stride
 // x2 = src (temp buffer)
 // x3 = slice offset
 // x9 = temp buffer stride
 function \txfm\()16_1d_8x16_pass2_neon
        mov             x9, #32
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
        load            \i,  x2,  x9
 .endr
@ -747,6 +747,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .ifc \txfm1,idct
        ld1             {v0.8h,v1.8h}, [x10]
 .endif
        mov             x9, #32
 .irp i, 0, 8
        add             x0,  sp,  #(\i*32)
@ -882,13 +883,12 @@ endfunc
 // x0 = dst (temp buffer)
 // x1 = unused
 // x2 = src
 // x9 = double input stride
 // x10 = idct_coeffs
 // x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass1_neon
        ld1             {v0.8h,v1.8h}, [x10]
        // Double stride of the input, since we only read every other line
        mov             x9,  #128
        movi            v4.8h, #0
        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
@ -987,12 +987,13 @@ endfunc
 // x0 = dst
 // x1 = dst stride
 // x2 = src (temp buffer)
 // x7 = negative double temp buffer stride
 // x9 = double temp buffer stride
 // x10 = idct_coeffs
 // x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass2_neon
        ld1             {v0.8h,v1.8h}, [x10]
        mov             x9, #128
        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
        ld1             {v\i\().8h}, [x2], x9
@ -1001,7 +1002,6 @@ function idct32_1d_8x32_pass2_neon
        idct16
        mov             x9,  #128
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
        st1             {v\i\().8h}, [x2], x9
 .endr
@ -1018,11 +1018,10 @@ function idct32_1d_8x32_pass2_neon
        idct32_odd
        mov             x9,  #128
 .macro load_acc_store a, b, c, d, neg=0
 .if \neg == 0
        ld1             {v4.8h},  [x2], x9
        ld1             {v5.8h},  [x2], x9
 .if \neg == 0
        add             v4.8h, v4.8h, v\a\().8h
        ld1             {v6.8h},  [x2], x9
        add             v5.8h, v5.8h, v\b\().8h
@ -1030,10 +1029,12 @@ function idct32_1d_8x32_pass2_neon
        add             v6.8h, v6.8h, v\c\().8h
        add             v7.8h, v7.8h, v\d\().8h
 .else
        ld1             {v4.8h},  [x2], x7
        ld1             {v5.8h},  [x2], x7
        sub             v4.8h, v4.8h, v\a\().8h
-        ld1             {v6.8h},  [x2], x9
+        ld1             {v6.8h},  [x2], x7
        sub             v5.8h, v5.8h, v\b\().8h
-        ld1             {v7.8h},  [x2], x9
+        ld1             {v7.8h},  [x2], x7
        sub             v6.8h, v6.8h, v\c\().8h
        sub             v7.8h, v7.8h, v\d\().8h
 .endif
@ -1064,7 +1065,6 @@ function idct32_1d_8x32_pass2_neon
        load_acc_store  23, 22, 21, 20
        load_acc_store  19, 18, 17, 16
        sub             x2,  x2,  x9
        neg             x9,  x9
        load_acc_store  16, 17, 18, 19, 1
        load_acc_store  20, 21, 22, 23, 1
        load_acc_store  24, 25, 26, 27, 1
@ -1093,6 +1093,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
        mov             x5,  x1
        mov             x6,  x2
        // Double stride of the input, since we only read every other line
        mov             x9,  #128
        neg             x7,  x9
 .irp i, 0, 8, 16, 24
        add             x0,  sp,  #(\i*64)
        add             x2,  x6,  #(\i*2)