lavc/hevcdsp_qpel_neon: using movi.16b instead of movi.2d

Building iOS platform with arm64, the compiler has a warning: "instruction movi.2d with immediate #0 may not function correctly on this CPU, converting to movi.16b"

Signed-off-by: xufuji456 <839789740@qq.com>
Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
xufuji456 2023-11-15 11:53:12 +08:00 committed by Martin Storsjö
parent 67ce690bc6
commit cc86343b96
4 changed files with 134 additions and 134 deletions

View File

@ -694,7 +694,7 @@ function ff_hevc_put_hevc_epel_h4_8_neon_i8mm, export=1
trn1 v4.2s, v4.2s, v5.2s trn1 v4.2s, v4.2s, v5.2s
trn1 v6.2s, v6.2s, v7.2s trn1 v6.2s, v6.2s, v7.2s
trn1 v4.2d, v4.2d, v6.2d trn1 v4.2d, v4.2d, v6.2d
movi v16.2d, #0 movi v16.16b, #0
usdot v16.4s, v4.16b, v30.16b usdot v16.4s, v4.16b, v30.16b
xtn v16.4h, v16.4s xtn v16.4h, v16.4s
st1 {v16.4h}, [x0], x10 st1 {v16.4h}, [x0], x10
@ -714,8 +714,8 @@ function ff_hevc_put_hevc_epel_h6_8_neon_i8mm, export=1
trn2 v17.2s, v4.2s, v5.2s trn2 v17.2s, v4.2s, v5.2s
trn1 v6.2s, v6.2s, v7.2s trn1 v6.2s, v6.2s, v7.2s
trn1 v16.2d, v16.2d, v6.2d trn1 v16.2d, v16.2d, v6.2d
movi v18.2d, #0 movi v18.16b, #0
movi v19.2d, #0 movi v19.16b, #0
usdot v18.4s, v16.16b, v30.16b usdot v18.4s, v16.16b, v30.16b
usdot v19.2s, v17.8b, v30.8b usdot v19.2s, v17.8b, v30.8b
xtn v18.4h, v18.4s xtn v18.4h, v18.4s
@ -736,8 +736,8 @@ function ff_hevc_put_hevc_epel_h8_8_neon_i8mm, export=1
ext v7.16b, v4.16b, v4.16b, #3 ext v7.16b, v4.16b, v4.16b, #3
zip1 v20.4s, v4.4s, v6.4s zip1 v20.4s, v4.4s, v6.4s
zip1 v21.4s, v5.4s, v7.4s zip1 v21.4s, v5.4s, v7.4s
movi v16.2d, #0 movi v16.16b, #0
movi v17.2d, #0 movi v17.16b, #0
usdot v16.4s, v20.16b, v30.16b usdot v16.4s, v20.16b, v30.16b
usdot v17.4s, v21.16b, v30.16b usdot v17.4s, v21.16b, v30.16b
xtn v16.4h, v16.4s xtn v16.4h, v16.4s
@ -761,9 +761,9 @@ function ff_hevc_put_hevc_epel_h12_8_neon_i8mm, export=1
trn1 v4.4s, v20.4s, v21.4s trn1 v4.4s, v20.4s, v21.4s
trn2 v5.4s, v20.4s, v21.4s trn2 v5.4s, v20.4s, v21.4s
trn1 v6.4s, v22.4s, v23.4s trn1 v6.4s, v22.4s, v23.4s
movi v16.2d, #0 movi v16.16b, #0
movi v17.2d, #0 movi v17.16b, #0
movi v18.2d, #0 movi v18.16b, #0
usdot v16.4s, v4.16b, v30.16b usdot v16.4s, v4.16b, v30.16b
usdot v17.4s, v5.16b, v30.16b usdot v17.4s, v5.16b, v30.16b
usdot v18.4s, v6.16b, v30.16b usdot v18.4s, v6.16b, v30.16b
@ -788,10 +788,10 @@ function ff_hevc_put_hevc_epel_h16_8_neon_i8mm, export=1
zip2 v22.4s, v0.4s, v6.4s zip2 v22.4s, v0.4s, v6.4s
zip1 v21.4s, v5.4s, v7.4s zip1 v21.4s, v5.4s, v7.4s
zip2 v23.4s, v5.4s, v7.4s zip2 v23.4s, v5.4s, v7.4s
movi v16.2d, #0 movi v16.16b, #0
movi v17.2d, #0 movi v17.16b, #0
movi v18.2d, #0 movi v18.16b, #0
movi v19.2d, #0 movi v19.16b, #0
usdot v16.4s, v20.16b, v30.16b usdot v16.4s, v20.16b, v30.16b
usdot v17.4s, v21.16b, v30.16b usdot v17.4s, v21.16b, v30.16b
usdot v18.4s, v22.16b, v30.16b usdot v18.4s, v22.16b, v30.16b
@ -815,14 +815,14 @@ function ff_hevc_put_hevc_epel_h24_8_neon_i8mm, export=1
ext v26.16b, v1.16b, v1.16b, #1 ext v26.16b, v1.16b, v1.16b, #1
ext v27.16b, v1.16b, v1.16b, #2 ext v27.16b, v1.16b, v1.16b, #2
ext v28.16b, v1.16b, v1.16b, #3 ext v28.16b, v1.16b, v1.16b, #3
movi v16.2d, #0 movi v16.16b, #0
movi v17.2d, #0 movi v17.16b, #0
movi v18.2d, #0 movi v18.16b, #0
movi v19.2d, #0 movi v19.16b, #0
movi v20.2d, #0 movi v20.16b, #0
movi v21.2d, #0 movi v21.16b, #0
movi v22.2d, #0 movi v22.16b, #0
movi v23.2d, #0 movi v23.16b, #0
usdot v16.4s, v0.16b, v30.16b usdot v16.4s, v0.16b, v30.16b
usdot v17.4s, v5.16b, v30.16b usdot v17.4s, v5.16b, v30.16b
usdot v18.4s, v6.16b, v30.16b usdot v18.4s, v6.16b, v30.16b
@ -861,14 +861,14 @@ function ff_hevc_put_hevc_epel_h32_8_neon_i8mm, export=1
ext v26.16b, v1.16b, v2.16b, #1 ext v26.16b, v1.16b, v2.16b, #1
ext v27.16b, v1.16b, v2.16b, #2 ext v27.16b, v1.16b, v2.16b, #2
ext v28.16b, v1.16b, v2.16b, #3 ext v28.16b, v1.16b, v2.16b, #3
movi v16.2d, #0 movi v16.16b, #0
movi v17.2d, #0 movi v17.16b, #0
movi v18.2d, #0 movi v18.16b, #0
movi v19.2d, #0 movi v19.16b, #0
movi v20.2d, #0 movi v20.16b, #0
movi v21.2d, #0 movi v21.16b, #0
movi v22.2d, #0 movi v22.16b, #0
movi v23.2d, #0 movi v23.16b, #0
usdot v16.4s, v0.16b, v30.16b usdot v16.4s, v0.16b, v30.16b
usdot v17.4s, v5.16b, v30.16b usdot v17.4s, v5.16b, v30.16b
usdot v18.4s, v6.16b, v30.16b usdot v18.4s, v6.16b, v30.16b
@ -900,18 +900,18 @@ function ff_hevc_put_hevc_epel_h48_8_neon_i8mm, export=1
ext v16.16b, v1.16b, v2.16b, #1 ext v16.16b, v1.16b, v2.16b, #1
ext v17.16b, v1.16b, v2.16b, #2 ext v17.16b, v1.16b, v2.16b, #2
ext v18.16b, v1.16b, v2.16b, #3 ext v18.16b, v1.16b, v2.16b, #3
movi v20.2d, #0 movi v20.16b, #0
movi v21.2d, #0 movi v21.16b, #0
movi v22.2d, #0 movi v22.16b, #0
movi v23.2d, #0 movi v23.16b, #0
usdot v20.4s, v0.16b, v30.16b usdot v20.4s, v0.16b, v30.16b
usdot v21.4s, v4.16b, v30.16b usdot v21.4s, v4.16b, v30.16b
usdot v22.4s, v5.16b, v30.16b usdot v22.4s, v5.16b, v30.16b
usdot v23.4s, v6.16b, v30.16b usdot v23.4s, v6.16b, v30.16b
movi v24.2d, #0 movi v24.16b, #0
movi v25.2d, #0 movi v25.16b, #0
movi v26.2d, #0 movi v26.16b, #0
movi v27.2d, #0 movi v27.16b, #0
usdot v24.4s, v1.16b, v30.16b usdot v24.4s, v1.16b, v30.16b
usdot v25.4s, v16.16b, v30.16b usdot v25.4s, v16.16b, v30.16b
usdot v26.4s, v17.16b, v30.16b usdot v26.4s, v17.16b, v30.16b
@ -928,10 +928,10 @@ function ff_hevc_put_hevc_epel_h48_8_neon_i8mm, export=1
ext v4.16b, v2.16b, v3.16b, #1 ext v4.16b, v2.16b, v3.16b, #1
ext v5.16b, v2.16b, v3.16b, #2 ext v5.16b, v2.16b, v3.16b, #2
ext v6.16b, v2.16b, v3.16b, #3 ext v6.16b, v2.16b, v3.16b, #3
movi v20.2d, #0 movi v20.16b, #0
movi v21.2d, #0 movi v21.16b, #0
movi v22.2d, #0 movi v22.16b, #0
movi v23.2d, #0 movi v23.16b, #0
usdot v20.4s, v2.16b, v30.16b usdot v20.4s, v2.16b, v30.16b
usdot v21.4s, v4.16b, v30.16b usdot v21.4s, v4.16b, v30.16b
usdot v22.4s, v5.16b, v30.16b usdot v22.4s, v5.16b, v30.16b
@ -957,18 +957,18 @@ function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1
ext v16.16b, v1.16b, v2.16b, #1 ext v16.16b, v1.16b, v2.16b, #1
ext v17.16b, v1.16b, v2.16b, #2 ext v17.16b, v1.16b, v2.16b, #2
ext v18.16b, v1.16b, v2.16b, #3 ext v18.16b, v1.16b, v2.16b, #3
movi v20.2d, #0 movi v20.16b, #0
movi v21.2d, #0 movi v21.16b, #0
movi v22.2d, #0 movi v22.16b, #0
movi v23.2d, #0 movi v23.16b, #0
usdot v20.4s, v0.16b, v30.16b usdot v20.4s, v0.16b, v30.16b
usdot v21.4s, v4.16b, v30.16b usdot v21.4s, v4.16b, v30.16b
usdot v22.4s, v5.16b, v30.16b usdot v22.4s, v5.16b, v30.16b
usdot v23.4s, v6.16b, v30.16b usdot v23.4s, v6.16b, v30.16b
movi v24.2d, #0 movi v24.16b, #0
movi v25.2d, #0 movi v25.16b, #0
movi v26.2d, #0 movi v26.16b, #0
movi v27.2d, #0 movi v27.16b, #0
usdot v24.4s, v1.16b, v30.16b usdot v24.4s, v1.16b, v30.16b
usdot v25.4s, v16.16b, v30.16b usdot v25.4s, v16.16b, v30.16b
usdot v26.4s, v17.16b, v30.16b usdot v26.4s, v17.16b, v30.16b
@ -989,18 +989,18 @@ function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1
ext v16.16b, v3.16b, v7.16b, #1 ext v16.16b, v3.16b, v7.16b, #1
ext v17.16b, v3.16b, v7.16b, #2 ext v17.16b, v3.16b, v7.16b, #2
ext v18.16b, v3.16b, v7.16b, #3 ext v18.16b, v3.16b, v7.16b, #3
movi v20.2d, #0 movi v20.16b, #0
movi v21.2d, #0 movi v21.16b, #0
movi v22.2d, #0 movi v22.16b, #0
movi v23.2d, #0 movi v23.16b, #0
usdot v20.4s, v2.16b, v30.16b usdot v20.4s, v2.16b, v30.16b
usdot v21.4s, v4.16b, v30.16b usdot v21.4s, v4.16b, v30.16b
usdot v22.4s, v5.16b, v30.16b usdot v22.4s, v5.16b, v30.16b
usdot v23.4s, v6.16b, v30.16b usdot v23.4s, v6.16b, v30.16b
movi v24.2d, #0 movi v24.16b, #0
movi v25.2d, #0 movi v25.16b, #0
movi v26.2d, #0 movi v26.16b, #0
movi v27.2d, #0 movi v27.16b, #0
usdot v24.4s, v3.16b, v30.16b usdot v24.4s, v3.16b, v30.16b
usdot v25.4s, v16.16b, v30.16b usdot v25.4s, v16.16b, v30.16b
usdot v26.4s, v17.16b, v30.16b usdot v26.4s, v17.16b, v30.16b
@ -1593,7 +1593,7 @@ function ff_hevc_put_hevc_epel_uni_w_h4_8_neon_i8mm, export=1
trn1 v0.2s, v0.2s, v2.2s trn1 v0.2s, v0.2s, v2.2s
trn1 v1.2s, v1.2s, v3.2s trn1 v1.2s, v1.2s, v3.2s
zip1 v0.4s, v0.4s, v1.4s zip1 v0.4s, v0.4s, v1.4s
movi v16.2d, #0 movi v16.16b, #0
usdot v16.4s, v0.16b, v28.16b usdot v16.4s, v0.16b, v28.16b
mul v16.4s, v16.4s, v30.4s mul v16.4s, v16.4s, v30.4s
sqrshl v16.4s, v16.4s, v31.4s sqrshl v16.4s, v16.4s, v31.4s
@ -1620,8 +1620,8 @@ function ff_hevc_put_hevc_epel_uni_w_h6_8_neon_i8mm, export=1
trn2 v6.2s, v0.2s, v1.2s trn2 v6.2s, v0.2s, v1.2s
trn1 v5.2s, v2.2s, v3.2s trn1 v5.2s, v2.2s, v3.2s
zip1 v4.2d, v4.2d, v5.2d zip1 v4.2d, v4.2d, v5.2d
movi v16.2d, #0 movi v16.16b, #0
movi v17.2d, #0 movi v17.16b, #0
usdot v16.4s, v4.16b, v28.16b usdot v16.4s, v4.16b, v28.16b
usdot v17.2s, v6.8b, v28.8b usdot v17.2s, v6.8b, v28.8b
mul v16.4s, v16.4s, v30.4s mul v16.4s, v16.4s, v30.4s
@ -1640,8 +1640,8 @@ function ff_hevc_put_hevc_epel_uni_w_h6_8_neon_i8mm, export=1
endfunc endfunc
.macro EPEL_UNI_W_H_CALC s0, s1, d0, d1 .macro EPEL_UNI_W_H_CALC s0, s1, d0, d1
movi \d0\().2d, #0 movi \d0\().16b, #0
movi \d1\().2d, #0 movi \d1\().16b, #0
usdot \d0\().4s, \s0\().16b, v28.16b usdot \d0\().4s, \s0\().16b, v28.16b
usdot \d1\().4s, \s1\().16b, v28.16b usdot \d1\().4s, \s1\().16b, v28.16b
mul \d0\().4s, \d0\().4s, v30.4s mul \d0\().4s, \d0\().4s, v30.4s
@ -1687,7 +1687,7 @@ function ff_hevc_put_hevc_epel_uni_w_h12_8_neon_i8mm, export=1
zip2 v7.4s, v1.4s, v3.4s zip2 v7.4s, v1.4s, v3.4s
zip1 v6.4s, v6.4s, v7.4s zip1 v6.4s, v6.4s, v7.4s
EPEL_UNI_W_H_CALC v4, v5, v16, v17 EPEL_UNI_W_H_CALC v4, v5, v16, v17
movi v18.2d, #0 movi v18.16b, #0
usdot v18.4s, v6.16b, v28.16b usdot v18.4s, v6.16b, v28.16b
mul v18.4s, v18.4s, v30.4s mul v18.4s, v18.4s, v30.4s
sqrshl v18.4s, v18.4s, v31.4s sqrshl v18.4s, v18.4s, v31.4s
@ -2575,7 +2575,7 @@ DISABLE_I8MM
.endm .endm
.macro EPEL_UNI_W_V4_CALC d0, s0, s1, s2, s3 .macro EPEL_UNI_W_V4_CALC d0, s0, s1, s2, s3
movi \d0\().2d, #0 movi \d0\().16b, #0
umlsl \d0\().8h, \s0\().8b, v0.8b umlsl \d0\().8h, \s0\().8b, v0.8b
umlal \d0\().8h, \s1\().8b, v1.8b umlal \d0\().8h, \s1\().8b, v1.8b
umlal \d0\().8h, \s2\().8b, v2.8b umlal \d0\().8h, \s2\().8b, v2.8b
@ -2626,7 +2626,7 @@ function ff_hevc_put_hevc_epel_uni_w_v4_8_neon, export=1
endfunc endfunc
.macro EPEL_UNI_W_V8_CALC d0, s0, s1, s2, s3, t0, t1 .macro EPEL_UNI_W_V8_CALC d0, s0, s1, s2, s3, t0, t1
movi \d0\().2d, #0 movi \d0\().16b, #0
umlsl \d0\().8h, \s0\().8b, v0.8b umlsl \d0\().8h, \s0\().8b, v0.8b
umlal \d0\().8h, \s1\().8b, v1.8b umlal \d0\().8h, \s1\().8b, v1.8b
umlal \d0\().8h, \s2\().8b, v2.8b umlal \d0\().8h, \s2\().8b, v2.8b
@ -2720,8 +2720,8 @@ function ff_hevc_put_hevc_epel_uni_w_v8_8_neon, export=1
endfunc endfunc
.macro EPEL_UNI_W_V12_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3 .macro EPEL_UNI_W_V12_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3
movi \d0\().2d, #0 movi \d0\().16b, #0
movi \d1\().2d, #0 movi \d1\().16b, #0
umlsl \d0\().8h, \s0\().8b, v0.8b umlsl \d0\().8h, \s0\().8b, v0.8b
umlsl2 \d1\().8h, \s0\().16b, v0.16b umlsl2 \d1\().8h, \s0\().16b, v0.16b
umlal \d0\().8h, \s1\().8b, v1.8b umlal \d0\().8h, \s1\().8b, v1.8b
@ -2793,8 +2793,8 @@ function ff_hevc_put_hevc_epel_uni_w_v12_8_neon, export=1
endfunc endfunc
.macro EPEL_UNI_W_V16_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3 .macro EPEL_UNI_W_V16_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3
movi \d0\().2d, #0 movi \d0\().16b, #0
movi \d1\().2d, #0 movi \d1\().16b, #0
umlsl \d0\().8h, \s0\().8b, v0.8b umlsl \d0\().8h, \s0\().8b, v0.8b
umlsl2 \d1\().8h, \s0\().16b, v0.16b umlsl2 \d1\().8h, \s0\().16b, v0.16b
umlal \d0\().8h, \s1\().8b, v1.8b umlal \d0\().8h, \s1\().8b, v1.8b

View File

@ -2180,8 +2180,8 @@ function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon_i8mm, export=1
ext v3.16b, v0.16b, v0.16b, #3 ext v3.16b, v0.16b, v0.16b, #3
zip1 v0.2d, v0.2d, v1.2d zip1 v0.2d, v0.2d, v1.2d
zip1 v2.2d, v2.2d, v3.2d zip1 v2.2d, v2.2d, v3.2d
movi v16.2d, #0 movi v16.16b, #0
movi v17.2d, #0 movi v17.16b, #0
usdot v16.4s, v0.16b, v28.16b usdot v16.4s, v0.16b, v28.16b
usdot v17.4s, v2.16b, v28.16b usdot v17.4s, v2.16b, v28.16b
addp v16.4s, v16.4s, v17.4s addp v16.4s, v16.4s, v17.4s
@ -2210,9 +2210,9 @@ function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon_i8mm, export=1
zip1 v0.2d, v0.2d, v1.2d zip1 v0.2d, v0.2d, v1.2d
zip1 v2.2d, v2.2d, v3.2d zip1 v2.2d, v2.2d, v3.2d
zip1 v4.2d, v4.2d, v5.2d zip1 v4.2d, v4.2d, v5.2d
movi v16.2d, #0 movi v16.16b, #0
movi v17.2d, #0 movi v17.16b, #0
movi v18.2d, #0 movi v18.16b, #0
usdot v16.4s, v0.16b, v28.16b usdot v16.4s, v0.16b, v28.16b
usdot v17.4s, v2.16b, v28.16b usdot v17.4s, v2.16b, v28.16b
usdot v18.4s, v4.16b, v28.16b usdot v18.4s, v4.16b, v28.16b
@ -2236,10 +2236,10 @@ endfunc
.macro QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3 .macro QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
movi \d0\().2d, #0 movi \d0\().16b, #0
movi \d1\().2d, #0 movi \d1\().16b, #0
movi \d2\().2d, #0 movi \d2\().16b, #0
movi \d3\().2d, #0 movi \d3\().16b, #0
usdot \d0\().4s, \s0\().16b, v28.16b usdot \d0\().4s, \s0\().16b, v28.16b
usdot \d1\().4s, \s1\().16b, v28.16b usdot \d1\().4s, \s1\().16b, v28.16b
usdot \d2\().4s, \s2\().16b, v28.16b usdot \d2\().4s, \s2\().16b, v28.16b
@ -2255,8 +2255,8 @@ endfunc
.endm .endm
.macro QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1 .macro QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1
movi \d0\().2d, #0 movi \d0\().16b, #0
movi \d1\().2d, #0 movi \d1\().16b, #0
usdot \d0\().4s, \s0\().16b, v28.16b usdot \d0\().4s, \s0\().16b, v28.16b
usdot \d1\().4s, \s1\().16b, v28.16b usdot \d1\().4s, \s1\().16b, v28.16b
addp \d0\().4s, \d0\().4s, \d1\().4s addp \d0\().4s, \d0\().4s, \d1\().4s
@ -2606,8 +2606,8 @@ function ff_hevc_put_hevc_qpel_h4_8_neon_i8mm, export=1
ext v3.16b, v0.16b, v0.16b, #3 ext v3.16b, v0.16b, v0.16b, #3
zip1 v0.2d, v0.2d, v1.2d zip1 v0.2d, v0.2d, v1.2d
zip1 v2.2d, v2.2d, v3.2d zip1 v2.2d, v2.2d, v3.2d
movi v16.2d, #0 movi v16.16b, #0
movi v17.2d, #0 movi v17.16b, #0
usdot v16.4s, v0.16b, v31.16b usdot v16.4s, v0.16b, v31.16b
usdot v17.4s, v2.16b, v31.16b usdot v17.4s, v2.16b, v31.16b
addp v16.4s, v16.4s, v17.4s addp v16.4s, v16.4s, v17.4s
@ -2633,9 +2633,9 @@ function ff_hevc_put_hevc_qpel_h6_8_neon_i8mm, export=1
zip1 v0.2d, v0.2d, v1.2d zip1 v0.2d, v0.2d, v1.2d
zip1 v2.2d, v2.2d, v3.2d zip1 v2.2d, v2.2d, v3.2d
zip1 v4.2d, v4.2d, v5.2d zip1 v4.2d, v4.2d, v5.2d
movi v16.2d, #0 movi v16.16b, #0
movi v17.2d, #0 movi v17.16b, #0
movi v18.2d, #0 movi v18.16b, #0
usdot v16.4s, v0.16b, v31.16b usdot v16.4s, v0.16b, v31.16b
usdot v17.4s, v2.16b, v31.16b usdot v17.4s, v2.16b, v31.16b
usdot v18.4s, v4.16b, v31.16b usdot v18.4s, v4.16b, v31.16b
@ -2668,10 +2668,10 @@ function ff_hevc_put_hevc_qpel_h8_8_neon_i8mm, export=1
zip1 v2.2d, v2.2d, v3.2d zip1 v2.2d, v2.2d, v3.2d
zip1 v4.2d, v4.2d, v5.2d zip1 v4.2d, v4.2d, v5.2d
zip1 v6.2d, v6.2d, v7.2d zip1 v6.2d, v6.2d, v7.2d
movi v16.2d, #0 movi v16.16b, #0
movi v17.2d, #0 movi v17.16b, #0
movi v18.2d, #0 movi v18.16b, #0
movi v19.2d, #0 movi v19.16b, #0
usdot v16.4s, v0.16b, v31.16b usdot v16.4s, v0.16b, v31.16b
usdot v17.4s, v2.16b, v31.16b usdot v17.4s, v2.16b, v31.16b
usdot v18.4s, v4.16b, v31.16b usdot v18.4s, v4.16b, v31.16b
@ -2688,10 +2688,10 @@ function ff_hevc_put_hevc_qpel_h8_8_neon_i8mm, export=1
endfunc endfunc
.macro QPEL_H_CALC s0, s1, s2, s3, d0, d1, d2, d3 .macro QPEL_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
movi \d0\().2d, #0 movi \d0\().16b, #0
movi \d1\().2d, #0 movi \d1\().16b, #0
movi \d2\().2d, #0 movi \d2\().16b, #0
movi \d3\().2d, #0 movi \d3\().16b, #0
usdot \d0\().4s, \s0\().16b, v31.16b usdot \d0\().4s, \s0\().16b, v31.16b
usdot \d1\().4s, \s1\().16b, v31.16b usdot \d1\().4s, \s1\().16b, v31.16b
usdot \d2\().4s, \s2\().16b, v31.16b usdot \d2\().4s, \s2\().16b, v31.16b
@ -2716,8 +2716,8 @@ function ff_hevc_put_hevc_qpel_h12_8_neon_i8mm, export=1
QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23 QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
addp v20.4s, v20.4s, v22.4s addp v20.4s, v20.4s, v22.4s
addp v21.4s, v21.4s, v23.4s addp v21.4s, v21.4s, v23.4s
movi v24.2d, #0 movi v24.16b, #0
movi v25.2d, #0 movi v25.16b, #0
usdot v24.4s, v18.16b, v31.16b usdot v24.4s, v18.16b, v31.16b
usdot v25.4s, v19.16b, v31.16b usdot v25.4s, v19.16b, v31.16b
addp v24.4s, v24.4s, v25.4s addp v24.4s, v24.4s, v25.4s

View File

@ -56,7 +56,7 @@ function ff_mpadsp_apply_window_\type\()_neon, export=1
.ifc \type, fixed .ifc \type, fixed
ld1r {v16.2s}, [x2] // dither_state ld1r {v16.2s}, [x2] // dither_state
sxtl v16.2d, v16.2s sxtl v16.2d, v16.2s
movi v29.2d, #0 movi v29.16b, #0
movi v30.2d, #(1<<OUT_SHIFT)-1 movi v30.2d, #(1<<OUT_SHIFT)-1
trn1 v31.2d, v29.2d, v30.2d trn1 v31.2d, v29.2d, v30.2d
trn2 v30.2d, v30.2d, v29.2d trn2 v30.2d, v30.2d, v29.2d
@ -74,9 +74,9 @@ function ff_mpadsp_apply_window_\type\()_neon, export=1
add x11, x6, #(32)<<2 // w + 32 add x11, x6, #(32)<<2 // w + 32
add x12, x7, #(32)<<2 // w2 + 32 add x12, x7, #(32)<<2 // w2 + 32
mov x15, #8 mov x15, #8
movi v17.2d, #0 movi v17.16b, #0
movi v18.2d, #0 movi v18.16b, #0
movi v19.2d, #0 movi v19.16b, #0
2: 2:
subs x15, x15, #1 subs x15, x15, #1
ld1 {v0.4s}, [x8], x9 ld1 {v0.4s}, [x8], x9

View File

@ -50,10 +50,10 @@ function ff_hscale8to15_X8_neon, export=1
add x12, x16, x7 // filter1 = filter0 + filterSize*2 add x12, x16, x7 // filter1 = filter0 + filterSize*2
add x13, x12, x7 // filter2 = filter1 + filterSize*2 add x13, x12, x7 // filter2 = filter1 + filterSize*2
add x4, x13, x7 // filter3 = filter2 + filterSize*2 add x4, x13, x7 // filter3 = filter2 + filterSize*2
movi v0.2d, #0 // val sum part 1 (for dst[0]) movi v0.16b, #0 // val sum part 1 (for dst[0])
movi v1.2d, #0 // val sum part 2 (for dst[1]) movi v1.16b, #0 // val sum part 2 (for dst[1])
movi v2.2d, #0 // val sum part 3 (for dst[2]) movi v2.16b, #0 // val sum part 3 (for dst[2])
movi v3.2d, #0 // val sum part 4 (for dst[3]) movi v3.16b, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, uxtw // srcp + filterPos[0] add x17, x3, w8, uxtw // srcp + filterPos[0]
add x8, x3, w0, uxtw // srcp + filterPos[1] add x8, x3, w0, uxtw // srcp + filterPos[1]
add x0, x3, w11, uxtw // srcp + filterPos[2] add x0, x3, w11, uxtw // srcp + filterPos[2]
@ -108,10 +108,10 @@ function ff_hscale8to15_X4_neon, export=1
ldp w8, w9, [x5] // filterPos[idx + 0], [idx + 1] ldp w8, w9, [x5] // filterPos[idx + 0], [idx + 1]
ldp w10, w11, [x5, #8] // filterPos[idx + 2], [idx + 3] ldp w10, w11, [x5, #8] // filterPos[idx + 2], [idx + 3]
movi v16.2d, #0 // initialize accumulator for idx + 0 movi v16.16b, #0 // initialize accumulator for idx + 0
movi v17.2d, #0 // initialize accumulator for idx + 1 movi v17.16b, #0 // initialize accumulator for idx + 1
movi v18.2d, #0 // initialize accumulator for idx + 2 movi v18.16b, #0 // initialize accumulator for idx + 2
movi v19.2d, #0 // initialize accumulator for idx + 3 movi v19.16b, #0 // initialize accumulator for idx + 3
mov x12, x4 // filter pointer for idx + 0 mov x12, x4 // filter pointer for idx + 0
add x13, x4, x7 // filter pointer for idx + 1 add x13, x4, x7 // filter pointer for idx + 1
@ -253,8 +253,8 @@ function ff_hscale8to15_4_neon, export=1
ldp w12, w13, [x5, #16] // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration ldp w12, w13, [x5, #16] // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration
ldp w14, w15, [x5, #24] // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration ldp w14, w15, [x5, #24] // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration
movi v0.2d, #0 // Clear madd accumulator for idx 0..3 movi v0.16b, #0 // Clear madd accumulator for idx 0..3
movi v5.2d, #0 // Clear madd accumulator for idx 4..7 movi v5.16b, #0 // Clear madd accumulator for idx 4..7
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7 ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
@ -299,8 +299,8 @@ function ff_hscale8to15_4_neon, export=1
ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp] ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp]
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7 ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
movi v0.2d, #0 // Clear madd accumulator for idx 0..3 movi v0.16b, #0 // Clear madd accumulator for idx 0..3
movi v5.2d, #0 // Clear madd accumulator for idx 4..7 movi v5.16b, #0 // Clear madd accumulator for idx 4..7
uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
@ -499,10 +499,10 @@ function ff_hscale8to19_X8_neon, export=1
ldr w11, [x5], #4 // filterPos[idx + 2] ldr w11, [x5], #4 // filterPos[idx + 2]
add x4, x13, x7 // filter3 = filter2 + filterSize*2 add x4, x13, x7 // filter3 = filter2 + filterSize*2
ldr w9, [x5], #4 // filterPos[idx + 3] ldr w9, [x5], #4 // filterPos[idx + 3]
movi v0.2d, #0 // val sum part 1 (for dst[0]) movi v0.16b, #0 // val sum part 1 (for dst[0])
movi v1.2d, #0 // val sum part 2 (for dst[1]) movi v1.16b, #0 // val sum part 2 (for dst[1])
movi v2.2d, #0 // val sum part 3 (for dst[2]) movi v2.16b, #0 // val sum part 3 (for dst[2])
movi v3.2d, #0 // val sum part 4 (for dst[3]) movi v3.16b, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, uxtw // srcp + filterPos[0] add x17, x3, w8, uxtw // srcp + filterPos[0]
add x8, x3, w0, uxtw // srcp + filterPos[1] add x8, x3, w0, uxtw // srcp + filterPos[1]
add x0, x3, w11, uxtw // srcp + filterPos[2] add x0, x3, w11, uxtw // srcp + filterPos[2]
@ -560,10 +560,10 @@ function ff_hscale8to19_X4_neon, export=1
ldp w8, w9, [x5] ldp w8, w9, [x5]
ldp w10, w11, [x5, #8] ldp w10, w11, [x5, #8]
movi v16.2d, #0 // initialize accumulator for idx + 0 movi v16.16b, #0 // initialize accumulator for idx + 0
movi v17.2d, #0 // initialize accumulator for idx + 1 movi v17.16b, #0 // initialize accumulator for idx + 1
movi v18.2d, #0 // initialize accumulator for idx + 2 movi v18.16b, #0 // initialize accumulator for idx + 2
movi v19.2d, #0 // initialize accumulator for idx + 3 movi v19.16b, #0 // initialize accumulator for idx + 3
mov x12, x4 // filter + 0 mov x12, x4 // filter + 0
add x13, x4, x7 // filter + 1 add x13, x4, x7 // filter + 1
@ -865,10 +865,10 @@ function ff_hscale16to15_X8_neon_asm, export=1
add x12, x16, x7 // filter1 = filter0 + filterSize*2 add x12, x16, x7 // filter1 = filter0 + filterSize*2
add x13, x12, x7 // filter2 = filter1 + filterSize*2 add x13, x12, x7 // filter2 = filter1 + filterSize*2
add x4, x13, x7 // filter3 = filter2 + filterSize*2 add x4, x13, x7 // filter3 = filter2 + filterSize*2
movi v0.2d, #0 // val sum part 1 (for dst[0]) movi v0.16b, #0 // val sum part 1 (for dst[0])
movi v1.2d, #0 // val sum part 2 (for dst[1]) movi v1.16b, #0 // val sum part 2 (for dst[1])
movi v2.2d, #0 // val sum part 3 (for dst[2]) movi v2.16b, #0 // val sum part 3 (for dst[2])
movi v3.2d, #0 // val sum part 4 (for dst[3]) movi v3.16b, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, uxtw // srcp + filterPos[0] add x17, x3, w8, uxtw // srcp + filterPos[0]
add x8, x3, w10, uxtw // srcp + filterPos[1] add x8, x3, w10, uxtw // srcp + filterPos[1]
add x10, x3, w11, uxtw // srcp + filterPos[2] add x10, x3, w11, uxtw // srcp + filterPos[2]
@ -945,10 +945,10 @@ function ff_hscale16to15_X4_neon_asm, export=1
ldp w8, w9, [x5] ldp w8, w9, [x5]
ldp w10, w11, [x5, #8] ldp w10, w11, [x5, #8]
movi v16.2d, #0 // initialize accumulator for idx + 0 movi v16.16b, #0 // initialize accumulator for idx + 0
movi v17.2d, #0 // initialize accumulator for idx + 1 movi v17.16b, #0 // initialize accumulator for idx + 1
movi v18.2d, #0 // initialize accumulator for idx + 2 movi v18.16b, #0 // initialize accumulator for idx + 2
movi v19.2d, #0 // initialize accumulator for idx + 3 movi v19.16b, #0 // initialize accumulator for idx + 3
mov x12, x4 // filter + 0 mov x12, x4 // filter + 0
add x13, x4, x7 // filter + 1 add x13, x4, x7 // filter + 1
@ -1270,10 +1270,10 @@ function ff_hscale16to19_X8_neon_asm, export=1
add x13, x12, x7 // filter2 = filter1 + filterSize*2 add x13, x12, x7 // filter2 = filter1 + filterSize*2
lsl w10, w10, #1 lsl w10, w10, #1
add x4, x13, x7 // filter3 = filter2 + filterSize*2 add x4, x13, x7 // filter3 = filter2 + filterSize*2
movi v0.2d, #0 // val sum part 1 (for dst[0]) movi v0.16b, #0 // val sum part 1 (for dst[0])
movi v1.2d, #0 // val sum part 2 (for dst[1]) movi v1.16b, #0 // val sum part 2 (for dst[1])
movi v2.2d, #0 // val sum part 3 (for dst[2]) movi v2.16b, #0 // val sum part 3 (for dst[2])
movi v3.2d, #0 // val sum part 4 (for dst[3]) movi v3.16b, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, uxtw // srcp + filterPos[0] add x17, x3, w8, uxtw // srcp + filterPos[0]
add x8, x3, w10, uxtw // srcp + filterPos[1] add x8, x3, w10, uxtw // srcp + filterPos[1]
add x10, x3, w11, uxtw // srcp + filterPos[2] add x10, x3, w11, uxtw // srcp + filterPos[2]
@ -1348,10 +1348,10 @@ function ff_hscale16to19_X4_neon_asm, export=1
ldp w8, w9, [x5] ldp w8, w9, [x5]
ldp w10, w11, [x5, #8] ldp w10, w11, [x5, #8]
movi v16.2d, #0 // initialize accumulator for idx + 0 movi v16.16b, #0 // initialize accumulator for idx + 0
movi v17.2d, #0 // initialize accumulator for idx + 1 movi v17.16b, #0 // initialize accumulator for idx + 1
movi v18.2d, #0 // initialize accumulator for idx + 2 movi v18.16b, #0 // initialize accumulator for idx + 2
movi v19.2d, #0 // initialize accumulator for idx + 3 movi v19.16b, #0 // initialize accumulator for idx + 3
mov x12, x4 // filter + 0 mov x12, x4 // filter + 0
add x13, x4, x7 // filter + 1 add x13, x4, x7 // filter + 1