mirror of https://git.ffmpeg.org/ffmpeg.git
lavc/hevcdsp_qpel_neon: using movi.16b instead of movi.2d
Building iOS platform with arm64, the compiler has a warning: "instruction movi.2d with immediate #0 may not function correctly on this CPU, converting to movi.16b" Signed-off-by: xufuji456 <839789740@qq.com> Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
67ce690bc6
commit
cc86343b96
|
@ -694,7 +694,7 @@ function ff_hevc_put_hevc_epel_h4_8_neon_i8mm, export=1
|
||||||
trn1 v4.2s, v4.2s, v5.2s
|
trn1 v4.2s, v4.2s, v5.2s
|
||||||
trn1 v6.2s, v6.2s, v7.2s
|
trn1 v6.2s, v6.2s, v7.2s
|
||||||
trn1 v4.2d, v4.2d, v6.2d
|
trn1 v4.2d, v4.2d, v6.2d
|
||||||
movi v16.2d, #0
|
movi v16.16b, #0
|
||||||
usdot v16.4s, v4.16b, v30.16b
|
usdot v16.4s, v4.16b, v30.16b
|
||||||
xtn v16.4h, v16.4s
|
xtn v16.4h, v16.4s
|
||||||
st1 {v16.4h}, [x0], x10
|
st1 {v16.4h}, [x0], x10
|
||||||
|
@ -714,8 +714,8 @@ function ff_hevc_put_hevc_epel_h6_8_neon_i8mm, export=1
|
||||||
trn2 v17.2s, v4.2s, v5.2s
|
trn2 v17.2s, v4.2s, v5.2s
|
||||||
trn1 v6.2s, v6.2s, v7.2s
|
trn1 v6.2s, v6.2s, v7.2s
|
||||||
trn1 v16.2d, v16.2d, v6.2d
|
trn1 v16.2d, v16.2d, v6.2d
|
||||||
movi v18.2d, #0
|
movi v18.16b, #0
|
||||||
movi v19.2d, #0
|
movi v19.16b, #0
|
||||||
usdot v18.4s, v16.16b, v30.16b
|
usdot v18.4s, v16.16b, v30.16b
|
||||||
usdot v19.2s, v17.8b, v30.8b
|
usdot v19.2s, v17.8b, v30.8b
|
||||||
xtn v18.4h, v18.4s
|
xtn v18.4h, v18.4s
|
||||||
|
@ -736,8 +736,8 @@ function ff_hevc_put_hevc_epel_h8_8_neon_i8mm, export=1
|
||||||
ext v7.16b, v4.16b, v4.16b, #3
|
ext v7.16b, v4.16b, v4.16b, #3
|
||||||
zip1 v20.4s, v4.4s, v6.4s
|
zip1 v20.4s, v4.4s, v6.4s
|
||||||
zip1 v21.4s, v5.4s, v7.4s
|
zip1 v21.4s, v5.4s, v7.4s
|
||||||
movi v16.2d, #0
|
movi v16.16b, #0
|
||||||
movi v17.2d, #0
|
movi v17.16b, #0
|
||||||
usdot v16.4s, v20.16b, v30.16b
|
usdot v16.4s, v20.16b, v30.16b
|
||||||
usdot v17.4s, v21.16b, v30.16b
|
usdot v17.4s, v21.16b, v30.16b
|
||||||
xtn v16.4h, v16.4s
|
xtn v16.4h, v16.4s
|
||||||
|
@ -761,9 +761,9 @@ function ff_hevc_put_hevc_epel_h12_8_neon_i8mm, export=1
|
||||||
trn1 v4.4s, v20.4s, v21.4s
|
trn1 v4.4s, v20.4s, v21.4s
|
||||||
trn2 v5.4s, v20.4s, v21.4s
|
trn2 v5.4s, v20.4s, v21.4s
|
||||||
trn1 v6.4s, v22.4s, v23.4s
|
trn1 v6.4s, v22.4s, v23.4s
|
||||||
movi v16.2d, #0
|
movi v16.16b, #0
|
||||||
movi v17.2d, #0
|
movi v17.16b, #0
|
||||||
movi v18.2d, #0
|
movi v18.16b, #0
|
||||||
usdot v16.4s, v4.16b, v30.16b
|
usdot v16.4s, v4.16b, v30.16b
|
||||||
usdot v17.4s, v5.16b, v30.16b
|
usdot v17.4s, v5.16b, v30.16b
|
||||||
usdot v18.4s, v6.16b, v30.16b
|
usdot v18.4s, v6.16b, v30.16b
|
||||||
|
@ -788,10 +788,10 @@ function ff_hevc_put_hevc_epel_h16_8_neon_i8mm, export=1
|
||||||
zip2 v22.4s, v0.4s, v6.4s
|
zip2 v22.4s, v0.4s, v6.4s
|
||||||
zip1 v21.4s, v5.4s, v7.4s
|
zip1 v21.4s, v5.4s, v7.4s
|
||||||
zip2 v23.4s, v5.4s, v7.4s
|
zip2 v23.4s, v5.4s, v7.4s
|
||||||
movi v16.2d, #0
|
movi v16.16b, #0
|
||||||
movi v17.2d, #0
|
movi v17.16b, #0
|
||||||
movi v18.2d, #0
|
movi v18.16b, #0
|
||||||
movi v19.2d, #0
|
movi v19.16b, #0
|
||||||
usdot v16.4s, v20.16b, v30.16b
|
usdot v16.4s, v20.16b, v30.16b
|
||||||
usdot v17.4s, v21.16b, v30.16b
|
usdot v17.4s, v21.16b, v30.16b
|
||||||
usdot v18.4s, v22.16b, v30.16b
|
usdot v18.4s, v22.16b, v30.16b
|
||||||
|
@ -815,14 +815,14 @@ function ff_hevc_put_hevc_epel_h24_8_neon_i8mm, export=1
|
||||||
ext v26.16b, v1.16b, v1.16b, #1
|
ext v26.16b, v1.16b, v1.16b, #1
|
||||||
ext v27.16b, v1.16b, v1.16b, #2
|
ext v27.16b, v1.16b, v1.16b, #2
|
||||||
ext v28.16b, v1.16b, v1.16b, #3
|
ext v28.16b, v1.16b, v1.16b, #3
|
||||||
movi v16.2d, #0
|
movi v16.16b, #0
|
||||||
movi v17.2d, #0
|
movi v17.16b, #0
|
||||||
movi v18.2d, #0
|
movi v18.16b, #0
|
||||||
movi v19.2d, #0
|
movi v19.16b, #0
|
||||||
movi v20.2d, #0
|
movi v20.16b, #0
|
||||||
movi v21.2d, #0
|
movi v21.16b, #0
|
||||||
movi v22.2d, #0
|
movi v22.16b, #0
|
||||||
movi v23.2d, #0
|
movi v23.16b, #0
|
||||||
usdot v16.4s, v0.16b, v30.16b
|
usdot v16.4s, v0.16b, v30.16b
|
||||||
usdot v17.4s, v5.16b, v30.16b
|
usdot v17.4s, v5.16b, v30.16b
|
||||||
usdot v18.4s, v6.16b, v30.16b
|
usdot v18.4s, v6.16b, v30.16b
|
||||||
|
@ -861,14 +861,14 @@ function ff_hevc_put_hevc_epel_h32_8_neon_i8mm, export=1
|
||||||
ext v26.16b, v1.16b, v2.16b, #1
|
ext v26.16b, v1.16b, v2.16b, #1
|
||||||
ext v27.16b, v1.16b, v2.16b, #2
|
ext v27.16b, v1.16b, v2.16b, #2
|
||||||
ext v28.16b, v1.16b, v2.16b, #3
|
ext v28.16b, v1.16b, v2.16b, #3
|
||||||
movi v16.2d, #0
|
movi v16.16b, #0
|
||||||
movi v17.2d, #0
|
movi v17.16b, #0
|
||||||
movi v18.2d, #0
|
movi v18.16b, #0
|
||||||
movi v19.2d, #0
|
movi v19.16b, #0
|
||||||
movi v20.2d, #0
|
movi v20.16b, #0
|
||||||
movi v21.2d, #0
|
movi v21.16b, #0
|
||||||
movi v22.2d, #0
|
movi v22.16b, #0
|
||||||
movi v23.2d, #0
|
movi v23.16b, #0
|
||||||
usdot v16.4s, v0.16b, v30.16b
|
usdot v16.4s, v0.16b, v30.16b
|
||||||
usdot v17.4s, v5.16b, v30.16b
|
usdot v17.4s, v5.16b, v30.16b
|
||||||
usdot v18.4s, v6.16b, v30.16b
|
usdot v18.4s, v6.16b, v30.16b
|
||||||
|
@ -900,18 +900,18 @@ function ff_hevc_put_hevc_epel_h48_8_neon_i8mm, export=1
|
||||||
ext v16.16b, v1.16b, v2.16b, #1
|
ext v16.16b, v1.16b, v2.16b, #1
|
||||||
ext v17.16b, v1.16b, v2.16b, #2
|
ext v17.16b, v1.16b, v2.16b, #2
|
||||||
ext v18.16b, v1.16b, v2.16b, #3
|
ext v18.16b, v1.16b, v2.16b, #3
|
||||||
movi v20.2d, #0
|
movi v20.16b, #0
|
||||||
movi v21.2d, #0
|
movi v21.16b, #0
|
||||||
movi v22.2d, #0
|
movi v22.16b, #0
|
||||||
movi v23.2d, #0
|
movi v23.16b, #0
|
||||||
usdot v20.4s, v0.16b, v30.16b
|
usdot v20.4s, v0.16b, v30.16b
|
||||||
usdot v21.4s, v4.16b, v30.16b
|
usdot v21.4s, v4.16b, v30.16b
|
||||||
usdot v22.4s, v5.16b, v30.16b
|
usdot v22.4s, v5.16b, v30.16b
|
||||||
usdot v23.4s, v6.16b, v30.16b
|
usdot v23.4s, v6.16b, v30.16b
|
||||||
movi v24.2d, #0
|
movi v24.16b, #0
|
||||||
movi v25.2d, #0
|
movi v25.16b, #0
|
||||||
movi v26.2d, #0
|
movi v26.16b, #0
|
||||||
movi v27.2d, #0
|
movi v27.16b, #0
|
||||||
usdot v24.4s, v1.16b, v30.16b
|
usdot v24.4s, v1.16b, v30.16b
|
||||||
usdot v25.4s, v16.16b, v30.16b
|
usdot v25.4s, v16.16b, v30.16b
|
||||||
usdot v26.4s, v17.16b, v30.16b
|
usdot v26.4s, v17.16b, v30.16b
|
||||||
|
@ -928,10 +928,10 @@ function ff_hevc_put_hevc_epel_h48_8_neon_i8mm, export=1
|
||||||
ext v4.16b, v2.16b, v3.16b, #1
|
ext v4.16b, v2.16b, v3.16b, #1
|
||||||
ext v5.16b, v2.16b, v3.16b, #2
|
ext v5.16b, v2.16b, v3.16b, #2
|
||||||
ext v6.16b, v2.16b, v3.16b, #3
|
ext v6.16b, v2.16b, v3.16b, #3
|
||||||
movi v20.2d, #0
|
movi v20.16b, #0
|
||||||
movi v21.2d, #0
|
movi v21.16b, #0
|
||||||
movi v22.2d, #0
|
movi v22.16b, #0
|
||||||
movi v23.2d, #0
|
movi v23.16b, #0
|
||||||
usdot v20.4s, v2.16b, v30.16b
|
usdot v20.4s, v2.16b, v30.16b
|
||||||
usdot v21.4s, v4.16b, v30.16b
|
usdot v21.4s, v4.16b, v30.16b
|
||||||
usdot v22.4s, v5.16b, v30.16b
|
usdot v22.4s, v5.16b, v30.16b
|
||||||
|
@ -957,18 +957,18 @@ function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1
|
||||||
ext v16.16b, v1.16b, v2.16b, #1
|
ext v16.16b, v1.16b, v2.16b, #1
|
||||||
ext v17.16b, v1.16b, v2.16b, #2
|
ext v17.16b, v1.16b, v2.16b, #2
|
||||||
ext v18.16b, v1.16b, v2.16b, #3
|
ext v18.16b, v1.16b, v2.16b, #3
|
||||||
movi v20.2d, #0
|
movi v20.16b, #0
|
||||||
movi v21.2d, #0
|
movi v21.16b, #0
|
||||||
movi v22.2d, #0
|
movi v22.16b, #0
|
||||||
movi v23.2d, #0
|
movi v23.16b, #0
|
||||||
usdot v20.4s, v0.16b, v30.16b
|
usdot v20.4s, v0.16b, v30.16b
|
||||||
usdot v21.4s, v4.16b, v30.16b
|
usdot v21.4s, v4.16b, v30.16b
|
||||||
usdot v22.4s, v5.16b, v30.16b
|
usdot v22.4s, v5.16b, v30.16b
|
||||||
usdot v23.4s, v6.16b, v30.16b
|
usdot v23.4s, v6.16b, v30.16b
|
||||||
movi v24.2d, #0
|
movi v24.16b, #0
|
||||||
movi v25.2d, #0
|
movi v25.16b, #0
|
||||||
movi v26.2d, #0
|
movi v26.16b, #0
|
||||||
movi v27.2d, #0
|
movi v27.16b, #0
|
||||||
usdot v24.4s, v1.16b, v30.16b
|
usdot v24.4s, v1.16b, v30.16b
|
||||||
usdot v25.4s, v16.16b, v30.16b
|
usdot v25.4s, v16.16b, v30.16b
|
||||||
usdot v26.4s, v17.16b, v30.16b
|
usdot v26.4s, v17.16b, v30.16b
|
||||||
|
@ -989,18 +989,18 @@ function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1
|
||||||
ext v16.16b, v3.16b, v7.16b, #1
|
ext v16.16b, v3.16b, v7.16b, #1
|
||||||
ext v17.16b, v3.16b, v7.16b, #2
|
ext v17.16b, v3.16b, v7.16b, #2
|
||||||
ext v18.16b, v3.16b, v7.16b, #3
|
ext v18.16b, v3.16b, v7.16b, #3
|
||||||
movi v20.2d, #0
|
movi v20.16b, #0
|
||||||
movi v21.2d, #0
|
movi v21.16b, #0
|
||||||
movi v22.2d, #0
|
movi v22.16b, #0
|
||||||
movi v23.2d, #0
|
movi v23.16b, #0
|
||||||
usdot v20.4s, v2.16b, v30.16b
|
usdot v20.4s, v2.16b, v30.16b
|
||||||
usdot v21.4s, v4.16b, v30.16b
|
usdot v21.4s, v4.16b, v30.16b
|
||||||
usdot v22.4s, v5.16b, v30.16b
|
usdot v22.4s, v5.16b, v30.16b
|
||||||
usdot v23.4s, v6.16b, v30.16b
|
usdot v23.4s, v6.16b, v30.16b
|
||||||
movi v24.2d, #0
|
movi v24.16b, #0
|
||||||
movi v25.2d, #0
|
movi v25.16b, #0
|
||||||
movi v26.2d, #0
|
movi v26.16b, #0
|
||||||
movi v27.2d, #0
|
movi v27.16b, #0
|
||||||
usdot v24.4s, v3.16b, v30.16b
|
usdot v24.4s, v3.16b, v30.16b
|
||||||
usdot v25.4s, v16.16b, v30.16b
|
usdot v25.4s, v16.16b, v30.16b
|
||||||
usdot v26.4s, v17.16b, v30.16b
|
usdot v26.4s, v17.16b, v30.16b
|
||||||
|
@ -1593,7 +1593,7 @@ function ff_hevc_put_hevc_epel_uni_w_h4_8_neon_i8mm, export=1
|
||||||
trn1 v0.2s, v0.2s, v2.2s
|
trn1 v0.2s, v0.2s, v2.2s
|
||||||
trn1 v1.2s, v1.2s, v3.2s
|
trn1 v1.2s, v1.2s, v3.2s
|
||||||
zip1 v0.4s, v0.4s, v1.4s
|
zip1 v0.4s, v0.4s, v1.4s
|
||||||
movi v16.2d, #0
|
movi v16.16b, #0
|
||||||
usdot v16.4s, v0.16b, v28.16b
|
usdot v16.4s, v0.16b, v28.16b
|
||||||
mul v16.4s, v16.4s, v30.4s
|
mul v16.4s, v16.4s, v30.4s
|
||||||
sqrshl v16.4s, v16.4s, v31.4s
|
sqrshl v16.4s, v16.4s, v31.4s
|
||||||
|
@ -1620,8 +1620,8 @@ function ff_hevc_put_hevc_epel_uni_w_h6_8_neon_i8mm, export=1
|
||||||
trn2 v6.2s, v0.2s, v1.2s
|
trn2 v6.2s, v0.2s, v1.2s
|
||||||
trn1 v5.2s, v2.2s, v3.2s
|
trn1 v5.2s, v2.2s, v3.2s
|
||||||
zip1 v4.2d, v4.2d, v5.2d
|
zip1 v4.2d, v4.2d, v5.2d
|
||||||
movi v16.2d, #0
|
movi v16.16b, #0
|
||||||
movi v17.2d, #0
|
movi v17.16b, #0
|
||||||
usdot v16.4s, v4.16b, v28.16b
|
usdot v16.4s, v4.16b, v28.16b
|
||||||
usdot v17.2s, v6.8b, v28.8b
|
usdot v17.2s, v6.8b, v28.8b
|
||||||
mul v16.4s, v16.4s, v30.4s
|
mul v16.4s, v16.4s, v30.4s
|
||||||
|
@ -1640,8 +1640,8 @@ function ff_hevc_put_hevc_epel_uni_w_h6_8_neon_i8mm, export=1
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
.macro EPEL_UNI_W_H_CALC s0, s1, d0, d1
|
.macro EPEL_UNI_W_H_CALC s0, s1, d0, d1
|
||||||
movi \d0\().2d, #0
|
movi \d0\().16b, #0
|
||||||
movi \d1\().2d, #0
|
movi \d1\().16b, #0
|
||||||
usdot \d0\().4s, \s0\().16b, v28.16b
|
usdot \d0\().4s, \s0\().16b, v28.16b
|
||||||
usdot \d1\().4s, \s1\().16b, v28.16b
|
usdot \d1\().4s, \s1\().16b, v28.16b
|
||||||
mul \d0\().4s, \d0\().4s, v30.4s
|
mul \d0\().4s, \d0\().4s, v30.4s
|
||||||
|
@ -1687,7 +1687,7 @@ function ff_hevc_put_hevc_epel_uni_w_h12_8_neon_i8mm, export=1
|
||||||
zip2 v7.4s, v1.4s, v3.4s
|
zip2 v7.4s, v1.4s, v3.4s
|
||||||
zip1 v6.4s, v6.4s, v7.4s
|
zip1 v6.4s, v6.4s, v7.4s
|
||||||
EPEL_UNI_W_H_CALC v4, v5, v16, v17
|
EPEL_UNI_W_H_CALC v4, v5, v16, v17
|
||||||
movi v18.2d, #0
|
movi v18.16b, #0
|
||||||
usdot v18.4s, v6.16b, v28.16b
|
usdot v18.4s, v6.16b, v28.16b
|
||||||
mul v18.4s, v18.4s, v30.4s
|
mul v18.4s, v18.4s, v30.4s
|
||||||
sqrshl v18.4s, v18.4s, v31.4s
|
sqrshl v18.4s, v18.4s, v31.4s
|
||||||
|
@ -2575,7 +2575,7 @@ DISABLE_I8MM
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro EPEL_UNI_W_V4_CALC d0, s0, s1, s2, s3
|
.macro EPEL_UNI_W_V4_CALC d0, s0, s1, s2, s3
|
||||||
movi \d0\().2d, #0
|
movi \d0\().16b, #0
|
||||||
umlsl \d0\().8h, \s0\().8b, v0.8b
|
umlsl \d0\().8h, \s0\().8b, v0.8b
|
||||||
umlal \d0\().8h, \s1\().8b, v1.8b
|
umlal \d0\().8h, \s1\().8b, v1.8b
|
||||||
umlal \d0\().8h, \s2\().8b, v2.8b
|
umlal \d0\().8h, \s2\().8b, v2.8b
|
||||||
|
@ -2626,7 +2626,7 @@ function ff_hevc_put_hevc_epel_uni_w_v4_8_neon, export=1
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
.macro EPEL_UNI_W_V8_CALC d0, s0, s1, s2, s3, t0, t1
|
.macro EPEL_UNI_W_V8_CALC d0, s0, s1, s2, s3, t0, t1
|
||||||
movi \d0\().2d, #0
|
movi \d0\().16b, #0
|
||||||
umlsl \d0\().8h, \s0\().8b, v0.8b
|
umlsl \d0\().8h, \s0\().8b, v0.8b
|
||||||
umlal \d0\().8h, \s1\().8b, v1.8b
|
umlal \d0\().8h, \s1\().8b, v1.8b
|
||||||
umlal \d0\().8h, \s2\().8b, v2.8b
|
umlal \d0\().8h, \s2\().8b, v2.8b
|
||||||
|
@ -2720,8 +2720,8 @@ function ff_hevc_put_hevc_epel_uni_w_v8_8_neon, export=1
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
.macro EPEL_UNI_W_V12_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3
|
.macro EPEL_UNI_W_V12_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3
|
||||||
movi \d0\().2d, #0
|
movi \d0\().16b, #0
|
||||||
movi \d1\().2d, #0
|
movi \d1\().16b, #0
|
||||||
umlsl \d0\().8h, \s0\().8b, v0.8b
|
umlsl \d0\().8h, \s0\().8b, v0.8b
|
||||||
umlsl2 \d1\().8h, \s0\().16b, v0.16b
|
umlsl2 \d1\().8h, \s0\().16b, v0.16b
|
||||||
umlal \d0\().8h, \s1\().8b, v1.8b
|
umlal \d0\().8h, \s1\().8b, v1.8b
|
||||||
|
@ -2793,8 +2793,8 @@ function ff_hevc_put_hevc_epel_uni_w_v12_8_neon, export=1
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
.macro EPEL_UNI_W_V16_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3
|
.macro EPEL_UNI_W_V16_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3
|
||||||
movi \d0\().2d, #0
|
movi \d0\().16b, #0
|
||||||
movi \d1\().2d, #0
|
movi \d1\().16b, #0
|
||||||
umlsl \d0\().8h, \s0\().8b, v0.8b
|
umlsl \d0\().8h, \s0\().8b, v0.8b
|
||||||
umlsl2 \d1\().8h, \s0\().16b, v0.16b
|
umlsl2 \d1\().8h, \s0\().16b, v0.16b
|
||||||
umlal \d0\().8h, \s1\().8b, v1.8b
|
umlal \d0\().8h, \s1\().8b, v1.8b
|
||||||
|
|
|
@ -2180,8 +2180,8 @@ function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon_i8mm, export=1
|
||||||
ext v3.16b, v0.16b, v0.16b, #3
|
ext v3.16b, v0.16b, v0.16b, #3
|
||||||
zip1 v0.2d, v0.2d, v1.2d
|
zip1 v0.2d, v0.2d, v1.2d
|
||||||
zip1 v2.2d, v2.2d, v3.2d
|
zip1 v2.2d, v2.2d, v3.2d
|
||||||
movi v16.2d, #0
|
movi v16.16b, #0
|
||||||
movi v17.2d, #0
|
movi v17.16b, #0
|
||||||
usdot v16.4s, v0.16b, v28.16b
|
usdot v16.4s, v0.16b, v28.16b
|
||||||
usdot v17.4s, v2.16b, v28.16b
|
usdot v17.4s, v2.16b, v28.16b
|
||||||
addp v16.4s, v16.4s, v17.4s
|
addp v16.4s, v16.4s, v17.4s
|
||||||
|
@ -2210,9 +2210,9 @@ function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon_i8mm, export=1
|
||||||
zip1 v0.2d, v0.2d, v1.2d
|
zip1 v0.2d, v0.2d, v1.2d
|
||||||
zip1 v2.2d, v2.2d, v3.2d
|
zip1 v2.2d, v2.2d, v3.2d
|
||||||
zip1 v4.2d, v4.2d, v5.2d
|
zip1 v4.2d, v4.2d, v5.2d
|
||||||
movi v16.2d, #0
|
movi v16.16b, #0
|
||||||
movi v17.2d, #0
|
movi v17.16b, #0
|
||||||
movi v18.2d, #0
|
movi v18.16b, #0
|
||||||
usdot v16.4s, v0.16b, v28.16b
|
usdot v16.4s, v0.16b, v28.16b
|
||||||
usdot v17.4s, v2.16b, v28.16b
|
usdot v17.4s, v2.16b, v28.16b
|
||||||
usdot v18.4s, v4.16b, v28.16b
|
usdot v18.4s, v4.16b, v28.16b
|
||||||
|
@ -2236,10 +2236,10 @@ endfunc
|
||||||
|
|
||||||
|
|
||||||
.macro QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
|
.macro QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
|
||||||
movi \d0\().2d, #0
|
movi \d0\().16b, #0
|
||||||
movi \d1\().2d, #0
|
movi \d1\().16b, #0
|
||||||
movi \d2\().2d, #0
|
movi \d2\().16b, #0
|
||||||
movi \d3\().2d, #0
|
movi \d3\().16b, #0
|
||||||
usdot \d0\().4s, \s0\().16b, v28.16b
|
usdot \d0\().4s, \s0\().16b, v28.16b
|
||||||
usdot \d1\().4s, \s1\().16b, v28.16b
|
usdot \d1\().4s, \s1\().16b, v28.16b
|
||||||
usdot \d2\().4s, \s2\().16b, v28.16b
|
usdot \d2\().4s, \s2\().16b, v28.16b
|
||||||
|
@ -2255,8 +2255,8 @@ endfunc
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1
|
.macro QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1
|
||||||
movi \d0\().2d, #0
|
movi \d0\().16b, #0
|
||||||
movi \d1\().2d, #0
|
movi \d1\().16b, #0
|
||||||
usdot \d0\().4s, \s0\().16b, v28.16b
|
usdot \d0\().4s, \s0\().16b, v28.16b
|
||||||
usdot \d1\().4s, \s1\().16b, v28.16b
|
usdot \d1\().4s, \s1\().16b, v28.16b
|
||||||
addp \d0\().4s, \d0\().4s, \d1\().4s
|
addp \d0\().4s, \d0\().4s, \d1\().4s
|
||||||
|
@ -2606,8 +2606,8 @@ function ff_hevc_put_hevc_qpel_h4_8_neon_i8mm, export=1
|
||||||
ext v3.16b, v0.16b, v0.16b, #3
|
ext v3.16b, v0.16b, v0.16b, #3
|
||||||
zip1 v0.2d, v0.2d, v1.2d
|
zip1 v0.2d, v0.2d, v1.2d
|
||||||
zip1 v2.2d, v2.2d, v3.2d
|
zip1 v2.2d, v2.2d, v3.2d
|
||||||
movi v16.2d, #0
|
movi v16.16b, #0
|
||||||
movi v17.2d, #0
|
movi v17.16b, #0
|
||||||
usdot v16.4s, v0.16b, v31.16b
|
usdot v16.4s, v0.16b, v31.16b
|
||||||
usdot v17.4s, v2.16b, v31.16b
|
usdot v17.4s, v2.16b, v31.16b
|
||||||
addp v16.4s, v16.4s, v17.4s
|
addp v16.4s, v16.4s, v17.4s
|
||||||
|
@ -2633,9 +2633,9 @@ function ff_hevc_put_hevc_qpel_h6_8_neon_i8mm, export=1
|
||||||
zip1 v0.2d, v0.2d, v1.2d
|
zip1 v0.2d, v0.2d, v1.2d
|
||||||
zip1 v2.2d, v2.2d, v3.2d
|
zip1 v2.2d, v2.2d, v3.2d
|
||||||
zip1 v4.2d, v4.2d, v5.2d
|
zip1 v4.2d, v4.2d, v5.2d
|
||||||
movi v16.2d, #0
|
movi v16.16b, #0
|
||||||
movi v17.2d, #0
|
movi v17.16b, #0
|
||||||
movi v18.2d, #0
|
movi v18.16b, #0
|
||||||
usdot v16.4s, v0.16b, v31.16b
|
usdot v16.4s, v0.16b, v31.16b
|
||||||
usdot v17.4s, v2.16b, v31.16b
|
usdot v17.4s, v2.16b, v31.16b
|
||||||
usdot v18.4s, v4.16b, v31.16b
|
usdot v18.4s, v4.16b, v31.16b
|
||||||
|
@ -2668,10 +2668,10 @@ function ff_hevc_put_hevc_qpel_h8_8_neon_i8mm, export=1
|
||||||
zip1 v2.2d, v2.2d, v3.2d
|
zip1 v2.2d, v2.2d, v3.2d
|
||||||
zip1 v4.2d, v4.2d, v5.2d
|
zip1 v4.2d, v4.2d, v5.2d
|
||||||
zip1 v6.2d, v6.2d, v7.2d
|
zip1 v6.2d, v6.2d, v7.2d
|
||||||
movi v16.2d, #0
|
movi v16.16b, #0
|
||||||
movi v17.2d, #0
|
movi v17.16b, #0
|
||||||
movi v18.2d, #0
|
movi v18.16b, #0
|
||||||
movi v19.2d, #0
|
movi v19.16b, #0
|
||||||
usdot v16.4s, v0.16b, v31.16b
|
usdot v16.4s, v0.16b, v31.16b
|
||||||
usdot v17.4s, v2.16b, v31.16b
|
usdot v17.4s, v2.16b, v31.16b
|
||||||
usdot v18.4s, v4.16b, v31.16b
|
usdot v18.4s, v4.16b, v31.16b
|
||||||
|
@ -2688,10 +2688,10 @@ function ff_hevc_put_hevc_qpel_h8_8_neon_i8mm, export=1
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
.macro QPEL_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
|
.macro QPEL_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
|
||||||
movi \d0\().2d, #0
|
movi \d0\().16b, #0
|
||||||
movi \d1\().2d, #0
|
movi \d1\().16b, #0
|
||||||
movi \d2\().2d, #0
|
movi \d2\().16b, #0
|
||||||
movi \d3\().2d, #0
|
movi \d3\().16b, #0
|
||||||
usdot \d0\().4s, \s0\().16b, v31.16b
|
usdot \d0\().4s, \s0\().16b, v31.16b
|
||||||
usdot \d1\().4s, \s1\().16b, v31.16b
|
usdot \d1\().4s, \s1\().16b, v31.16b
|
||||||
usdot \d2\().4s, \s2\().16b, v31.16b
|
usdot \d2\().4s, \s2\().16b, v31.16b
|
||||||
|
@ -2716,8 +2716,8 @@ function ff_hevc_put_hevc_qpel_h12_8_neon_i8mm, export=1
|
||||||
QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
|
QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
|
||||||
addp v20.4s, v20.4s, v22.4s
|
addp v20.4s, v20.4s, v22.4s
|
||||||
addp v21.4s, v21.4s, v23.4s
|
addp v21.4s, v21.4s, v23.4s
|
||||||
movi v24.2d, #0
|
movi v24.16b, #0
|
||||||
movi v25.2d, #0
|
movi v25.16b, #0
|
||||||
usdot v24.4s, v18.16b, v31.16b
|
usdot v24.4s, v18.16b, v31.16b
|
||||||
usdot v25.4s, v19.16b, v31.16b
|
usdot v25.4s, v19.16b, v31.16b
|
||||||
addp v24.4s, v24.4s, v25.4s
|
addp v24.4s, v24.4s, v25.4s
|
||||||
|
|
|
@ -56,7 +56,7 @@ function ff_mpadsp_apply_window_\type\()_neon, export=1
|
||||||
.ifc \type, fixed
|
.ifc \type, fixed
|
||||||
ld1r {v16.2s}, [x2] // dither_state
|
ld1r {v16.2s}, [x2] // dither_state
|
||||||
sxtl v16.2d, v16.2s
|
sxtl v16.2d, v16.2s
|
||||||
movi v29.2d, #0
|
movi v29.16b, #0
|
||||||
movi v30.2d, #(1<<OUT_SHIFT)-1
|
movi v30.2d, #(1<<OUT_SHIFT)-1
|
||||||
trn1 v31.2d, v29.2d, v30.2d
|
trn1 v31.2d, v29.2d, v30.2d
|
||||||
trn2 v30.2d, v30.2d, v29.2d
|
trn2 v30.2d, v30.2d, v29.2d
|
||||||
|
@ -74,9 +74,9 @@ function ff_mpadsp_apply_window_\type\()_neon, export=1
|
||||||
add x11, x6, #(32)<<2 // w + 32
|
add x11, x6, #(32)<<2 // w + 32
|
||||||
add x12, x7, #(32)<<2 // w2 + 32
|
add x12, x7, #(32)<<2 // w2 + 32
|
||||||
mov x15, #8
|
mov x15, #8
|
||||||
movi v17.2d, #0
|
movi v17.16b, #0
|
||||||
movi v18.2d, #0
|
movi v18.16b, #0
|
||||||
movi v19.2d, #0
|
movi v19.16b, #0
|
||||||
2:
|
2:
|
||||||
subs x15, x15, #1
|
subs x15, x15, #1
|
||||||
ld1 {v0.4s}, [x8], x9
|
ld1 {v0.4s}, [x8], x9
|
||||||
|
|
|
@ -50,10 +50,10 @@ function ff_hscale8to15_X8_neon, export=1
|
||||||
add x12, x16, x7 // filter1 = filter0 + filterSize*2
|
add x12, x16, x7 // filter1 = filter0 + filterSize*2
|
||||||
add x13, x12, x7 // filter2 = filter1 + filterSize*2
|
add x13, x12, x7 // filter2 = filter1 + filterSize*2
|
||||||
add x4, x13, x7 // filter3 = filter2 + filterSize*2
|
add x4, x13, x7 // filter3 = filter2 + filterSize*2
|
||||||
movi v0.2d, #0 // val sum part 1 (for dst[0])
|
movi v0.16b, #0 // val sum part 1 (for dst[0])
|
||||||
movi v1.2d, #0 // val sum part 2 (for dst[1])
|
movi v1.16b, #0 // val sum part 2 (for dst[1])
|
||||||
movi v2.2d, #0 // val sum part 3 (for dst[2])
|
movi v2.16b, #0 // val sum part 3 (for dst[2])
|
||||||
movi v3.2d, #0 // val sum part 4 (for dst[3])
|
movi v3.16b, #0 // val sum part 4 (for dst[3])
|
||||||
add x17, x3, w8, uxtw // srcp + filterPos[0]
|
add x17, x3, w8, uxtw // srcp + filterPos[0]
|
||||||
add x8, x3, w0, uxtw // srcp + filterPos[1]
|
add x8, x3, w0, uxtw // srcp + filterPos[1]
|
||||||
add x0, x3, w11, uxtw // srcp + filterPos[2]
|
add x0, x3, w11, uxtw // srcp + filterPos[2]
|
||||||
|
@ -108,10 +108,10 @@ function ff_hscale8to15_X4_neon, export=1
|
||||||
ldp w8, w9, [x5] // filterPos[idx + 0], [idx + 1]
|
ldp w8, w9, [x5] // filterPos[idx + 0], [idx + 1]
|
||||||
ldp w10, w11, [x5, #8] // filterPos[idx + 2], [idx + 3]
|
ldp w10, w11, [x5, #8] // filterPos[idx + 2], [idx + 3]
|
||||||
|
|
||||||
movi v16.2d, #0 // initialize accumulator for idx + 0
|
movi v16.16b, #0 // initialize accumulator for idx + 0
|
||||||
movi v17.2d, #0 // initialize accumulator for idx + 1
|
movi v17.16b, #0 // initialize accumulator for idx + 1
|
||||||
movi v18.2d, #0 // initialize accumulator for idx + 2
|
movi v18.16b, #0 // initialize accumulator for idx + 2
|
||||||
movi v19.2d, #0 // initialize accumulator for idx + 3
|
movi v19.16b, #0 // initialize accumulator for idx + 3
|
||||||
|
|
||||||
mov x12, x4 // filter pointer for idx + 0
|
mov x12, x4 // filter pointer for idx + 0
|
||||||
add x13, x4, x7 // filter pointer for idx + 1
|
add x13, x4, x7 // filter pointer for idx + 1
|
||||||
|
@ -253,8 +253,8 @@ function ff_hscale8to15_4_neon, export=1
|
||||||
ldp w12, w13, [x5, #16] // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration
|
ldp w12, w13, [x5, #16] // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration
|
||||||
ldp w14, w15, [x5, #24] // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration
|
ldp w14, w15, [x5, #24] // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration
|
||||||
|
|
||||||
movi v0.2d, #0 // Clear madd accumulator for idx 0..3
|
movi v0.16b, #0 // Clear madd accumulator for idx 0..3
|
||||||
movi v5.2d, #0 // Clear madd accumulator for idx 4..7
|
movi v5.16b, #0 // Clear madd accumulator for idx 4..7
|
||||||
|
|
||||||
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
|
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
|
||||||
|
|
||||||
|
@ -299,8 +299,8 @@ function ff_hscale8to15_4_neon, export=1
|
||||||
ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp]
|
ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp]
|
||||||
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
|
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
|
||||||
|
|
||||||
movi v0.2d, #0 // Clear madd accumulator for idx 0..3
|
movi v0.16b, #0 // Clear madd accumulator for idx 0..3
|
||||||
movi v5.2d, #0 // Clear madd accumulator for idx 4..7
|
movi v5.16b, #0 // Clear madd accumulator for idx 4..7
|
||||||
|
|
||||||
uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
|
uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
|
||||||
uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
|
uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
|
||||||
|
@ -499,10 +499,10 @@ function ff_hscale8to19_X8_neon, export=1
|
||||||
ldr w11, [x5], #4 // filterPos[idx + 2]
|
ldr w11, [x5], #4 // filterPos[idx + 2]
|
||||||
add x4, x13, x7 // filter3 = filter2 + filterSize*2
|
add x4, x13, x7 // filter3 = filter2 + filterSize*2
|
||||||
ldr w9, [x5], #4 // filterPos[idx + 3]
|
ldr w9, [x5], #4 // filterPos[idx + 3]
|
||||||
movi v0.2d, #0 // val sum part 1 (for dst[0])
|
movi v0.16b, #0 // val sum part 1 (for dst[0])
|
||||||
movi v1.2d, #0 // val sum part 2 (for dst[1])
|
movi v1.16b, #0 // val sum part 2 (for dst[1])
|
||||||
movi v2.2d, #0 // val sum part 3 (for dst[2])
|
movi v2.16b, #0 // val sum part 3 (for dst[2])
|
||||||
movi v3.2d, #0 // val sum part 4 (for dst[3])
|
movi v3.16b, #0 // val sum part 4 (for dst[3])
|
||||||
add x17, x3, w8, uxtw // srcp + filterPos[0]
|
add x17, x3, w8, uxtw // srcp + filterPos[0]
|
||||||
add x8, x3, w0, uxtw // srcp + filterPos[1]
|
add x8, x3, w0, uxtw // srcp + filterPos[1]
|
||||||
add x0, x3, w11, uxtw // srcp + filterPos[2]
|
add x0, x3, w11, uxtw // srcp + filterPos[2]
|
||||||
|
@ -560,10 +560,10 @@ function ff_hscale8to19_X4_neon, export=1
|
||||||
ldp w8, w9, [x5]
|
ldp w8, w9, [x5]
|
||||||
ldp w10, w11, [x5, #8]
|
ldp w10, w11, [x5, #8]
|
||||||
|
|
||||||
movi v16.2d, #0 // initialize accumulator for idx + 0
|
movi v16.16b, #0 // initialize accumulator for idx + 0
|
||||||
movi v17.2d, #0 // initialize accumulator for idx + 1
|
movi v17.16b, #0 // initialize accumulator for idx + 1
|
||||||
movi v18.2d, #0 // initialize accumulator for idx + 2
|
movi v18.16b, #0 // initialize accumulator for idx + 2
|
||||||
movi v19.2d, #0 // initialize accumulator for idx + 3
|
movi v19.16b, #0 // initialize accumulator for idx + 3
|
||||||
|
|
||||||
mov x12, x4 // filter + 0
|
mov x12, x4 // filter + 0
|
||||||
add x13, x4, x7 // filter + 1
|
add x13, x4, x7 // filter + 1
|
||||||
|
@ -865,10 +865,10 @@ function ff_hscale16to15_X8_neon_asm, export=1
|
||||||
add x12, x16, x7 // filter1 = filter0 + filterSize*2
|
add x12, x16, x7 // filter1 = filter0 + filterSize*2
|
||||||
add x13, x12, x7 // filter2 = filter1 + filterSize*2
|
add x13, x12, x7 // filter2 = filter1 + filterSize*2
|
||||||
add x4, x13, x7 // filter3 = filter2 + filterSize*2
|
add x4, x13, x7 // filter3 = filter2 + filterSize*2
|
||||||
movi v0.2d, #0 // val sum part 1 (for dst[0])
|
movi v0.16b, #0 // val sum part 1 (for dst[0])
|
||||||
movi v1.2d, #0 // val sum part 2 (for dst[1])
|
movi v1.16b, #0 // val sum part 2 (for dst[1])
|
||||||
movi v2.2d, #0 // val sum part 3 (for dst[2])
|
movi v2.16b, #0 // val sum part 3 (for dst[2])
|
||||||
movi v3.2d, #0 // val sum part 4 (for dst[3])
|
movi v3.16b, #0 // val sum part 4 (for dst[3])
|
||||||
add x17, x3, w8, uxtw // srcp + filterPos[0]
|
add x17, x3, w8, uxtw // srcp + filterPos[0]
|
||||||
add x8, x3, w10, uxtw // srcp + filterPos[1]
|
add x8, x3, w10, uxtw // srcp + filterPos[1]
|
||||||
add x10, x3, w11, uxtw // srcp + filterPos[2]
|
add x10, x3, w11, uxtw // srcp + filterPos[2]
|
||||||
|
@ -945,10 +945,10 @@ function ff_hscale16to15_X4_neon_asm, export=1
|
||||||
ldp w8, w9, [x5]
|
ldp w8, w9, [x5]
|
||||||
ldp w10, w11, [x5, #8]
|
ldp w10, w11, [x5, #8]
|
||||||
|
|
||||||
movi v16.2d, #0 // initialize accumulator for idx + 0
|
movi v16.16b, #0 // initialize accumulator for idx + 0
|
||||||
movi v17.2d, #0 // initialize accumulator for idx + 1
|
movi v17.16b, #0 // initialize accumulator for idx + 1
|
||||||
movi v18.2d, #0 // initialize accumulator for idx + 2
|
movi v18.16b, #0 // initialize accumulator for idx + 2
|
||||||
movi v19.2d, #0 // initialize accumulator for idx + 3
|
movi v19.16b, #0 // initialize accumulator for idx + 3
|
||||||
|
|
||||||
mov x12, x4 // filter + 0
|
mov x12, x4 // filter + 0
|
||||||
add x13, x4, x7 // filter + 1
|
add x13, x4, x7 // filter + 1
|
||||||
|
@ -1270,10 +1270,10 @@ function ff_hscale16to19_X8_neon_asm, export=1
|
||||||
add x13, x12, x7 // filter2 = filter1 + filterSize*2
|
add x13, x12, x7 // filter2 = filter1 + filterSize*2
|
||||||
lsl w10, w10, #1
|
lsl w10, w10, #1
|
||||||
add x4, x13, x7 // filter3 = filter2 + filterSize*2
|
add x4, x13, x7 // filter3 = filter2 + filterSize*2
|
||||||
movi v0.2d, #0 // val sum part 1 (for dst[0])
|
movi v0.16b, #0 // val sum part 1 (for dst[0])
|
||||||
movi v1.2d, #0 // val sum part 2 (for dst[1])
|
movi v1.16b, #0 // val sum part 2 (for dst[1])
|
||||||
movi v2.2d, #0 // val sum part 3 (for dst[2])
|
movi v2.16b, #0 // val sum part 3 (for dst[2])
|
||||||
movi v3.2d, #0 // val sum part 4 (for dst[3])
|
movi v3.16b, #0 // val sum part 4 (for dst[3])
|
||||||
add x17, x3, w8, uxtw // srcp + filterPos[0]
|
add x17, x3, w8, uxtw // srcp + filterPos[0]
|
||||||
add x8, x3, w10, uxtw // srcp + filterPos[1]
|
add x8, x3, w10, uxtw // srcp + filterPos[1]
|
||||||
add x10, x3, w11, uxtw // srcp + filterPos[2]
|
add x10, x3, w11, uxtw // srcp + filterPos[2]
|
||||||
|
@ -1348,10 +1348,10 @@ function ff_hscale16to19_X4_neon_asm, export=1
|
||||||
ldp w8, w9, [x5]
|
ldp w8, w9, [x5]
|
||||||
ldp w10, w11, [x5, #8]
|
ldp w10, w11, [x5, #8]
|
||||||
|
|
||||||
movi v16.2d, #0 // initialize accumulator for idx + 0
|
movi v16.16b, #0 // initialize accumulator for idx + 0
|
||||||
movi v17.2d, #0 // initialize accumulator for idx + 1
|
movi v17.16b, #0 // initialize accumulator for idx + 1
|
||||||
movi v18.2d, #0 // initialize accumulator for idx + 2
|
movi v18.16b, #0 // initialize accumulator for idx + 2
|
||||||
movi v19.2d, #0 // initialize accumulator for idx + 3
|
movi v19.16b, #0 // initialize accumulator for idx + 3
|
||||||
|
|
||||||
mov x12, x4 // filter + 0
|
mov x12, x4 // filter + 0
|
||||||
add x13, x4, x7 // filter + 1
|
add x13, x4, x7 // filter + 1
|
||||||
|
|
Loading…
Reference in New Issue