aarch64: h264dsp: Fix indentation of some functions to match the rest

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Martin Storsjö 2021-08-05 01:18:50 +03:00
parent e86ec831b0
commit c60b76d0c8
1 changed files with 236 additions and 236 deletions

View File

@ -182,198 +182,198 @@ endfunc
.macro h264_loop_filter_start_intra
orr w4, w2, w3
cbnz w4, 1f
ret
orr w4, w2, w3
cbnz w4, 1f
ret
1:
dup v30.16b, w2 // alpha
dup v31.16b, w3 // beta
dup v30.16b, w2 // alpha
dup v31.16b, w3 // beta
.endm
.macro h264_loop_filter_luma_intra
uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
cmhi v19.16b, v30.16b, v16.16b // < alpha
cmhi v17.16b, v31.16b, v17.16b // < beta
cmhi v18.16b, v31.16b, v18.16b // < beta
uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
cmhi v19.16b, v30.16b, v16.16b // < alpha
cmhi v17.16b, v31.16b, v17.16b // < beta
cmhi v18.16b, v31.16b, v18.16b // < beta
movi v29.16b, #2
ushr v30.16b, v30.16b, #2 // alpha >> 2
add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
movi v29.16b, #2
ushr v30.16b, v30.16b, #2 // alpha >> 2
add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
and v19.16b, v19.16b, v17.16b
and v19.16b, v19.16b, v18.16b
shrn v20.8b, v19.8h, #4
mov x4, v20.d[0]
cbz x4, 9f
and v19.16b, v19.16b, v17.16b
and v19.16b, v19.16b, v18.16b
shrn v20.8b, v19.8h, #4
mov x4, v20.d[0]
cbz x4, 9f
ushll v20.8h, v6.8b, #1
ushll v22.8h, v1.8b, #1
ushll2 v21.8h, v6.16b, #1
ushll2 v23.8h, v1.16b, #1
uaddw v20.8h, v20.8h, v7.8b
uaddw v22.8h, v22.8h, v0.8b
uaddw2 v21.8h, v21.8h, v7.16b
uaddw2 v23.8h, v23.8h, v0.16b
uaddw v20.8h, v20.8h, v1.8b
uaddw v22.8h, v22.8h, v6.8b
uaddw2 v21.8h, v21.8h, v1.16b
uaddw2 v23.8h, v23.8h, v6.16b
ushll v20.8h, v6.8b, #1
ushll v22.8h, v1.8b, #1
ushll2 v21.8h, v6.16b, #1
ushll2 v23.8h, v1.16b, #1
uaddw v20.8h, v20.8h, v7.8b
uaddw v22.8h, v22.8h, v0.8b
uaddw2 v21.8h, v21.8h, v7.16b
uaddw2 v23.8h, v23.8h, v0.16b
uaddw v20.8h, v20.8h, v1.8b
uaddw v22.8h, v22.8h, v6.8b
uaddw2 v21.8h, v21.8h, v1.16b
uaddw2 v23.8h, v23.8h, v6.16b
rshrn v24.8b, v20.8h, #2 // p0'_1
rshrn v25.8b, v22.8h, #2 // q0'_1
rshrn2 v24.16b, v21.8h, #2 // p0'_1
rshrn2 v25.16b, v23.8h, #2 // q0'_1
rshrn v24.8b, v20.8h, #2 // p0'_1
rshrn v25.8b, v22.8h, #2 // q0'_1
rshrn2 v24.16b, v21.8h, #2 // p0'_1
rshrn2 v25.16b, v23.8h, #2 // q0'_1
uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
cmhi v17.16b, v31.16b, v17.16b // < beta
cmhi v18.16b, v31.16b, v18.16b // < beta
uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
cmhi v17.16b, v31.16b, v17.16b // < beta
cmhi v18.16b, v31.16b, v18.16b // < beta
and v17.16b, v16.16b, v17.16b // if_2 && if_3
and v18.16b, v16.16b, v18.16b // if_2 && if_4
and v17.16b, v16.16b, v17.16b // if_2 && if_3
and v18.16b, v16.16b, v18.16b // if_2 && if_4
not v30.16b, v17.16b
not v31.16b, v18.16b
not v30.16b, v17.16b
not v31.16b, v18.16b
and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
//calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
uaddl v26.8h, v5.8b, v7.8b
uaddl2 v27.8h, v5.16b, v7.16b
uaddw v26.8h, v26.8h, v0.8b
uaddw2 v27.8h, v27.8h, v0.16b
add v20.8h, v20.8h, v26.8h
add v21.8h, v21.8h, v27.8h
uaddw v20.8h, v20.8h, v0.8b
uaddw2 v21.8h, v21.8h, v0.16b
rshrn v20.8b, v20.8h, #3 // p0'_2
rshrn2 v20.16b, v21.8h, #3 // p0'_2
uaddw v26.8h, v26.8h, v6.8b
uaddw2 v27.8h, v27.8h, v6.16b
rshrn v21.8b, v26.8h, #2 // p1'_2
rshrn2 v21.16b, v27.8h, #2 // p1'_2
uaddl v28.8h, v4.8b, v5.8b
uaddl2 v29.8h, v4.16b, v5.16b
shl v28.8h, v28.8h, #1
shl v29.8h, v29.8h, #1
add v28.8h, v28.8h, v26.8h
add v29.8h, v29.8h, v27.8h
rshrn v19.8b, v28.8h, #3 // p2'_2
rshrn2 v19.16b, v29.8h, #3 // p2'_2
//calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
uaddl v26.8h, v5.8b, v7.8b
uaddl2 v27.8h, v5.16b, v7.16b
uaddw v26.8h, v26.8h, v0.8b
uaddw2 v27.8h, v27.8h, v0.16b
add v20.8h, v20.8h, v26.8h
add v21.8h, v21.8h, v27.8h
uaddw v20.8h, v20.8h, v0.8b
uaddw2 v21.8h, v21.8h, v0.16b
rshrn v20.8b, v20.8h, #3 // p0'_2
rshrn2 v20.16b, v21.8h, #3 // p0'_2
uaddw v26.8h, v26.8h, v6.8b
uaddw2 v27.8h, v27.8h, v6.16b
rshrn v21.8b, v26.8h, #2 // p1'_2
rshrn2 v21.16b, v27.8h, #2 // p1'_2
uaddl v28.8h, v4.8b, v5.8b
uaddl2 v29.8h, v4.16b, v5.16b
shl v28.8h, v28.8h, #1
shl v29.8h, v29.8h, #1
add v28.8h, v28.8h, v26.8h
add v29.8h, v29.8h, v27.8h
rshrn v19.8b, v28.8h, #3 // p2'_2
rshrn2 v19.16b, v29.8h, #3 // p2'_2
//calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
uaddl v26.8h, v2.8b, v0.8b
uaddl2 v27.8h, v2.16b, v0.16b
uaddw v26.8h, v26.8h, v7.8b
uaddw2 v27.8h, v27.8h, v7.16b
add v22.8h, v22.8h, v26.8h
add v23.8h, v23.8h, v27.8h
uaddw v22.8h, v22.8h, v7.8b
uaddw2 v23.8h, v23.8h, v7.16b
rshrn v22.8b, v22.8h, #3 // q0'_2
rshrn2 v22.16b, v23.8h, #3 // q0'_2
uaddw v26.8h, v26.8h, v1.8b
uaddw2 v27.8h, v27.8h, v1.16b
rshrn v23.8b, v26.8h, #2 // q1'_2
rshrn2 v23.16b, v27.8h, #2 // q1'_2
uaddl v28.8h, v2.8b, v3.8b
uaddl2 v29.8h, v2.16b, v3.16b
shl v28.8h, v28.8h, #1
shl v29.8h, v29.8h, #1
add v28.8h, v28.8h, v26.8h
add v29.8h, v29.8h, v27.8h
rshrn v26.8b, v28.8h, #3 // q2'_2
rshrn2 v26.16b, v29.8h, #3 // q2'_2
//calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
uaddl v26.8h, v2.8b, v0.8b
uaddl2 v27.8h, v2.16b, v0.16b
uaddw v26.8h, v26.8h, v7.8b
uaddw2 v27.8h, v27.8h, v7.16b
add v22.8h, v22.8h, v26.8h
add v23.8h, v23.8h, v27.8h
uaddw v22.8h, v22.8h, v7.8b
uaddw2 v23.8h, v23.8h, v7.16b
rshrn v22.8b, v22.8h, #3 // q0'_2
rshrn2 v22.16b, v23.8h, #3 // q0'_2
uaddw v26.8h, v26.8h, v1.8b
uaddw2 v27.8h, v27.8h, v1.16b
rshrn v23.8b, v26.8h, #2 // q1'_2
rshrn2 v23.16b, v27.8h, #2 // q1'_2
uaddl v28.8h, v2.8b, v3.8b
uaddl2 v29.8h, v2.16b, v3.16b
shl v28.8h, v28.8h, #1
shl v29.8h, v29.8h, #1
add v28.8h, v28.8h, v26.8h
add v29.8h, v29.8h, v27.8h
rshrn v26.8b, v28.8h, #3 // q2'_2
rshrn2 v26.16b, v29.8h, #3 // q2'_2
bit v7.16b, v24.16b, v30.16b // p0'_1
bit v0.16b, v25.16b, v31.16b // q0'_1
bit v7.16b, v20.16b, v17.16b // p0'_2
bit v6.16b, v21.16b, v17.16b // p1'_2
bit v5.16b, v19.16b, v17.16b // p2'_2
bit v0.16b, v22.16b, v18.16b // q0'_2
bit v1.16b, v23.16b, v18.16b // q1'_2
bit v2.16b, v26.16b, v18.16b // q2'_2
bit v7.16b, v24.16b, v30.16b // p0'_1
bit v0.16b, v25.16b, v31.16b // q0'_1
bit v7.16b, v20.16b, v17.16b // p0'_2
bit v6.16b, v21.16b, v17.16b // p1'_2
bit v5.16b, v19.16b, v17.16b // p2'_2
bit v0.16b, v22.16b, v18.16b // q0'_2
bit v1.16b, v23.16b, v18.16b // q1'_2
bit v2.16b, v26.16b, v18.16b // q2'_2
.endm
function ff_h264_v_loop_filter_luma_intra_neon, export=1
h264_loop_filter_start_intra
h264_loop_filter_start_intra
ld1 {v0.16b}, [x0], x1 // q0
ld1 {v1.16b}, [x0], x1 // q1
ld1 {v2.16b}, [x0], x1 // q2
ld1 {v3.16b}, [x0], x1 // q3
sub x0, x0, x1, lsl #3
ld1 {v4.16b}, [x0], x1 // p3
ld1 {v5.16b}, [x0], x1 // p2
ld1 {v6.16b}, [x0], x1 // p1
ld1 {v7.16b}, [x0] // p0
ld1 {v0.16b}, [x0], x1 // q0
ld1 {v1.16b}, [x0], x1 // q1
ld1 {v2.16b}, [x0], x1 // q2
ld1 {v3.16b}, [x0], x1 // q3
sub x0, x0, x1, lsl #3
ld1 {v4.16b}, [x0], x1 // p3
ld1 {v5.16b}, [x0], x1 // p2
ld1 {v6.16b}, [x0], x1 // p1
ld1 {v7.16b}, [x0] // p0
h264_loop_filter_luma_intra
h264_loop_filter_luma_intra
sub x0, x0, x1, lsl #1
st1 {v5.16b}, [x0], x1 // p2
st1 {v6.16b}, [x0], x1 // p1
st1 {v7.16b}, [x0], x1 // p0
st1 {v0.16b}, [x0], x1 // q0
st1 {v1.16b}, [x0], x1 // q1
st1 {v2.16b}, [x0] // q2
sub x0, x0, x1, lsl #1
st1 {v5.16b}, [x0], x1 // p2
st1 {v6.16b}, [x0], x1 // p1
st1 {v7.16b}, [x0], x1 // p0
st1 {v0.16b}, [x0], x1 // q0
st1 {v1.16b}, [x0], x1 // q1
st1 {v2.16b}, [x0] // q2
9:
ret
ret
endfunc
function ff_h264_h_loop_filter_luma_intra_neon, export=1
h264_loop_filter_start_intra
h264_loop_filter_start_intra
sub x0, x0, #4
ld1 {v4.8b}, [x0], x1
ld1 {v5.8b}, [x0], x1
ld1 {v6.8b}, [x0], x1
ld1 {v7.8b}, [x0], x1
ld1 {v0.8b}, [x0], x1
ld1 {v1.8b}, [x0], x1
ld1 {v2.8b}, [x0], x1
ld1 {v3.8b}, [x0], x1
ld1 {v4.d}[1], [x0], x1
ld1 {v5.d}[1], [x0], x1
ld1 {v6.d}[1], [x0], x1
ld1 {v7.d}[1], [x0], x1
ld1 {v0.d}[1], [x0], x1
ld1 {v1.d}[1], [x0], x1
ld1 {v2.d}[1], [x0], x1
ld1 {v3.d}[1], [x0], x1
sub x0, x0, #4
ld1 {v4.8b}, [x0], x1
ld1 {v5.8b}, [x0], x1
ld1 {v6.8b}, [x0], x1
ld1 {v7.8b}, [x0], x1
ld1 {v0.8b}, [x0], x1
ld1 {v1.8b}, [x0], x1
ld1 {v2.8b}, [x0], x1
ld1 {v3.8b}, [x0], x1
ld1 {v4.d}[1], [x0], x1
ld1 {v5.d}[1], [x0], x1
ld1 {v6.d}[1], [x0], x1
ld1 {v7.d}[1], [x0], x1
ld1 {v0.d}[1], [x0], x1
ld1 {v1.d}[1], [x0], x1
ld1 {v2.d}[1], [x0], x1
ld1 {v3.d}[1], [x0], x1
transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
h264_loop_filter_luma_intra
h264_loop_filter_luma_intra
transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
sub x0, x0, x1, lsl #4
st1 {v4.8b}, [x0], x1
st1 {v5.8b}, [x0], x1
st1 {v6.8b}, [x0], x1
st1 {v7.8b}, [x0], x1
st1 {v0.8b}, [x0], x1
st1 {v1.8b}, [x0], x1
st1 {v2.8b}, [x0], x1
st1 {v3.8b}, [x0], x1
st1 {v4.d}[1], [x0], x1
st1 {v5.d}[1], [x0], x1
st1 {v6.d}[1], [x0], x1
st1 {v7.d}[1], [x0], x1
st1 {v0.d}[1], [x0], x1
st1 {v1.d}[1], [x0], x1
st1 {v2.d}[1], [x0], x1
st1 {v3.d}[1], [x0], x1
sub x0, x0, x1, lsl #4
st1 {v4.8b}, [x0], x1
st1 {v5.8b}, [x0], x1
st1 {v6.8b}, [x0], x1
st1 {v7.8b}, [x0], x1
st1 {v0.8b}, [x0], x1
st1 {v1.8b}, [x0], x1
st1 {v2.8b}, [x0], x1
st1 {v3.8b}, [x0], x1
st1 {v4.d}[1], [x0], x1
st1 {v5.d}[1], [x0], x1
st1 {v6.d}[1], [x0], x1
st1 {v7.d}[1], [x0], x1
st1 {v0.d}[1], [x0], x1
st1 {v1.d}[1], [x0], x1
st1 {v2.d}[1], [x0], x1
st1 {v3.d}[1], [x0], x1
9:
ret
ret
endfunc
.macro h264_loop_filter_chroma
@ -474,113 +474,113 @@ function ff_h264_h_loop_filter_chroma422_neon, export=1
endfunc
.macro h264_loop_filter_chroma_intra
uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0)
uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0)
cmhi v26.8b, v30.8b, v26.8b // < alpha
cmhi v27.8b, v31.8b, v27.8b // < beta
cmhi v28.8b, v31.8b, v28.8b // < beta
and v26.8b, v26.8b, v27.8b
and v26.8b, v26.8b, v28.8b
mov x2, v26.d[0]
uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0)
uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0)
cmhi v26.8b, v30.8b, v26.8b // < alpha
cmhi v27.8b, v31.8b, v27.8b // < beta
cmhi v28.8b, v31.8b, v28.8b // < beta
and v26.8b, v26.8b, v27.8b
and v26.8b, v26.8b, v28.8b
mov x2, v26.d[0]
ushll v4.8h, v18.8b, #1
ushll v6.8h, v19.8b, #1
cbz x2, 9f
uaddl v20.8h, v16.8b, v19.8b
uaddl v22.8h, v17.8b, v18.8b
add v20.8h, v20.8h, v4.8h
add v22.8h, v22.8h, v6.8h
uqrshrn v24.8b, v20.8h, #2
uqrshrn v25.8b, v22.8h, #2
bit v16.8b, v24.8b, v26.8b
bit v17.8b, v25.8b, v26.8b
ushll v4.8h, v18.8b, #1
ushll v6.8h, v19.8b, #1
cbz x2, 9f
uaddl v20.8h, v16.8b, v19.8b
uaddl v22.8h, v17.8b, v18.8b
add v20.8h, v20.8h, v4.8h
add v22.8h, v22.8h, v6.8h
uqrshrn v24.8b, v20.8h, #2
uqrshrn v25.8b, v22.8h, #2
bit v16.8b, v24.8b, v26.8b
bit v17.8b, v25.8b, v26.8b
.endm
function ff_h264_v_loop_filter_chroma_intra_neon, export=1
h264_loop_filter_start_intra
h264_loop_filter_start_intra
sub x0, x0, x1, lsl #1
ld1 {v18.8b}, [x0], x1
ld1 {v16.8b}, [x0], x1
ld1 {v17.8b}, [x0], x1
ld1 {v19.8b}, [x0]
sub x0, x0, x1, lsl #1
ld1 {v18.8b}, [x0], x1
ld1 {v16.8b}, [x0], x1
ld1 {v17.8b}, [x0], x1
ld1 {v19.8b}, [x0]
h264_loop_filter_chroma_intra
h264_loop_filter_chroma_intra
sub x0, x0, x1, lsl #1
st1 {v16.8b}, [x0], x1
st1 {v17.8b}, [x0], x1
sub x0, x0, x1, lsl #1
st1 {v16.8b}, [x0], x1
st1 {v17.8b}, [x0], x1
9:
ret
ret
endfunc
function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
h264_loop_filter_start_intra
h264_loop_filter_start_intra
sub x4, x0, #2
sub x0, x0, #1
ld1 {v18.8b}, [x4], x1
ld1 {v16.8b}, [x4], x1
ld1 {v17.8b}, [x4], x1
ld1 {v19.8b}, [x4], x1
sub x4, x0, #2
sub x0, x0, #1
ld1 {v18.8b}, [x4], x1
ld1 {v16.8b}, [x4], x1
ld1 {v17.8b}, [x4], x1
ld1 {v19.8b}, [x4], x1
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra
h264_loop_filter_chroma_intra
st2 {v16.b,v17.b}[0], [x0], x1
st2 {v16.b,v17.b}[1], [x0], x1
st2 {v16.b,v17.b}[2], [x0], x1
st2 {v16.b,v17.b}[3], [x0], x1
st2 {v16.b,v17.b}[0], [x0], x1
st2 {v16.b,v17.b}[1], [x0], x1
st2 {v16.b,v17.b}[2], [x0], x1
st2 {v16.b,v17.b}[3], [x0], x1
9:
ret
ret
endfunc
function ff_h264_h_loop_filter_chroma_intra_neon, export=1
h264_loop_filter_start_intra
h264_loop_filter_start_intra
sub x4, x0, #2
sub x0, x0, #1
sub x4, x0, #2
sub x0, x0, #1
h_loop_filter_chroma420_intra:
ld1 {v18.8b}, [x4], x1
ld1 {v16.8b}, [x4], x1
ld1 {v17.8b}, [x4], x1
ld1 {v19.8b}, [x4], x1
ld1 {v18.s}[1], [x4], x1
ld1 {v16.s}[1], [x4], x1
ld1 {v17.s}[1], [x4], x1
ld1 {v19.s}[1], [x4], x1
ld1 {v18.8b}, [x4], x1
ld1 {v16.8b}, [x4], x1
ld1 {v17.8b}, [x4], x1
ld1 {v19.8b}, [x4], x1
ld1 {v18.s}[1], [x4], x1
ld1 {v16.s}[1], [x4], x1
ld1 {v17.s}[1], [x4], x1
ld1 {v19.s}[1], [x4], x1
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra
h264_loop_filter_chroma_intra
st2 {v16.b,v17.b}[0], [x0], x1
st2 {v16.b,v17.b}[1], [x0], x1
st2 {v16.b,v17.b}[2], [x0], x1
st2 {v16.b,v17.b}[3], [x0], x1
st2 {v16.b,v17.b}[4], [x0], x1
st2 {v16.b,v17.b}[5], [x0], x1
st2 {v16.b,v17.b}[6], [x0], x1
st2 {v16.b,v17.b}[7], [x0], x1
st2 {v16.b,v17.b}[0], [x0], x1
st2 {v16.b,v17.b}[1], [x0], x1
st2 {v16.b,v17.b}[2], [x0], x1
st2 {v16.b,v17.b}[3], [x0], x1
st2 {v16.b,v17.b}[4], [x0], x1
st2 {v16.b,v17.b}[5], [x0], x1
st2 {v16.b,v17.b}[6], [x0], x1
st2 {v16.b,v17.b}[7], [x0], x1
9:
ret
ret
endfunc
function ff_h264_h_loop_filter_chroma422_intra_neon, export=1
h264_loop_filter_start_intra
sub x4, x0, #2
add x5, x0, x1, lsl #3
sub x0, x0, #1
mov x7, x30
bl h_loop_filter_chroma420_intra
sub x0, x5, #1
mov x30, x7
b h_loop_filter_chroma420_intra
h264_loop_filter_start_intra
sub x4, x0, #2
add x5, x0, x1, lsl #3
sub x0, x0, #1
mov x7, x30
bl h_loop_filter_chroma420_intra
sub x0, x5, #1
mov x30, x7
b h_loop_filter_chroma420_intra
endfunc
.macro biweight_16 macs, macd