diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S index 997167ca88..997082498f 100644 --- a/libavcodec/aarch64/h264dsp_neon.S +++ b/libavcodec/aarch64/h264dsp_neon.S @@ -182,198 +182,198 @@ endfunc .macro h264_loop_filter_start_intra - orr w4, w2, w3 - cbnz w4, 1f - ret + orr w4, w2, w3 + cbnz w4, 1f + ret 1: - dup v30.16b, w2 // alpha - dup v31.16b, w3 // beta + dup v30.16b, w2 // alpha + dup v31.16b, w3 // beta .endm .macro h264_loop_filter_luma_intra - uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0) - uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0) - uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0) - cmhi v19.16b, v30.16b, v16.16b // < alpha - cmhi v17.16b, v31.16b, v17.16b // < beta - cmhi v18.16b, v31.16b, v18.16b // < beta + uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0) + uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0) + uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0) + cmhi v19.16b, v30.16b, v16.16b // < alpha + cmhi v17.16b, v31.16b, v17.16b // < beta + cmhi v18.16b, v31.16b, v18.16b // < beta - movi v29.16b, #2 - ushr v30.16b, v30.16b, #2 // alpha >> 2 - add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2 - cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2 + movi v29.16b, #2 + ushr v30.16b, v30.16b, #2 // alpha >> 2 + add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2 + cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2 - and v19.16b, v19.16b, v17.16b - and v19.16b, v19.16b, v18.16b - shrn v20.8b, v19.8h, #4 - mov x4, v20.d[0] - cbz x4, 9f + and v19.16b, v19.16b, v17.16b + and v19.16b, v19.16b, v18.16b + shrn v20.8b, v19.8h, #4 + mov x4, v20.d[0] + cbz x4, 9f - ushll v20.8h, v6.8b, #1 - ushll v22.8h, v1.8b, #1 - ushll2 v21.8h, v6.16b, #1 - ushll2 v23.8h, v1.16b, #1 - uaddw v20.8h, v20.8h, v7.8b - uaddw v22.8h, v22.8h, v0.8b - uaddw2 v21.8h, v21.8h, v7.16b - uaddw2 v23.8h, v23.8h, v0.16b - uaddw v20.8h, v20.8h, v1.8b - uaddw v22.8h, v22.8h, v6.8b - uaddw2 v21.8h, v21.8h, v1.16b - uaddw2 v23.8h, v23.8h, v6.16b + ushll v20.8h, v6.8b, #1 + ushll v22.8h, v1.8b, #1 + ushll2 v21.8h, v6.16b, #1 + ushll2 v23.8h, v1.16b, #1 + uaddw v20.8h, v20.8h, v7.8b + uaddw v22.8h, v22.8h, v0.8b + uaddw2 v21.8h, v21.8h, v7.16b + uaddw2 v23.8h, v23.8h, v0.16b + uaddw v20.8h, v20.8h, v1.8b + uaddw v22.8h, v22.8h, v6.8b + uaddw2 v21.8h, v21.8h, v1.16b + uaddw2 v23.8h, v23.8h, v6.16b - rshrn v24.8b, v20.8h, #2 // p0'_1 - rshrn v25.8b, v22.8h, #2 // q0'_1 - rshrn2 v24.16b, v21.8h, #2 // p0'_1 - rshrn2 v25.16b, v23.8h, #2 // q0'_1 + rshrn v24.8b, v20.8h, #2 // p0'_1 + rshrn v25.8b, v22.8h, #2 // q0'_1 + rshrn2 v24.16b, v21.8h, #2 // p0'_1 + rshrn2 v25.16b, v23.8h, #2 // q0'_1 - uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0) - uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0) - cmhi v17.16b, v31.16b, v17.16b // < beta - cmhi v18.16b, v31.16b, v18.16b // < beta + uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0) + uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0) + cmhi v17.16b, v31.16b, v17.16b // < beta + cmhi v18.16b, v31.16b, v18.16b // < beta - and v17.16b, v16.16b, v17.16b // if_2 && if_3 - and v18.16b, v16.16b, v18.16b // if_2 && if_4 + and v17.16b, v16.16b, v17.16b // if_2 && if_3 + and v18.16b, v16.16b, v18.16b // if_2 && if_4 - not v30.16b, v17.16b - not v31.16b, v18.16b + not v30.16b, v17.16b + not v31.16b, v18.16b - and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3) - and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4) + and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3) + and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4) - and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3 - and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4 + and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3 + and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4 - //calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4 - uaddl v26.8h, v5.8b, v7.8b - uaddl2 v27.8h, v5.16b, v7.16b - uaddw v26.8h, v26.8h, v0.8b - uaddw2 v27.8h, v27.8h, v0.16b - add v20.8h, v20.8h, v26.8h - add v21.8h, v21.8h, v27.8h - uaddw v20.8h, v20.8h, v0.8b - uaddw2 v21.8h, v21.8h, v0.16b - rshrn v20.8b, v20.8h, #3 // p0'_2 - rshrn2 v20.16b, v21.8h, #3 // p0'_2 - uaddw v26.8h, v26.8h, v6.8b - uaddw2 v27.8h, v27.8h, v6.16b - rshrn v21.8b, v26.8h, #2 // p1'_2 - rshrn2 v21.16b, v27.8h, #2 // p1'_2 - uaddl v28.8h, v4.8b, v5.8b - uaddl2 v29.8h, v4.16b, v5.16b - shl v28.8h, v28.8h, #1 - shl v29.8h, v29.8h, #1 - add v28.8h, v28.8h, v26.8h - add v29.8h, v29.8h, v27.8h - rshrn v19.8b, v28.8h, #3 // p2'_2 - rshrn2 v19.16b, v29.8h, #3 // p2'_2 + //calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4 + uaddl v26.8h, v5.8b, v7.8b + uaddl2 v27.8h, v5.16b, v7.16b + uaddw v26.8h, v26.8h, v0.8b + uaddw2 v27.8h, v27.8h, v0.16b + add v20.8h, v20.8h, v26.8h + add v21.8h, v21.8h, v27.8h + uaddw v20.8h, v20.8h, v0.8b + uaddw2 v21.8h, v21.8h, v0.16b + rshrn v20.8b, v20.8h, #3 // p0'_2 + rshrn2 v20.16b, v21.8h, #3 // p0'_2 + uaddw v26.8h, v26.8h, v6.8b + uaddw2 v27.8h, v27.8h, v6.16b + rshrn v21.8b, v26.8h, #2 // p1'_2 + rshrn2 v21.16b, v27.8h, #2 // p1'_2 + uaddl v28.8h, v4.8b, v5.8b + uaddl2 v29.8h, v4.16b, v5.16b + shl v28.8h, v28.8h, #1 + shl v29.8h, v29.8h, #1 + add v28.8h, v28.8h, v26.8h + add v29.8h, v29.8h, v27.8h + rshrn v19.8b, v28.8h, #3 // p2'_2 + rshrn2 v19.16b, v29.8h, #3 // p2'_2 - //calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3 - uaddl v26.8h, v2.8b, v0.8b - uaddl2 v27.8h, v2.16b, v0.16b - uaddw v26.8h, v26.8h, v7.8b - uaddw2 v27.8h, v27.8h, v7.16b - add v22.8h, v22.8h, v26.8h - add v23.8h, v23.8h, v27.8h - uaddw v22.8h, v22.8h, v7.8b - uaddw2 v23.8h, v23.8h, v7.16b - rshrn v22.8b, v22.8h, #3 // q0'_2 - rshrn2 v22.16b, v23.8h, #3 // q0'_2 - uaddw v26.8h, v26.8h, v1.8b - uaddw2 v27.8h, v27.8h, v1.16b - rshrn v23.8b, v26.8h, #2 // q1'_2 - rshrn2 v23.16b, v27.8h, #2 // q1'_2 - uaddl v28.8h, v2.8b, v3.8b - uaddl2 v29.8h, v2.16b, v3.16b - shl v28.8h, v28.8h, #1 - shl v29.8h, v29.8h, #1 - add v28.8h, v28.8h, v26.8h - add v29.8h, v29.8h, v27.8h - rshrn v26.8b, v28.8h, #3 // q2'_2 - rshrn2 v26.16b, v29.8h, #3 // q2'_2 + //calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3 + uaddl v26.8h, v2.8b, v0.8b + uaddl2 v27.8h, v2.16b, v0.16b + uaddw v26.8h, v26.8h, v7.8b + uaddw2 v27.8h, v27.8h, v7.16b + add v22.8h, v22.8h, v26.8h + add v23.8h, v23.8h, v27.8h + uaddw v22.8h, v22.8h, v7.8b + uaddw2 v23.8h, v23.8h, v7.16b + rshrn v22.8b, v22.8h, #3 // q0'_2 + rshrn2 v22.16b, v23.8h, #3 // q0'_2 + uaddw v26.8h, v26.8h, v1.8b + uaddw2 v27.8h, v27.8h, v1.16b + rshrn v23.8b, v26.8h, #2 // q1'_2 + rshrn2 v23.16b, v27.8h, #2 // q1'_2 + uaddl v28.8h, v2.8b, v3.8b + uaddl2 v29.8h, v2.16b, v3.16b + shl v28.8h, v28.8h, #1 + shl v29.8h, v29.8h, #1 + add v28.8h, v28.8h, v26.8h + add v29.8h, v29.8h, v27.8h + rshrn v26.8b, v28.8h, #3 // q2'_2 + rshrn2 v26.16b, v29.8h, #3 // q2'_2 - bit v7.16b, v24.16b, v30.16b // p0'_1 - bit v0.16b, v25.16b, v31.16b // q0'_1 - bit v7.16b, v20.16b, v17.16b // p0'_2 - bit v6.16b, v21.16b, v17.16b // p1'_2 - bit v5.16b, v19.16b, v17.16b // p2'_2 - bit v0.16b, v22.16b, v18.16b // q0'_2 - bit v1.16b, v23.16b, v18.16b // q1'_2 - bit v2.16b, v26.16b, v18.16b // q2'_2 + bit v7.16b, v24.16b, v30.16b // p0'_1 + bit v0.16b, v25.16b, v31.16b // q0'_1 + bit v7.16b, v20.16b, v17.16b // p0'_2 + bit v6.16b, v21.16b, v17.16b // p1'_2 + bit v5.16b, v19.16b, v17.16b // p2'_2 + bit v0.16b, v22.16b, v18.16b // q0'_2 + bit v1.16b, v23.16b, v18.16b // q1'_2 + bit v2.16b, v26.16b, v18.16b // q2'_2 .endm function ff_h264_v_loop_filter_luma_intra_neon, export=1 - h264_loop_filter_start_intra + h264_loop_filter_start_intra - ld1 {v0.16b}, [x0], x1 // q0 - ld1 {v1.16b}, [x0], x1 // q1 - ld1 {v2.16b}, [x0], x1 // q2 - ld1 {v3.16b}, [x0], x1 // q3 - sub x0, x0, x1, lsl #3 - ld1 {v4.16b}, [x0], x1 // p3 - ld1 {v5.16b}, [x0], x1 // p2 - ld1 {v6.16b}, [x0], x1 // p1 - ld1 {v7.16b}, [x0] // p0 + ld1 {v0.16b}, [x0], x1 // q0 + ld1 {v1.16b}, [x0], x1 // q1 + ld1 {v2.16b}, [x0], x1 // q2 + ld1 {v3.16b}, [x0], x1 // q3 + sub x0, x0, x1, lsl #3 + ld1 {v4.16b}, [x0], x1 // p3 + ld1 {v5.16b}, [x0], x1 // p2 + ld1 {v6.16b}, [x0], x1 // p1 + ld1 {v7.16b}, [x0] // p0 - h264_loop_filter_luma_intra + h264_loop_filter_luma_intra - sub x0, x0, x1, lsl #1 - st1 {v5.16b}, [x0], x1 // p2 - st1 {v6.16b}, [x0], x1 // p1 - st1 {v7.16b}, [x0], x1 // p0 - st1 {v0.16b}, [x0], x1 // q0 - st1 {v1.16b}, [x0], x1 // q1 - st1 {v2.16b}, [x0] // q2 + sub x0, x0, x1, lsl #1 + st1 {v5.16b}, [x0], x1 // p2 + st1 {v6.16b}, [x0], x1 // p1 + st1 {v7.16b}, [x0], x1 // p0 + st1 {v0.16b}, [x0], x1 // q0 + st1 {v1.16b}, [x0], x1 // q1 + st1 {v2.16b}, [x0] // q2 9: - ret + ret endfunc function ff_h264_h_loop_filter_luma_intra_neon, export=1 - h264_loop_filter_start_intra + h264_loop_filter_start_intra - sub x0, x0, #4 - ld1 {v4.8b}, [x0], x1 - ld1 {v5.8b}, [x0], x1 - ld1 {v6.8b}, [x0], x1 - ld1 {v7.8b}, [x0], x1 - ld1 {v0.8b}, [x0], x1 - ld1 {v1.8b}, [x0], x1 - ld1 {v2.8b}, [x0], x1 - ld1 {v3.8b}, [x0], x1 - ld1 {v4.d}[1], [x0], x1 - ld1 {v5.d}[1], [x0], x1 - ld1 {v6.d}[1], [x0], x1 - ld1 {v7.d}[1], [x0], x1 - ld1 {v0.d}[1], [x0], x1 - ld1 {v1.d}[1], [x0], x1 - ld1 {v2.d}[1], [x0], x1 - ld1 {v3.d}[1], [x0], x1 + sub x0, x0, #4 + ld1 {v4.8b}, [x0], x1 + ld1 {v5.8b}, [x0], x1 + ld1 {v6.8b}, [x0], x1 + ld1 {v7.8b}, [x0], x1 + ld1 {v0.8b}, [x0], x1 + ld1 {v1.8b}, [x0], x1 + ld1 {v2.8b}, [x0], x1 + ld1 {v3.8b}, [x0], x1 + ld1 {v4.d}[1], [x0], x1 + ld1 {v5.d}[1], [x0], x1 + ld1 {v6.d}[1], [x0], x1 + ld1 {v7.d}[1], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v1.d}[1], [x0], x1 + ld1 {v2.d}[1], [x0], x1 + ld1 {v3.d}[1], [x0], x1 - transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 + transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 - h264_loop_filter_luma_intra + h264_loop_filter_luma_intra - transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 + transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 - sub x0, x0, x1, lsl #4 - st1 {v4.8b}, [x0], x1 - st1 {v5.8b}, [x0], x1 - st1 {v6.8b}, [x0], x1 - st1 {v7.8b}, [x0], x1 - st1 {v0.8b}, [x0], x1 - st1 {v1.8b}, [x0], x1 - st1 {v2.8b}, [x0], x1 - st1 {v3.8b}, [x0], x1 - st1 {v4.d}[1], [x0], x1 - st1 {v5.d}[1], [x0], x1 - st1 {v6.d}[1], [x0], x1 - st1 {v7.d}[1], [x0], x1 - st1 {v0.d}[1], [x0], x1 - st1 {v1.d}[1], [x0], x1 - st1 {v2.d}[1], [x0], x1 - st1 {v3.d}[1], [x0], x1 + sub x0, x0, x1, lsl #4 + st1 {v4.8b}, [x0], x1 + st1 {v5.8b}, [x0], x1 + st1 {v6.8b}, [x0], x1 + st1 {v7.8b}, [x0], x1 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x0], x1 + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x0], x1 + st1 {v4.d}[1], [x0], x1 + st1 {v5.d}[1], [x0], x1 + st1 {v6.d}[1], [x0], x1 + st1 {v7.d}[1], [x0], x1 + st1 {v0.d}[1], [x0], x1 + st1 {v1.d}[1], [x0], x1 + st1 {v2.d}[1], [x0], x1 + st1 {v3.d}[1], [x0], x1 9: - ret + ret endfunc .macro h264_loop_filter_chroma @@ -474,113 +474,113 @@ function ff_h264_h_loop_filter_chroma422_neon, export=1 endfunc .macro h264_loop_filter_chroma_intra - uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0) - uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0) - uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0) - cmhi v26.8b, v30.8b, v26.8b // < alpha - cmhi v27.8b, v31.8b, v27.8b // < beta - cmhi v28.8b, v31.8b, v28.8b // < beta - and v26.8b, v26.8b, v27.8b - and v26.8b, v26.8b, v28.8b - mov x2, v26.d[0] + uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0) + uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0) + uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0) + cmhi v26.8b, v30.8b, v26.8b // < alpha + cmhi v27.8b, v31.8b, v27.8b // < beta + cmhi v28.8b, v31.8b, v28.8b // < beta + and v26.8b, v26.8b, v27.8b + and v26.8b, v26.8b, v28.8b + mov x2, v26.d[0] - ushll v4.8h, v18.8b, #1 - ushll v6.8h, v19.8b, #1 - cbz x2, 9f - uaddl v20.8h, v16.8b, v19.8b - uaddl v22.8h, v17.8b, v18.8b - add v20.8h, v20.8h, v4.8h - add v22.8h, v22.8h, v6.8h - uqrshrn v24.8b, v20.8h, #2 - uqrshrn v25.8b, v22.8h, #2 - bit v16.8b, v24.8b, v26.8b - bit v17.8b, v25.8b, v26.8b + ushll v4.8h, v18.8b, #1 + ushll v6.8h, v19.8b, #1 + cbz x2, 9f + uaddl v20.8h, v16.8b, v19.8b + uaddl v22.8h, v17.8b, v18.8b + add v20.8h, v20.8h, v4.8h + add v22.8h, v22.8h, v6.8h + uqrshrn v24.8b, v20.8h, #2 + uqrshrn v25.8b, v22.8h, #2 + bit v16.8b, v24.8b, v26.8b + bit v17.8b, v25.8b, v26.8b .endm function ff_h264_v_loop_filter_chroma_intra_neon, export=1 - h264_loop_filter_start_intra + h264_loop_filter_start_intra - sub x0, x0, x1, lsl #1 - ld1 {v18.8b}, [x0], x1 - ld1 {v16.8b}, [x0], x1 - ld1 {v17.8b}, [x0], x1 - ld1 {v19.8b}, [x0] + sub x0, x0, x1, lsl #1 + ld1 {v18.8b}, [x0], x1 + ld1 {v16.8b}, [x0], x1 + ld1 {v17.8b}, [x0], x1 + ld1 {v19.8b}, [x0] - h264_loop_filter_chroma_intra + h264_loop_filter_chroma_intra - sub x0, x0, x1, lsl #1 - st1 {v16.8b}, [x0], x1 - st1 {v17.8b}, [x0], x1 + sub x0, x0, x1, lsl #1 + st1 {v16.8b}, [x0], x1 + st1 {v17.8b}, [x0], x1 9: - ret + ret endfunc function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1 - h264_loop_filter_start_intra + h264_loop_filter_start_intra - sub x4, x0, #2 - sub x0, x0, #1 - ld1 {v18.8b}, [x4], x1 - ld1 {v16.8b}, [x4], x1 - ld1 {v17.8b}, [x4], x1 - ld1 {v19.8b}, [x4], x1 + sub x4, x0, #2 + sub x0, x0, #1 + ld1 {v18.8b}, [x4], x1 + ld1 {v16.8b}, [x4], x1 + ld1 {v17.8b}, [x4], x1 + ld1 {v19.8b}, [x4], x1 - transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 + transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 - h264_loop_filter_chroma_intra + h264_loop_filter_chroma_intra - st2 {v16.b,v17.b}[0], [x0], x1 - st2 {v16.b,v17.b}[1], [x0], x1 - st2 {v16.b,v17.b}[2], [x0], x1 - st2 {v16.b,v17.b}[3], [x0], x1 + st2 {v16.b,v17.b}[0], [x0], x1 + st2 {v16.b,v17.b}[1], [x0], x1 + st2 {v16.b,v17.b}[2], [x0], x1 + st2 {v16.b,v17.b}[3], [x0], x1 9: - ret + ret endfunc function ff_h264_h_loop_filter_chroma_intra_neon, export=1 - h264_loop_filter_start_intra + h264_loop_filter_start_intra - sub x4, x0, #2 - sub x0, x0, #1 + sub x4, x0, #2 + sub x0, x0, #1 h_loop_filter_chroma420_intra: - ld1 {v18.8b}, [x4], x1 - ld1 {v16.8b}, [x4], x1 - ld1 {v17.8b}, [x4], x1 - ld1 {v19.8b}, [x4], x1 - ld1 {v18.s}[1], [x4], x1 - ld1 {v16.s}[1], [x4], x1 - ld1 {v17.s}[1], [x4], x1 - ld1 {v19.s}[1], [x4], x1 + ld1 {v18.8b}, [x4], x1 + ld1 {v16.8b}, [x4], x1 + ld1 {v17.8b}, [x4], x1 + ld1 {v19.8b}, [x4], x1 + ld1 {v18.s}[1], [x4], x1 + ld1 {v16.s}[1], [x4], x1 + ld1 {v17.s}[1], [x4], x1 + ld1 {v19.s}[1], [x4], x1 - transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 + transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 - h264_loop_filter_chroma_intra + h264_loop_filter_chroma_intra - st2 {v16.b,v17.b}[0], [x0], x1 - st2 {v16.b,v17.b}[1], [x0], x1 - st2 {v16.b,v17.b}[2], [x0], x1 - st2 {v16.b,v17.b}[3], [x0], x1 - st2 {v16.b,v17.b}[4], [x0], x1 - st2 {v16.b,v17.b}[5], [x0], x1 - st2 {v16.b,v17.b}[6], [x0], x1 - st2 {v16.b,v17.b}[7], [x0], x1 + st2 {v16.b,v17.b}[0], [x0], x1 + st2 {v16.b,v17.b}[1], [x0], x1 + st2 {v16.b,v17.b}[2], [x0], x1 + st2 {v16.b,v17.b}[3], [x0], x1 + st2 {v16.b,v17.b}[4], [x0], x1 + st2 {v16.b,v17.b}[5], [x0], x1 + st2 {v16.b,v17.b}[6], [x0], x1 + st2 {v16.b,v17.b}[7], [x0], x1 9: - ret + ret endfunc function ff_h264_h_loop_filter_chroma422_intra_neon, export=1 - h264_loop_filter_start_intra - sub x4, x0, #2 - add x5, x0, x1, lsl #3 - sub x0, x0, #1 - mov x7, x30 - bl h_loop_filter_chroma420_intra - sub x0, x5, #1 - mov x30, x7 - b h_loop_filter_chroma420_intra + h264_loop_filter_start_intra + sub x4, x0, #2 + add x5, x0, x1, lsl #3 + sub x0, x0, #1 + mov x7, x30 + bl h_loop_filter_chroma420_intra + sub x0, x5, #1 + mov x30, x7 + b h_loop_filter_chroma420_intra endfunc .macro biweight_16 macs, macd