ffmpeg/libavcodec/aarch64/vvc/alf.S

294 lines
10 KiB
ArmAsm
Raw Normal View History

avcodec/vvc: Add aarch64 neon optimization for ALF vvc_alf_filter_chroma_4x4_8_c: 3.0 vvc_alf_filter_chroma_4x4_8_neon: 1.0 vvc_alf_filter_chroma_4x4_10_c: 2.7 vvc_alf_filter_chroma_4x4_10_neon: 1.0 vvc_alf_filter_chroma_4x4_12_c: 2.7 vvc_alf_filter_chroma_4x4_12_neon: 1.0 vvc_alf_filter_chroma_8x8_8_c: 10.2 vvc_alf_filter_chroma_8x8_8_neon: 3.0 vvc_alf_filter_chroma_8x8_10_c: 10.0 vvc_alf_filter_chroma_8x8_10_neon: 2.5 vvc_alf_filter_chroma_8x8_12_c: 10.0 vvc_alf_filter_chroma_8x8_12_neon: 2.5 vvc_alf_filter_chroma_16x16_8_c: 41.7 vvc_alf_filter_chroma_16x16_8_neon: 11.2 vvc_alf_filter_chroma_16x16_10_c: 39.0 vvc_alf_filter_chroma_16x16_10_neon: 10.0 vvc_alf_filter_chroma_16x16_12_c: 40.2 vvc_alf_filter_chroma_16x16_12_neon: 10.2 vvc_alf_filter_chroma_32x32_8_c: 162.0 vvc_alf_filter_chroma_32x32_8_neon: 45.0 vvc_alf_filter_chroma_32x32_10_c: 155.5 vvc_alf_filter_chroma_32x32_10_neon: 39.5 vvc_alf_filter_chroma_32x32_12_c: 155.5 vvc_alf_filter_chroma_32x32_12_neon: 40.0 vvc_alf_filter_chroma_64x64_8_c: 646.0 vvc_alf_filter_chroma_64x64_8_neon: 175.5 vvc_alf_filter_chroma_64x64_10_c: 708.2 vvc_alf_filter_chroma_64x64_10_neon: 166.7 vvc_alf_filter_chroma_64x64_12_c: 619.2 vvc_alf_filter_chroma_64x64_12_neon: 157.2 vvc_alf_filter_chroma_128x128_8_c: 2611.5 vvc_alf_filter_chroma_128x128_8_neon: 698.2 vvc_alf_filter_chroma_128x128_10_c: 2470.0 vvc_alf_filter_chroma_128x128_10_neon: 616.0 vvc_alf_filter_chroma_128x128_12_c: 2531.5 vvc_alf_filter_chroma_128x128_12_neon: 620.2 vvc_alf_filter_luma_8x8_8_c: 25.2 vvc_alf_filter_luma_8x8_8_neon: 4.2 vvc_alf_filter_luma_8x8_10_c: 18.5 vvc_alf_filter_luma_8x8_10_neon: 4.0 vvc_alf_filter_luma_8x8_12_c: 19.0 vvc_alf_filter_luma_8x8_12_neon: 4.0 vvc_alf_filter_luma_16x16_8_c: 106.5 vvc_alf_filter_luma_16x16_8_neon: 16.2 vvc_alf_filter_luma_16x16_10_c: 75.2 vvc_alf_filter_luma_16x16_10_neon: 14.7 vvc_alf_filter_luma_16x16_12_c: 79.7 vvc_alf_filter_luma_16x16_12_neon: 14.7 vvc_alf_filter_luma_32x32_8_c: 400.5 vvc_alf_filter_luma_32x32_8_neon: 63.2 vvc_alf_filter_luma_32x32_10_c: 299.2 vvc_alf_filter_luma_32x32_10_neon: 57.7 vvc_alf_filter_luma_32x32_12_c: 299.2 vvc_alf_filter_luma_32x32_12_neon: 57.7 vvc_alf_filter_luma_64x64_8_c: 1602.5 vvc_alf_filter_luma_64x64_8_neon: 251.7 vvc_alf_filter_luma_64x64_10_c: 1197.0 vvc_alf_filter_luma_64x64_10_neon: 235.5 vvc_alf_filter_luma_64x64_12_c: 1220.2 vvc_alf_filter_luma_64x64_12_neon: 235.7 vvc_alf_filter_luma_128x128_8_c: 6570.2 vvc_alf_filter_luma_128x128_8_neon: 1007.7 vvc_alf_filter_luma_128x128_10_c: 4822.7 vvc_alf_filter_luma_128x128_10_neon: 936.2 vvc_alf_filter_luma_128x128_12_c: 4791.2 vvc_alf_filter_luma_128x128_12_neon: 938.5 Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
2024-07-15 16:19:15 +00:00
/*
* Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
.macro alf_luma_filter_pixel index, pix_size, addr1, addr2, offset1, offset2
.if \pix_size == 1
ldur d3, [\addr1, #\offset1]
ldur d4, [\addr2, #\offset2]
uxtl v6.8h, v3.8b
uxtl v7.8h, v4.8b
.else
ldur q6, [\addr1, #(2*\offset1)]
ldur q7, [\addr2, #(2*\offset2)]
.endif
.if \index < 8
dup v17.4h, v0.h[\index] // clip
dup v18.4h, v16.h[\index] // -clip
dup v19.4h, v1.h[\index] // filter
dup v26.4h, v22.h[\index] // clip
dup v27.4h, v23.h[\index] // -clip
dup v28.4h, v24.h[\index] // filter
.else
dup v17.4h, v0.h[\index - 8] // clip
dup v18.4h, v16.h[\index - 8] // -clip
dup v19.4h, v1.h[\index - 8] // filter
dup v26.4h, v22.h[\index - 8] // clip
dup v27.4h, v23.h[\index - 8] // -clip
dup v28.4h, v24.h[\index - 8] // filter
.endif
ins v17.d[1], v26.d[0]
ins v18.d[1], v27.d[0]
ins v19.d[1], v28.d[0]
sub v6.8h, v6.8h, v5.8h
sub v7.8h, v7.8h, v5.8h
smin v6.8h, v6.8h, v17.8h
smin v7.8h, v7.8h, v17.8h
smax v6.8h, v6.8h, v18.8h
smax v7.8h, v7.8h, v18.8h
add v6.8h, v6.8h, v7.8h
smlal v20.4s, v19.4h, v6.4h // v20: sum
smlal2 v21.4s, v19.8h, v6.8h // v21: sum
.endm
/* x0: dst
* x1: pp
* x2: filter
* x3: clip
* w4: is_near_vb
* w5: pix_max
*/
.macro alf_filter_luma_kernel, pix_size
dst .req x0
pp .req x1
filter .req x2
clip .req x3
is_near_vb .req w4
pix_max .req w5
.if \pix_size > 1
dup v25.8h, pix_max // pix_max
.endif
ldr q0, [clip] // clip
ldr q1, [filter] // filter
ldur q22, [clip, #24] // clip
ldur q24, [filter, #24] // filter
avcodec/vvc: Add aarch64 neon optimization for ALF vvc_alf_filter_chroma_4x4_8_c: 3.0 vvc_alf_filter_chroma_4x4_8_neon: 1.0 vvc_alf_filter_chroma_4x4_10_c: 2.7 vvc_alf_filter_chroma_4x4_10_neon: 1.0 vvc_alf_filter_chroma_4x4_12_c: 2.7 vvc_alf_filter_chroma_4x4_12_neon: 1.0 vvc_alf_filter_chroma_8x8_8_c: 10.2 vvc_alf_filter_chroma_8x8_8_neon: 3.0 vvc_alf_filter_chroma_8x8_10_c: 10.0 vvc_alf_filter_chroma_8x8_10_neon: 2.5 vvc_alf_filter_chroma_8x8_12_c: 10.0 vvc_alf_filter_chroma_8x8_12_neon: 2.5 vvc_alf_filter_chroma_16x16_8_c: 41.7 vvc_alf_filter_chroma_16x16_8_neon: 11.2 vvc_alf_filter_chroma_16x16_10_c: 39.0 vvc_alf_filter_chroma_16x16_10_neon: 10.0 vvc_alf_filter_chroma_16x16_12_c: 40.2 vvc_alf_filter_chroma_16x16_12_neon: 10.2 vvc_alf_filter_chroma_32x32_8_c: 162.0 vvc_alf_filter_chroma_32x32_8_neon: 45.0 vvc_alf_filter_chroma_32x32_10_c: 155.5 vvc_alf_filter_chroma_32x32_10_neon: 39.5 vvc_alf_filter_chroma_32x32_12_c: 155.5 vvc_alf_filter_chroma_32x32_12_neon: 40.0 vvc_alf_filter_chroma_64x64_8_c: 646.0 vvc_alf_filter_chroma_64x64_8_neon: 175.5 vvc_alf_filter_chroma_64x64_10_c: 708.2 vvc_alf_filter_chroma_64x64_10_neon: 166.7 vvc_alf_filter_chroma_64x64_12_c: 619.2 vvc_alf_filter_chroma_64x64_12_neon: 157.2 vvc_alf_filter_chroma_128x128_8_c: 2611.5 vvc_alf_filter_chroma_128x128_8_neon: 698.2 vvc_alf_filter_chroma_128x128_10_c: 2470.0 vvc_alf_filter_chroma_128x128_10_neon: 616.0 vvc_alf_filter_chroma_128x128_12_c: 2531.5 vvc_alf_filter_chroma_128x128_12_neon: 620.2 vvc_alf_filter_luma_8x8_8_c: 25.2 vvc_alf_filter_luma_8x8_8_neon: 4.2 vvc_alf_filter_luma_8x8_10_c: 18.5 vvc_alf_filter_luma_8x8_10_neon: 4.0 vvc_alf_filter_luma_8x8_12_c: 19.0 vvc_alf_filter_luma_8x8_12_neon: 4.0 vvc_alf_filter_luma_16x16_8_c: 106.5 vvc_alf_filter_luma_16x16_8_neon: 16.2 vvc_alf_filter_luma_16x16_10_c: 75.2 vvc_alf_filter_luma_16x16_10_neon: 14.7 vvc_alf_filter_luma_16x16_12_c: 79.7 vvc_alf_filter_luma_16x16_12_neon: 14.7 vvc_alf_filter_luma_32x32_8_c: 400.5 vvc_alf_filter_luma_32x32_8_neon: 63.2 vvc_alf_filter_luma_32x32_10_c: 299.2 vvc_alf_filter_luma_32x32_10_neon: 57.7 vvc_alf_filter_luma_32x32_12_c: 299.2 vvc_alf_filter_luma_32x32_12_neon: 57.7 vvc_alf_filter_luma_64x64_8_c: 1602.5 vvc_alf_filter_luma_64x64_8_neon: 251.7 vvc_alf_filter_luma_64x64_10_c: 1197.0 vvc_alf_filter_luma_64x64_10_neon: 235.5 vvc_alf_filter_luma_64x64_12_c: 1220.2 vvc_alf_filter_luma_64x64_12_neon: 235.7 vvc_alf_filter_luma_128x128_8_c: 6570.2 vvc_alf_filter_luma_128x128_8_neon: 1007.7 vvc_alf_filter_luma_128x128_10_c: 4822.7 vvc_alf_filter_luma_128x128_10_neon: 936.2 vvc_alf_filter_luma_128x128_12_c: 4791.2 vvc_alf_filter_luma_128x128_12_neon: 938.5 Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
2024-07-15 16:19:15 +00:00
ldr x5, [pp] // x5: p0
ldr x6, [pp, #(5*8)] // x6: p5
ldr x7, [pp, #(6*8)] // x7: p6
neg v16.8h, v0.8h // -clip
neg v23.8h, v22.8h // -clip
.if \pix_size == 1
ldr d2, [x5] // curr
.else
ldr q5, [x5] // curr
.endif
movi v20.4s, #64
avcodec/vvc: Add aarch64 neon optimization for ALF vvc_alf_filter_chroma_4x4_8_c: 3.0 vvc_alf_filter_chroma_4x4_8_neon: 1.0 vvc_alf_filter_chroma_4x4_10_c: 2.7 vvc_alf_filter_chroma_4x4_10_neon: 1.0 vvc_alf_filter_chroma_4x4_12_c: 2.7 vvc_alf_filter_chroma_4x4_12_neon: 1.0 vvc_alf_filter_chroma_8x8_8_c: 10.2 vvc_alf_filter_chroma_8x8_8_neon: 3.0 vvc_alf_filter_chroma_8x8_10_c: 10.0 vvc_alf_filter_chroma_8x8_10_neon: 2.5 vvc_alf_filter_chroma_8x8_12_c: 10.0 vvc_alf_filter_chroma_8x8_12_neon: 2.5 vvc_alf_filter_chroma_16x16_8_c: 41.7 vvc_alf_filter_chroma_16x16_8_neon: 11.2 vvc_alf_filter_chroma_16x16_10_c: 39.0 vvc_alf_filter_chroma_16x16_10_neon: 10.0 vvc_alf_filter_chroma_16x16_12_c: 40.2 vvc_alf_filter_chroma_16x16_12_neon: 10.2 vvc_alf_filter_chroma_32x32_8_c: 162.0 vvc_alf_filter_chroma_32x32_8_neon: 45.0 vvc_alf_filter_chroma_32x32_10_c: 155.5 vvc_alf_filter_chroma_32x32_10_neon: 39.5 vvc_alf_filter_chroma_32x32_12_c: 155.5 vvc_alf_filter_chroma_32x32_12_neon: 40.0 vvc_alf_filter_chroma_64x64_8_c: 646.0 vvc_alf_filter_chroma_64x64_8_neon: 175.5 vvc_alf_filter_chroma_64x64_10_c: 708.2 vvc_alf_filter_chroma_64x64_10_neon: 166.7 vvc_alf_filter_chroma_64x64_12_c: 619.2 vvc_alf_filter_chroma_64x64_12_neon: 157.2 vvc_alf_filter_chroma_128x128_8_c: 2611.5 vvc_alf_filter_chroma_128x128_8_neon: 698.2 vvc_alf_filter_chroma_128x128_10_c: 2470.0 vvc_alf_filter_chroma_128x128_10_neon: 616.0 vvc_alf_filter_chroma_128x128_12_c: 2531.5 vvc_alf_filter_chroma_128x128_12_neon: 620.2 vvc_alf_filter_luma_8x8_8_c: 25.2 vvc_alf_filter_luma_8x8_8_neon: 4.2 vvc_alf_filter_luma_8x8_10_c: 18.5 vvc_alf_filter_luma_8x8_10_neon: 4.0 vvc_alf_filter_luma_8x8_12_c: 19.0 vvc_alf_filter_luma_8x8_12_neon: 4.0 vvc_alf_filter_luma_16x16_8_c: 106.5 vvc_alf_filter_luma_16x16_8_neon: 16.2 vvc_alf_filter_luma_16x16_10_c: 75.2 vvc_alf_filter_luma_16x16_10_neon: 14.7 vvc_alf_filter_luma_16x16_12_c: 79.7 vvc_alf_filter_luma_16x16_12_neon: 14.7 vvc_alf_filter_luma_32x32_8_c: 400.5 vvc_alf_filter_luma_32x32_8_neon: 63.2 vvc_alf_filter_luma_32x32_10_c: 299.2 vvc_alf_filter_luma_32x32_10_neon: 57.7 vvc_alf_filter_luma_32x32_12_c: 299.2 vvc_alf_filter_luma_32x32_12_neon: 57.7 vvc_alf_filter_luma_64x64_8_c: 1602.5 vvc_alf_filter_luma_64x64_8_neon: 251.7 vvc_alf_filter_luma_64x64_10_c: 1197.0 vvc_alf_filter_luma_64x64_10_neon: 235.5 vvc_alf_filter_luma_64x64_12_c: 1220.2 vvc_alf_filter_luma_64x64_12_neon: 235.7 vvc_alf_filter_luma_128x128_8_c: 6570.2 vvc_alf_filter_luma_128x128_8_neon: 1007.7 vvc_alf_filter_luma_128x128_10_c: 4822.7 vvc_alf_filter_luma_128x128_10_neon: 936.2 vvc_alf_filter_luma_128x128_12_c: 4791.2 vvc_alf_filter_luma_128x128_12_neon: 938.5 Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
2024-07-15 16:19:15 +00:00
cbz is_near_vb, 1f
shl v20.4s, v20.4s, #3
1:
.if \pix_size == 1
uxtl v5.8h, v2.8b
.endif
mov v21.16b, v20.16b
ldr x8, [pp, #(3*8)] // p3
ldr x9, [pp, #(4*8)] // p4
alf_luma_filter_pixel 0, \pix_size, x6, x7, 0, 0
ldr x6, [pp, #(1*8)] // p1
ldr x7, [pp, #(2*8)] // p2
alf_luma_filter_pixel 1, \pix_size, x8, x9, 1, -1
alf_luma_filter_pixel 2, \pix_size, x8, x9, 0, 0
alf_luma_filter_pixel 3, \pix_size, x8, x9, -1, 1
alf_luma_filter_pixel 4, \pix_size, x6, x7, 2, -2
alf_luma_filter_pixel 5, \pix_size, x6, x7, 1, -1
alf_luma_filter_pixel 6, \pix_size, x6, x7, 0, 0
alf_luma_filter_pixel 7, \pix_size, x6, x7, -1, 1
ldr d0, [clip, #16] // clip
ldr d1, [filter, #16] // filter
neg v16.4h, v0.4h // -clip
ldr d22, [clip, #40] // clip
ldr d24, [filter, #40] // filter
neg v23.4h, v22.4h // -clip
alf_luma_filter_pixel 8, \pix_size, x6, x7, -2, 2
alf_luma_filter_pixel 9, \pix_size, x5, x5, 3, -3
alf_luma_filter_pixel 10, \pix_size, x5, x5, 2, -2
alf_luma_filter_pixel 11, \pix_size, x5, x5, 1, -1
cbz is_near_vb, 2f
sshr v20.4s, v20.4s, #10
sshr v21.4s, v21.4s, #10
b 3f
2:
sshr v20.4s, v20.4s, #7
sshr v21.4s, v21.4s, #7
3:
uxtl v22.4s, v5.4h
uxtl2 v23.4s, v5.8h
add v20.4s, v20.4s, v22.4s
add v21.4s, v21.4s, v23.4s
sqxtun v20.4h, v20.4s
sqxtun2 v20.8h, v21.4s
.if \pix_size == 1
sqxtun v20.8b, v20.8h
str d20, [dst]
.else
smin v20.8h, v20.8h, v25.8h
str q20, [dst]
.endif
ret
.unreq dst
.unreq pp
.unreq filter
.unreq clip
.unreq is_near_vb
.unreq pix_max
.endm
.macro alf_chroma_filter_pixel index, pix_size, addr1, addr2, offset1, offset2
.if \pix_size == 1
ldur s3, [\addr1, #\offset1]
ldur s4, [\addr2, #\offset2]
uxtl v6.8h, v3.8b
uxtl v7.8h, v4.8b
.else
ldur d6, [\addr1, #(2*\offset1)]
ldur d7, [\addr2, #(2*\offset2)]
.endif
.if \index < 8
dup v17.4h, v0.h[\index] // v17: clip[0]
dup v18.4h, v16.h[\index] // v18: -clip[0]
dup v19.4h, v1.h[\index] // v19: filter[0]
.else
dup v17.4h, v0.h[\index - 8] // v17: clip[0]
dup v18.4h, v16.h[\index - 8] // v18: -clip[0]
dup v19.4h, v1.h[\index - 8] // v19: filter[0]
.endif
sub v6.4h, v6.4h, v5.4h
sub v7.4h, v7.4h, v5.4h
smin v6.4h, v6.4h, v17.4h
smin v7.4h, v7.4h, v17.4h
smax v6.4h, v6.4h, v18.4h
smax v7.4h, v7.4h, v18.4h
add v6.4h, v6.4h, v7.4h
smlal v20.4s, v19.4h, v6.4h // v20: sum
.endm
/* x0: dst
* x1: pp
* x2: filter
* x3: clip
* w4: is_near_vb
* w5: pix_max
*/
.macro alf_filter_chroma_kernel, pix_size
dst .req x0
pp .req x1
filter .req x2
clip .req x3
is_near_vb .req w4
pix_max .req w5
.if \pix_size > 1
dup v25.4h, pix_max // pix_max
.endif
ldr q0, [clip] // clip
ldr q1, [filter] // filter
ldr x5, [pp] // p0
ldr x6, [pp, #(3*8)] // p3
ldr x7, [pp, #(4*8)] // p4
neg v16.8h, v0.8h // -clip
.if \pix_size == 1
ldr s2, [x5] // curr
.else
ldr d5, [x5] // curr
.endif
movi v20.4s, #64
avcodec/vvc: Add aarch64 neon optimization for ALF vvc_alf_filter_chroma_4x4_8_c: 3.0 vvc_alf_filter_chroma_4x4_8_neon: 1.0 vvc_alf_filter_chroma_4x4_10_c: 2.7 vvc_alf_filter_chroma_4x4_10_neon: 1.0 vvc_alf_filter_chroma_4x4_12_c: 2.7 vvc_alf_filter_chroma_4x4_12_neon: 1.0 vvc_alf_filter_chroma_8x8_8_c: 10.2 vvc_alf_filter_chroma_8x8_8_neon: 3.0 vvc_alf_filter_chroma_8x8_10_c: 10.0 vvc_alf_filter_chroma_8x8_10_neon: 2.5 vvc_alf_filter_chroma_8x8_12_c: 10.0 vvc_alf_filter_chroma_8x8_12_neon: 2.5 vvc_alf_filter_chroma_16x16_8_c: 41.7 vvc_alf_filter_chroma_16x16_8_neon: 11.2 vvc_alf_filter_chroma_16x16_10_c: 39.0 vvc_alf_filter_chroma_16x16_10_neon: 10.0 vvc_alf_filter_chroma_16x16_12_c: 40.2 vvc_alf_filter_chroma_16x16_12_neon: 10.2 vvc_alf_filter_chroma_32x32_8_c: 162.0 vvc_alf_filter_chroma_32x32_8_neon: 45.0 vvc_alf_filter_chroma_32x32_10_c: 155.5 vvc_alf_filter_chroma_32x32_10_neon: 39.5 vvc_alf_filter_chroma_32x32_12_c: 155.5 vvc_alf_filter_chroma_32x32_12_neon: 40.0 vvc_alf_filter_chroma_64x64_8_c: 646.0 vvc_alf_filter_chroma_64x64_8_neon: 175.5 vvc_alf_filter_chroma_64x64_10_c: 708.2 vvc_alf_filter_chroma_64x64_10_neon: 166.7 vvc_alf_filter_chroma_64x64_12_c: 619.2 vvc_alf_filter_chroma_64x64_12_neon: 157.2 vvc_alf_filter_chroma_128x128_8_c: 2611.5 vvc_alf_filter_chroma_128x128_8_neon: 698.2 vvc_alf_filter_chroma_128x128_10_c: 2470.0 vvc_alf_filter_chroma_128x128_10_neon: 616.0 vvc_alf_filter_chroma_128x128_12_c: 2531.5 vvc_alf_filter_chroma_128x128_12_neon: 620.2 vvc_alf_filter_luma_8x8_8_c: 25.2 vvc_alf_filter_luma_8x8_8_neon: 4.2 vvc_alf_filter_luma_8x8_10_c: 18.5 vvc_alf_filter_luma_8x8_10_neon: 4.0 vvc_alf_filter_luma_8x8_12_c: 19.0 vvc_alf_filter_luma_8x8_12_neon: 4.0 vvc_alf_filter_luma_16x16_8_c: 106.5 vvc_alf_filter_luma_16x16_8_neon: 16.2 vvc_alf_filter_luma_16x16_10_c: 75.2 vvc_alf_filter_luma_16x16_10_neon: 14.7 vvc_alf_filter_luma_16x16_12_c: 79.7 vvc_alf_filter_luma_16x16_12_neon: 14.7 vvc_alf_filter_luma_32x32_8_c: 400.5 vvc_alf_filter_luma_32x32_8_neon: 63.2 vvc_alf_filter_luma_32x32_10_c: 299.2 vvc_alf_filter_luma_32x32_10_neon: 57.7 vvc_alf_filter_luma_32x32_12_c: 299.2 vvc_alf_filter_luma_32x32_12_neon: 57.7 vvc_alf_filter_luma_64x64_8_c: 1602.5 vvc_alf_filter_luma_64x64_8_neon: 251.7 vvc_alf_filter_luma_64x64_10_c: 1197.0 vvc_alf_filter_luma_64x64_10_neon: 235.5 vvc_alf_filter_luma_64x64_12_c: 1220.2 vvc_alf_filter_luma_64x64_12_neon: 235.7 vvc_alf_filter_luma_128x128_8_c: 6570.2 vvc_alf_filter_luma_128x128_8_neon: 1007.7 vvc_alf_filter_luma_128x128_10_c: 4822.7 vvc_alf_filter_luma_128x128_10_neon: 936.2 vvc_alf_filter_luma_128x128_12_c: 4791.2 vvc_alf_filter_luma_128x128_12_neon: 938.5 Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
2024-07-15 16:19:15 +00:00
cbz is_near_vb, 1f
shl v20.4s, v20.4s, #3
1:
.if \pix_size == 1
uxtl v5.8h, v2.8b
.endif
ldr x8, [pp, #(1*8)] // p1
ldr x9, [pp, #(2*8)] // p2
alf_chroma_filter_pixel 0, \pix_size, x6, x7, 0, 0
alf_chroma_filter_pixel 1, \pix_size, x8, x9, 1, -1
alf_chroma_filter_pixel 2, \pix_size, x8, x9, 0, 0
alf_chroma_filter_pixel 3, \pix_size, x8, x9, -1, 1
alf_chroma_filter_pixel 4, \pix_size, x5, x5, 2, -2
alf_chroma_filter_pixel 5, \pix_size, x5, x5, 1, -1
uxtl v22.4s, v5.4h
cbz is_near_vb, 2f
sshr v20.4s, v20.4s, #10
b 3f
2:
sshr v20.4s, v20.4s, #7
3:
add v20.4s, v20.4s, v22.4s
sqxtun v20.4h, v20.4s
.if \pix_size == 1
sqxtun v20.8b, v20.8h
str s20, [dst]
.else
smin v20.4h, v20.4h, v25.4h
str d20, [dst]
.endif
ret
.unreq dst
.unreq pp
.unreq filter
.unreq clip
.unreq is_near_vb
.unreq pix_max
.endm
function ff_alf_filter_luma_kernel_8_neon, export=1
alf_filter_luma_kernel 1
avcodec/vvc: Add aarch64 neon optimization for ALF vvc_alf_filter_chroma_4x4_8_c: 3.0 vvc_alf_filter_chroma_4x4_8_neon: 1.0 vvc_alf_filter_chroma_4x4_10_c: 2.7 vvc_alf_filter_chroma_4x4_10_neon: 1.0 vvc_alf_filter_chroma_4x4_12_c: 2.7 vvc_alf_filter_chroma_4x4_12_neon: 1.0 vvc_alf_filter_chroma_8x8_8_c: 10.2 vvc_alf_filter_chroma_8x8_8_neon: 3.0 vvc_alf_filter_chroma_8x8_10_c: 10.0 vvc_alf_filter_chroma_8x8_10_neon: 2.5 vvc_alf_filter_chroma_8x8_12_c: 10.0 vvc_alf_filter_chroma_8x8_12_neon: 2.5 vvc_alf_filter_chroma_16x16_8_c: 41.7 vvc_alf_filter_chroma_16x16_8_neon: 11.2 vvc_alf_filter_chroma_16x16_10_c: 39.0 vvc_alf_filter_chroma_16x16_10_neon: 10.0 vvc_alf_filter_chroma_16x16_12_c: 40.2 vvc_alf_filter_chroma_16x16_12_neon: 10.2 vvc_alf_filter_chroma_32x32_8_c: 162.0 vvc_alf_filter_chroma_32x32_8_neon: 45.0 vvc_alf_filter_chroma_32x32_10_c: 155.5 vvc_alf_filter_chroma_32x32_10_neon: 39.5 vvc_alf_filter_chroma_32x32_12_c: 155.5 vvc_alf_filter_chroma_32x32_12_neon: 40.0 vvc_alf_filter_chroma_64x64_8_c: 646.0 vvc_alf_filter_chroma_64x64_8_neon: 175.5 vvc_alf_filter_chroma_64x64_10_c: 708.2 vvc_alf_filter_chroma_64x64_10_neon: 166.7 vvc_alf_filter_chroma_64x64_12_c: 619.2 vvc_alf_filter_chroma_64x64_12_neon: 157.2 vvc_alf_filter_chroma_128x128_8_c: 2611.5 vvc_alf_filter_chroma_128x128_8_neon: 698.2 vvc_alf_filter_chroma_128x128_10_c: 2470.0 vvc_alf_filter_chroma_128x128_10_neon: 616.0 vvc_alf_filter_chroma_128x128_12_c: 2531.5 vvc_alf_filter_chroma_128x128_12_neon: 620.2 vvc_alf_filter_luma_8x8_8_c: 25.2 vvc_alf_filter_luma_8x8_8_neon: 4.2 vvc_alf_filter_luma_8x8_10_c: 18.5 vvc_alf_filter_luma_8x8_10_neon: 4.0 vvc_alf_filter_luma_8x8_12_c: 19.0 vvc_alf_filter_luma_8x8_12_neon: 4.0 vvc_alf_filter_luma_16x16_8_c: 106.5 vvc_alf_filter_luma_16x16_8_neon: 16.2 vvc_alf_filter_luma_16x16_10_c: 75.2 vvc_alf_filter_luma_16x16_10_neon: 14.7 vvc_alf_filter_luma_16x16_12_c: 79.7 vvc_alf_filter_luma_16x16_12_neon: 14.7 vvc_alf_filter_luma_32x32_8_c: 400.5 vvc_alf_filter_luma_32x32_8_neon: 63.2 vvc_alf_filter_luma_32x32_10_c: 299.2 vvc_alf_filter_luma_32x32_10_neon: 57.7 vvc_alf_filter_luma_32x32_12_c: 299.2 vvc_alf_filter_luma_32x32_12_neon: 57.7 vvc_alf_filter_luma_64x64_8_c: 1602.5 vvc_alf_filter_luma_64x64_8_neon: 251.7 vvc_alf_filter_luma_64x64_10_c: 1197.0 vvc_alf_filter_luma_64x64_10_neon: 235.5 vvc_alf_filter_luma_64x64_12_c: 1220.2 vvc_alf_filter_luma_64x64_12_neon: 235.7 vvc_alf_filter_luma_128x128_8_c: 6570.2 vvc_alf_filter_luma_128x128_8_neon: 1007.7 vvc_alf_filter_luma_128x128_10_c: 4822.7 vvc_alf_filter_luma_128x128_10_neon: 936.2 vvc_alf_filter_luma_128x128_12_c: 4791.2 vvc_alf_filter_luma_128x128_12_neon: 938.5 Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
2024-07-15 16:19:15 +00:00
endfunc
function ff_alf_filter_luma_kernel_12_neon, export=1
mov w5, #4095
avcodec/vvc: Add aarch64 neon optimization for ALF vvc_alf_filter_chroma_4x4_8_c: 3.0 vvc_alf_filter_chroma_4x4_8_neon: 1.0 vvc_alf_filter_chroma_4x4_10_c: 2.7 vvc_alf_filter_chroma_4x4_10_neon: 1.0 vvc_alf_filter_chroma_4x4_12_c: 2.7 vvc_alf_filter_chroma_4x4_12_neon: 1.0 vvc_alf_filter_chroma_8x8_8_c: 10.2 vvc_alf_filter_chroma_8x8_8_neon: 3.0 vvc_alf_filter_chroma_8x8_10_c: 10.0 vvc_alf_filter_chroma_8x8_10_neon: 2.5 vvc_alf_filter_chroma_8x8_12_c: 10.0 vvc_alf_filter_chroma_8x8_12_neon: 2.5 vvc_alf_filter_chroma_16x16_8_c: 41.7 vvc_alf_filter_chroma_16x16_8_neon: 11.2 vvc_alf_filter_chroma_16x16_10_c: 39.0 vvc_alf_filter_chroma_16x16_10_neon: 10.0 vvc_alf_filter_chroma_16x16_12_c: 40.2 vvc_alf_filter_chroma_16x16_12_neon: 10.2 vvc_alf_filter_chroma_32x32_8_c: 162.0 vvc_alf_filter_chroma_32x32_8_neon: 45.0 vvc_alf_filter_chroma_32x32_10_c: 155.5 vvc_alf_filter_chroma_32x32_10_neon: 39.5 vvc_alf_filter_chroma_32x32_12_c: 155.5 vvc_alf_filter_chroma_32x32_12_neon: 40.0 vvc_alf_filter_chroma_64x64_8_c: 646.0 vvc_alf_filter_chroma_64x64_8_neon: 175.5 vvc_alf_filter_chroma_64x64_10_c: 708.2 vvc_alf_filter_chroma_64x64_10_neon: 166.7 vvc_alf_filter_chroma_64x64_12_c: 619.2 vvc_alf_filter_chroma_64x64_12_neon: 157.2 vvc_alf_filter_chroma_128x128_8_c: 2611.5 vvc_alf_filter_chroma_128x128_8_neon: 698.2 vvc_alf_filter_chroma_128x128_10_c: 2470.0 vvc_alf_filter_chroma_128x128_10_neon: 616.0 vvc_alf_filter_chroma_128x128_12_c: 2531.5 vvc_alf_filter_chroma_128x128_12_neon: 620.2 vvc_alf_filter_luma_8x8_8_c: 25.2 vvc_alf_filter_luma_8x8_8_neon: 4.2 vvc_alf_filter_luma_8x8_10_c: 18.5 vvc_alf_filter_luma_8x8_10_neon: 4.0 vvc_alf_filter_luma_8x8_12_c: 19.0 vvc_alf_filter_luma_8x8_12_neon: 4.0 vvc_alf_filter_luma_16x16_8_c: 106.5 vvc_alf_filter_luma_16x16_8_neon: 16.2 vvc_alf_filter_luma_16x16_10_c: 75.2 vvc_alf_filter_luma_16x16_10_neon: 14.7 vvc_alf_filter_luma_16x16_12_c: 79.7 vvc_alf_filter_luma_16x16_12_neon: 14.7 vvc_alf_filter_luma_32x32_8_c: 400.5 vvc_alf_filter_luma_32x32_8_neon: 63.2 vvc_alf_filter_luma_32x32_10_c: 299.2 vvc_alf_filter_luma_32x32_10_neon: 57.7 vvc_alf_filter_luma_32x32_12_c: 299.2 vvc_alf_filter_luma_32x32_12_neon: 57.7 vvc_alf_filter_luma_64x64_8_c: 1602.5 vvc_alf_filter_luma_64x64_8_neon: 251.7 vvc_alf_filter_luma_64x64_10_c: 1197.0 vvc_alf_filter_luma_64x64_10_neon: 235.5 vvc_alf_filter_luma_64x64_12_c: 1220.2 vvc_alf_filter_luma_64x64_12_neon: 235.7 vvc_alf_filter_luma_128x128_8_c: 6570.2 vvc_alf_filter_luma_128x128_8_neon: 1007.7 vvc_alf_filter_luma_128x128_10_c: 4822.7 vvc_alf_filter_luma_128x128_10_neon: 936.2 vvc_alf_filter_luma_128x128_12_c: 4791.2 vvc_alf_filter_luma_128x128_12_neon: 938.5 Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
2024-07-15 16:19:15 +00:00
b 1f
endfunc
function ff_alf_filter_luma_kernel_10_neon, export=1
mov w5, #1023
avcodec/vvc: Add aarch64 neon optimization for ALF vvc_alf_filter_chroma_4x4_8_c: 3.0 vvc_alf_filter_chroma_4x4_8_neon: 1.0 vvc_alf_filter_chroma_4x4_10_c: 2.7 vvc_alf_filter_chroma_4x4_10_neon: 1.0 vvc_alf_filter_chroma_4x4_12_c: 2.7 vvc_alf_filter_chroma_4x4_12_neon: 1.0 vvc_alf_filter_chroma_8x8_8_c: 10.2 vvc_alf_filter_chroma_8x8_8_neon: 3.0 vvc_alf_filter_chroma_8x8_10_c: 10.0 vvc_alf_filter_chroma_8x8_10_neon: 2.5 vvc_alf_filter_chroma_8x8_12_c: 10.0 vvc_alf_filter_chroma_8x8_12_neon: 2.5 vvc_alf_filter_chroma_16x16_8_c: 41.7 vvc_alf_filter_chroma_16x16_8_neon: 11.2 vvc_alf_filter_chroma_16x16_10_c: 39.0 vvc_alf_filter_chroma_16x16_10_neon: 10.0 vvc_alf_filter_chroma_16x16_12_c: 40.2 vvc_alf_filter_chroma_16x16_12_neon: 10.2 vvc_alf_filter_chroma_32x32_8_c: 162.0 vvc_alf_filter_chroma_32x32_8_neon: 45.0 vvc_alf_filter_chroma_32x32_10_c: 155.5 vvc_alf_filter_chroma_32x32_10_neon: 39.5 vvc_alf_filter_chroma_32x32_12_c: 155.5 vvc_alf_filter_chroma_32x32_12_neon: 40.0 vvc_alf_filter_chroma_64x64_8_c: 646.0 vvc_alf_filter_chroma_64x64_8_neon: 175.5 vvc_alf_filter_chroma_64x64_10_c: 708.2 vvc_alf_filter_chroma_64x64_10_neon: 166.7 vvc_alf_filter_chroma_64x64_12_c: 619.2 vvc_alf_filter_chroma_64x64_12_neon: 157.2 vvc_alf_filter_chroma_128x128_8_c: 2611.5 vvc_alf_filter_chroma_128x128_8_neon: 698.2 vvc_alf_filter_chroma_128x128_10_c: 2470.0 vvc_alf_filter_chroma_128x128_10_neon: 616.0 vvc_alf_filter_chroma_128x128_12_c: 2531.5 vvc_alf_filter_chroma_128x128_12_neon: 620.2 vvc_alf_filter_luma_8x8_8_c: 25.2 vvc_alf_filter_luma_8x8_8_neon: 4.2 vvc_alf_filter_luma_8x8_10_c: 18.5 vvc_alf_filter_luma_8x8_10_neon: 4.0 vvc_alf_filter_luma_8x8_12_c: 19.0 vvc_alf_filter_luma_8x8_12_neon: 4.0 vvc_alf_filter_luma_16x16_8_c: 106.5 vvc_alf_filter_luma_16x16_8_neon: 16.2 vvc_alf_filter_luma_16x16_10_c: 75.2 vvc_alf_filter_luma_16x16_10_neon: 14.7 vvc_alf_filter_luma_16x16_12_c: 79.7 vvc_alf_filter_luma_16x16_12_neon: 14.7 vvc_alf_filter_luma_32x32_8_c: 400.5 vvc_alf_filter_luma_32x32_8_neon: 63.2 vvc_alf_filter_luma_32x32_10_c: 299.2 vvc_alf_filter_luma_32x32_10_neon: 57.7 vvc_alf_filter_luma_32x32_12_c: 299.2 vvc_alf_filter_luma_32x32_12_neon: 57.7 vvc_alf_filter_luma_64x64_8_c: 1602.5 vvc_alf_filter_luma_64x64_8_neon: 251.7 vvc_alf_filter_luma_64x64_10_c: 1197.0 vvc_alf_filter_luma_64x64_10_neon: 235.5 vvc_alf_filter_luma_64x64_12_c: 1220.2 vvc_alf_filter_luma_64x64_12_neon: 235.7 vvc_alf_filter_luma_128x128_8_c: 6570.2 vvc_alf_filter_luma_128x128_8_neon: 1007.7 vvc_alf_filter_luma_128x128_10_c: 4822.7 vvc_alf_filter_luma_128x128_10_neon: 936.2 vvc_alf_filter_luma_128x128_12_c: 4791.2 vvc_alf_filter_luma_128x128_12_neon: 938.5 Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
2024-07-15 16:19:15 +00:00
1:
alf_filter_luma_kernel 2
avcodec/vvc: Add aarch64 neon optimization for ALF vvc_alf_filter_chroma_4x4_8_c: 3.0 vvc_alf_filter_chroma_4x4_8_neon: 1.0 vvc_alf_filter_chroma_4x4_10_c: 2.7 vvc_alf_filter_chroma_4x4_10_neon: 1.0 vvc_alf_filter_chroma_4x4_12_c: 2.7 vvc_alf_filter_chroma_4x4_12_neon: 1.0 vvc_alf_filter_chroma_8x8_8_c: 10.2 vvc_alf_filter_chroma_8x8_8_neon: 3.0 vvc_alf_filter_chroma_8x8_10_c: 10.0 vvc_alf_filter_chroma_8x8_10_neon: 2.5 vvc_alf_filter_chroma_8x8_12_c: 10.0 vvc_alf_filter_chroma_8x8_12_neon: 2.5 vvc_alf_filter_chroma_16x16_8_c: 41.7 vvc_alf_filter_chroma_16x16_8_neon: 11.2 vvc_alf_filter_chroma_16x16_10_c: 39.0 vvc_alf_filter_chroma_16x16_10_neon: 10.0 vvc_alf_filter_chroma_16x16_12_c: 40.2 vvc_alf_filter_chroma_16x16_12_neon: 10.2 vvc_alf_filter_chroma_32x32_8_c: 162.0 vvc_alf_filter_chroma_32x32_8_neon: 45.0 vvc_alf_filter_chroma_32x32_10_c: 155.5 vvc_alf_filter_chroma_32x32_10_neon: 39.5 vvc_alf_filter_chroma_32x32_12_c: 155.5 vvc_alf_filter_chroma_32x32_12_neon: 40.0 vvc_alf_filter_chroma_64x64_8_c: 646.0 vvc_alf_filter_chroma_64x64_8_neon: 175.5 vvc_alf_filter_chroma_64x64_10_c: 708.2 vvc_alf_filter_chroma_64x64_10_neon: 166.7 vvc_alf_filter_chroma_64x64_12_c: 619.2 vvc_alf_filter_chroma_64x64_12_neon: 157.2 vvc_alf_filter_chroma_128x128_8_c: 2611.5 vvc_alf_filter_chroma_128x128_8_neon: 698.2 vvc_alf_filter_chroma_128x128_10_c: 2470.0 vvc_alf_filter_chroma_128x128_10_neon: 616.0 vvc_alf_filter_chroma_128x128_12_c: 2531.5 vvc_alf_filter_chroma_128x128_12_neon: 620.2 vvc_alf_filter_luma_8x8_8_c: 25.2 vvc_alf_filter_luma_8x8_8_neon: 4.2 vvc_alf_filter_luma_8x8_10_c: 18.5 vvc_alf_filter_luma_8x8_10_neon: 4.0 vvc_alf_filter_luma_8x8_12_c: 19.0 vvc_alf_filter_luma_8x8_12_neon: 4.0 vvc_alf_filter_luma_16x16_8_c: 106.5 vvc_alf_filter_luma_16x16_8_neon: 16.2 vvc_alf_filter_luma_16x16_10_c: 75.2 vvc_alf_filter_luma_16x16_10_neon: 14.7 vvc_alf_filter_luma_16x16_12_c: 79.7 vvc_alf_filter_luma_16x16_12_neon: 14.7 vvc_alf_filter_luma_32x32_8_c: 400.5 vvc_alf_filter_luma_32x32_8_neon: 63.2 vvc_alf_filter_luma_32x32_10_c: 299.2 vvc_alf_filter_luma_32x32_10_neon: 57.7 vvc_alf_filter_luma_32x32_12_c: 299.2 vvc_alf_filter_luma_32x32_12_neon: 57.7 vvc_alf_filter_luma_64x64_8_c: 1602.5 vvc_alf_filter_luma_64x64_8_neon: 251.7 vvc_alf_filter_luma_64x64_10_c: 1197.0 vvc_alf_filter_luma_64x64_10_neon: 235.5 vvc_alf_filter_luma_64x64_12_c: 1220.2 vvc_alf_filter_luma_64x64_12_neon: 235.7 vvc_alf_filter_luma_128x128_8_c: 6570.2 vvc_alf_filter_luma_128x128_8_neon: 1007.7 vvc_alf_filter_luma_128x128_10_c: 4822.7 vvc_alf_filter_luma_128x128_10_neon: 936.2 vvc_alf_filter_luma_128x128_12_c: 4791.2 vvc_alf_filter_luma_128x128_12_neon: 938.5 Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
2024-07-15 16:19:15 +00:00
endfunc
function ff_alf_filter_chroma_kernel_8_neon, export=1
alf_filter_chroma_kernel 1
avcodec/vvc: Add aarch64 neon optimization for ALF vvc_alf_filter_chroma_4x4_8_c: 3.0 vvc_alf_filter_chroma_4x4_8_neon: 1.0 vvc_alf_filter_chroma_4x4_10_c: 2.7 vvc_alf_filter_chroma_4x4_10_neon: 1.0 vvc_alf_filter_chroma_4x4_12_c: 2.7 vvc_alf_filter_chroma_4x4_12_neon: 1.0 vvc_alf_filter_chroma_8x8_8_c: 10.2 vvc_alf_filter_chroma_8x8_8_neon: 3.0 vvc_alf_filter_chroma_8x8_10_c: 10.0 vvc_alf_filter_chroma_8x8_10_neon: 2.5 vvc_alf_filter_chroma_8x8_12_c: 10.0 vvc_alf_filter_chroma_8x8_12_neon: 2.5 vvc_alf_filter_chroma_16x16_8_c: 41.7 vvc_alf_filter_chroma_16x16_8_neon: 11.2 vvc_alf_filter_chroma_16x16_10_c: 39.0 vvc_alf_filter_chroma_16x16_10_neon: 10.0 vvc_alf_filter_chroma_16x16_12_c: 40.2 vvc_alf_filter_chroma_16x16_12_neon: 10.2 vvc_alf_filter_chroma_32x32_8_c: 162.0 vvc_alf_filter_chroma_32x32_8_neon: 45.0 vvc_alf_filter_chroma_32x32_10_c: 155.5 vvc_alf_filter_chroma_32x32_10_neon: 39.5 vvc_alf_filter_chroma_32x32_12_c: 155.5 vvc_alf_filter_chroma_32x32_12_neon: 40.0 vvc_alf_filter_chroma_64x64_8_c: 646.0 vvc_alf_filter_chroma_64x64_8_neon: 175.5 vvc_alf_filter_chroma_64x64_10_c: 708.2 vvc_alf_filter_chroma_64x64_10_neon: 166.7 vvc_alf_filter_chroma_64x64_12_c: 619.2 vvc_alf_filter_chroma_64x64_12_neon: 157.2 vvc_alf_filter_chroma_128x128_8_c: 2611.5 vvc_alf_filter_chroma_128x128_8_neon: 698.2 vvc_alf_filter_chroma_128x128_10_c: 2470.0 vvc_alf_filter_chroma_128x128_10_neon: 616.0 vvc_alf_filter_chroma_128x128_12_c: 2531.5 vvc_alf_filter_chroma_128x128_12_neon: 620.2 vvc_alf_filter_luma_8x8_8_c: 25.2 vvc_alf_filter_luma_8x8_8_neon: 4.2 vvc_alf_filter_luma_8x8_10_c: 18.5 vvc_alf_filter_luma_8x8_10_neon: 4.0 vvc_alf_filter_luma_8x8_12_c: 19.0 vvc_alf_filter_luma_8x8_12_neon: 4.0 vvc_alf_filter_luma_16x16_8_c: 106.5 vvc_alf_filter_luma_16x16_8_neon: 16.2 vvc_alf_filter_luma_16x16_10_c: 75.2 vvc_alf_filter_luma_16x16_10_neon: 14.7 vvc_alf_filter_luma_16x16_12_c: 79.7 vvc_alf_filter_luma_16x16_12_neon: 14.7 vvc_alf_filter_luma_32x32_8_c: 400.5 vvc_alf_filter_luma_32x32_8_neon: 63.2 vvc_alf_filter_luma_32x32_10_c: 299.2 vvc_alf_filter_luma_32x32_10_neon: 57.7 vvc_alf_filter_luma_32x32_12_c: 299.2 vvc_alf_filter_luma_32x32_12_neon: 57.7 vvc_alf_filter_luma_64x64_8_c: 1602.5 vvc_alf_filter_luma_64x64_8_neon: 251.7 vvc_alf_filter_luma_64x64_10_c: 1197.0 vvc_alf_filter_luma_64x64_10_neon: 235.5 vvc_alf_filter_luma_64x64_12_c: 1220.2 vvc_alf_filter_luma_64x64_12_neon: 235.7 vvc_alf_filter_luma_128x128_8_c: 6570.2 vvc_alf_filter_luma_128x128_8_neon: 1007.7 vvc_alf_filter_luma_128x128_10_c: 4822.7 vvc_alf_filter_luma_128x128_10_neon: 936.2 vvc_alf_filter_luma_128x128_12_c: 4791.2 vvc_alf_filter_luma_128x128_12_neon: 938.5 Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
2024-07-15 16:19:15 +00:00
endfunc
function ff_alf_filter_chroma_kernel_12_neon, export=1
mov w5, #4095
avcodec/vvc: Add aarch64 neon optimization for ALF vvc_alf_filter_chroma_4x4_8_c: 3.0 vvc_alf_filter_chroma_4x4_8_neon: 1.0 vvc_alf_filter_chroma_4x4_10_c: 2.7 vvc_alf_filter_chroma_4x4_10_neon: 1.0 vvc_alf_filter_chroma_4x4_12_c: 2.7 vvc_alf_filter_chroma_4x4_12_neon: 1.0 vvc_alf_filter_chroma_8x8_8_c: 10.2 vvc_alf_filter_chroma_8x8_8_neon: 3.0 vvc_alf_filter_chroma_8x8_10_c: 10.0 vvc_alf_filter_chroma_8x8_10_neon: 2.5 vvc_alf_filter_chroma_8x8_12_c: 10.0 vvc_alf_filter_chroma_8x8_12_neon: 2.5 vvc_alf_filter_chroma_16x16_8_c: 41.7 vvc_alf_filter_chroma_16x16_8_neon: 11.2 vvc_alf_filter_chroma_16x16_10_c: 39.0 vvc_alf_filter_chroma_16x16_10_neon: 10.0 vvc_alf_filter_chroma_16x16_12_c: 40.2 vvc_alf_filter_chroma_16x16_12_neon: 10.2 vvc_alf_filter_chroma_32x32_8_c: 162.0 vvc_alf_filter_chroma_32x32_8_neon: 45.0 vvc_alf_filter_chroma_32x32_10_c: 155.5 vvc_alf_filter_chroma_32x32_10_neon: 39.5 vvc_alf_filter_chroma_32x32_12_c: 155.5 vvc_alf_filter_chroma_32x32_12_neon: 40.0 vvc_alf_filter_chroma_64x64_8_c: 646.0 vvc_alf_filter_chroma_64x64_8_neon: 175.5 vvc_alf_filter_chroma_64x64_10_c: 708.2 vvc_alf_filter_chroma_64x64_10_neon: 166.7 vvc_alf_filter_chroma_64x64_12_c: 619.2 vvc_alf_filter_chroma_64x64_12_neon: 157.2 vvc_alf_filter_chroma_128x128_8_c: 2611.5 vvc_alf_filter_chroma_128x128_8_neon: 698.2 vvc_alf_filter_chroma_128x128_10_c: 2470.0 vvc_alf_filter_chroma_128x128_10_neon: 616.0 vvc_alf_filter_chroma_128x128_12_c: 2531.5 vvc_alf_filter_chroma_128x128_12_neon: 620.2 vvc_alf_filter_luma_8x8_8_c: 25.2 vvc_alf_filter_luma_8x8_8_neon: 4.2 vvc_alf_filter_luma_8x8_10_c: 18.5 vvc_alf_filter_luma_8x8_10_neon: 4.0 vvc_alf_filter_luma_8x8_12_c: 19.0 vvc_alf_filter_luma_8x8_12_neon: 4.0 vvc_alf_filter_luma_16x16_8_c: 106.5 vvc_alf_filter_luma_16x16_8_neon: 16.2 vvc_alf_filter_luma_16x16_10_c: 75.2 vvc_alf_filter_luma_16x16_10_neon: 14.7 vvc_alf_filter_luma_16x16_12_c: 79.7 vvc_alf_filter_luma_16x16_12_neon: 14.7 vvc_alf_filter_luma_32x32_8_c: 400.5 vvc_alf_filter_luma_32x32_8_neon: 63.2 vvc_alf_filter_luma_32x32_10_c: 299.2 vvc_alf_filter_luma_32x32_10_neon: 57.7 vvc_alf_filter_luma_32x32_12_c: 299.2 vvc_alf_filter_luma_32x32_12_neon: 57.7 vvc_alf_filter_luma_64x64_8_c: 1602.5 vvc_alf_filter_luma_64x64_8_neon: 251.7 vvc_alf_filter_luma_64x64_10_c: 1197.0 vvc_alf_filter_luma_64x64_10_neon: 235.5 vvc_alf_filter_luma_64x64_12_c: 1220.2 vvc_alf_filter_luma_64x64_12_neon: 235.7 vvc_alf_filter_luma_128x128_8_c: 6570.2 vvc_alf_filter_luma_128x128_8_neon: 1007.7 vvc_alf_filter_luma_128x128_10_c: 4822.7 vvc_alf_filter_luma_128x128_10_neon: 936.2 vvc_alf_filter_luma_128x128_12_c: 4791.2 vvc_alf_filter_luma_128x128_12_neon: 938.5 Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
2024-07-15 16:19:15 +00:00
b 1f
endfunc
function ff_alf_filter_chroma_kernel_10_neon, export=1
mov w5, #1023
avcodec/vvc: Add aarch64 neon optimization for ALF vvc_alf_filter_chroma_4x4_8_c: 3.0 vvc_alf_filter_chroma_4x4_8_neon: 1.0 vvc_alf_filter_chroma_4x4_10_c: 2.7 vvc_alf_filter_chroma_4x4_10_neon: 1.0 vvc_alf_filter_chroma_4x4_12_c: 2.7 vvc_alf_filter_chroma_4x4_12_neon: 1.0 vvc_alf_filter_chroma_8x8_8_c: 10.2 vvc_alf_filter_chroma_8x8_8_neon: 3.0 vvc_alf_filter_chroma_8x8_10_c: 10.0 vvc_alf_filter_chroma_8x8_10_neon: 2.5 vvc_alf_filter_chroma_8x8_12_c: 10.0 vvc_alf_filter_chroma_8x8_12_neon: 2.5 vvc_alf_filter_chroma_16x16_8_c: 41.7 vvc_alf_filter_chroma_16x16_8_neon: 11.2 vvc_alf_filter_chroma_16x16_10_c: 39.0 vvc_alf_filter_chroma_16x16_10_neon: 10.0 vvc_alf_filter_chroma_16x16_12_c: 40.2 vvc_alf_filter_chroma_16x16_12_neon: 10.2 vvc_alf_filter_chroma_32x32_8_c: 162.0 vvc_alf_filter_chroma_32x32_8_neon: 45.0 vvc_alf_filter_chroma_32x32_10_c: 155.5 vvc_alf_filter_chroma_32x32_10_neon: 39.5 vvc_alf_filter_chroma_32x32_12_c: 155.5 vvc_alf_filter_chroma_32x32_12_neon: 40.0 vvc_alf_filter_chroma_64x64_8_c: 646.0 vvc_alf_filter_chroma_64x64_8_neon: 175.5 vvc_alf_filter_chroma_64x64_10_c: 708.2 vvc_alf_filter_chroma_64x64_10_neon: 166.7 vvc_alf_filter_chroma_64x64_12_c: 619.2 vvc_alf_filter_chroma_64x64_12_neon: 157.2 vvc_alf_filter_chroma_128x128_8_c: 2611.5 vvc_alf_filter_chroma_128x128_8_neon: 698.2 vvc_alf_filter_chroma_128x128_10_c: 2470.0 vvc_alf_filter_chroma_128x128_10_neon: 616.0 vvc_alf_filter_chroma_128x128_12_c: 2531.5 vvc_alf_filter_chroma_128x128_12_neon: 620.2 vvc_alf_filter_luma_8x8_8_c: 25.2 vvc_alf_filter_luma_8x8_8_neon: 4.2 vvc_alf_filter_luma_8x8_10_c: 18.5 vvc_alf_filter_luma_8x8_10_neon: 4.0 vvc_alf_filter_luma_8x8_12_c: 19.0 vvc_alf_filter_luma_8x8_12_neon: 4.0 vvc_alf_filter_luma_16x16_8_c: 106.5 vvc_alf_filter_luma_16x16_8_neon: 16.2 vvc_alf_filter_luma_16x16_10_c: 75.2 vvc_alf_filter_luma_16x16_10_neon: 14.7 vvc_alf_filter_luma_16x16_12_c: 79.7 vvc_alf_filter_luma_16x16_12_neon: 14.7 vvc_alf_filter_luma_32x32_8_c: 400.5 vvc_alf_filter_luma_32x32_8_neon: 63.2 vvc_alf_filter_luma_32x32_10_c: 299.2 vvc_alf_filter_luma_32x32_10_neon: 57.7 vvc_alf_filter_luma_32x32_12_c: 299.2 vvc_alf_filter_luma_32x32_12_neon: 57.7 vvc_alf_filter_luma_64x64_8_c: 1602.5 vvc_alf_filter_luma_64x64_8_neon: 251.7 vvc_alf_filter_luma_64x64_10_c: 1197.0 vvc_alf_filter_luma_64x64_10_neon: 235.5 vvc_alf_filter_luma_64x64_12_c: 1220.2 vvc_alf_filter_luma_64x64_12_neon: 235.7 vvc_alf_filter_luma_128x128_8_c: 6570.2 vvc_alf_filter_luma_128x128_8_neon: 1007.7 vvc_alf_filter_luma_128x128_10_c: 4822.7 vvc_alf_filter_luma_128x128_10_neon: 936.2 vvc_alf_filter_luma_128x128_12_c: 4791.2 vvc_alf_filter_luma_128x128_12_neon: 938.5 Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
2024-07-15 16:19:15 +00:00
1:
alf_filter_chroma_kernel 2
avcodec/vvc: Add aarch64 neon optimization for ALF vvc_alf_filter_chroma_4x4_8_c: 3.0 vvc_alf_filter_chroma_4x4_8_neon: 1.0 vvc_alf_filter_chroma_4x4_10_c: 2.7 vvc_alf_filter_chroma_4x4_10_neon: 1.0 vvc_alf_filter_chroma_4x4_12_c: 2.7 vvc_alf_filter_chroma_4x4_12_neon: 1.0 vvc_alf_filter_chroma_8x8_8_c: 10.2 vvc_alf_filter_chroma_8x8_8_neon: 3.0 vvc_alf_filter_chroma_8x8_10_c: 10.0 vvc_alf_filter_chroma_8x8_10_neon: 2.5 vvc_alf_filter_chroma_8x8_12_c: 10.0 vvc_alf_filter_chroma_8x8_12_neon: 2.5 vvc_alf_filter_chroma_16x16_8_c: 41.7 vvc_alf_filter_chroma_16x16_8_neon: 11.2 vvc_alf_filter_chroma_16x16_10_c: 39.0 vvc_alf_filter_chroma_16x16_10_neon: 10.0 vvc_alf_filter_chroma_16x16_12_c: 40.2 vvc_alf_filter_chroma_16x16_12_neon: 10.2 vvc_alf_filter_chroma_32x32_8_c: 162.0 vvc_alf_filter_chroma_32x32_8_neon: 45.0 vvc_alf_filter_chroma_32x32_10_c: 155.5 vvc_alf_filter_chroma_32x32_10_neon: 39.5 vvc_alf_filter_chroma_32x32_12_c: 155.5 vvc_alf_filter_chroma_32x32_12_neon: 40.0 vvc_alf_filter_chroma_64x64_8_c: 646.0 vvc_alf_filter_chroma_64x64_8_neon: 175.5 vvc_alf_filter_chroma_64x64_10_c: 708.2 vvc_alf_filter_chroma_64x64_10_neon: 166.7 vvc_alf_filter_chroma_64x64_12_c: 619.2 vvc_alf_filter_chroma_64x64_12_neon: 157.2 vvc_alf_filter_chroma_128x128_8_c: 2611.5 vvc_alf_filter_chroma_128x128_8_neon: 698.2 vvc_alf_filter_chroma_128x128_10_c: 2470.0 vvc_alf_filter_chroma_128x128_10_neon: 616.0 vvc_alf_filter_chroma_128x128_12_c: 2531.5 vvc_alf_filter_chroma_128x128_12_neon: 620.2 vvc_alf_filter_luma_8x8_8_c: 25.2 vvc_alf_filter_luma_8x8_8_neon: 4.2 vvc_alf_filter_luma_8x8_10_c: 18.5 vvc_alf_filter_luma_8x8_10_neon: 4.0 vvc_alf_filter_luma_8x8_12_c: 19.0 vvc_alf_filter_luma_8x8_12_neon: 4.0 vvc_alf_filter_luma_16x16_8_c: 106.5 vvc_alf_filter_luma_16x16_8_neon: 16.2 vvc_alf_filter_luma_16x16_10_c: 75.2 vvc_alf_filter_luma_16x16_10_neon: 14.7 vvc_alf_filter_luma_16x16_12_c: 79.7 vvc_alf_filter_luma_16x16_12_neon: 14.7 vvc_alf_filter_luma_32x32_8_c: 400.5 vvc_alf_filter_luma_32x32_8_neon: 63.2 vvc_alf_filter_luma_32x32_10_c: 299.2 vvc_alf_filter_luma_32x32_10_neon: 57.7 vvc_alf_filter_luma_32x32_12_c: 299.2 vvc_alf_filter_luma_32x32_12_neon: 57.7 vvc_alf_filter_luma_64x64_8_c: 1602.5 vvc_alf_filter_luma_64x64_8_neon: 251.7 vvc_alf_filter_luma_64x64_10_c: 1197.0 vvc_alf_filter_luma_64x64_10_neon: 235.5 vvc_alf_filter_luma_64x64_12_c: 1220.2 vvc_alf_filter_luma_64x64_12_neon: 235.7 vvc_alf_filter_luma_128x128_8_c: 6570.2 vvc_alf_filter_luma_128x128_8_neon: 1007.7 vvc_alf_filter_luma_128x128_10_c: 4822.7 vvc_alf_filter_luma_128x128_10_neon: 936.2 vvc_alf_filter_luma_128x128_12_c: 4791.2 vvc_alf_filter_luma_128x128_12_neon: 938.5 Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
2024-07-15 16:19:15 +00:00
endfunc