From 3f84d1d1fb75e4192c35664b4dd4741e8381b667 Mon Sep 17 00:00:00 2001
From: Zhao Zhili <zhilizhao@tencent.com>
Date: Wed, 11 Sep 2024 21:40:59 +0800
Subject: [PATCH] aarch64/vvc: Add avg

avg_8_2x2_c:                                             0.2 ( 1.00x)
avg_8_2x2_neon:                                          0.2 ( 1.00x)
avg_8_4x4_c:                                             0.2 ( 1.00x)
avg_8_4x4_neon:                                          0.2 ( 1.00x)
avg_8_8x8_c:                                             0.9 ( 1.00x)
avg_8_8x8_neon:                                          0.2 ( 5.29x)
avg_8_16x16_c:                                           3.7 ( 1.00x)
avg_8_16x16_neon:                                        0.7 ( 5.44x)
avg_8_32x32_c:                                          14.9 ( 1.00x)
avg_8_32x32_neon:                                        1.7 ( 8.91x)
avg_8_64x64_c:                                          59.7 ( 1.00x)
avg_8_64x64_neon:                                        6.9 ( 8.62x)
avg_8_128x128_c:                                       254.7 ( 1.00x)
avg_8_128x128_neon:                                     26.9 ( 9.46x)
avg_10_2x2_c:                                            0.2 ( 1.00x)
avg_10_2x2_neon:                                         0.2 ( 1.00x)
avg_10_4x4_c:                                            0.2 ( 1.00x)
avg_10_4x4_neon:                                         0.2 ( 1.00x)
avg_10_8x8_c:                                            0.9 ( 1.00x)
avg_10_8x8_neon:                                         0.2 ( 5.29x)
avg_10_16x16_c:                                          3.4 ( 1.00x)
avg_10_16x16_neon:                                       0.4 ( 8.06x)
avg_10_32x32_c:                                         13.9 ( 1.00x)
avg_10_32x32_neon:                                       1.9 ( 7.23x)
avg_10_64x64_c:                                         54.2 ( 1.00x)
avg_10_64x64_neon:                                       8.4 ( 6.43x)
avg_10_128x128_c:                                      232.4 ( 1.00x)
avg_10_128x128_neon:                                    30.9 ( 7.52x)
avg_12_2x2_c:                                            0.0 ( 0.00x)
avg_12_2x2_neon:                                         0.2 ( 0.00x)
avg_12_4x4_c:                                            0.4 ( 1.00x)
avg_12_4x4_neon:                                         0.2 ( 2.43x)
avg_12_8x8_c:                                            0.7 ( 1.00x)
avg_12_8x8_neon:                                         0.2 ( 3.86x)
avg_12_16x16_c:                                          3.7 ( 1.00x)
avg_12_16x16_neon:                                       0.4 ( 8.65x)
avg_12_32x32_c:                                         13.7 ( 1.00x)
avg_12_32x32_neon:                                       2.2 ( 6.29x)
avg_12_64x64_c:                                         53.9 ( 1.00x)
avg_12_64x64_neon:                                       7.7 ( 7.03x)
avg_12_128x128_c:                                      270.9 ( 1.00x)
avg_12_128x128_neon:                                    30.4 ( 8.90x)
---
 libavcodec/aarch64/vvc/Makefile   |   1 +
 libavcodec/aarch64/vvc/dsp_init.c |  16 +++
 libavcodec/aarch64/vvc/inter.S    | 163 ++++++++++++++++++++++++++++++
 3 files changed, 180 insertions(+)
 create mode 100644 libavcodec/aarch64/vvc/inter.S

diff --git a/libavcodec/aarch64/vvc/Makefile b/libavcodec/aarch64/vvc/Makefile
index 7ba13a2165..ed80338969 100644
--- a/libavcodec/aarch64/vvc/Makefile
+++ b/libavcodec/aarch64/vvc/Makefile
@@ -3,6 +3,7 @@ clean::
 
 OBJS-$(CONFIG_VVC_DECODER)              += aarch64/vvc/dsp_init.o
 NEON-OBJS-$(CONFIG_VVC_DECODER)         += aarch64/vvc/alf.o \
+                                           aarch64/vvc/inter.o \
                                            aarch64/vvc/sad.o \
                                            aarch64/h26x/epel_neon.o \
                                            aarch64/h26x/qpel_neon.o \
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index 4867491620..ad767d17e2 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -42,6 +42,16 @@
 int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy,
                     const int block_w, const int block_h);
 
+void ff_vvc_avg_8_neon(uint8_t *dst, ptrdiff_t dst_stride,
+                       const int16_t *src0, const int16_t *src1, int width,
+                       int height);
+void ff_vvc_avg_10_neon(uint8_t *dst, ptrdiff_t dst_stride,
+                       const int16_t *src0, const int16_t *src1, int width,
+                       int height);
+void ff_vvc_avg_12_neon(uint8_t *dst, ptrdiff_t dst_stride,
+                        const int16_t *src0, const int16_t *src1, int width,
+                        int height);
+
 void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -112,6 +122,8 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
         c->inter.put_uni_w[0][5][0][0] = ff_vvc_put_pel_uni_w_pixels64_8_neon;
         c->inter.put_uni_w[0][6][0][0] = ff_vvc_put_pel_uni_w_pixels128_8_neon;
 
+        c->inter.avg = ff_vvc_avg_8_neon;
+
         for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
             c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
         c->sao.edge_filter[0] = ff_vvc_sao_edge_filter_8x8_8_neon;
@@ -150,9 +162,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
             c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon_i8mm;
         }
     } else if (bd == 10) {
+        c->inter.avg = ff_vvc_avg_10_neon;
+
         c->alf.filter[LUMA] = alf_filter_luma_10_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
     } else if (bd == 12) {
+        c->inter.avg = ff_vvc_avg_12_neon;
+
         c->alf.filter[LUMA] = alf_filter_luma_12_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
     }
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
new file mode 100644
index 0000000000..2f69274b86
--- /dev/null
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define VVC_MAX_PB_SIZE 128
+
+.macro vvc_avg, bit_depth
+
+.macro vvc_avg_\bit_depth\()_2_4, tap
+.if \tap == 2
+        ldr             s0, [src0]
+        ldr             s2, [src1]
+.else
+        ldr             d0, [src0]
+        ldr             d2, [src1]
+.endif
+        saddl           v4.4s, v0.4h, v2.4h
+        add             v4.4s, v4.4s, v16.4s
+        sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
+.if \bit_depth == 8
+        sqxtun          v4.8b, v4.8h
+.if \tap == 2
+        str             h4, [dst]
+.else   // tap == 4
+        str             s4, [dst]
+.endif
+
+.else   // bit_depth > 8
+        smin            v4.4h, v4.4h, v17.4h
+        smax            v4.4h, v4.4h, v18.4h
+.if \tap == 2
+        str             s4, [dst]
+.else
+        str             d4, [dst]
+.endif
+.endif
+        add             src0, src0, x10
+        add             src1, src1, x10
+        add             dst, dst, dst_stride
+.endm
+
+function ff_vvc_avg_\bit_depth\()_neon, export=1
+        dst             .req x0
+        dst_stride      .req x1
+        src0            .req x2
+        src1            .req x3
+        width           .req w4
+        height          .req w5
+
+        mov             x10, #(VVC_MAX_PB_SIZE * 2)
+        cmp             width, #8
+.if \bit_depth == 8
+        movi            v16.4s, #64
+.else
+.if \bit_depth == 10
+        mov             w6, #1023
+        movi            v16.4s, #16
+.else
+        mov             w6, #4095
+        movi            v16.4s, #4
+.endif
+        movi            v18.8h, #0
+        dup             v17.8h, w6
+.endif
+        b.eq            8f
+        b.hi            16f
+        cmp             width, #4
+        b.eq            4f
+2:      // width == 2
+        subs            height, height, #1
+        vvc_avg_\bit_depth\()_2_4 2
+        b.ne            2b
+        b               32f
+4:      // width == 4
+        subs            height, height, #1
+        vvc_avg_\bit_depth\()_2_4 4
+        b.ne            4b
+        b               32f
+8:      // width == 8
+        ld1             {v0.8h}, [src0], x10
+        ld1             {v2.8h}, [src1], x10
+        saddl           v4.4s, v0.4h, v2.4h
+        saddl2          v5.4s, v0.8h, v2.8h
+        add             v4.4s, v4.4s, v16.4s
+        add             v5.4s, v5.4s, v16.4s
+        sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
+        sqshrn2         v4.8h, v5.4s, #(15 - \bit_depth)
+        subs            height, height, #1
+.if \bit_depth == 8
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [dst], dst_stride
+.else
+        smin            v4.8h, v4.8h, v17.8h
+        smax            v4.8h, v4.8h, v18.8h
+        st1             {v4.8h}, [dst], dst_stride
+.endif
+        b.ne            8b
+        b               32f
+16:     // width >= 16
+        mov             w6, width
+        mov             x7, src0
+        mov             x8, src1
+        mov             x9, dst
+17:
+        ldp             q0, q1, [x7], #32
+        ldp             q2, q3, [x8], #32
+        saddl           v4.4s, v0.4h, v2.4h
+        saddl2          v5.4s, v0.8h, v2.8h
+        saddl           v6.4s, v1.4h, v3.4h
+        saddl2          v7.4s, v1.8h, v3.8h
+        add             v4.4s, v4.4s, v16.4s
+        add             v5.4s, v5.4s, v16.4s
+        add             v6.4s, v6.4s, v16.4s
+        add             v7.4s, v7.4s, v16.4s
+        sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
+        sqshrn2         v4.8h, v5.4s, #(15 - \bit_depth)
+        sqshrn          v6.4h, v6.4s, #(15 - \bit_depth)
+        sqshrn2         v6.8h, v7.4s, #(15 - \bit_depth)
+        subs            w6, w6, #16
+.if \bit_depth == 8
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v6.8h
+        str             q4, [x9], #16
+.else
+        smin            v4.8h, v4.8h, v17.8h
+        smin            v6.8h, v6.8h, v17.8h
+        smax            v4.8h, v4.8h, v18.8h
+        smax            v6.8h, v6.8h, v18.8h
+        stp             q4, q6, [x9], #32
+.endif
+        b.ne            17b
+
+        subs            height, height, #1
+        add             src0, src0, x10
+        add             src1, src1, x10
+        add             dst, dst, dst_stride
+        b.ne            16b
+32:
+        ret
+endfunc
+.endm
+
+vvc_avg 8
+vvc_avg 10
+vvc_avg 12