lavc/aarch64: new optimization for 8-bit hevc_qpel_h hevc_qpel_uni_w_hv

Signed-off-by: Martin Storsjö <martin@martin.st>
2024-12-27 01:42:20 +00:00 · 2023-05-28 09:56:51 +08:00 · 2023-05-28 09:56:51 +08:00 · e79686be96
commit e79686be96
parent 15972cce8c
2 changed files with 1102 additions and 0 deletions
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@ -145,6 +145,13 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \

+#define NEON8_FNPROTO_PARTIAL_5(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+

 NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
        const uint8_t *_src, ptrdiff_t _srcstride,
@ -156,11 +163,20 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
        int height, int denom, int wx, int ox,
        intptr_t mx, intptr_t my, int width),);

+
+NEON8_FNPROTO(qpel_h, (int16_t *dst,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
 NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
        const uint8_t *_src, ptrdiff_t _srcstride,
        int height, int denom, int wx, int ox,
        intptr_t mx, intptr_t my, int width), _i8mm);

+NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width), _i8mm);

 #define NEON8_FNASSIGN(member, v, h, fn, ext) \
        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
@ -181,6 +197,12 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
        member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;

+#define NEON8_FNASSIGN_PARTIAL_5(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;

 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@ -247,6 +269,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)

        if (have_i8mm(cpu_flags)) {
            NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
+            NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _i8mm);
        }

    }
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S