lavc/aarch64: Add neon implementation for pix_median_abs8

Provide optimized implementation for pix_median_abs8 function. Performance comparison tests are shown below. - median_sad_1_c: 277.0 - median_sad_1_neon: 82.0 Benchmarks and tests run with checkasm tool on AWS Graviton 3. Signed-off-by: Hubert Mazur <hum@semihalf.com> Signed-off-by: Martin Storsjö <martin@martin.st>
2025-03-22 19:07:57 +00:00 · 2022-09-20 13:01:58 +02:00 · 2022-09-20 13:01:58 +02:00 · b2732115dd
commit b2732115dd
parent e9a6170213
2 changed files with 65 additions and 0 deletions
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@ -57,6 +57,8 @@ int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
                        ptrdiff_t stride, int h);
 int pix_median_abs16_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
                          ptrdiff_t stride, int h);
 int pix_median_abs8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
                         ptrdiff_t stride, int h);
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
@ -85,6 +87,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
        c->nsse[0] = nsse16_neon_wrapper;
        c->median_sad[0] = pix_median_abs16_neon;
        c->median_sad[1] = pix_median_abs8_neon;
    }
 }
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@ -1089,3 +1089,65 @@ function vsad_intra8_neon, export=1
        ret
 endfunc
 function pix_median_abs8_neon, export=1
        // x0           unused
        // x1           uint8_t *pix1
        // x2           uint8_t *pix2
        // x3           ptrdiff_t stride
        // w4           int h
        ld1             {v2.8b}, [x1], x3
        ld1             {v3.8b}, [x2], x3
        movi            v31.8h, #0
        ext             v0.8b, v2.8b, v2.8b, #1
        ext             v1.8b, v3.8b, v3.8b, #1
        usubl           v28.8h, v2.8b, v3.8b
        usubl           v26.8h, v0.8b, v1.8b
        sub             w4, w4, #1                              // we need to make h-1 iterations
        saba            v31.8h, v26.8h, v28.8h
        mov             h18, v28.h[0]
        cmp             w4, #1
        sqabs           h18, h18
        movi            v0.8h, #0
        b.lt            2f
 1:
        ld1             {v6.8b}, [x1], x3                       // pix1 vector for V(j-1)
        ld1             {v7.8b}, [x2], x3                       // pix2 vector for V(j-1)
        subs            w4, w4, #1
        ext             v4.8b, v6.8b, v6.8b, #1                 // pix1 vector for V(j)
        ext             v5.8b, v7.8b, v7.8b, #1                 // pix2 vector for V(j)
        // protected registers: v30, v29, v28, v27, v26, v25, v24, v23
        // scratch registers: v22, v21, v20, v19, v17
        // To find median of three values, calculate sum of them
        // and subtract max and min value from it.
        usubl           v30.8h, v6.8b, v7.8b                    // V(j-1)
        usubl           v24.8h, v4.8b, v5.8b                    // V(j)
        saba            v0.8h, v30.8h, v28.8h
        add             v22.8h, v26.8h, v30.8h
        smin            v20.8h, v26.8h, v30.8h
        smax            v19.8h, v26.8h, v30.8h
        sub             v22.8h, v22.8h, v28.8h
        smin            v17.8h, v19.8h, v22.8h
        mov             v28.16b, v30.16b
        smax            v20.8h, v20.8h, v17.8h                  // median values lower half
        smax            v19.8h, v25.8h, v29.8h
        saba            v31.8h, v24.8h, v20.8h
        mov             v26.16b, v24.16b
        smax            v17.8h, v22.8h, v19.8h                  // median values upper half
        b.ne            1b
 2:
        mov             h17, v0.h[0]
        ins             v31.h[7], wzr
        add             d18, d18, d17
        uaddlv          s17, v31.8h
        add             d18, d18, d17
        fmov            w0, s18
        ret
 endfunc