lavc/aarch64: Provide neon implementation of nsse16

Add vectorized implementation of nsse16 function. Performance comparison tests are shown below. - nsse_0_c: 682.2 - nsse_0_neon: 116.5 Benchmarks and tests run with checkasm tool on AWS Graviton 3. Co-authored-by: Martin Storsjö <martin@martin.st> Signed-off-by: Hubert Mazur <hum@semihalf.com> Signed-off-by: Martin Storsjö <martin@martin.st>
2022-09-08 11:25:07 +02:00 · 2022-09-08 11:25:07 +02:00 · 06b98e396a
parent 908abe8032
commit 06b98e396a
2 changed files with 137 additions and 0 deletions
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@ -49,6 +49,10 @@ int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
                ptrdiff_t stride, int h);
 int vsse_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
                      ptrdiff_t stride, int h);
 int nsse16_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
                ptrdiff_t stride, int h);
 int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
                        ptrdiff_t stride, int h);
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
@ -72,5 +76,16 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
        c->vsse[0] = vsse16_neon;
        c->vsse[4] = vsse_intra16_neon;
        c->nsse[0] = nsse16_neon_wrapper;
    }
 }
 int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
                        ptrdiff_t stride, int h)
 {
    if (c)
        return nsse16_neon(c->avctx->nsse_weight, s1, s2, stride, h);
    else
        return nsse16_neon(8, s1, s2, stride, h);
 }
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@ -847,3 +847,125 @@ function vsse_intra16_neon, export=1
        ret
 endfunc
 function nsse16_neon, export=1
        // x0           multiplier
        // x1           uint8_t *pix1
        // x2           uint8_t *pix2
        // x3           ptrdiff_t stride
        // w4           int h
        str             x0, [sp, #-0x40]!
        stp             x1, x2, [sp, #0x10]
        stp             x3, x4, [sp, #0x20]
        str             x30, [sp, #0x30]
        bl              X(sse16_neon)
        ldr             x30, [sp, #0x30]
        mov             w9, w0                                  // here we store score1
        ldr             x5, [sp]
        ldp             x1, x2, [sp, #0x10]
        ldp             x3, x4, [sp, #0x20]
        add             sp, sp, #0x40
        movi            v16.8h, #0
        movi            v17.8h, #0
        movi            v18.8h, #0
        movi            v19.8h, #0
        ld1             {v0.16b}, [x1], x3
        subs            w4, w4, #1                              // we need to make h-1 iterations
        ld1             {v2.16b}, [x2], x3
        ext             v1.16b, v0.16b, v0.16b, #1              // x1 + 1
        cmp             w4, #2
        ext             v3.16b, v2.16b, v2.16b, #1              // x2 + 1
        b.lt            2f
 // make 2 iterations at once
 1:
        ld1             {v4.16b}, [x1], x3
        ld1             {v6.16b}, [x2], x3
        ld1             {v20.16b}, [x1], x3
        ext             v5.16b, v4.16b, v4.16b, #1              // x1 + stride + 1
        usubl           v31.8h, v0.8b, v4.8b
        usubl2          v30.8h, v0.16b, v4.16b
        ld1             {v22.16b}, [x2], x3
        usubl           v29.8h, v1.8b, v5.8b
        usubl2          v28.8h, v1.16b, v5.16b
        ext             v7.16b, v6.16b, v6.16b, #1              // x2 + stride + 1
        saba            v16.8h, v31.8h, v29.8h
        ext             v21.16b, v20.16b, v20.16b, #1
        saba            v17.8h, v30.8h, v28.8h
        usubl           v27.8h, v2.8b, v6.8b
        usubl2          v26.8h, v2.16b, v6.16b
        ext             v23.16b, v22.16b, v22.16b, #1
        usubl           v25.8h, v3.8b, v7.8b
        usubl2          v24.8h, v3.16b, v7.16b
        saba            v18.8h, v27.8h, v25.8h
        saba            v19.8h, v26.8h, v24.8h
        usubl           v31.8h, v4.8b, v20.8b
        usubl2          v30.8h, v4.16b, v20.16b
        usubl           v29.8h, v5.8b, v21.8b
        usubl2          v28.8h, v5.16b, v21.16b
        saba            v16.8h, v31.8h, v29.8h
        saba            v17.8h, v30.8h, v28.8h
        usubl           v27.8h, v6.8b, v22.8b
        usubl2          v26.8h, v6.16b, v22.16b
        usubl           v25.8h, v7.8b, v23.8b
        usubl2          v24.8h, v7.16b, v23.16b
        saba            v18.8h, v27.8h, v25.8h
        saba            v19.8h, v26.8h, v24.8h
        sub             w4, w4, #2
        mov             v0.16b, v20.16b
        mov             v1.16b, v21.16b
        cmp             w4, #2
        mov             v2.16b, v22.16b
        mov             v3.16b, v23.16b
        b.ge            1b
        cbz             w4, 3f
 // iterate by one
 2:
        ld1             {v4.16b}, [x1], x3
        subs            w4, w4, #1
        ld1             {v6.16b}, [x2], x3
        ext             v5.16b, v4.16b, v4.16b, #1              // x1 + stride + 1
        usubl           v31.8h, v0.8b, v4.8b
        ext             v7.16b, v6.16b, v6.16b, #1              // x2 + stride + 1
        usubl2          v30.8h, v0.16b, v4.16b
        usubl           v29.8h, v1.8b, v5.8b
        usubl2          v28.8h, v1.16b, v5.16b
        saba            v16.8h, v31.8h, v29.8h
        saba            v17.8h, v30.8h, v28.8h
        usubl           v27.8h, v2.8b, v6.8b
        usubl2          v26.8h, v2.16b, v6.16b
        usubl           v25.8h, v3.8b, v7.8b
        usubl2          v24.8h, v3.16b, v7.16b
        saba            v18.8h, v27.8h, v25.8h
        saba            v19.8h, v26.8h, v24.8h
        mov             v0.16b, v4.16b
        mov             v1.16b, v5.16b
        mov             v2.16b, v6.16b
        mov             v3.16b, v7.16b
        cbnz            w4, 2b
 3:
        sqsub           v17.8h, v17.8h, v19.8h
        sqsub           v16.8h, v16.8h, v18.8h
        ins             v17.h[7], wzr
        sqadd           v16.8h, v16.8h, v17.8h
        saddlv          s16, v16.8h
        sqabs           s16, s16
        fmov            w0, s16
        mul             w0, w0, w5
        add             w0, w0, w9
        ret
 endfunc