avcodec/ac3: Implement sum_square_butterfly_int32 for aarch64 NEON

Signed-off-by: Geoff Hill <geoff@geoffhill.org> Signed-off-by: Martin Storsjö <martin@martin.st>
2024-04-06 07:26:06 -07:00 · 2024-04-06 07:26:06 -07:00 · 42e88f18f3
parent 69cb34f885
commit 42e88f18f3
3 changed files with 55 additions and 0 deletions
--- a/libavcodec/aarch64/ac3dsp_init_aarch64.c
+++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c
@ -28,6 +28,10 @@
 void ff_ac3_exponent_min_neon(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
 void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs);
 void ff_float_to_fixed24_neon(int32_t *dst, const float *src, size_t len);
 void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
                                            const int32_t *coef0,
                                            const int32_t *coef1,
                                            int len);
 av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
 {
@ -37,4 +41,5 @@ av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
    c->ac3_exponent_min = ff_ac3_exponent_min_neon;
    c->extract_exponents = ff_ac3_extract_exponents_neon;
    c->float_to_fixed24 = ff_float_to_fixed24_neon;
    c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
 }
--- a/libavcodec/aarch64/ac3dsp_neon.S
+++ b/libavcodec/aarch64/ac3dsp_neon.S
@ -64,3 +64,26 @@ function ff_float_to_fixed24_neon, export=1
        b.ne            1b
        ret
 endfunc
 function ff_ac3_sum_square_butterfly_int32_neon, export=1
        movi            v0.2d, #0
        movi            v1.2d, #0
        movi            v2.2d, #0
        movi            v3.2d, #0
 1:      ld1             {v4.2s}, [x1], #8
        ld1             {v5.2s}, [x2], #8
        add             v6.2s, v4.2s, v5.2s
        sub             v7.2s, v4.2s, v5.2s
        smlal           v0.2d, v4.2s, v4.2s
        smlal           v1.2d, v5.2s, v5.2s
        smlal           v2.2d, v6.2s, v6.2s
        smlal           v3.2d, v7.2s, v7.2s
        subs            w3, w3, #2
        b.gt            1b
        addp            d0, v0.2d
        addp            d1, v1.2d
        addp            d2, v2.2d
        addp            d3, v3.2d
        st1             {v0.1d-v3.1d}, [x0]
        ret
 endfunc
--- a/tests/checkasm/ac3dsp.c
+++ b/tests/checkasm/ac3dsp.c
@ -139,6 +139,32 @@ static void check_float_to_fixed24(AC3DSPContext *c) {
    report("float_to_fixed24");
 }
 static void check_ac3_sum_square_butterfly_int32(AC3DSPContext *c) {
 #define ELEMS 240
    LOCAL_ALIGNED_16(int32_t, lt, [ELEMS]);
    LOCAL_ALIGNED_16(int32_t, rt, [ELEMS]);
    LOCAL_ALIGNED_16(uint64_t, v1, [4]);
    LOCAL_ALIGNED_16(uint64_t, v2, [4]);
    declare_func(void, int64_t[4], const int32_t *, const int32_t *, int);
    randomize_i24(lt, ELEMS);
    randomize_i24(rt, ELEMS);
    if (check_func(c->sum_square_butterfly_int32,
                   "ac3_sum_square_bufferfly_int32")) {
        call_ref(v1, lt, rt, ELEMS);
        call_new(v2, lt, rt, ELEMS);
        if (memcmp(v1, v2, sizeof(int64_t[4])) != 0)
            fail();
        bench_new(v2, lt, rt, ELEMS);
    }
    report("ac3_sum_square_butterfly_int32");
 }
 void checkasm_check_ac3dsp(void)
 {
    AC3DSPContext c;
@ -147,4 +173,5 @@ void checkasm_check_ac3dsp(void)
    check_ac3_exponent_min(&c);
    check_ac3_extract_exponents(&c);
    check_float_to_fixed24(&c);
    check_ac3_sum_square_butterfly_int32(&c);
 }