From 90b1b9350c0a97c4065ae9054b83e57f48a0de1f Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Thu, 3 Dec 2015 16:17:32 +0100 Subject: [PATCH] arm: add ff_int32_to_float_fmul_array8_neon Quite a bit faster than int32_to_float_fmul_array8_c calling ff_int32_to_float_fmul_scalar_neon through FmtConvertContext. Number of cycles per int32_to_float_fmul_array8 call while decoding padded.dts on exynos5422: before after change cortex-a7: 1270 951 -25% cortex-a15: 434 285 -34% checkasm --bench cycle counts: cortex-a15 cortex-a7 int32_to_float_fmul_array8_c: 1730.4 4384.5 int32_to_float_fmul_array8_neon_c: 571.5 1694.3 int32_to_float_fmul_array8_neon: 374.0 1448.8 Interesting are the differences between int32_to_float_fmul_array8_neon_c and int32_to_float_fmul_array8_neon. The former is current behaviour of calling ff_int32_to_float_fmul_scalar_neon repeatedly from the c function, The raw numbers differ since checkasm uses different lengths than the dca decoder. --- libavcodec/arm/fmtconvert_init_arm.c | 4 +++ libavcodec/arm/fmtconvert_neon.S | 37 ++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/libavcodec/arm/fmtconvert_init_arm.c b/libavcodec/arm/fmtconvert_init_arm.c index 6a80bfb6b3..11396e898c 100644 --- a/libavcodec/arm/fmtconvert_init_arm.c +++ b/libavcodec/arm/fmtconvert_init_arm.c @@ -25,6 +25,9 @@ #include "libavcodec/avcodec.h" #include "libavcodec/fmtconvert.h" +void ff_int32_to_float_fmul_array8_neon(FmtConvertContext *c, float *dst, + const int32_t *src, const float *mul, + int len); void ff_int32_to_float_fmul_scalar_neon(float *dst, const int32_t *src, float mul, int len); @@ -46,6 +49,7 @@ av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx } if (have_neon(cpu_flags)) { + c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_neon; c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; } } diff --git a/libavcodec/arm/fmtconvert_neon.S b/libavcodec/arm/fmtconvert_neon.S index 5e0ac68843..5d48e3d197 100644 --- a/libavcodec/arm/fmtconvert_neon.S +++ b/libavcodec/arm/fmtconvert_neon.S @@ -1,6 +1,7 @@ /* * ARM NEON optimised Format Conversion Utils * Copyright (c) 2008 Mans Rullgard + * Copyright (c) 2015 Janne Grunau b * * This file is part of Libav. * @@ -49,3 +50,39 @@ NOVFP len .req r3 bx lr .unreq len endfunc + +function ff_int32_to_float_fmul_array8_neon, export=1 + ldr r0, [sp] + lsr r0, r0, #3 + subs r0, r0, #1 + beq 1f +2: + vld1.32 {q0-q1}, [r2,:128]! + vld1.32 {q2-q3}, [r2,:128]! + vld1.32 {d20}, [r3]! + subs r0, r0, #2 + vcvt.f32.s32 q0, q0 + vcvt.f32.s32 q1, q1 + vdup.32 q8, d20[0] + vcvt.f32.s32 q2, q2 + vcvt.f32.s32 q3, q3 + vmul.f32 q0, q0, q8 + vdup.32 q9, d20[1] + vmul.f32 q1, q1, q8 + vmul.f32 q2, q2, q9 + vmul.f32 q3, q3, q9 + vst1.32 {q0-q1}, [r1,:128]! + vst1.32 {q2-q3}, [r1,:128]! + bgt 2b + it lt + bxlt lr +1: + vld1.32 {q0-q1}, [r2,:128] + vld1.32 {d16[],d17[]}, [r3] + vcvt.f32.s32 q0, q0 + vcvt.f32.s32 q1, q1 + vmul.f32 q0, q0, q8 + vmul.f32 q1, q1, q8 + vst1.32 {q0-q1}, [r1,:128] + bx lr +endfunc