mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2025-01-27 17:53:13 +00:00
lavc/aarch64/fdct: add neon-optimized fdct for aarch64
The code is imported from libjpeg-turbo-3.0.1. The neon registers used have been changed to avoid modifying v8-v15. Reviewed-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
27f6211c74
commit
d4d09c8e42
@ -1,5 +1,6 @@
|
||||
# subsystems
|
||||
OBJS-$(CONFIG_AC3DSP) += aarch64/ac3dsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_FDCTDSP) += aarch64/fdctdsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_init.o
|
||||
OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o
|
||||
OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o
|
||||
@ -37,6 +38,7 @@ ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o
|
||||
# subsystems
|
||||
NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/sbrdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_AC3DSP) += aarch64/ac3dsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_FDCTDSP) += aarch64/fdctdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \
|
||||
|
26
libavcodec/aarch64/fdct.h
Normal file
26
libavcodec/aarch64/fdct.h
Normal file
@ -0,0 +1,26 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_AARCH64_FDCT_H
|
||||
#define AVCODEC_AARCH64_FDCT_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
void ff_fdct_neon(int16_t *block);
|
||||
|
||||
#endif /* AVCODEC_AARCH64_FDCT_H */
|
39
libavcodec/aarch64/fdctdsp_init_aarch64.c
Normal file
39
libavcodec/aarch64/fdctdsp_init_aarch64.c
Normal file
@ -0,0 +1,39 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/fdctdsp.h"
|
||||
#include "fdct.h"
|
||||
|
||||
av_cold void ff_fdctdsp_init_aarch64(FDCTDSPContext *c, AVCodecContext *avctx,
|
||||
unsigned high_bit_depth)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
if (!high_bit_depth) {
|
||||
if (avctx->dct_algo == FF_DCT_AUTO ||
|
||||
avctx->dct_algo == FF_DCT_NEON) {
|
||||
c->fdct = ff_fdct_neon;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
368
libavcodec/aarch64/fdctdsp_neon.S
Normal file
368
libavcodec/aarch64/fdctdsp_neon.S
Normal file
@ -0,0 +1,368 @@
|
||||
/*
|
||||
* Armv8 Neon optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
|
||||
* All Rights Reserved.
|
||||
* Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
|
||||
* Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved.
|
||||
* Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
|
||||
* Copyright (C) 2014-2016, 2020, D. R. Commander. All Rights Reserved.
|
||||
* Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved.
|
||||
* Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
#include "neon.S"
|
||||
|
||||
// #define EIGHT_BIT_SAMPLES
|
||||
|
||||
/* Constants for jsimd_fdct_islow_neon() */
|
||||
|
||||
#define F_0_298 2446 /* FIX(0.298631336) */
|
||||
#define F_0_390 3196 /* FIX(0.390180644) */
|
||||
#define F_0_541 4433 /* FIX(0.541196100) */
|
||||
#define F_0_765 6270 /* FIX(0.765366865) */
|
||||
#define F_0_899 7373 /* FIX(0.899976223) */
|
||||
#define F_1_175 9633 /* FIX(1.175875602) */
|
||||
#define F_1_501 12299 /* FIX(1.501321110) */
|
||||
#define F_1_847 15137 /* FIX(1.847759065) */
|
||||
#define F_1_961 16069 /* FIX(1.961570560) */
|
||||
#define F_2_053 16819 /* FIX(2.053119869) */
|
||||
#define F_2_562 20995 /* FIX(2.562915447) */
|
||||
#define F_3_072 25172 /* FIX(3.072711026) */
|
||||
|
||||
const jsimd_fdct_islow_neon_consts, align=4
|
||||
.short F_0_298
|
||||
.short -F_0_390
|
||||
.short F_0_541
|
||||
.short F_0_765
|
||||
.short - F_0_899
|
||||
.short F_1_175
|
||||
.short F_1_501
|
||||
.short - F_1_847
|
||||
.short - F_1_961
|
||||
.short F_2_053
|
||||
.short - F_2_562
|
||||
.short F_3_072
|
||||
.short 0 /* padding */
|
||||
.short 0
|
||||
.short 0
|
||||
.short 0
|
||||
endconst
|
||||
|
||||
#undef F_0_298
|
||||
#undef F_0_390
|
||||
#undef F_0_541
|
||||
#undef F_0_765
|
||||
#undef F_0_899
|
||||
#undef F_1_175
|
||||
#undef F_1_501
|
||||
#undef F_1_847
|
||||
#undef F_1_961
|
||||
#undef F_2_053
|
||||
#undef F_2_562
|
||||
#undef F_3_072
|
||||
|
||||
/*****************************************************************************/
|
||||
|
||||
/*
|
||||
* jsimd_fdct_islow_neon
|
||||
*
|
||||
* This file contains a slower but more accurate integer implementation of the
|
||||
* forward DCT (Discrete Cosine Transform). The following code is based
|
||||
* directly on the IJG''s original jfdctint.c; see the jfdctint.c for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#define CONST_BITS 13
|
||||
#ifdef EIGHT_BIT_SAMPLES
|
||||
#define PASS1_BITS 2
|
||||
#else
|
||||
#define PASS1_BITS 1 /* lose a little precision to avoid overflow */
|
||||
#endif
|
||||
|
||||
#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||
#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
|
||||
|
||||
#define XFIX_P_0_298 v0.h[0]
|
||||
#define XFIX_N_0_390 v0.h[1]
|
||||
#define XFIX_P_0_541 v0.h[2]
|
||||
#define XFIX_P_0_765 v0.h[3]
|
||||
#define XFIX_N_0_899 v0.h[4]
|
||||
#define XFIX_P_1_175 v0.h[5]
|
||||
#define XFIX_P_1_501 v0.h[6]
|
||||
#define XFIX_N_1_847 v0.h[7]
|
||||
#define XFIX_N_1_961 v1.h[0]
|
||||
#define XFIX_P_2_053 v1.h[1]
|
||||
#define XFIX_N_2_562 v1.h[2]
|
||||
#define XFIX_P_3_072 v1.h[3]
|
||||
|
||||
function ff_fdct_neon, export=1
|
||||
|
||||
DATA .req x0
|
||||
TMP .req x9
|
||||
|
||||
/* Load constants */
|
||||
movrel TMP, jsimd_fdct_islow_neon_consts
|
||||
ld1 {v0.8h, v1.8h}, [TMP]
|
||||
|
||||
/* Load all DATA into Neon registers with the following allocation:
|
||||
* 0 1 2 3 | 4 5 6 7
|
||||
* ---------+--------
|
||||
* 0 | d16 | d17 | v16.8h
|
||||
* 1 | d18 | d19 | v17.8h
|
||||
* 2 | d20 | d21 | v18.8h
|
||||
* 3 | d22 | d23 | v19.8h
|
||||
* 4 | d24 | d25 | v20.8h
|
||||
* 5 | d26 | d27 | v21.8h
|
||||
* 6 | d28 | d29 | v22.8h
|
||||
* 7 | d30 | d31 | v23.8h
|
||||
*/
|
||||
|
||||
ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
|
||||
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
|
||||
sub DATA, DATA, #64
|
||||
|
||||
/* Transpose */
|
||||
transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v31, v2
|
||||
|
||||
/* 1-D FDCT */
|
||||
add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */
|
||||
sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */
|
||||
add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */
|
||||
sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */
|
||||
add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */
|
||||
sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */
|
||||
add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */
|
||||
sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */
|
||||
|
||||
/* Even part */
|
||||
add v4.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */
|
||||
sub v5.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */
|
||||
add v6.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */
|
||||
sub v7.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */
|
||||
|
||||
add v16.8h, v4.8h, v6.8h /* tmp10 + tmp11 */
|
||||
sub v20.8h, v4.8h, v6.8h /* tmp10 - tmp11 */
|
||||
|
||||
add v18.8h, v7.8h, v5.8h /* tmp12 + tmp13 */
|
||||
|
||||
shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
|
||||
shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
|
||||
|
||||
smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
|
||||
smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
|
||||
mov v22.16b, v18.16b
|
||||
mov v25.16b, v24.16b
|
||||
|
||||
smlal v18.4s, v5.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
|
||||
smlal2 v24.4s, v5.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
|
||||
smlal v22.4s, v7.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
|
||||
smlal2 v25.4s, v7.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
|
||||
|
||||
rshrn v18.4h, v18.4s, #DESCALE_P1
|
||||
rshrn v22.4h, v22.4s, #DESCALE_P1
|
||||
rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
|
||||
rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
|
||||
|
||||
/* Odd part */
|
||||
add v2.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */
|
||||
add v3.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */
|
||||
add v6.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */
|
||||
add v7.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */
|
||||
smull v4.4s, v6.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */
|
||||
smull2 v5.4s, v6.8h, XFIX_P_1_175
|
||||
smlal v4.4s, v7.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
|
||||
smlal2 v5.4s, v7.8h, XFIX_P_1_175
|
||||
|
||||
smull2 v24.4s, v28.8h, XFIX_P_0_298
|
||||
smull2 v25.4s, v29.8h, XFIX_P_2_053
|
||||
smull2 v26.4s, v30.8h, XFIX_P_3_072
|
||||
smull2 v27.4s, v31.8h, XFIX_P_1_501
|
||||
smull v23.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
|
||||
smull v21.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
|
||||
smull v19.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
|
||||
smull v17.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
|
||||
|
||||
smull2 v28.4s, v2.8h, XFIX_N_0_899
|
||||
smull2 v29.4s, v3.8h, XFIX_N_2_562
|
||||
smull2 v30.4s, v6.8h, XFIX_N_1_961
|
||||
smull2 v31.4s, v7.8h, XFIX_N_0_390
|
||||
smull v2.4s, v2.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
|
||||
smull v3.4s, v3.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
|
||||
smull v6.4s, v6.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
|
||||
smull v7.4s, v7.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
|
||||
|
||||
add v6.4s, v6.4s, v4.4s /* z3 += z5 */
|
||||
add v30.4s, v30.4s, v5.4s
|
||||
add v7.4s, v7.4s, v4.4s /* z4 += z5 */
|
||||
add v31.4s, v31.4s, v5.4s
|
||||
|
||||
add v23.4s, v23.4s, v2.4s /* tmp4 += z1 */
|
||||
add v24.4s, v24.4s, v28.4s
|
||||
add v21.4s, v21.4s, v3.4s /* tmp5 += z2 */
|
||||
add v25.4s, v25.4s, v29.4s
|
||||
add v19.4s, v19.4s, v6.4s /* tmp6 += z3 */
|
||||
add v26.4s, v26.4s, v30.4s
|
||||
add v17.4s, v17.4s, v7.4s /* tmp7 += z4 */
|
||||
add v27.4s, v27.4s, v31.4s
|
||||
|
||||
add v23.4s, v23.4s, v6.4s /* tmp4 += z3 */
|
||||
add v24.4s, v24.4s, v30.4s
|
||||
add v21.4s, v21.4s, v7.4s /* tmp5 += z4 */
|
||||
add v25.4s, v25.4s, v31.4s
|
||||
add v19.4s, v19.4s, v3.4s /* tmp6 += z2 */
|
||||
add v26.4s, v26.4s, v29.4s
|
||||
add v17.4s, v17.4s, v2.4s /* tmp7 += z1 */
|
||||
add v27.4s, v27.4s, v28.4s
|
||||
|
||||
rshrn v23.4h, v23.4s, #DESCALE_P1
|
||||
rshrn v21.4h, v21.4s, #DESCALE_P1
|
||||
rshrn v19.4h, v19.4s, #DESCALE_P1
|
||||
rshrn v17.4h, v17.4s, #DESCALE_P1
|
||||
rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
|
||||
rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
|
||||
rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
|
||||
rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
|
||||
|
||||
/* Transpose */
|
||||
transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v31, v2
|
||||
|
||||
/* 1-D FDCT */
|
||||
add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */
|
||||
sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */
|
||||
add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */
|
||||
sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */
|
||||
add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */
|
||||
sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */
|
||||
add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */
|
||||
sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */
|
||||
|
||||
/* Even part */
|
||||
add v4.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */
|
||||
sub v5.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */
|
||||
add v6.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */
|
||||
sub v7.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */
|
||||
|
||||
add v16.8h, v4.8h, v6.8h /* tmp10 + tmp11 */
|
||||
sub v20.8h, v4.8h, v6.8h /* tmp10 - tmp11 */
|
||||
|
||||
add v18.8h, v7.8h, v5.8h /* tmp12 + tmp13 */
|
||||
|
||||
srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
|
||||
srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
|
||||
|
||||
smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
|
||||
smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
|
||||
mov v22.16b, v18.16b
|
||||
mov v25.16b, v24.16b
|
||||
|
||||
smlal v18.4s, v5.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
|
||||
smlal2 v24.4s, v5.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
|
||||
smlal v22.4s, v7.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
|
||||
smlal2 v25.4s, v7.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
|
||||
|
||||
rshrn v18.4h, v18.4s, #DESCALE_P2
|
||||
rshrn v22.4h, v22.4s, #DESCALE_P2
|
||||
rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS+PASS1_BITS); */
|
||||
rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS+PASS1_BITS); */
|
||||
|
||||
/* Odd part */
|
||||
add v2.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */
|
||||
add v3.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */
|
||||
add v6.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */
|
||||
add v7.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */
|
||||
|
||||
smull v4.4s, v6.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */
|
||||
smull2 v5.4s, v6.8h, XFIX_P_1_175
|
||||
smlal v4.4s, v7.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
|
||||
smlal2 v5.4s, v7.8h, XFIX_P_1_175
|
||||
|
||||
smull2 v24.4s, v28.8h, XFIX_P_0_298
|
||||
smull2 v25.4s, v29.8h, XFIX_P_2_053
|
||||
smull2 v26.4s, v30.8h, XFIX_P_3_072
|
||||
smull2 v27.4s, v31.8h, XFIX_P_1_501
|
||||
smull v23.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
|
||||
smull v21.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
|
||||
smull v19.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
|
||||
smull v17.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
|
||||
|
||||
smull2 v28.4s, v2.8h, XFIX_N_0_899
|
||||
smull2 v29.4s, v3.8h, XFIX_N_2_562
|
||||
smull2 v30.4s, v6.8h, XFIX_N_1_961
|
||||
smull2 v31.4s, v7.8h, XFIX_N_0_390
|
||||
smull v2.4s, v2.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
|
||||
smull v3.4s, v3.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
|
||||
smull v6.4s, v6.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
|
||||
smull v7.4s, v7.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
|
||||
|
||||
add v6.4s, v6.4s, v4.4s /* z3 += z5 */
|
||||
add v30.4s, v30.4s, v5.4s
|
||||
add v7.4s, v7.4s, v4.4s /* z4 += z5 */
|
||||
add v31.4s, v31.4s, v5.4s
|
||||
|
||||
add v23.4s, v23.4s, v2.4s /* tmp4 += z1 */
|
||||
add v24.4s, v24.4s, v28.4s
|
||||
add v21.4s, v21.4s, v3.4s /* tmp5 += z2 */
|
||||
add v25.4s, v25.4s, v29.4s
|
||||
add v19.4s, v19.4s, v6.4s /* tmp6 += z3 */
|
||||
add v26.4s, v26.4s, v30.4s
|
||||
add v17.4s, v17.4s, v7.4s /* tmp7 += z4 */
|
||||
add v27.4s, v27.4s, v31.4s
|
||||
|
||||
add v23.4s, v23.4s, v6.4s /* tmp4 += z3 */
|
||||
add v24.4s, v24.4s, v30.4s
|
||||
add v21.4s, v21.4s, v7.4s /* tmp5 += z4 */
|
||||
add v25.4s, v25.4s, v31.4s
|
||||
add v19.4s, v19.4s, v3.4s /* tmp6 += z2 */
|
||||
add v26.4s, v26.4s, v29.4s
|
||||
add v17.4s, v17.4s, v2.4s /* tmp7 += z1 */
|
||||
add v27.4s, v27.4s, v28.4s
|
||||
|
||||
rshrn v23.4h, v23.4s, #DESCALE_P2
|
||||
rshrn v21.4h, v21.4s, #DESCALE_P2
|
||||
rshrn v19.4h, v19.4s, #DESCALE_P2
|
||||
rshrn v17.4h, v17.4s, #DESCALE_P2
|
||||
rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS+PASS1_BITS); */
|
||||
rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS+PASS1_BITS); */
|
||||
rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS+PASS1_BITS); */
|
||||
rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS+PASS1_BITS); */
|
||||
|
||||
/* Store results */
|
||||
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
|
||||
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
|
||||
|
||||
ret
|
||||
|
||||
.unreq DATA
|
||||
.unreq TMP
|
||||
endfunc
|
||||
|
||||
#undef XFIX_P_0_298
|
||||
#undef XFIX_N_0_390
|
||||
#undef XFIX_P_0_541
|
||||
#undef XFIX_P_0_765
|
||||
#undef XFIX_N_0_899
|
||||
#undef XFIX_P_1_175
|
||||
#undef XFIX_P_1_501
|
||||
#undef XFIX_N_1_847
|
||||
#undef XFIX_N_1_961
|
||||
#undef XFIX_P_2_053
|
||||
#undef XFIX_N_2_562
|
||||
#undef XFIX_P_3_072
|
@ -1538,6 +1538,7 @@ typedef struct AVCodecContext {
|
||||
#define FF_DCT_MMX 3
|
||||
#define FF_DCT_ALTIVEC 5
|
||||
#define FF_DCT_FAAN 6
|
||||
#define FF_DCT_NEON 7
|
||||
|
||||
/**
|
||||
* IDCT algorithm, see FF_IDCT_* below.
|
||||
|
@ -42,7 +42,9 @@ av_cold void ff_fdctdsp_init(FDCTDSPContext *c, AVCodecContext *avctx)
|
||||
c->fdct248 = ff_fdct248_islow_8;
|
||||
}
|
||||
|
||||
#if ARCH_PPC
|
||||
#if ARCH_AARCH64
|
||||
ff_fdctdsp_init_aarch64(c, avctx, high_bit_depth);
|
||||
#elif ARCH_PPC
|
||||
ff_fdctdsp_init_ppc(c, avctx, high_bit_depth);
|
||||
#elif ARCH_X86
|
||||
ff_fdctdsp_init_x86(c, avctx, high_bit_depth);
|
||||
|
@ -32,6 +32,8 @@ typedef struct FDCTDSPContext {
|
||||
|
||||
FF_VISIBILITY_PUSH_HIDDEN
|
||||
void ff_fdctdsp_init(FDCTDSPContext *c, struct AVCodecContext *avctx);
|
||||
void ff_fdctdsp_init_aarch64(FDCTDSPContext *c, struct AVCodecContext *avctx,
|
||||
unsigned high_bit_depth);
|
||||
void ff_fdctdsp_init_ppc(FDCTDSPContext *c, struct AVCodecContext *avctx,
|
||||
unsigned high_bit_depth);
|
||||
void ff_fdctdsp_init_x86(FDCTDSPContext *c, struct AVCodecContext *avctx,
|
||||
|
@ -158,6 +158,7 @@ static const AVOption avcodec_options[] = {
|
||||
{"mmx", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_MMX }, INT_MIN, INT_MAX, V|E, .unit = "dct"},
|
||||
{"altivec", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_ALTIVEC }, INT_MIN, INT_MAX, V|E, .unit = "dct"},
|
||||
{"faan", "floating point AAN DCT", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_FAAN }, INT_MIN, INT_MAX, V|E, .unit = "dct"},
|
||||
{"neon", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_NEON }, INT_MIN, INT_MAX, V|E, .unit = "dct"},
|
||||
{"lumi_mask", "compresses bright areas stronger than medium ones", OFFSET(lumi_masking), AV_OPT_TYPE_FLOAT, {.dbl = 0 }, -FLT_MAX, FLT_MAX, V|E},
|
||||
{"tcplx_mask", "temporal complexity masking", OFFSET(temporal_cplx_masking), AV_OPT_TYPE_FLOAT, {.dbl = 0 }, -FLT_MAX, FLT_MAX, V|E},
|
||||
{"scplx_mask", "spatial complexity masking", OFFSET(spatial_cplx_masking), AV_OPT_TYPE_FLOAT, {.dbl = 0 }, -FLT_MAX, FLT_MAX, V|E},
|
||||
|
@ -19,9 +19,11 @@
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavcodec/aarch64/fdct.h"
|
||||
#include "libavcodec/aarch64/idct.h"
|
||||
|
||||
static const struct algo fdct_tab_arch[] = {
|
||||
{ "neon", ff_fdct_neon, FF_IDCT_PERM_NONE, AV_CPU_FLAG_NEON },
|
||||
{ 0 }
|
||||
};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user