mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2025-01-20 14:20:51 +00:00
hevcdsp: ARM NEON optimized deblocking filter
cherry picked from commit 1b9ee47d2f43b0a029a9468233626102eb1473b8 Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
9a2f5d825a
commit
0c494114cc
@ -132,6 +132,8 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
|
||||
NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \
|
||||
arm/synth_filter_neon.o
|
||||
NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \
|
||||
arm/hevcdsp_deblock_neon.o
|
||||
NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \
|
||||
arm/rv40dsp_neon.o
|
||||
|
385
libavcodec/arm/hevcdsp_deblock_neon.S
Normal file
385
libavcodec/arm/hevcdsp_deblock_neon.S
Normal file
@ -0,0 +1,385 @@
|
||||
/*
|
||||
* Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
#include "neon.S"
|
||||
|
||||
.macro hevc_loop_filter_chroma_start
|
||||
ldr r12, [r2]
|
||||
ldr r3, [r2, #4]
|
||||
add r2, r3, r12
|
||||
cmp r2, #0
|
||||
it eq
|
||||
bxeq lr
|
||||
.endm
|
||||
|
||||
.macro hevc_loop_filter_chroma_body
|
||||
vsubl.u8 q4, d4, d2
|
||||
vsubl.u8 q11, d18, d19
|
||||
vshl.i16 q4, #2
|
||||
vadd.i16 q11, q4
|
||||
vdup.16 d0, r12
|
||||
vdup.16 d1, r3
|
||||
vrshr.s16 q11, q11, #3
|
||||
vneg.s16 q12, q0
|
||||
vmovl.u8 q2, d4
|
||||
vmin.s16 q11, q11, q0
|
||||
vmax.s16 q11, q11, q12
|
||||
vaddw.u8 q1, q11, d2
|
||||
vsub.i16 q2, q11
|
||||
vqmovun.s16 d2, q1
|
||||
vqmovun.s16 d4, q2
|
||||
.endm
|
||||
|
||||
.macro hevc_loop_filter_luma_start
|
||||
ldr r12, [r3]
|
||||
ldr r3, [r3, #4]
|
||||
lsl r3, #16
|
||||
orr r3, r12
|
||||
cmp r3, #0
|
||||
it eq
|
||||
bxeq lr
|
||||
lsr r3, #16
|
||||
.endm
|
||||
|
||||
.macro hevc_loop_filter_luma_body
|
||||
vmovl.u8 q8, d16
|
||||
vmovl.u8 q9, d18
|
||||
vmovl.u8 q10, d20
|
||||
vmovl.u8 q11, d22
|
||||
vmovl.u8 q12, d24
|
||||
vmovl.u8 q13, d26
|
||||
vmovl.u8 q14, d28
|
||||
vmovl.u8 q15, d30
|
||||
|
||||
vadd.i16 q7, q9, q11
|
||||
vadd.i16 q6, q14, q12
|
||||
vsub.i16 q7, q10
|
||||
vsub.i16 q6, q13
|
||||
vabd.s16 q7, q7, q10
|
||||
vabd.s16 q6, q6, q13
|
||||
|
||||
|
||||
vdup.16 q0, r2
|
||||
vmov q4, q7
|
||||
vmov q5, q6
|
||||
vdup.16 d4, r12
|
||||
vtrn.16 q7, q4
|
||||
vtrn.16 q6, q5
|
||||
|
||||
vshl.u64 q7, #32
|
||||
vshr.u64 q4, #32
|
||||
vshl.u64 q6, #32
|
||||
vshr.u64 q5, #32
|
||||
vshr.u64 q7, #32
|
||||
vshr.u64 q6, #32
|
||||
vshl.u64 q5, #32
|
||||
vshl.u64 q4, #32
|
||||
vorr q6, q5
|
||||
vorr q7, q4
|
||||
vdup.16 d5, r3
|
||||
vadd.i16 q5, q7, q6
|
||||
|
||||
vmov q4, q5
|
||||
vmov q3, q5
|
||||
vtrn.32 q3, q4
|
||||
|
||||
vadd.i16 q4, q3
|
||||
|
||||
vshl.s16 q5, q5, #1
|
||||
vcgt.s16 q3, q0, q4
|
||||
|
||||
vmovn.i16 d6, q3
|
||||
vshr.s16 q1, q0, #2
|
||||
vmovn.i16 d6, q3
|
||||
vcgt.s16 q5, q1, q5
|
||||
vmov r7, s12
|
||||
cmp r7, #0
|
||||
beq bypasswrite
|
||||
|
||||
vpadd.i32 d0, d14, d12
|
||||
vpadd.i32 d1, d15, d13
|
||||
vmov q4, q2
|
||||
vshl.s16 q2, #2
|
||||
vshr.s16 q1, q1, #1
|
||||
vrhadd.s16 q2, q4
|
||||
|
||||
vabd.s16 q7, q8, q11
|
||||
vaba.s16 q7, q15, q12
|
||||
|
||||
vmovn.i32 d0, q0
|
||||
vmov r5, r6, s0, s1
|
||||
vcgt.s16 q6, q1, q7
|
||||
vand q5, q5, q6
|
||||
vabd.s16 q7, q11, q12
|
||||
vcgt.s16 q6, q2, q7
|
||||
vand q5, q5, q6
|
||||
|
||||
vmov q2, q5
|
||||
vtrn.s16 q5, q2
|
||||
vshr.u64 q2, #32
|
||||
vshl.u64 q5, #32
|
||||
vshl.u64 q2, #32
|
||||
vshr.u64 q5, #32
|
||||
vorr q5, q2
|
||||
|
||||
vmov q2, q5
|
||||
vshl.i16 q7, q4, #1
|
||||
vtrn.32 q2, q5
|
||||
vand q5, q2
|
||||
vneg.s16 q6, q7
|
||||
vmovn.i16 d4, q5
|
||||
vmovn.i16 d4, q2
|
||||
vmov r8, s8
|
||||
|
||||
and r9, r8, r7
|
||||
cmp r9, #0
|
||||
beq weakfilter_\@
|
||||
|
||||
vadd.i16 q2, q11, q12
|
||||
vadd.i16 q4, q9, q8
|
||||
vadd.i16 q1, q2, q10
|
||||
vdup.16 d10, r9
|
||||
vadd.i16 q0, q1, q9
|
||||
vshl.i16 q4, #1
|
||||
lsr r9, #16
|
||||
vadd.i16 q1, q0
|
||||
vrshr.s16 q3, q0, #2
|
||||
vadd.i16 q1, q13
|
||||
vadd.i16 q4, q0
|
||||
vsub.i16 q3, q10
|
||||
vrshr.s16 q1, #3
|
||||
vrshr.s16 q4, #3
|
||||
vmax.s16 q3, q6
|
||||
vsub.i16 q1, q11
|
||||
vsub.i16 q4, q9
|
||||
vmin.s16 q3, q7
|
||||
vmax.s16 q4, q6
|
||||
vmax.s16 q1, q6
|
||||
vadd.i16 q3, q10
|
||||
vmin.s16 q4, q7
|
||||
vmin.s16 q1, q7
|
||||
vdup.16 d11, r9
|
||||
vadd.i16 q4, q9
|
||||
vadd.i16 q1, q11
|
||||
vbit q9, q4, q5
|
||||
vadd.i16 q4, q2, q13
|
||||
vbit q11, q1, q5
|
||||
vadd.i16 q0, q4, q14
|
||||
vadd.i16 q2, q15, q14
|
||||
vadd.i16 q4, q0
|
||||
|
||||
vshl.i16 q2, #1
|
||||
vadd.i16 q4, q10
|
||||
vbit q10, q3, q5
|
||||
vrshr.s16 q4, #3
|
||||
vadd.i16 q2, q0
|
||||
vrshr.s16 q3, q0, #2
|
||||
vsub.i16 q4, q12
|
||||
vrshr.s16 q2, #3
|
||||
vsub.i16 q3, q13
|
||||
vmax.s16 q4, q6
|
||||
vsub.i16 q2, q14
|
||||
vmax.s16 q3, q6
|
||||
vmin.s16 q4, q7
|
||||
vmax.s16 q2, q6
|
||||
vmin.s16 q3, q7
|
||||
vadd.i16 q4, q12
|
||||
vmin.s16 q2, q7
|
||||
vadd.i16 q3, q13
|
||||
vbit q12, q4, q5
|
||||
vadd.i16 q2, q14
|
||||
vbit q13, q3, q5
|
||||
vbit q14, q2, q5
|
||||
|
||||
weakfilter_\@:
|
||||
mvn r8, r8
|
||||
and r9, r8, r7
|
||||
cmp r9, #0
|
||||
beq ready_\@
|
||||
|
||||
vdup.16 q4, r2
|
||||
|
||||
vdup.16 d10, r9
|
||||
lsr r9, #16
|
||||
vmov q1, q4
|
||||
vdup.16 d11, r9
|
||||
vshr.s16 q1, #1
|
||||
vsub.i16 q2, q12, q11
|
||||
vadd.i16 q4, q1
|
||||
vshl.s16 q0, q2, #3
|
||||
vshr.s16 q4, #3
|
||||
vadd.i16 q2, q0
|
||||
vsub.i16 q0, q13, q10
|
||||
vsub.i16 q2, q0
|
||||
vshl.i16 q0, q0, #1
|
||||
vsub.i16 q2, q0
|
||||
vshl.s16 q1, q7, 2
|
||||
vrshr.s16 q2, q2, #4
|
||||
vadd.i16 q1, q7
|
||||
vabs.s16 q3, q2
|
||||
vshr.s16 q6, q6, #1
|
||||
vcgt.s16 q1, q1, q3
|
||||
vand q5, q1
|
||||
vshr.s16 q7, q7, #1
|
||||
vmax.s16 q2, q2, q6
|
||||
vmin.s16 q2, q2, q7
|
||||
|
||||
vshr.s16 q7, q7, #1
|
||||
vrhadd.s16 q3, q9, q11
|
||||
vneg.s16 q6, q7
|
||||
vsub.s16 q3, q10
|
||||
vdup.16 d2, r5
|
||||
vhadd.s16 q3, q2
|
||||
vdup.16 d3, r6
|
||||
vmax.s16 q3, q3, q6
|
||||
vcgt.s16 q1, q4, q1
|
||||
vmin.s16 q3, q3, q7
|
||||
vand q1, q5
|
||||
vadd.i16 q3, q10
|
||||
lsr r5, #16
|
||||
lsr r6, #16
|
||||
vbit q10, q3, q1
|
||||
|
||||
vrhadd.s16 q3, q14, q12
|
||||
vdup.16 d2, r5
|
||||
vsub.s16 q3, q13
|
||||
vdup.16 d3, r6
|
||||
vhsub.s16 q3, q2
|
||||
vcgt.s16 q1, q4, q1
|
||||
vmax.s16 q3, q3, q6
|
||||
vand q1, q5
|
||||
vmin.s16 q3, q3, q7
|
||||
vadd.i16 q3, q13
|
||||
vbit q13, q3, q1
|
||||
vadd.i16 q0, q11, q2
|
||||
vsub.i16 q4, q12, q2
|
||||
vbit q11, q0, q5
|
||||
vbit q12, q4, q5
|
||||
|
||||
ready_\@:
|
||||
vqmovun.s16 d16, q8
|
||||
vqmovun.s16 d18, q9
|
||||
vqmovun.s16 d20, q10
|
||||
vqmovun.s16 d22, q11
|
||||
vqmovun.s16 d24, q12
|
||||
vqmovun.s16 d26, q13
|
||||
vqmovun.s16 d28, q14
|
||||
vqmovun.s16 d30, q15
|
||||
.endm
|
||||
|
||||
function ff_hevc_v_loop_filter_luma_neon, export=1
|
||||
hevc_loop_filter_luma_start
|
||||
push {r5-r11}
|
||||
vpush {d8-d15}
|
||||
sub r0, #4
|
||||
vld1.8 {d16}, [r0], r1
|
||||
vld1.8 {d18}, [r0], r1
|
||||
vld1.8 {d20}, [r0], r1
|
||||
vld1.8 {d22}, [r0], r1
|
||||
vld1.8 {d24}, [r0], r1
|
||||
vld1.8 {d26}, [r0], r1
|
||||
vld1.8 {d28}, [r0], r1
|
||||
vld1.8 {d30}, [r0], r1
|
||||
sub r0, r0, r1, lsl #3
|
||||
transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
|
||||
hevc_loop_filter_luma_body
|
||||
transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
|
||||
vst1.8 {d16}, [r0], r1
|
||||
vst1.8 {d18}, [r0], r1
|
||||
vst1.8 {d20}, [r0], r1
|
||||
vst1.8 {d22}, [r0], r1
|
||||
vst1.8 {d24}, [r0], r1
|
||||
vst1.8 {d26}, [r0], r1
|
||||
vst1.8 {d28}, [r0], r1
|
||||
vst1.8 {d30}, [r0]
|
||||
vpop {d8-d15}
|
||||
pop {r5-r11}
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_hevc_h_loop_filter_luma_neon, export=1
|
||||
hevc_loop_filter_luma_start
|
||||
push {r5-r11}
|
||||
vpush {d8-d15}
|
||||
sub r0, r0, r1, lsl #2
|
||||
vld1.8 {d16}, [r0], r1
|
||||
vld1.8 {d18}, [r0], r1
|
||||
vld1.8 {d20}, [r0], r1
|
||||
vld1.8 {d22}, [r0], r1
|
||||
vld1.8 {d24}, [r0], r1
|
||||
vld1.8 {d26}, [r0], r1
|
||||
vld1.8 {d28}, [r0], r1
|
||||
vld1.8 {d30}, [r0], r1
|
||||
sub r0, r0, r1, lsl #3
|
||||
add r0, r1
|
||||
hevc_loop_filter_luma_body
|
||||
vst1.8 {d18}, [r0], r1
|
||||
vst1.8 {d20}, [r0], r1
|
||||
vst1.8 {d22}, [r0], r1
|
||||
vst1.8 {d24}, [r0], r1
|
||||
vst1.8 {d26}, [r0], r1
|
||||
vst1.8 {d28}, [r0]
|
||||
bypasswrite:
|
||||
vpop {d8-d15}
|
||||
pop {r5-r11}
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_hevc_v_loop_filter_chroma_neon, export=1
|
||||
hevc_loop_filter_chroma_start
|
||||
sub r0, #4
|
||||
vld1.8 {d16}, [r0], r1
|
||||
vld1.8 {d17}, [r0], r1
|
||||
vld1.8 {d18}, [r0], r1
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d19}, [r0], r1
|
||||
vld1.8 {d20}, [r0], r1
|
||||
vld1.8 {d21}, [r0], r1
|
||||
sub r0, r0, r1, lsl #3
|
||||
transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
|
||||
hevc_loop_filter_chroma_body
|
||||
transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
|
||||
vst1.8 {d16}, [r0], r1
|
||||
vst1.8 {d17}, [r0], r1
|
||||
vst1.8 {d18}, [r0], r1
|
||||
vst1.8 {d2}, [r0], r1
|
||||
vst1.8 {d4}, [r0], r1
|
||||
vst1.8 {d19}, [r0], r1
|
||||
vst1.8 {d20}, [r0], r1
|
||||
vst1.8 {d21}, [r0]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_hevc_h_loop_filter_chroma_neon, export=1
|
||||
hevc_loop_filter_chroma_start
|
||||
sub r0, r0, r1, lsl #1
|
||||
vld1.8 {d18}, [r0], r1
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d19}, [r0]
|
||||
sub r0, r0, r1, lsl #1
|
||||
hevc_loop_filter_chroma_body
|
||||
vst1.8 {d2}, [r0], r1
|
||||
vst1.8 {d4}, [r0]
|
||||
bx lr
|
||||
endfunc
|
48
libavcodec/arm/hevcdsp_init_neon.c
Normal file
48
libavcodec/arm/hevcdsp_init_neon.c
Normal file
@ -0,0 +1,48 @@
|
||||
/*
|
||||
* Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/hevcdsp.h"
|
||||
|
||||
void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
|
||||
void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
|
||||
void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
|
||||
void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
|
||||
|
||||
static av_cold void hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
|
||||
{
|
||||
#if HAVE_NEON
|
||||
if (bit_depth == 8) {
|
||||
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_neon;
|
||||
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_neon;
|
||||
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_neon;
|
||||
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_neon;
|
||||
}
|
||||
#endif // HAVE_NEON
|
||||
}
|
||||
|
||||
void ff_hevcdsp_init_arm(HEVCDSPContext *c, const int bit_depth)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags))
|
||||
hevcdsp_init_neon(c, bit_depth);
|
||||
}
|
@ -259,4 +259,6 @@ int i = 0;
|
||||
|
||||
if (ARCH_X86)
|
||||
ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
|
||||
if (ARCH_ARM)
|
||||
ff_hevcdsp_init_arm(hevcdsp, bit_depth);
|
||||
}
|
||||
|
@ -128,5 +128,5 @@ extern const int8_t ff_hevc_epel_filters[7][4];
|
||||
extern const int8_t ff_hevc_qpel_filters[3][16];
|
||||
|
||||
void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
|
||||
|
||||
void ff_hevcdsp_init_arm(HEVCDSPContext *c, const int bit_depth);
|
||||
#endif /* AVCODEC_HEVCDSP_H */
|
||||
|
Loading…
Reference in New Issue
Block a user