mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2025-01-03 13:32:10 +00:00
avcodec/hevc: Add pel_uni_w_pixels4/6/8/12/16/24/32/48/64 asm opt
tests/checkasm/checkasm: C LSX LASX put_hevc_pel_uni_w_pixels4_8_c: 2.7 1.0 put_hevc_pel_uni_w_pixels6_8_c: 6.2 2.0 1.5 put_hevc_pel_uni_w_pixels8_8_c: 10.7 2.5 1.7 put_hevc_pel_uni_w_pixels12_8_c: 23.0 5.5 5.0 put_hevc_pel_uni_w_pixels16_8_c: 41.0 8.2 5.0 put_hevc_pel_uni_w_pixels24_8_c: 91.0 19.7 13.2 put_hevc_pel_uni_w_pixels32_8_c: 161.7 32.5 16.2 put_hevc_pel_uni_w_pixels48_8_c: 354.5 73.7 43.0 put_hevc_pel_uni_w_pixels64_8_c: 641.5 130.0 64.2 Speedup of decoding H265 4K 30FPS 30Mbps on 3A6000 with 8 threads is 1fps(47fps-->48fps). Reviewed-by: yinshiyou-hf@loongson.cn Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
parent
cfbdda607d
commit
a28eea2a27
@ -28,7 +28,8 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
|
||||
loongarch/hevc_mc_bi_lsx.o \
|
||||
loongarch/hevc_mc_uni_lsx.o \
|
||||
loongarch/hevc_mc_uniw_lsx.o \
|
||||
loongarch/hevc_add_res.o
|
||||
loongarch/hevc_add_res.o \
|
||||
loongarch/hevc_mc.o
|
||||
LSX-OBJS-$(CONFIG_H264DSP) += loongarch/h264idct.o \
|
||||
loongarch/h264idct_loongarch.o \
|
||||
loongarch/h264dsp.o
|
||||
|
471
libavcodec/loongarch/hevc_mc.S
Normal file
471
libavcodec/loongarch/hevc_mc.S
Normal file
@ -0,0 +1,471 @@
|
||||
/*
|
||||
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
||||
* Contributed by jinbo <jinbo@loongson.cn>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "loongson_asm.S"
|
||||
|
||||
.macro LOAD_VAR bit
|
||||
addi.w t1, a5, 6 //shift
|
||||
addi.w t3, zero, 1 //one
|
||||
sub.w t4, t1, t3
|
||||
sll.w t3, t3, t4 //offset
|
||||
.if \bit == 128
|
||||
vreplgr2vr.w vr1, a6 //wx
|
||||
vreplgr2vr.w vr2, t3 //offset
|
||||
vreplgr2vr.w vr3, t1 //shift
|
||||
vreplgr2vr.w vr4, a7 //ox
|
||||
.else
|
||||
xvreplgr2vr.w xr1, a6
|
||||
xvreplgr2vr.w xr2, t3
|
||||
xvreplgr2vr.w xr3, t1
|
||||
xvreplgr2vr.w xr4, a7
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro HEVC_PEL_UNI_W_PIXELS8_LSX src0, dst0, w
|
||||
vldrepl.d vr0, \src0, 0
|
||||
vsllwil.hu.bu vr0, vr0, 0
|
||||
vexth.wu.hu vr5, vr0
|
||||
vsllwil.wu.hu vr0, vr0, 0
|
||||
vslli.w vr0, vr0, 6
|
||||
vslli.w vr5, vr5, 6
|
||||
vmul.w vr0, vr0, vr1
|
||||
vmul.w vr5, vr5, vr1
|
||||
vadd.w vr0, vr0, vr2
|
||||
vadd.w vr5, vr5, vr2
|
||||
vsra.w vr0, vr0, vr3
|
||||
vsra.w vr5, vr5, vr3
|
||||
vadd.w vr0, vr0, vr4
|
||||
vadd.w vr5, vr5, vr4
|
||||
vssrani.h.w vr5, vr0, 0
|
||||
vssrani.bu.h vr5, vr5, 0
|
||||
.if \w == 6
|
||||
fst.s f5, \dst0, 0
|
||||
vstelm.h vr5, \dst0, 4, 2
|
||||
.else
|
||||
fst.d f5, \dst0, 0
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro HEVC_PEL_UNI_W_PIXELS8x2_LASX src0, dst0, w
|
||||
vldrepl.d vr0, \src0, 0
|
||||
add.d t2, \src0, a3
|
||||
vldrepl.d vr5, t2, 0
|
||||
xvpermi.q xr0, xr5, 0x02
|
||||
xvsllwil.hu.bu xr0, xr0, 0
|
||||
xvexth.wu.hu xr5, xr0
|
||||
xvsllwil.wu.hu xr0, xr0, 0
|
||||
xvslli.w xr0, xr0, 6
|
||||
xvslli.w xr5, xr5, 6
|
||||
xvmul.w xr0, xr0, xr1
|
||||
xvmul.w xr5, xr5, xr1
|
||||
xvadd.w xr0, xr0, xr2
|
||||
xvadd.w xr5, xr5, xr2
|
||||
xvsra.w xr0, xr0, xr3
|
||||
xvsra.w xr5, xr5, xr3
|
||||
xvadd.w xr0, xr0, xr4
|
||||
xvadd.w xr5, xr5, xr4
|
||||
xvssrani.h.w xr5, xr0, 0
|
||||
xvpermi.q xr0, xr5, 0x01
|
||||
xvssrani.bu.h xr0, xr5, 0
|
||||
add.d t3, \dst0, a1
|
||||
.if \w == 6
|
||||
vstelm.w vr0, \dst0, 0, 0
|
||||
vstelm.h vr0, \dst0, 4, 2
|
||||
vstelm.w vr0, t3, 0, 2
|
||||
vstelm.h vr0, t3, 4, 6
|
||||
.else
|
||||
vstelm.d vr0, \dst0, 0, 0
|
||||
vstelm.d vr0, t3, 0, 1
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro HEVC_PEL_UNI_W_PIXELS16_LSX src0, dst0
|
||||
vld vr0, \src0, 0
|
||||
vexth.hu.bu vr7, vr0
|
||||
vexth.wu.hu vr8, vr7
|
||||
vsllwil.wu.hu vr7, vr7, 0
|
||||
vsllwil.hu.bu vr5, vr0, 0
|
||||
vexth.wu.hu vr6, vr5
|
||||
vsllwil.wu.hu vr5, vr5, 0
|
||||
vslli.w vr5, vr5, 6
|
||||
vslli.w vr6, vr6, 6
|
||||
vslli.w vr7, vr7, 6
|
||||
vslli.w vr8, vr8, 6
|
||||
vmul.w vr5, vr5, vr1
|
||||
vmul.w vr6, vr6, vr1
|
||||
vmul.w vr7, vr7, vr1
|
||||
vmul.w vr8, vr8, vr1
|
||||
vadd.w vr5, vr5, vr2
|
||||
vadd.w vr6, vr6, vr2
|
||||
vadd.w vr7, vr7, vr2
|
||||
vadd.w vr8, vr8, vr2
|
||||
vsra.w vr5, vr5, vr3
|
||||
vsra.w vr6, vr6, vr3
|
||||
vsra.w vr7, vr7, vr3
|
||||
vsra.w vr8, vr8, vr3
|
||||
vadd.w vr5, vr5, vr4
|
||||
vadd.w vr6, vr6, vr4
|
||||
vadd.w vr7, vr7, vr4
|
||||
vadd.w vr8, vr8, vr4
|
||||
vssrani.h.w vr6, vr5, 0
|
||||
vssrani.h.w vr8, vr7, 0
|
||||
vssrani.bu.h vr8, vr6, 0
|
||||
vst vr8, \dst0, 0
|
||||
.endm
|
||||
|
||||
.macro HEVC_PEL_UNI_W_PIXELS16_LASX src0, dst0
|
||||
vld vr0, \src0, 0
|
||||
xvpermi.d xr0, xr0, 0xd8
|
||||
xvsllwil.hu.bu xr0, xr0, 0
|
||||
xvexth.wu.hu xr6, xr0
|
||||
xvsllwil.wu.hu xr5, xr0, 0
|
||||
xvslli.w xr5, xr5, 6
|
||||
xvslli.w xr6, xr6, 6
|
||||
xvmul.w xr5, xr5, xr1
|
||||
xvmul.w xr6, xr6, xr1
|
||||
xvadd.w xr5, xr5, xr2
|
||||
xvadd.w xr6, xr6, xr2
|
||||
xvsra.w xr5, xr5, xr3
|
||||
xvsra.w xr6, xr6, xr3
|
||||
xvadd.w xr5, xr5, xr4
|
||||
xvadd.w xr6, xr6, xr4
|
||||
xvssrani.h.w xr6, xr5, 0
|
||||
xvpermi.q xr7, xr6, 0x01
|
||||
xvssrani.bu.h xr7, xr6, 0
|
||||
vst vr7, \dst0, 0
|
||||
.endm
|
||||
|
||||
.macro HEVC_PEL_UNI_W_PIXELS32_LASX src0, dst0, w
|
||||
.if \w == 16
|
||||
vld vr0, \src0, 0
|
||||
add.d t2, \src0, a3
|
||||
vld vr5, t2, 0
|
||||
xvpermi.q xr0, xr5, 0x02
|
||||
.else //w=24/32
|
||||
xvld xr0, \src0, 0
|
||||
.endif
|
||||
xvexth.hu.bu xr7, xr0
|
||||
xvexth.wu.hu xr8, xr7
|
||||
xvsllwil.wu.hu xr7, xr7, 0
|
||||
xvsllwil.hu.bu xr5, xr0, 0
|
||||
xvexth.wu.hu xr6, xr5
|
||||
xvsllwil.wu.hu xr5, xr5, 0
|
||||
xvslli.w xr5, xr5, 6
|
||||
xvslli.w xr6, xr6, 6
|
||||
xvslli.w xr7, xr7, 6
|
||||
xvslli.w xr8, xr8, 6
|
||||
xvmul.w xr5, xr5, xr1
|
||||
xvmul.w xr6, xr6, xr1
|
||||
xvmul.w xr7, xr7, xr1
|
||||
xvmul.w xr8, xr8, xr1
|
||||
xvadd.w xr5, xr5, xr2
|
||||
xvadd.w xr6, xr6, xr2
|
||||
xvadd.w xr7, xr7, xr2
|
||||
xvadd.w xr8, xr8, xr2
|
||||
xvsra.w xr5, xr5, xr3
|
||||
xvsra.w xr6, xr6, xr3
|
||||
xvsra.w xr7, xr7, xr3
|
||||
xvsra.w xr8, xr8, xr3
|
||||
xvadd.w xr5, xr5, xr4
|
||||
xvadd.w xr6, xr6, xr4
|
||||
xvadd.w xr7, xr7, xr4
|
||||
xvadd.w xr8, xr8, xr4
|
||||
xvssrani.h.w xr6, xr5, 0
|
||||
xvssrani.h.w xr8, xr7, 0
|
||||
xvssrani.bu.h xr8, xr6, 0
|
||||
.if \w == 16
|
||||
vst vr8, \dst0, 0
|
||||
add.d t2, \dst0, a1
|
||||
xvpermi.q xr8, xr8, 0x01
|
||||
vst vr8, t2, 0
|
||||
.elseif \w == 24
|
||||
vst vr8, \dst0, 0
|
||||
xvstelm.d xr8, \dst0, 16, 2
|
||||
.else
|
||||
xvst xr8, \dst0, 0
|
||||
.endif
|
||||
.endm
|
||||
|
||||
function ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx
|
||||
LOAD_VAR 128
|
||||
srli.w t0, a4, 1
|
||||
.LOOP_PIXELS4:
|
||||
vldrepl.w vr0, a2, 0
|
||||
add.d t1, a2, a3
|
||||
vldrepl.w vr5, t1, 0
|
||||
vsllwil.hu.bu vr0, vr0, 0
|
||||
vsllwil.wu.hu vr0, vr0, 0
|
||||
vsllwil.hu.bu vr5, vr5, 0
|
||||
vsllwil.wu.hu vr5, vr5, 0
|
||||
vslli.w vr0, vr0, 6
|
||||
vslli.w vr5, vr5, 6
|
||||
vmul.w vr0, vr0, vr1
|
||||
vmul.w vr5, vr5, vr1
|
||||
vadd.w vr0, vr0, vr2
|
||||
vadd.w vr5, vr5, vr2
|
||||
vsra.w vr0, vr0, vr3
|
||||
vsra.w vr5, vr5, vr3
|
||||
vadd.w vr0, vr0, vr4
|
||||
vadd.w vr5, vr5, vr4
|
||||
vssrani.h.w vr5, vr0, 0
|
||||
vssrani.bu.h vr5, vr5, 0
|
||||
fst.s f5, a0, 0
|
||||
add.d t2, a0, a1
|
||||
vstelm.w vr5, t2, 0, 1
|
||||
alsl.d a2, a3, a2, 1
|
||||
alsl.d a0, a1, a0, 1
|
||||
addi.w t0, t0, -1
|
||||
bnez t0, .LOOP_PIXELS4
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx
|
||||
LOAD_VAR 128
|
||||
.LOOP_PIXELS6:
|
||||
HEVC_PEL_UNI_W_PIXELS8_LSX a2, a0, 6
|
||||
add.d a2, a2, a3
|
||||
add.d a0, a0, a1
|
||||
addi.w a4, a4, -1
|
||||
bnez a4, .LOOP_PIXELS6
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx
|
||||
LOAD_VAR 256
|
||||
srli.w t0, a4, 1
|
||||
.LOOP_PIXELS6_LASX:
|
||||
HEVC_PEL_UNI_W_PIXELS8x2_LASX a2, a0, 6
|
||||
alsl.d a2, a3, a2, 1
|
||||
alsl.d a0, a1, a0, 1
|
||||
addi.w t0, t0, -1
|
||||
bnez t0, .LOOP_PIXELS6_LASX
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx
|
||||
LOAD_VAR 128
|
||||
.LOOP_PIXELS8:
|
||||
HEVC_PEL_UNI_W_PIXELS8_LSX a2, a0, 8
|
||||
add.d a2, a2, a3
|
||||
add.d a0, a0, a1
|
||||
addi.w a4, a4, -1
|
||||
bnez a4, .LOOP_PIXELS8
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx
|
||||
LOAD_VAR 256
|
||||
srli.w t0, a4, 1
|
||||
.LOOP_PIXELS8_LASX:
|
||||
HEVC_PEL_UNI_W_PIXELS8x2_LASX a2, a0, 8
|
||||
alsl.d a2, a3, a2, 1
|
||||
alsl.d a0, a1, a0, 1
|
||||
addi.w t0, t0, -1
|
||||
bnez t0, .LOOP_PIXELS8_LASX
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx
|
||||
LOAD_VAR 128
|
||||
.LOOP_PIXELS12:
|
||||
vld vr0, a2, 0
|
||||
vexth.hu.bu vr7, vr0
|
||||
vsllwil.wu.hu vr7, vr7, 0
|
||||
vsllwil.hu.bu vr5, vr0, 0
|
||||
vexth.wu.hu vr6, vr5
|
||||
vsllwil.wu.hu vr5, vr5, 0
|
||||
vslli.w vr5, vr5, 6
|
||||
vslli.w vr6, vr6, 6
|
||||
vslli.w vr7, vr7, 6
|
||||
vmul.w vr5, vr5, vr1
|
||||
vmul.w vr6, vr6, vr1
|
||||
vmul.w vr7, vr7, vr1
|
||||
vadd.w vr5, vr5, vr2
|
||||
vadd.w vr6, vr6, vr2
|
||||
vadd.w vr7, vr7, vr2
|
||||
vsra.w vr5, vr5, vr3
|
||||
vsra.w vr6, vr6, vr3
|
||||
vsra.w vr7, vr7, vr3
|
||||
vadd.w vr5, vr5, vr4
|
||||
vadd.w vr6, vr6, vr4
|
||||
vadd.w vr7, vr7, vr4
|
||||
vssrani.h.w vr6, vr5, 0
|
||||
vssrani.h.w vr7, vr7, 0
|
||||
vssrani.bu.h vr7, vr6, 0
|
||||
fst.d f7, a0, 0
|
||||
vstelm.w vr7, a0, 8, 2
|
||||
add.d a2, a2, a3
|
||||
add.d a0, a0, a1
|
||||
addi.w a4, a4, -1
|
||||
bnez a4, .LOOP_PIXELS12
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx
|
||||
LOAD_VAR 256
|
||||
.LOOP_PIXELS12_LASX:
|
||||
vld vr0, a2, 0
|
||||
xvpermi.d xr0, xr0, 0xd8
|
||||
xvsllwil.hu.bu xr0, xr0, 0
|
||||
xvexth.wu.hu xr6, xr0
|
||||
xvsllwil.wu.hu xr5, xr0, 0
|
||||
xvslli.w xr5, xr5, 6
|
||||
xvslli.w xr6, xr6, 6
|
||||
xvmul.w xr5, xr5, xr1
|
||||
xvmul.w xr6, xr6, xr1
|
||||
xvadd.w xr5, xr5, xr2
|
||||
xvadd.w xr6, xr6, xr2
|
||||
xvsra.w xr5, xr5, xr3
|
||||
xvsra.w xr6, xr6, xr3
|
||||
xvadd.w xr5, xr5, xr4
|
||||
xvadd.w xr6, xr6, xr4
|
||||
xvssrani.h.w xr6, xr5, 0
|
||||
xvpermi.q xr7, xr6, 0x01
|
||||
xvssrani.bu.h xr7, xr6, 0
|
||||
fst.d f7, a0, 0
|
||||
vstelm.w vr7, a0, 8, 2
|
||||
add.d a2, a2, a3
|
||||
add.d a0, a0, a1
|
||||
addi.w a4, a4, -1
|
||||
bnez a4, .LOOP_PIXELS12_LASX
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx
|
||||
LOAD_VAR 128
|
||||
.LOOP_PIXELS16:
|
||||
HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0
|
||||
add.d a2, a2, a3
|
||||
add.d a0, a0, a1
|
||||
addi.w a4, a4, -1
|
||||
bnez a4, .LOOP_PIXELS16
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx
|
||||
LOAD_VAR 256
|
||||
srli.w t0, a4, 1
|
||||
.LOOP_PIXELS16_LASX:
|
||||
HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 16
|
||||
alsl.d a2, a3, a2, 1
|
||||
alsl.d a0, a1, a0, 1
|
||||
addi.w t0, t0, -1
|
||||
bnez t0, .LOOP_PIXELS16_LASX
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx
|
||||
LOAD_VAR 128
|
||||
.LOOP_PIXELS24:
|
||||
HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0
|
||||
addi.d t0, a2, 16
|
||||
addi.d t1, a0, 16
|
||||
HEVC_PEL_UNI_W_PIXELS8_LSX t0, t1, 8
|
||||
add.d a2, a2, a3
|
||||
add.d a0, a0, a1
|
||||
addi.w a4, a4, -1
|
||||
bnez a4, .LOOP_PIXELS24
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx
|
||||
LOAD_VAR 256
|
||||
.LOOP_PIXELS24_LASX:
|
||||
HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 24
|
||||
add.d a2, a2, a3
|
||||
add.d a0, a0, a1
|
||||
addi.w a4, a4, -1
|
||||
bnez a4, .LOOP_PIXELS24_LASX
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx
|
||||
LOAD_VAR 128
|
||||
.LOOP_PIXELS32:
|
||||
HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0
|
||||
addi.d t0, a2, 16
|
||||
addi.d t1, a0, 16
|
||||
HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1
|
||||
add.d a2, a2, a3
|
||||
add.d a0, a0, a1
|
||||
addi.w a4, a4, -1
|
||||
bnez a4, .LOOP_PIXELS32
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx
|
||||
LOAD_VAR 256
|
||||
.LOOP_PIXELS32_LASX:
|
||||
HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 32
|
||||
add.d a2, a2, a3
|
||||
add.d a0, a0, a1
|
||||
addi.w a4, a4, -1
|
||||
bnez a4, .LOOP_PIXELS32_LASX
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx
|
||||
LOAD_VAR 128
|
||||
.LOOP_PIXELS48:
|
||||
HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0
|
||||
addi.d t0, a2, 16
|
||||
addi.d t1, a0, 16
|
||||
HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1
|
||||
addi.d t0, a2, 32
|
||||
addi.d t1, a0, 32
|
||||
HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1
|
||||
add.d a2, a2, a3
|
||||
add.d a0, a0, a1
|
||||
addi.w a4, a4, -1
|
||||
bnez a4, .LOOP_PIXELS48
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx
|
||||
LOAD_VAR 256
|
||||
.LOOP_PIXELS48_LASX:
|
||||
HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 32
|
||||
addi.d t0, a2, 32
|
||||
addi.d t1, a0, 32
|
||||
HEVC_PEL_UNI_W_PIXELS16_LASX t0, t1
|
||||
add.d a2, a2, a3
|
||||
add.d a0, a0, a1
|
||||
addi.w a4, a4, -1
|
||||
bnez a4, .LOOP_PIXELS48_LASX
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx
|
||||
LOAD_VAR 128
|
||||
.LOOP_PIXELS64:
|
||||
HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0
|
||||
addi.d t0, a2, 16
|
||||
addi.d t1, a0, 16
|
||||
HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1
|
||||
addi.d t0, a2, 32
|
||||
addi.d t1, a0, 32
|
||||
HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1
|
||||
addi.d t0, a2, 48
|
||||
addi.d t1, a0, 48
|
||||
HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1
|
||||
add.d a2, a2, a3
|
||||
add.d a0, a0, a1
|
||||
addi.w a4, a4, -1
|
||||
bnez a4, .LOOP_PIXELS64
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx
|
||||
LOAD_VAR 256
|
||||
.LOOP_PIXELS64_LASX:
|
||||
HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 32
|
||||
addi.d t0, a2, 32
|
||||
addi.d t1, a0, 32
|
||||
HEVC_PEL_UNI_W_PIXELS32_LASX t0, t1, 32
|
||||
add.d a2, a2, a3
|
||||
add.d a0, a0, a1
|
||||
addi.w a4, a4, -1
|
||||
bnez a4, .LOOP_PIXELS64_LASX
|
||||
endfunc
|
@ -22,6 +22,7 @@
|
||||
|
||||
#include "libavutil/loongarch/cpu.h"
|
||||
#include "hevcdsp_lsx.h"
|
||||
#include "hevcdsp_lasx.h"
|
||||
|
||||
void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
|
||||
{
|
||||
@ -160,6 +161,26 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
|
||||
c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_lsx;
|
||||
c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_lsx;
|
||||
|
||||
c->put_hevc_qpel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx;
|
||||
c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx;
|
||||
c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx;
|
||||
c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx;
|
||||
c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx;
|
||||
c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx;
|
||||
c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx;
|
||||
c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx;
|
||||
c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx;
|
||||
|
||||
c->put_hevc_epel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx;
|
||||
c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx;
|
||||
c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx;
|
||||
c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx;
|
||||
c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx;
|
||||
c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx;
|
||||
c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx;
|
||||
c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx;
|
||||
c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx;
|
||||
|
||||
c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_lsx;
|
||||
c->put_hevc_qpel_uni_w[5][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv16_8_lsx;
|
||||
c->put_hevc_qpel_uni_w[6][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv24_8_lsx;
|
||||
@ -196,4 +217,26 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
|
||||
c->add_residual[3] = ff_hevc_add_residual32x32_8_lsx;
|
||||
}
|
||||
}
|
||||
|
||||
if (have_lasx(cpu_flags)) {
|
||||
if (bit_depth == 8) {
|
||||
c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx;
|
||||
c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx;
|
||||
c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx;
|
||||
c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx;
|
||||
c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx;
|
||||
c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx;
|
||||
c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx;
|
||||
c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx;
|
||||
|
||||
c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx;
|
||||
c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx;
|
||||
c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx;
|
||||
c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx;
|
||||
c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx;
|
||||
c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx;
|
||||
c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx;
|
||||
c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
53
libavcodec/loongarch/hevcdsp_lasx.h
Normal file
53
libavcodec/loongarch/hevcdsp_lasx.h
Normal file
@ -0,0 +1,53 @@
|
||||
/*
|
||||
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
||||
* Contributed by jinbo <jinbo@loongson.cn>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H
|
||||
#define AVCODEC_LOONGARCH_HEVCDSP_LASX_H
|
||||
|
||||
#include "libavcodec/hevcdsp.h"
|
||||
|
||||
#define PEL_UNI_W(PEL, DIR, WIDTH) \
|
||||
void ff_hevc_put_hevc_##PEL##_uni_w_##DIR##WIDTH##_8_lasx(uint8_t *dst, \
|
||||
ptrdiff_t \
|
||||
dst_stride, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t \
|
||||
src_stride, \
|
||||
int height, \
|
||||
int denom, \
|
||||
int wx, \
|
||||
int ox, \
|
||||
intptr_t mx, \
|
||||
intptr_t my, \
|
||||
int width)
|
||||
|
||||
PEL_UNI_W(pel, pixels, 6);
|
||||
PEL_UNI_W(pel, pixels, 8);
|
||||
PEL_UNI_W(pel, pixels, 12);
|
||||
PEL_UNI_W(pel, pixels, 16);
|
||||
PEL_UNI_W(pel, pixels, 24);
|
||||
PEL_UNI_W(pel, pixels, 32);
|
||||
PEL_UNI_W(pel, pixels, 48);
|
||||
PEL_UNI_W(pel, pixels, 64);
|
||||
|
||||
#undef PEL_UNI_W
|
||||
|
||||
#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H
|
@ -232,4 +232,31 @@ void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t s
|
||||
void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
|
||||
void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
|
||||
|
||||
#define PEL_UNI_W(PEL, DIR, WIDTH) \
|
||||
void ff_hevc_put_hevc_##PEL##_uni_w_##DIR##WIDTH##_8_lsx(uint8_t *dst, \
|
||||
ptrdiff_t \
|
||||
dst_stride, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t \
|
||||
src_stride, \
|
||||
int height, \
|
||||
int denom, \
|
||||
int wx, \
|
||||
int ox, \
|
||||
intptr_t mx, \
|
||||
intptr_t my, \
|
||||
int width)
|
||||
|
||||
PEL_UNI_W(pel, pixels, 4);
|
||||
PEL_UNI_W(pel, pixels, 6);
|
||||
PEL_UNI_W(pel, pixels, 8);
|
||||
PEL_UNI_W(pel, pixels, 12);
|
||||
PEL_UNI_W(pel, pixels, 16);
|
||||
PEL_UNI_W(pel, pixels, 24);
|
||||
PEL_UNI_W(pel, pixels, 32);
|
||||
PEL_UNI_W(pel, pixels, 48);
|
||||
PEL_UNI_W(pel, pixels, 64);
|
||||
|
||||
#undef PEL_UNI_W
|
||||
|
||||
#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
|
||||
|
Loading…
Reference in New Issue
Block a user