ffmpeg/libavcodec/loongarch/h264intrapred.S
Lu Wang 8815a7719e
avcodec/la: Add LSX optimization for h264 chroma and intrapred.
./configure --disable-lasx
ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an
before: 199fps
after:  214fps

Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn>
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
2023-05-25 21:04:56 +02:00

300 lines
8.4 KiB
ArmAsm

/*
* Loongson LSX optimized h264intrapred
*
* Copyright (c) 2023 Loongson Technology Corporation Limited
* Contributed by Lu Wang <wanglu@loongson.cn>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "loongson_asm.S"
const shufa
.byte 6, 5, 4, 3, 2, 1, 0
endconst
const mulk
.byte 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0
endconst
const mulh
.byte 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
.byte 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0
endconst
.macro PRED16X16_PLANE
slli.d t6, a1, 1
slli.d t4, a1, 3
addi.d t0, a0, 7
sub.d t0, t0, a1
add.d t1, a0, t4
addi.d t1, t1, -1
sub.d t2, t1, t6
ld.bu t3, t0, 1
ld.bu t4, t0, -1
ld.bu t5, t1, 0
ld.bu t7, t2, 0
sub.d t3, t3, t4
sub.d t4, t5, t7
la.local t5, mulk
vld vr0, t5, 0
fld.d f1, t0, 2
fld.d f2, t0, -8
la.local t5, shufa
fld.d f3, t5, 0
vshuf.b vr2, vr2, vr2, vr3
vilvl.b vr1, vr1, vr2
vhsubw.hu.bu vr1, vr1, vr1
vmul.h vr0, vr0, vr1
vhaddw.w.h vr1, vr0, vr0
vhaddw.d.w vr0, vr1, vr1
vhaddw.q.d vr1, vr0, vr0
vpickve2gr.w t5, vr1, 0
add.d t3, t3, t5
//2
sub.d t2, t2, a1
ld.bu t8, t2, 0
ldx.bu t7, t1, a1
sub.d t5, t7, t8
slli.d t5, t5, 1
//3&4
add.d t1, t1, t6
sub.d t2, t2, a1
ld.bu t8, t2, 0
ld.bu t7, t1, 0
sub.d t7, t7, t8
slli.d t8, t7, 1
add.d t7, t7, t8
add.d t5, t5, t7
sub.d t2, t2, a1
ld.bu t8, t2, 0
ldx.bu t7, t1, a1
sub.d t7, t7, t8
slli.d t7, t7, 2
add.d t5, t5, t7
//5&6
add.d t1, t1, t6
sub.d t2, t2, a1
ld.bu t8, t2, 0
ld.bu t7, t1, 0
sub.d t7, t7, t8
slli.d t8, t7, 2
add.d t7, t7, t8
add.d t5, t5, t7
sub.d t2, t2, a1
ld.bu t8, t2, 0
ldx.bu t7, t1, a1
sub.d t7, t7, t8
slli.d t8, t7, 1
slli.d t7, t7, 2
add.d t7, t7, t8
add.d t5, t5, t7
//7&8
add.d t1, t1, t6
sub.d t2, t2, a1
ld.bu t8, t2, 0
ld.bu t7, t1, 0
sub.d t7, t7, t8
slli.d t8, t7, 3
sub.d t7, t8, t7
add.d t5, t5, t7
sub.d t2, t2, a1
ld.bu t8, t2, 0
ldx.bu t7, t1, a1
sub.d t7, t7, t8
slli.d t7, t7, 3
add.d t5, t5, t7
add.d t4, t4, t5
add.d t1, t1, a1
.endm
.macro PRED16X16_PLANE_END
ld.bu t7, t1, 0
ld.bu t8, t2, 16
add.d t5, t7, t8
addi.d t5, t5, 1
slli.d t5, t5, 4
add.d t7, t3, t4
slli.d t8, t7, 3
sub.d t7, t8, t7
sub.d t5, t5, t7
la.local t8, mulh
vld vr3, t8, 0
slli.d t8, t3, 3
vreplgr2vr.h vr4, t3
vreplgr2vr.h vr9, t8
vmul.h vr5, vr3, vr4
.rept 16
move t7, t5
add.d t5, t5, t4
vreplgr2vr.h vr6, t7
vadd.h vr7, vr6, vr5
vadd.h vr8, vr9, vr7
vssrani.bu.h vr8, vr7, 5
vst vr8, a0, 0
add.d a0, a0, a1
.endr
.endm
.macro PRED16X16_PLANE_END_LASX
ld.bu t7, t1, 0
ld.bu t8, t2, 16
add.d t5, t7, t8
addi.d t5, t5, 1
slli.d t5, t5, 4
add.d t7, t3, t4
slli.d t8, t7, 3
sub.d t7, t8, t7
sub.d t5, t5, t7
la.local t8, mulh
xvld xr3, t8, 0
xvreplgr2vr.h xr4, t3
xvmul.h xr5, xr3, xr4
.rept 8
move t7, t5
add.d t5, t5, t4
xvreplgr2vr.h xr6, t7
xvreplgr2vr.h xr8, t5
add.d t5, t5, t4
xvadd.h xr7, xr6, xr5
xvadd.h xr9, xr8, xr5
xvssrani.bu.h xr9, xr7, 5
vstelm.d vr9, a0, 0, 0
xvstelm.d xr9, a0, 8, 2
add.d a0, a0, a1
vstelm.d vr9, a0, 0, 1
xvstelm.d xr9, a0, 8, 3
add.d a0, a0, a1
.endr
.endm
/* void ff_h264_pred16x16_plane_h264_8_lsx(uint8_t *src, ptrdiff_t stride)
*/
function ff_h264_pred16x16_plane_h264_8_lsx
PRED16X16_PLANE
slli.d t7, t3, 2
add.d t3, t3, t7
addi.d t3, t3, 32
srai.d t3, t3, 6
slli.d t7, t4, 2
add.d t4, t4, t7
addi.d t4, t4, 32
srai.d t4, t4, 6
PRED16X16_PLANE_END
endfunc
/* void ff_h264_pred16x16_plane_rv40_8_lsx(uint8_t *src, ptrdiff_t stride)
*/
function ff_h264_pred16x16_plane_rv40_8_lsx
PRED16X16_PLANE
srai.d t7, t3, 2
add.d t3, t3, t7
srai.d t3, t3, 4
srai.d t7, t4, 2
add.d t4, t4, t7
srai.d t4, t4, 4
PRED16X16_PLANE_END
endfunc
/* void ff_h264_pred16x16_plane_svq3_8_lsx(uint8_t *src, ptrdiff_t stride)
*/
function ff_h264_pred16x16_plane_svq3_8_lsx
PRED16X16_PLANE
li.d t6, 4
li.d t7, 5
li.d t8, 16
div.d t3, t3, t6
mul.d t3, t3, t7
div.d t3, t3, t8
div.d t4, t4, t6
mul.d t4, t4, t7
div.d t4, t4, t8
move t7, t3
move t3, t4
move t4, t7
PRED16X16_PLANE_END
endfunc
/* void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride)
*/
function ff_h264_pred16x16_plane_h264_8_lasx
PRED16X16_PLANE
slli.d t7, t3, 2
add.d t3, t3, t7
addi.d t3, t3, 32
srai.d t3, t3, 6
slli.d t7, t4, 2
add.d t4, t4, t7
addi.d t4, t4, 32
srai.d t4, t4, 6
PRED16X16_PLANE_END_LASX
endfunc
/* void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride)
*/
function ff_h264_pred16x16_plane_rv40_8_lasx
PRED16X16_PLANE
srai.d t7, t3, 2
add.d t3, t3, t7
srai.d t3, t3, 4
srai.d t7, t4, 2
add.d t4, t4, t7
srai.d t4, t4, 4
PRED16X16_PLANE_END_LASX
endfunc
/* void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride)
*/
function ff_h264_pred16x16_plane_svq3_8_lasx
PRED16X16_PLANE
li.d t5, 4
li.d t7, 5
li.d t8, 16
div.d t3, t3, t5
mul.d t3, t3, t7
div.d t3, t3, t8
div.d t4, t4, t5
mul.d t4, t4, t7
div.d t4, t4, t8
move t7, t3
move t3, t4
move t4, t7
PRED16X16_PLANE_END_LASX
endfunc