ffmpeg/libswscale/loongarch/output.S
Lu Wang 4501b1dfd7
swscale/la: Optimize the functions of the swscale series with lsx.
./configure --disable-lasx
ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480
-pix_fmt bgra -y /dev/null -an
before: 91fps
after:  160fps

Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn>
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
2023-05-25 21:05:08 +02:00

139 lines
4.8 KiB
ArmAsm

/*
* Loongson LSX optimized swscale
*
* Copyright (c) 2023 Loongson Technology Corporation Limited
* Contributed by Lu Wang <wanglu@loongson.cn>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/loongarch/loongson_asm.S"
/* static void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
* const int16_t **src, uint8_t *dest, int dstW,
* const uint8_t *dither, int offset)
*/
function ff_yuv2planeX_8_lsx
addi.w t1, a6, 1
addi.w t2, a6, 2
addi.w t3, a6, 3
addi.w t4, a6, 4
addi.w t5, a6, 5
addi.w t6, a6, 6
addi.w t7, a6, 7
andi t0, a6, 7
andi t1, t1, 7
andi t2, t2, 7
andi t3, t3, 7
andi t4, t4, 7
andi t5, t5, 7
andi t6, t6, 7
andi t7, t7, 7
ldx.bu t0, a5, t0
ldx.bu t1, a5, t1
ldx.bu t2, a5, t2
ldx.bu t3, a5, t3
ldx.bu t4, a5, t4
ldx.bu t5, a5, t5
ldx.bu t6, a5, t6
ldx.bu t7, a5, t7
vreplgr2vr.w vr0, t0
vreplgr2vr.w vr1, t1
vreplgr2vr.w vr2, t2
vreplgr2vr.w vr3, t3
vreplgr2vr.w vr4, t4
vreplgr2vr.w vr5, t5
vreplgr2vr.w vr6, t6
vreplgr2vr.w vr7, t7
vilvl.w vr0, vr2, vr0
vilvl.w vr4, vr6, vr4
vilvl.w vr1, vr3, vr1
vilvl.w vr5, vr7, vr5
vilvl.d vr12, vr4, vr0
vilvl.d vr13, vr5, vr1
li.w t5, 0
li.w t8, 8
bge a4, t8, .WIDTH8
blt zero, a4, .WIDTH
b .END
.WIDTH8:
li.d t1, 0
li.d t4, 0
vslli.w vr2, vr12, 12
vslli.w vr3, vr13, 12
move t3, a0
.FILTERSIZE8:
ldx.d t2, a2, t1
vldx vr4, t2, t5
vldrepl.h vr5, t3, 0
vmaddwev.w.h vr2, vr4, vr5
vmaddwod.w.h vr3, vr4, vr5
addi.d t1, t1, 8
addi.d t3, t3, 2
addi.d t4, t4, 1
blt t4, a1, .FILTERSIZE8
vsrai.w vr2, vr2, 19
vsrai.w vr3, vr3, 19
vclip255.w vr2, vr2
vclip255.w vr3, vr3
vpickev.h vr2, vr3, vr2
vpickev.b vr2, vr2, vr2
vbsrl.v vr3, vr2, 4
vilvl.b vr2, vr3, vr2
fst.d f2, a3, 0
addi.d t5, t5, 16
addi.d a4, a4, -8
addi.d a3, a3, 8
bge a4, t8, .WIDTH8
blt zero, a4, .WIDTH
b .END
.WIDTH:
li.d t1, 0
li.d t4, 0
vslli.w vr2, vr12, 12
vslli.w vr3, vr13, 12
.FILTERSIZE:
ldx.d t2, a2, t1
vldx vr4, t2, t5
vldrepl.h vr5, a0, 0
vmaddwev.w.h vr2, vr4, vr5
vmaddwod.w.h vr3, vr4, vr5
addi.d t1, t1, 8
addi.d a0, a0, 2
addi.d t4, t4, 1
blt t4, a1, .FILTERSIZE
vsrai.w vr2, vr2, 19
vsrai.w vr3, vr3, 19
vclip255.w vr2, vr2
vclip255.w vr3, vr3
vpickev.h vr2, vr3, vr2
vpickev.b vr2, vr2, vr2
vbsrl.v vr3, vr2, 4
vilvl.b vr2, vr3, vr2
.DEST:
vstelm.b vr2, a3, 0, 0
vbsrl.v vr2, vr2, 1
addi.d a4, a4, -1
addi.d a3, a3, 1
blt zero, a4, .DEST
.END:
endfunc