ffmpeg/libswscale/loongarch/swscale.S
Lu Wang 4501b1dfd7
swscale/la: Optimize the functions of the swscale series with lsx.
./configure --disable-lasx
ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480
-pix_fmt bgra -y /dev/null -an
before: 91fps
after:  160fps

Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn>
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
2023-05-25 21:05:08 +02:00

1869 lines
70 KiB
ArmAsm

/*
* Loongson LSX optimized swscale
*
* Copyright (c) 2023 Loongson Technology Corporation Limited
* Contributed by Lu Wang <wanglu@loongson.cn>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/loongarch/loongson_asm.S"
/* void ff_hscale_8_to_15_lsx(SwsContext *c, int16_t *dst, int dstW,
* const uint8_t *src, const int16_t *filter,
* const int32_t *filterPos, int filterSize)
*/
function ff_hscale_8_to_15_lsx
addi.d sp, sp, -72
st.d s0, sp, 0
st.d s1, sp, 8
st.d s2, sp, 16
st.d s3, sp, 24
st.d s4, sp, 32
st.d s5, sp, 40
st.d s6, sp, 48
st.d s7, sp, 56
st.d s8, sp, 64
li.w t0, 32767
li.w t8, 8
li.w t7, 4
vldi vr0, 0
vreplgr2vr.w vr20, t0
beq a6, t7, .LOOP_DSTW4
beq a6, t8, .LOOP_DSTW8
blt t8, a6, .LOOP_START
b .END_DSTW4
.LOOP_START:
li.w t1, 0
li.w s1, 0
li.w s2, 0
li.w s3, 0
li.w s4, 0
li.w s5, 0
vldi vr22, 0
addi.w s0, a6, -7
slli.w s7, a6, 1
slli.w s8, a6, 2
add.w t6, s7, s8
.LOOP_DSTW:
ld.w t2, a5, 0
ld.w t3, a5, 4
ld.w t4, a5, 8
ld.w t5, a5, 12
fldx.d f1, a3, t2
fldx.d f2, a3, t3
fldx.d f3, a3, t4
fldx.d f4, a3, t5
vld vr9, a4, 0
vldx vr10, a4, s7
vldx vr11, a4, s8
vldx vr12, a4, t6
vilvl.b vr1, vr0, vr1
vilvl.b vr2, vr0, vr2
vilvl.b vr3, vr0, vr3
vilvl.b vr4, vr0, vr4
vdp2.w.h vr17, vr1, vr9
vdp2.w.h vr18, vr2, vr10
vdp2.w.h vr19, vr3, vr11
vdp2.w.h vr21, vr4, vr12
vhaddw.d.w vr1, vr17, vr17
vhaddw.d.w vr2, vr18, vr18
vhaddw.d.w vr3, vr19, vr19
vhaddw.d.w vr4, vr21, vr21
vhaddw.q.d vr1, vr1, vr1
vhaddw.q.d vr2, vr2, vr2
vhaddw.q.d vr3, vr3, vr3
vhaddw.q.d vr4, vr4, vr4
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.d vr1, vr3, vr1
vadd.w vr22, vr22, vr1
addi.w s1, s1, 8
addi.d a3, a3, 8
addi.d a4, a4, 16
blt s1, s0, .LOOP_DSTW
blt s1, a6, .DSTWA
b .END_FILTER
.DSTWA:
ld.w t2, a5, 0
li.w t3, 0
move s6, s1
.FILTERSIZEA:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s2, s2, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .FILTERSIZEA
ld.w t2, a5, 4
li.w t3, 0
move s6, s1
addi.w t1, t1, 1
.FILTERSIZEB:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s3, s3, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .FILTERSIZEB
ld.w t2, a5, 8
addi.w t1, t1, 1
li.w t3, 0
move s6, s1
.FILTERSIZEC:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s4, s4, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .FILTERSIZEC
ld.w t2, a5, 12
addi.w t1, t1, 1
move s6, s1
li.w t3, 0
.FILTERSIZED:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s5, s5, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .FILTERSIZED
.END_FILTER:
vpickve2gr.w t1, vr22, 0
vpickve2gr.w t2, vr22, 1
vpickve2gr.w t3, vr22, 2
vpickve2gr.w t4, vr22, 3
add.w s2, s2, t1
add.w s3, s3, t2
add.w s4, s4, t3
add.w s5, s5, t4
srai.w s2, s2, 7
srai.w s3, s3, 7
srai.w s4, s4, 7
srai.w s5, s5, 7
slt t1, s2, t0
slt t2, s3, t0
slt t3, s4, t0
slt t4, s5, t0
maskeqz s2, s2, t1
maskeqz s3, s3, t2
maskeqz s4, s4, t3
maskeqz s5, s5, t4
masknez t1, t0, t1
masknez t2, t0, t2
masknez t3, t0, t3
masknez t4, t0, t4
or s2, s2, t1
or s3, s3, t2
or s4, s4, t3
or s5, s5, t4
st.h s2, a1, 0
st.h s3, a1, 2
st.h s4, a1, 4
st.h s5, a1, 6
addi.d a1, a1, 8
sub.d a3, a3, s1
addi.d a5, a5, 16
slli.d t3, a6, 3
add.d a4, a4, t3
sub.d a4, a4, s1
sub.d a4, a4, s1
addi.d a2, a2, -4
bge a2, t7, .LOOP_START
blt zero, a2, .RES
b .END_LOOP
.RES:
li.w t1, 0
.DSTW:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.FILTERSIZE:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .FILTERSIZE
srai.w t8, t8, 7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 1
stx.h t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .DSTW
b .END_LOOP
.LOOP_DSTW8:
ld.w t1, a5, 0
ld.w t2, a5, 4
ld.w t3, a5, 8
ld.w t4, a5, 12
fldx.d f1, a3, t1
fldx.d f2, a3, t2
fldx.d f3, a3, t3
fldx.d f4, a3, t4
ld.w t1, a5, 16
ld.w t2, a5, 20
ld.w t3, a5, 24
ld.w t4, a5, 28
fldx.d f5, a3, t1
fldx.d f6, a3, t2
fldx.d f7, a3, t3
fldx.d f8, a3, t4
vld vr9, a4, 0
vld vr10, a4, 16
vld vr11, a4, 32
vld vr12, a4, 48
vld vr13, a4, 64
vld vr14, a4, 80
vld vr15, a4, 96
vld vr16, a4, 112
vilvl.b vr1, vr0, vr1
vilvl.b vr2, vr0, vr2
vilvl.b vr3, vr0, vr3
vilvl.b vr4, vr0, vr4
vilvl.b vr5, vr0, vr5
vilvl.b vr6, vr0, vr6
vilvl.b vr7, vr0, vr7
vilvl.b vr8, vr0, vr8
vdp2.w.h vr17, vr1, vr9
vdp2.w.h vr18, vr2, vr10
vdp2.w.h vr19, vr3, vr11
vdp2.w.h vr21, vr4, vr12
vdp2.w.h vr1, vr5, vr13
vdp2.w.h vr2, vr6, vr14
vdp2.w.h vr3, vr7, vr15
vdp2.w.h vr4, vr8, vr16
vhaddw.d.w vr5, vr1, vr1
vhaddw.d.w vr6, vr2, vr2
vhaddw.d.w vr7, vr3, vr3
vhaddw.d.w vr8, vr4, vr4
vhaddw.d.w vr1, vr17, vr17
vhaddw.d.w vr2, vr18, vr18
vhaddw.d.w vr3, vr19, vr19
vhaddw.d.w vr4, vr21, vr21
vhaddw.q.d vr1, vr1, vr1
vhaddw.q.d vr2, vr2, vr2
vhaddw.q.d vr3, vr3, vr3
vhaddw.q.d vr4, vr4, vr4
vhaddw.q.d vr5, vr5, vr5
vhaddw.q.d vr6, vr6, vr6
vhaddw.q.d vr7, vr7, vr7
vhaddw.q.d vr8, vr8, vr8
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.w vr5, vr6, vr5
vilvl.w vr7, vr8, vr7
vilvl.d vr1, vr3, vr1
vilvl.d vr5, vr7, vr5
vsrai.w vr1, vr1, 7
vsrai.w vr5, vr5, 7
vmin.w vr1, vr1, vr20
vmin.w vr5, vr5, vr20
vpickev.h vr1, vr5, vr1
vst vr1, a1, 0
addi.d a1, a1, 16
addi.d a5, a5, 32
addi.d a4, a4, 128
addi.d a2, a2, -8
bge a2, t8, .LOOP_DSTW8
blt zero, a2, .RES8
b .END_LOOP
.RES8:
li.w t1, 0
.DSTW8:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.FILTERSIZE8:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .FILTERSIZE8
srai.w t8, t8, 7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 1
stx.h t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .DSTW8
b .END_LOOP
.LOOP_DSTW4:
ld.w t1, a5, 0
ld.w t2, a5, 4
ld.w t3, a5, 8
ld.w t4, a5, 12
fldx.s f1, a3, t1
fldx.s f2, a3, t2
fldx.s f3, a3, t3
fldx.s f4, a3, t4
ld.w t1, a5, 16
ld.w t2, a5, 20
ld.w t3, a5, 24
ld.w t4, a5, 28
fldx.s f5, a3, t1
fldx.s f6, a3, t2
fldx.s f7, a3, t3
fldx.s f8, a3, t4
vld vr9, a4, 0
vld vr10, a4, 16
vld vr11, a4, 32
vld vr12, a4, 48
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.w vr5, vr6, vr5
vilvl.w vr7, vr8, vr7
vilvl.b vr1, vr0, vr1
vilvl.b vr3, vr0, vr3
vilvl.b vr5, vr0, vr5
vilvl.b vr7, vr0, vr7
vdp2.w.h vr13, vr1, vr9
vdp2.w.h vr14, vr3, vr10
vdp2.w.h vr15, vr5, vr11
vdp2.w.h vr16, vr7, vr12
vhaddw.d.w vr13, vr13, vr13
vhaddw.d.w vr14, vr14, vr14
vhaddw.d.w vr15, vr15, vr15
vhaddw.d.w vr16, vr16, vr16
vpickev.w vr13, vr14, vr13
vpickev.w vr15, vr16, vr15
vsrai.w vr13, vr13, 7
vsrai.w vr15, vr15, 7
vmin.w vr13, vr13, vr20
vmin.w vr15, vr15, vr20
vpickev.h vr13, vr15, vr13
vst vr13, a1, 0
addi.d a1, a1, 16
addi.d a5, a5, 32
addi.d a4, a4, 64
addi.d a2, a2, -8
bge a2, t8, .LOOP_DSTW4
blt zero, a2, .RES4
b .END_LOOP
.RES4:
li.w t1, 0
.DSTW4:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.FILTERSIZE4:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .FILTERSIZE4
srai.w t8, t8, 7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 1
stx.h t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .DSTW4
b .END_LOOP
.END_DSTW4:
li.w t1, 0
.LOOP_DSTW1:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.FILTERSIZE1:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .FILTERSIZE1
srai.w t8, t8, 7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 1
stx.h t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .LOOP_DSTW1
b .END_LOOP
.END_LOOP:
ld.d s0, sp, 0
ld.d s1, sp, 8
ld.d s2, sp, 16
ld.d s3, sp, 24
ld.d s4, sp, 32
ld.d s5, sp, 40
ld.d s6, sp, 48
ld.d s7, sp, 56
ld.d s8, sp, 64
addi.d sp, sp, 72
endfunc
/* void ff_hscale_8_to_19_lsx(SwsContext *c, int16_t *dst, int dstW,
* const uint8_t *src, const int16_t *filter,
* const int32_t *filterPos, int filterSize)
*/
function ff_hscale_8_to_19_lsx
addi.d sp, sp, -72
st.d s0, sp, 0
st.d s1, sp, 8
st.d s2, sp, 16
st.d s3, sp, 24
st.d s4, sp, 32
st.d s5, sp, 40
st.d s6, sp, 48
st.d s7, sp, 56
st.d s8, sp, 64
li.w t0, 524287
li.w t8, 8
li.w t7, 4
vldi vr0, 0
vreplgr2vr.w vr20, t0
beq a6, t7, .LOOP_DST4
beq a6, t8, .LOOP_DST8
blt t8, a6, .LOOP
b .END_DST4
.LOOP:
li.w t1, 0
li.w s1, 0
li.w s2, 0
li.w s3, 0
li.w s4, 0
li.w s5, 0
vldi vr22, 0
addi.w s0, a6, -7
slli.w s7, a6, 1
slli.w s8, a6, 2
add.w t6, s7, s8
.LOOP_DST:
ld.w t2, a5, 0
ld.w t3, a5, 4
ld.w t4, a5, 8
ld.w t5, a5, 12
fldx.d f1, a3, t2
fldx.d f2, a3, t3
fldx.d f3, a3, t4
fldx.d f4, a3, t5
vld vr9, a4, 0
vldx vr10, a4, s7
vldx vr11, a4, s8
vldx vr12, a4, t6
vilvl.b vr1, vr0, vr1
vilvl.b vr2, vr0, vr2
vilvl.b vr3, vr0, vr3
vilvl.b vr4, vr0, vr4
vdp2.w.h vr17, vr1, vr9
vdp2.w.h vr18, vr2, vr10
vdp2.w.h vr19, vr3, vr11
vdp2.w.h vr21, vr4, vr12
vhaddw.d.w vr1, vr17, vr17
vhaddw.d.w vr2, vr18, vr18
vhaddw.d.w vr3, vr19, vr19
vhaddw.d.w vr4, vr21, vr21
vhaddw.q.d vr1, vr1, vr1
vhaddw.q.d vr2, vr2, vr2
vhaddw.q.d vr3, vr3, vr3
vhaddw.q.d vr4, vr4, vr4
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.d vr1, vr3, vr1
vadd.w vr22, vr22, vr1
addi.w s1, s1, 8
addi.d a3, a3, 8
addi.d a4, a4, 16
blt s1, s0, .LOOP_DST
blt s1, a6, .DSTA
b .END_FILTERA
.DSTA:
ld.w t2, a5, 0
li.w t3, 0
move s6, s1
.FILTERA:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s2, s2, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .FILTERA
ld.w t2, a5, 4
li.w t3, 0
move s6, s1
addi.w t1, t1, 1
.FILTERB:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s3, s3, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .FILTERB
ld.w t2, a5, 8
addi.w t1, t1, 1
li.w t3, 0
move s6, s1
.FILTERC:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s4, s4, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .FILTERC
ld.w t2, a5, 12
addi.w t1, t1, 1
move s6, s1
li.w t3, 0
.FILTERD:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s5, s5, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .FILTERD
.END_FILTERA:
vpickve2gr.w t1, vr22, 0
vpickve2gr.w t2, vr22, 1
vpickve2gr.w t3, vr22, 2
vpickve2gr.w t4, vr22, 3
add.w s2, s2, t1
add.w s3, s3, t2
add.w s4, s4, t3
add.w s5, s5, t4
srai.w s2, s2, 3
srai.w s3, s3, 3
srai.w s4, s4, 3
srai.w s5, s5, 3
slt t1, s2, t0
slt t2, s3, t0
slt t3, s4, t0
slt t4, s5, t0
maskeqz s2, s2, t1
maskeqz s3, s3, t2
maskeqz s4, s4, t3
maskeqz s5, s5, t4
masknez t1, t0, t1
masknez t2, t0, t2
masknez t3, t0, t3
masknez t4, t0, t4
or s2, s2, t1
or s3, s3, t2
or s4, s4, t3
or s5, s5, t4
st.w s2, a1, 0
st.w s3, a1, 4
st.w s4, a1, 8
st.w s5, a1, 12
addi.d a1, a1, 16
sub.d a3, a3, s1
addi.d a5, a5, 16
slli.d t3, a6, 3
add.d a4, a4, t3
sub.d a4, a4, s1
sub.d a4, a4, s1
addi.d a2, a2, -4
bge a2, t7, .LOOP
blt zero, a2, .RESA
b .END
.RESA:
li.w t1, 0
.DST:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.FILTER:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .FILTER
srai.w t8, t8, 3
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 2
stx.w t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .DST
b .END
.LOOP_DST8:
ld.w t1, a5, 0
ld.w t2, a5, 4
ld.w t3, a5, 8
ld.w t4, a5, 12
fldx.d f1, a3, t1
fldx.d f2, a3, t2
fldx.d f3, a3, t3
fldx.d f4, a3, t4
ld.w t1, a5, 16
ld.w t2, a5, 20
ld.w t3, a5, 24
ld.w t4, a5, 28
fldx.d f5, a3, t1
fldx.d f6, a3, t2
fldx.d f7, a3, t3
fldx.d f8, a3, t4
vld vr9, a4, 0
vld vr10, a4, 16
vld vr11, a4, 32
vld vr12, a4, 48
vld vr13, a4, 64
vld vr14, a4, 80
vld vr15, a4, 96
vld vr16, a4, 112
vilvl.b vr1, vr0, vr1
vilvl.b vr2, vr0, vr2
vilvl.b vr3, vr0, vr3
vilvl.b vr4, vr0, vr4
vilvl.b vr5, vr0, vr5
vilvl.b vr6, vr0, vr6
vilvl.b vr7, vr0, vr7
vilvl.b vr8, vr0, vr8
vdp2.w.h vr17, vr1, vr9
vdp2.w.h vr18, vr2, vr10
vdp2.w.h vr19, vr3, vr11
vdp2.w.h vr21, vr4, vr12
vdp2.w.h vr1, vr5, vr13
vdp2.w.h vr2, vr6, vr14
vdp2.w.h vr3, vr7, vr15
vdp2.w.h vr4, vr8, vr16
vhaddw.d.w vr5, vr1, vr1
vhaddw.d.w vr6, vr2, vr2
vhaddw.d.w vr7, vr3, vr3
vhaddw.d.w vr8, vr4, vr4
vhaddw.d.w vr1, vr17, vr17
vhaddw.d.w vr2, vr18, vr18
vhaddw.d.w vr3, vr19, vr19
vhaddw.d.w vr4, vr21, vr21
vhaddw.q.d vr1, vr1, vr1
vhaddw.q.d vr2, vr2, vr2
vhaddw.q.d vr3, vr3, vr3
vhaddw.q.d vr4, vr4, vr4
vhaddw.q.d vr5, vr5, vr5
vhaddw.q.d vr6, vr6, vr6
vhaddw.q.d vr7, vr7, vr7
vhaddw.q.d vr8, vr8, vr8
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.w vr5, vr6, vr5
vilvl.w vr7, vr8, vr7
vilvl.d vr1, vr3, vr1
vilvl.d vr5, vr7, vr5
vsrai.w vr1, vr1, 3
vsrai.w vr5, vr5, 3
vmin.w vr1, vr1, vr20
vmin.w vr5, vr5, vr20
vst vr1, a1, 0
vst vr5, a1, 16
addi.d a1, a1, 32
addi.d a5, a5, 32
addi.d a4, a4, 128
addi.d a2, a2, -8
bge a2, t8, .LOOP_DST8
blt zero, a2, .REST8
b .END
.REST8:
li.w t1, 0
.DST8:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.FILTER8:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .FILTER8
srai.w t8, t8, 3
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 2
stx.w t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .DST8
b .END
.LOOP_DST4:
ld.w t1, a5, 0
ld.w t2, a5, 4
ld.w t3, a5, 8
ld.w t4, a5, 12
fldx.s f1, a3, t1
fldx.s f2, a3, t2
fldx.s f3, a3, t3
fldx.s f4, a3, t4
ld.w t1, a5, 16
ld.w t2, a5, 20
ld.w t3, a5, 24
ld.w t4, a5, 28
fldx.s f5, a3, t1
fldx.s f6, a3, t2
fldx.s f7, a3, t3
fldx.s f8, a3, t4
vld vr9, a4, 0
vld vr10, a4, 16
vld vr11, a4, 32
vld vr12, a4, 48
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.w vr5, vr6, vr5
vilvl.w vr7, vr8, vr7
vilvl.b vr1, vr0, vr1
vilvl.b vr3, vr0, vr3
vilvl.b vr5, vr0, vr5
vilvl.b vr7, vr0, vr7
vdp2.w.h vr13, vr1, vr9
vdp2.w.h vr14, vr3, vr10
vdp2.w.h vr15, vr5, vr11
vdp2.w.h vr16, vr7, vr12
vhaddw.d.w vr13, vr13, vr13
vhaddw.d.w vr14, vr14, vr14
vhaddw.d.w vr15, vr15, vr15
vhaddw.d.w vr16, vr16, vr16
vpickev.w vr13, vr14, vr13
vpickev.w vr15, vr16, vr15
vsrai.w vr13, vr13, 3
vsrai.w vr15, vr15, 3
vmin.w vr13, vr13, vr20
vmin.w vr15, vr15, vr20
vst vr13, a1, 0
vst vr15, a1, 16
addi.d a1, a1, 32
addi.d a5, a5, 32
addi.d a4, a4, 64
addi.d a2, a2, -8
bge a2, t8, .LOOP_DST4
blt zero, a2, .REST4
b .END
.REST4:
li.w t1, 0
.DST4:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.FILTER4:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .FILTER4
srai.w t8, t8, 3
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 2
stx.w t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .DST4
b .END
.END_DST4:
li.w t1, 0
.LOOP_DST1:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.FILTER1:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .FILTER1
srai.w t8, t8, 3
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 2
stx.w t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .LOOP_DST1
b .END
.END:
ld.d s0, sp, 0
ld.d s1, sp, 8
ld.d s2, sp, 16
ld.d s3, sp, 24
ld.d s4, sp, 32
ld.d s5, sp, 40
ld.d s6, sp, 48
ld.d s7, sp, 56
ld.d s8, sp, 64
addi.d sp, sp, 72
endfunc
/* void ff_hscale_16_to_15_sub_lsx(SwsContext *c, int16_t *dst, int dstW,
* const uint8_t *src, const int16_t *filter,
* const int32_t *filterPos, int filterSize, int sh)
*/
function ff_hscale_16_to_15_sub_lsx
addi.d sp, sp, -72
st.d s0, sp, 0
st.d s1, sp, 8
st.d s2, sp, 16
st.d s3, sp, 24
st.d s4, sp, 32
st.d s5, sp, 40
st.d s6, sp, 48
st.d s7, sp, 56
st.d s8, sp, 64
li.w t0, 32767
li.w t8, 8
li.w t7, 4
vreplgr2vr.w vr20, t0
vreplgr2vr.w vr0, a7
beq a6, t7, .LOOP_HS15_DST4
beq a6, t8, .LOOP_HS15_DST8
blt t8, a6, .LOOP_HS15
b .END_HS15_DST4
.LOOP_HS15:
li.w t1, 0
li.w s1, 0
li.w s2, 0
li.w s3, 0
li.w s4, 0
li.w s5, 0
vldi vr22, 0
addi.w s0, a6, -7
slli.w s7, a6, 1
slli.w s8, a6, 2
add.w t6, s7, s8
.LOOP_HS15_DST:
ld.w t2, a5, 0
ld.w t3, a5, 4
ld.w t4, a5, 8
ld.w t5, a5, 12
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
slli.w t5, t5, 1
vldx vr1, a3, t2
vldx vr2, a3, t3
vldx vr3, a3, t4
vldx vr4, a3, t5
vld vr9, a4, 0
vldx vr10, a4, s7
vldx vr11, a4, s8
vldx vr12, a4, t6
vmulwev.w.hu.h vr17, vr1, vr9
vmulwev.w.hu.h vr18, vr2, vr10
vmulwev.w.hu.h vr19, vr3, vr11
vmulwev.w.hu.h vr21, vr4, vr12
vmaddwod.w.hu.h vr17, vr1, vr9
vmaddwod.w.hu.h vr18, vr2, vr10
vmaddwod.w.hu.h vr19, vr3, vr11
vmaddwod.w.hu.h vr21, vr4, vr12
vhaddw.d.w vr1, vr17, vr17
vhaddw.d.w vr2, vr18, vr18
vhaddw.d.w vr3, vr19, vr19
vhaddw.d.w vr4, vr21, vr21
vhaddw.q.d vr1, vr1, vr1
vhaddw.q.d vr2, vr2, vr2
vhaddw.q.d vr3, vr3, vr3
vhaddw.q.d vr4, vr4, vr4
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.d vr1, vr3, vr1
vadd.w vr22, vr22, vr1
addi.w s1, s1, 8
addi.d a3, a3, 16
addi.d a4, a4, 16
blt s1, s0, .LOOP_HS15_DST
blt s1, a6, .HS15_DSTA
b .END_HS15_FILTERA
.HS15_DSTA:
ld.w t2, a5, 0
li.w t3, 0
move s6, s1
.HS15_FILTERA:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s2, s2, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .HS15_FILTERA
ld.w t2, a5, 4
li.w t3, 0
move s6, s1
addi.w t1, t1, 1
.HS15_FILTERB:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s3, s3, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .HS15_FILTERB
ld.w t2, a5, 8
addi.w t1, t1, 1
li.w t3, 0
move s6, s1
.HS15_FILTERC:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s4, s4, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .HS15_FILTERC
ld.w t2, a5, 12
addi.w t1, t1, 1
move s6, s1
li.w t3, 0
.HS15_FILTERD:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s5, s5, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .HS15_FILTERD
.END_HS15_FILTERA:
vpickve2gr.w t1, vr22, 0
vpickve2gr.w t2, vr22, 1
vpickve2gr.w t3, vr22, 2
vpickve2gr.w t4, vr22, 3
add.w s2, s2, t1
add.w s3, s3, t2
add.w s4, s4, t3
add.w s5, s5, t4
sra.w s2, s2, a7
sra.w s3, s3, a7
sra.w s4, s4, a7
sra.w s5, s5, a7
slt t1, s2, t0
slt t2, s3, t0
slt t3, s4, t0
slt t4, s5, t0
maskeqz s2, s2, t1
maskeqz s3, s3, t2
maskeqz s4, s4, t3
maskeqz s5, s5, t4
masknez t1, t0, t1
masknez t2, t0, t2
masknez t3, t0, t3
masknez t4, t0, t4
or s2, s2, t1
or s3, s3, t2
or s4, s4, t3
or s5, s5, t4
st.h s2, a1, 0
st.h s3, a1, 2
st.h s4, a1, 4
st.h s5, a1, 6
addi.d a1, a1, 8
sub.d a3, a3, s1
sub.d a3, a3, s1
addi.d a5, a5, 16
slli.d t3, a6, 3
add.d a4, a4, t3
sub.d a4, a4, s1
sub.d a4, a4, s1
addi.d a2, a2, -4
bge a2, t7, .LOOP_HS15
blt zero, a2, .HS15_RESA
b .HS15_END
.HS15_RESA:
li.w t1, 0
.HS15_DST:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.HS15_FILTER:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .HS15_FILTER
sra.w t8, t8, a7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 1
stx.h t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .HS15_DST
b .HS15_END
.LOOP_HS15_DST8:
ld.w t1, a5, 0
ld.w t2, a5, 4
ld.w t3, a5, 8
ld.w t4, a5, 12
slli.w t1, t1, 1
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
vldx vr1, a3, t1
vldx vr2, a3, t2
vldx vr3, a3, t3
vldx vr4, a3, t4
ld.w t1, a5, 16
ld.w t2, a5, 20
ld.w t3, a5, 24
ld.w t4, a5, 28
slli.w t1, t1, 1
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
vldx vr5, a3, t1
vldx vr6, a3, t2
vldx vr7, a3, t3
vldx vr8, a3, t4
vld vr9, a4, 0
vld vr10, a4, 16
vld vr11, a4, 32
vld vr12, a4, 48
vld vr13, a4, 64
vld vr14, a4, 80
vld vr15, a4, 96
vld vr16, a4, 112
vmulwev.w.hu.h vr17, vr1, vr9
vmulwev.w.hu.h vr18, vr2, vr10
vmulwev.w.hu.h vr19, vr3, vr11
vmulwev.w.hu.h vr21, vr4, vr12
vmaddwod.w.hu.h vr17, vr1, vr9
vmaddwod.w.hu.h vr18, vr2, vr10
vmaddwod.w.hu.h vr19, vr3, vr11
vmaddwod.w.hu.h vr21, vr4, vr12
vmulwev.w.hu.h vr1, vr5, vr13
vmulwev.w.hu.h vr2, vr6, vr14
vmulwev.w.hu.h vr3, vr7, vr15
vmulwev.w.hu.h vr4, vr8, vr16
vmaddwod.w.hu.h vr1, vr5, vr13
vmaddwod.w.hu.h vr2, vr6, vr14
vmaddwod.w.hu.h vr3, vr7, vr15
vmaddwod.w.hu.h vr4, vr8, vr16
vhaddw.d.w vr5, vr1, vr1
vhaddw.d.w vr6, vr2, vr2
vhaddw.d.w vr7, vr3, vr3
vhaddw.d.w vr8, vr4, vr4
vhaddw.d.w vr1, vr17, vr17
vhaddw.d.w vr2, vr18, vr18
vhaddw.d.w vr3, vr19, vr19
vhaddw.d.w vr4, vr21, vr21
vhaddw.q.d vr1, vr1, vr1
vhaddw.q.d vr2, vr2, vr2
vhaddw.q.d vr3, vr3, vr3
vhaddw.q.d vr4, vr4, vr4
vhaddw.q.d vr5, vr5, vr5
vhaddw.q.d vr6, vr6, vr6
vhaddw.q.d vr7, vr7, vr7
vhaddw.q.d vr8, vr8, vr8
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.w vr5, vr6, vr5
vilvl.w vr7, vr8, vr7
vilvl.d vr1, vr3, vr1
vilvl.d vr5, vr7, vr5
vsra.w vr1, vr1, vr0
vsra.w vr5, vr5, vr0
vmin.w vr1, vr1, vr20
vmin.w vr5, vr5, vr20
vpickev.h vr1, vr5, vr1
vst vr1, a1, 0
addi.d a1, a1, 16
addi.d a5, a5, 32
addi.d a4, a4, 128
addi.d a2, a2, -8
bge a2, t8, .LOOP_HS15_DST8
blt zero, a2, .HS15_REST8
b .HS15_END
.HS15_REST8:
li.w t1, 0
.HS15_DST8:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.HS15_FILTER8:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .HS15_FILTER8
sra.w t8, t8, a7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 1
stx.h t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .HS15_DST8
b .HS15_END
.LOOP_HS15_DST4:
ld.w t1, a5, 0
ld.w t2, a5, 4
ld.w t3, a5, 8
ld.w t4, a5, 12
slli.w t1, t1, 1
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
fldx.d f1, a3, t1
fldx.d f2, a3, t2
fldx.d f3, a3, t3
fldx.d f4, a3, t4
ld.w t1, a5, 16
ld.w t2, a5, 20
ld.w t3, a5, 24
ld.w t4, a5, 28
slli.w t1, t1, 1
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
fldx.d f5, a3, t1
fldx.d f6, a3, t2
fldx.d f7, a3, t3
fldx.d f8, a3, t4
vld vr9, a4, 0
vld vr10, a4, 16
vld vr11, a4, 32
vld vr12, a4, 48
vilvl.d vr1, vr2, vr1
vilvl.d vr3, vr4, vr3
vilvl.d vr5, vr6, vr5
vilvl.d vr7, vr8, vr7
vmulwev.w.hu.h vr13, vr1, vr9
vmulwev.w.hu.h vr14, vr3, vr10
vmulwev.w.hu.h vr15, vr5, vr11
vmulwev.w.hu.h vr16, vr7, vr12
vmaddwod.w.hu.h vr13, vr1, vr9
vmaddwod.w.hu.h vr14, vr3, vr10
vmaddwod.w.hu.h vr15, vr5, vr11
vmaddwod.w.hu.h vr16, vr7, vr12
vhaddw.d.w vr13, vr13, vr13
vhaddw.d.w vr14, vr14, vr14
vhaddw.d.w vr15, vr15, vr15
vhaddw.d.w vr16, vr16, vr16
vpickev.w vr13, vr14, vr13
vpickev.w vr15, vr16, vr15
vsra.w vr13, vr13, vr0
vsra.w vr15, vr15, vr0
vmin.w vr13, vr13, vr20
vmin.w vr15, vr15, vr20
vpickev.h vr13, vr15, vr13
vst vr13, a1, 0
addi.d a1, a1, 16
addi.d a5, a5, 32
addi.d a4, a4, 64
addi.d a2, a2, -8
bge a2, t8, .LOOP_HS15_DST4
blt zero, a2, .HS15_REST4
b .HS15_END
.HS15_REST4:
li.w t1, 0
.HS15_DST4:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.HS15_FILTER4:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .HS15_FILTER4
sra.w t8, t8, a7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 1
stx.h t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .HS15_DST4
b .HS15_END
.END_HS15_DST4:
li.w t1, 0
.LOOP_HS15_DST1:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.HS15_FILTER1:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .HS15_FILTER1
sra.w t8, t8, a7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 1
stx.h t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .LOOP_HS15_DST1
b .HS15_END
.HS15_END:
ld.d s0, sp, 0
ld.d s1, sp, 8
ld.d s2, sp, 16
ld.d s3, sp, 24
ld.d s4, sp, 32
ld.d s5, sp, 40
ld.d s6, sp, 48
ld.d s7, sp, 56
ld.d s8, sp, 64
addi.d sp, sp, 72
endfunc
/* void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *dst, int dstW,
* const uint8_t *src, const int16_t *filter,
* const int32_t *filterPos, int filterSize, int sh)
*/
function ff_hscale_16_to_19_sub_lsx
addi.d sp, sp, -72
st.d s0, sp, 0
st.d s1, sp, 8
st.d s2, sp, 16
st.d s3, sp, 24
st.d s4, sp, 32
st.d s5, sp, 40
st.d s6, sp, 48
st.d s7, sp, 56
st.d s8, sp, 64
li.w t0, 524287
li.w t8, 8
li.w t7, 4
vreplgr2vr.w vr20, t0
vreplgr2vr.w vr0, a7
beq a6, t7, .LOOP_HS19_DST4
beq a6, t8, .LOOP_HS19_DST8
blt t8, a6, .LOOP_HS19
b .END_HS19_DST4
.LOOP_HS19:
li.w t1, 0
li.w s1, 0
li.w s2, 0
li.w s3, 0
li.w s4, 0
li.w s5, 0
vldi vr22, 0
addi.w s0, a6, -7
slli.w s7, a6, 1
slli.w s8, a6, 2
add.w t6, s7, s8
.LOOP_HS19_DST:
ld.w t2, a5, 0
ld.w t3, a5, 4
ld.w t4, a5, 8
ld.w t5, a5, 12
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
slli.w t5, t5, 1
vldx vr1, a3, t2
vldx vr2, a3, t3
vldx vr3, a3, t4
vldx vr4, a3, t5
vld vr9, a4, 0
vldx vr10, a4, s7
vldx vr11, a4, s8
vldx vr12, a4, t6
vmulwev.w.hu.h vr17, vr1, vr9
vmulwev.w.hu.h vr18, vr2, vr10
vmulwev.w.hu.h vr19, vr3, vr11
vmulwev.w.hu.h vr21, vr4, vr12
vmaddwod.w.hu.h vr17, vr1, vr9
vmaddwod.w.hu.h vr18, vr2, vr10
vmaddwod.w.hu.h vr19, vr3, vr11
vmaddwod.w.hu.h vr21, vr4, vr12
vhaddw.d.w vr1, vr17, vr17
vhaddw.d.w vr2, vr18, vr18
vhaddw.d.w vr3, vr19, vr19
vhaddw.d.w vr4, vr21, vr21
vhaddw.q.d vr1, vr1, vr1
vhaddw.q.d vr2, vr2, vr2
vhaddw.q.d vr3, vr3, vr3
vhaddw.q.d vr4, vr4, vr4
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.d vr1, vr3, vr1
vadd.w vr22, vr22, vr1
addi.w s1, s1, 8
addi.d a3, a3, 16
addi.d a4, a4, 16
blt s1, s0, .LOOP_HS19_DST
blt s1, a6, .HS19_DSTA
b .END_HS19_FILTERA
.HS19_DSTA:
ld.w t2, a5, 0
li.w t3, 0
move s6, s1
.HS19_FILTERA:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s2, s2, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .HS19_FILTERA
ld.w t2, a5, 4
li.w t3, 0
move s6, s1
addi.w t1, t1, 1
.HS19_FILTERB:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s3, s3, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .HS19_FILTERB
ld.w t2, a5, 8
addi.w t1, t1, 1
li.w t3, 0
move s6, s1
.HS19_FILTERC:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s4, s4, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .HS19_FILTERC
ld.w t2, a5, 12
addi.w t1, t1, 1
move s6, s1
li.w t3, 0
.HS19_FILTERD:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s5, s5, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .HS19_FILTERD
.END_HS19_FILTERA:
vpickve2gr.w t1, vr22, 0
vpickve2gr.w t2, vr22, 1
vpickve2gr.w t3, vr22, 2
vpickve2gr.w t4, vr22, 3
add.w s2, s2, t1
add.w s3, s3, t2
add.w s4, s4, t3
add.w s5, s5, t4
sra.w s2, s2, a7
sra.w s3, s3, a7
sra.w s4, s4, a7
sra.w s5, s5, a7
slt t1, s2, t0
slt t2, s3, t0
slt t3, s4, t0
slt t4, s5, t0
maskeqz s2, s2, t1
maskeqz s3, s3, t2
maskeqz s4, s4, t3
maskeqz s5, s5, t4
masknez t1, t0, t1
masknez t2, t0, t2
masknez t3, t0, t3
masknez t4, t0, t4
or s2, s2, t1
or s3, s3, t2
or s4, s4, t3
or s5, s5, t4
st.w s2, a1, 0
st.w s3, a1, 4
st.w s4, a1, 8
st.w s5, a1, 12
addi.d a1, a1, 16
sub.d a3, a3, s1
sub.d a3, a3, s1
addi.d a5, a5, 16
slli.d t3, a6, 3
add.d a4, a4, t3
sub.d a4, a4, s1
sub.d a4, a4, s1
addi.d a2, a2, -4
bge a2, t7, .LOOP_HS19
blt zero, a2, .HS19_RESA
b .HS19_END
.HS19_RESA:
li.w t1, 0
.HS19_DST:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.HS19_FILTER:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .HS19_FILTER
sra.w t8, t8, a7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 2
stx.w t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .HS19_DST
b .HS19_END
.LOOP_HS19_DST8:
ld.w t1, a5, 0
ld.w t2, a5, 4
ld.w t3, a5, 8
ld.w t4, a5, 12
slli.w t1, t1, 1
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
vldx vr1, a3, t1
vldx vr2, a3, t2
vldx vr3, a3, t3
vldx vr4, a3, t4
ld.w t1, a5, 16
ld.w t2, a5, 20
ld.w t3, a5, 24
ld.w t4, a5, 28
slli.w t1, t1, 1
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
vldx vr5, a3, t1
vldx vr6, a3, t2
vldx vr7, a3, t3
vldx vr8, a3, t4
vld vr9, a4, 0
vld vr10, a4, 16
vld vr11, a4, 32
vld vr12, a4, 48
vld vr13, a4, 64
vld vr14, a4, 80
vld vr15, a4, 96
vld vr16, a4, 112
vmulwev.w.hu.h vr17, vr1, vr9
vmulwev.w.hu.h vr18, vr2, vr10
vmulwev.w.hu.h vr19, vr3, vr11
vmulwev.w.hu.h vr21, vr4, vr12
vmaddwod.w.hu.h vr17, vr1, vr9
vmaddwod.w.hu.h vr18, vr2, vr10
vmaddwod.w.hu.h vr19, vr3, vr11
vmaddwod.w.hu.h vr21, vr4, vr12
vmulwev.w.hu.h vr1, vr5, vr13
vmulwev.w.hu.h vr2, vr6, vr14
vmulwev.w.hu.h vr3, vr7, vr15
vmulwev.w.hu.h vr4, vr8, vr16
vmaddwod.w.hu.h vr1, vr5, vr13
vmaddwod.w.hu.h vr2, vr6, vr14
vmaddwod.w.hu.h vr3, vr7, vr15
vmaddwod.w.hu.h vr4, vr8, vr16
vhaddw.d.w vr5, vr1, vr1
vhaddw.d.w vr6, vr2, vr2
vhaddw.d.w vr7, vr3, vr3
vhaddw.d.w vr8, vr4, vr4
vhaddw.d.w vr1, vr17, vr17
vhaddw.d.w vr2, vr18, vr18
vhaddw.d.w vr3, vr19, vr19
vhaddw.d.w vr4, vr21, vr21
vhaddw.q.d vr1, vr1, vr1
vhaddw.q.d vr2, vr2, vr2
vhaddw.q.d vr3, vr3, vr3
vhaddw.q.d vr4, vr4, vr4
vhaddw.q.d vr5, vr5, vr5
vhaddw.q.d vr6, vr6, vr6
vhaddw.q.d vr7, vr7, vr7
vhaddw.q.d vr8, vr8, vr8
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.w vr5, vr6, vr5
vilvl.w vr7, vr8, vr7
vilvl.d vr1, vr3, vr1
vilvl.d vr5, vr7, vr5
vsra.w vr1, vr1, vr0
vsra.w vr5, vr5, vr0
vmin.w vr1, vr1, vr20
vmin.w vr5, vr5, vr20
vst vr1, a1, 0
vst vr5, a1, 16
addi.d a1, a1, 32
addi.d a5, a5, 32
addi.d a4, a4, 128
addi.d a2, a2, -8
bge a2, t8, .LOOP_HS19_DST8
blt zero, a2, .HS19_REST8
b .HS19_END
.HS19_REST8:
li.w t1, 0
.HS19_DST8:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.HS19_FILTER8:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .HS19_FILTER8
sra.w t8, t8, a7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 2
stx.w t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .HS19_DST8
b .HS19_END
.LOOP_HS19_DST4:
ld.w t1, a5, 0
ld.w t2, a5, 4
ld.w t3, a5, 8
ld.w t4, a5, 12
slli.w t1, t1, 1
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
fldx.d f1, a3, t1
fldx.d f2, a3, t2
fldx.d f3, a3, t3
fldx.d f4, a3, t4
ld.w t1, a5, 16
ld.w t2, a5, 20
ld.w t3, a5, 24
ld.w t4, a5, 28
slli.w t1, t1, 1
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
fldx.d f5, a3, t1
fldx.d f6, a3, t2
fldx.d f7, a3, t3
fldx.d f8, a3, t4
vld vr9, a4, 0
vld vr10, a4, 16
vld vr11, a4, 32
vld vr12, a4, 48
vilvl.d vr1, vr2, vr1
vilvl.d vr3, vr4, vr3
vilvl.d vr5, vr6, vr5
vilvl.d vr7, vr8, vr7
vmulwev.w.hu.h vr13, vr1, vr9
vmulwev.w.hu.h vr14, vr3, vr10
vmulwev.w.hu.h vr15, vr5, vr11
vmulwev.w.hu.h vr16, vr7, vr12
vmaddwod.w.hu.h vr13, vr1, vr9
vmaddwod.w.hu.h vr14, vr3, vr10
vmaddwod.w.hu.h vr15, vr5, vr11
vmaddwod.w.hu.h vr16, vr7, vr12
vhaddw.d.w vr13, vr13, vr13
vhaddw.d.w vr14, vr14, vr14
vhaddw.d.w vr15, vr15, vr15
vhaddw.d.w vr16, vr16, vr16
vpickev.w vr13, vr14, vr13
vpickev.w vr15, vr16, vr15
vsra.w vr13, vr13, vr0
vsra.w vr15, vr15, vr0
vmin.w vr13, vr13, vr20
vmin.w vr15, vr15, vr20
vst vr13, a1, 0
vst vr15, a1, 16
addi.d a1, a1, 32
addi.d a5, a5, 32
addi.d a4, a4, 64
addi.d a2, a2, -8
bge a2, t8, .LOOP_HS19_DST4
blt zero, a2, .HS19_REST4
b .HS19_END
.HS19_REST4:
li.w t1, 0
.HS19_DST4:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.HS19_FILTER4:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .HS19_FILTER4
sra.w t8, t8, a7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 2
stx.w t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .HS19_DST4
b .HS19_END
.END_HS19_DST4:
li.w t1, 0
.LOOP_HS19_DST1:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.HS19_FILTER1:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .HS19_FILTER1
sra.w t8, t8, a7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 2
stx.w t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .LOOP_HS19_DST1
b .HS19_END
.HS19_END:
ld.d s0, sp, 0
ld.d s1, sp, 8
ld.d s2, sp, 16
ld.d s3, sp, 24
ld.d s4, sp, 32
ld.d s5, sp, 40
ld.d s6, sp, 48
ld.d s7, sp, 56
ld.d s8, sp, 64
addi.d sp, sp, 72
endfunc