ffmpeg/libswscale/loongarch/swscale.S

2237 lines
80 KiB
ArmAsm

/*
* Loongson LSX optimized swscale
*
* Copyright (c) 2023 Loongson Technology Corporation Limited
* Contributed by Lu Wang <wanglu@loongson.cn>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/loongarch/loongson_asm.S"
/* void ff_hscale_8_to_15_lsx(SwsInternal *c, int16_t *dst, int dstW,
* const uint8_t *src, const int16_t *filter,
* const int32_t *filterPos, int filterSize)
*/
function ff_hscale_8_to_15_lsx
addi.d sp, sp, -72
st.d s0, sp, 0
st.d s1, sp, 8
st.d s2, sp, 16
st.d s3, sp, 24
st.d s4, sp, 32
st.d s5, sp, 40
st.d s6, sp, 48
st.d s7, sp, 56
st.d s8, sp, 64
li.w t0, 32767
li.w t8, 8
li.w t7, 4
vldi vr0, 0
vreplgr2vr.w vr20, t0
beq a6, t7, .LOOP_DSTW4
beq a6, t8, .LOOP_DSTW8
blt t8, a6, .LOOP_START
b .END_DSTW4
.LOOP_START:
li.w t1, 0
li.w s1, 0
li.w s2, 0
li.w s3, 0
li.w s4, 0
li.w s5, 0
vldi vr22, 0
addi.w s0, a6, -7
slli.w s7, a6, 1
slli.w s8, a6, 2
add.w t6, s7, s8
.LOOP_DSTW:
ld.w t2, a5, 0
ld.w t3, a5, 4
ld.w t4, a5, 8
ld.w t5, a5, 12
fldx.d f1, a3, t2
fldx.d f2, a3, t3
fldx.d f3, a3, t4
fldx.d f4, a3, t5
vld vr9, a4, 0
vldx vr10, a4, s7
vldx vr11, a4, s8
vldx vr12, a4, t6
vilvl.b vr1, vr0, vr1
vilvl.b vr2, vr0, vr2
vilvl.b vr3, vr0, vr3
vilvl.b vr4, vr0, vr4
vdp2.w.h vr17, vr1, vr9
vdp2.w.h vr18, vr2, vr10
vdp2.w.h vr19, vr3, vr11
vdp2.w.h vr21, vr4, vr12
vhaddw.d.w vr1, vr17, vr17
vhaddw.d.w vr2, vr18, vr18
vhaddw.d.w vr3, vr19, vr19
vhaddw.d.w vr4, vr21, vr21
vhaddw.q.d vr1, vr1, vr1
vhaddw.q.d vr2, vr2, vr2
vhaddw.q.d vr3, vr3, vr3
vhaddw.q.d vr4, vr4, vr4
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.d vr1, vr3, vr1
vadd.w vr22, vr22, vr1
addi.w s1, s1, 8
addi.d a3, a3, 8
addi.d a4, a4, 16
blt s1, s0, .LOOP_DSTW
blt s1, a6, .DSTWA
b .END_FILTER
.DSTWA:
ld.w t2, a5, 0
li.w t3, 0
move s6, s1
.FILTERSIZEA:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s2, s2, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .FILTERSIZEA
ld.w t2, a5, 4
li.w t3, 0
move s6, s1
addi.w t1, t1, 1
.FILTERSIZEB:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s3, s3, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .FILTERSIZEB
ld.w t2, a5, 8
addi.w t1, t1, 1
li.w t3, 0
move s6, s1
.FILTERSIZEC:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s4, s4, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .FILTERSIZEC
ld.w t2, a5, 12
addi.w t1, t1, 1
move s6, s1
li.w t3, 0
.FILTERSIZED:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s5, s5, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .FILTERSIZED
.END_FILTER:
vpickve2gr.w t1, vr22, 0
vpickve2gr.w t2, vr22, 1
vpickve2gr.w t3, vr22, 2
vpickve2gr.w t4, vr22, 3
add.w s2, s2, t1
add.w s3, s3, t2
add.w s4, s4, t3
add.w s5, s5, t4
srai.w s2, s2, 7
srai.w s3, s3, 7
srai.w s4, s4, 7
srai.w s5, s5, 7
slt t1, s2, t0
slt t2, s3, t0
slt t3, s4, t0
slt t4, s5, t0
maskeqz s2, s2, t1
maskeqz s3, s3, t2
maskeqz s4, s4, t3
maskeqz s5, s5, t4
masknez t1, t0, t1
masknez t2, t0, t2
masknez t3, t0, t3
masknez t4, t0, t4
or s2, s2, t1
or s3, s3, t2
or s4, s4, t3
or s5, s5, t4
st.h s2, a1, 0
st.h s3, a1, 2
st.h s4, a1, 4
st.h s5, a1, 6
addi.d a1, a1, 8
sub.d a3, a3, s1
addi.d a5, a5, 16
slli.d t3, a6, 3
add.d a4, a4, t3
sub.d a4, a4, s1
sub.d a4, a4, s1
addi.d a2, a2, -4
bge a2, t7, .LOOP_START
blt zero, a2, .RES
b .END_LOOP
.RES:
li.w t1, 0
.DSTW:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.FILTERSIZE:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .FILTERSIZE
srai.w t8, t8, 7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 1
stx.h t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .DSTW
b .END_LOOP
.LOOP_DSTW8:
ld.w t1, a5, 0
ld.w t2, a5, 4
ld.w t3, a5, 8
ld.w t4, a5, 12
fldx.d f1, a3, t1
fldx.d f2, a3, t2
fldx.d f3, a3, t3
fldx.d f4, a3, t4
ld.w t1, a5, 16
ld.w t2, a5, 20
ld.w t3, a5, 24
ld.w t4, a5, 28
fldx.d f5, a3, t1
fldx.d f6, a3, t2
fldx.d f7, a3, t3
fldx.d f8, a3, t4
vld vr9, a4, 0
vld vr10, a4, 16
vld vr11, a4, 32
vld vr12, a4, 48
vld vr13, a4, 64
vld vr14, a4, 80
vld vr15, a4, 96
vld vr16, a4, 112
vilvl.b vr1, vr0, vr1
vilvl.b vr2, vr0, vr2
vilvl.b vr3, vr0, vr3
vilvl.b vr4, vr0, vr4
vilvl.b vr5, vr0, vr5
vilvl.b vr6, vr0, vr6
vilvl.b vr7, vr0, vr7
vilvl.b vr8, vr0, vr8
vdp2.w.h vr17, vr1, vr9
vdp2.w.h vr18, vr2, vr10
vdp2.w.h vr19, vr3, vr11
vdp2.w.h vr21, vr4, vr12
vdp2.w.h vr1, vr5, vr13
vdp2.w.h vr2, vr6, vr14
vdp2.w.h vr3, vr7, vr15
vdp2.w.h vr4, vr8, vr16
vhaddw.d.w vr5, vr1, vr1
vhaddw.d.w vr6, vr2, vr2
vhaddw.d.w vr7, vr3, vr3
vhaddw.d.w vr8, vr4, vr4
vhaddw.d.w vr1, vr17, vr17
vhaddw.d.w vr2, vr18, vr18
vhaddw.d.w vr3, vr19, vr19
vhaddw.d.w vr4, vr21, vr21
vhaddw.q.d vr1, vr1, vr1
vhaddw.q.d vr2, vr2, vr2
vhaddw.q.d vr3, vr3, vr3
vhaddw.q.d vr4, vr4, vr4
vhaddw.q.d vr5, vr5, vr5
vhaddw.q.d vr6, vr6, vr6
vhaddw.q.d vr7, vr7, vr7
vhaddw.q.d vr8, vr8, vr8
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.w vr5, vr6, vr5
vilvl.w vr7, vr8, vr7
vilvl.d vr1, vr3, vr1
vilvl.d vr5, vr7, vr5
vsrai.w vr1, vr1, 7
vsrai.w vr5, vr5, 7
vmin.w vr1, vr1, vr20
vmin.w vr5, vr5, vr20
vpickev.h vr1, vr5, vr1
vst vr1, a1, 0
addi.d a1, a1, 16
addi.d a5, a5, 32
addi.d a4, a4, 128
addi.d a2, a2, -8
bge a2, t8, .LOOP_DSTW8
blt zero, a2, .RES8
b .END_LOOP
.RES8:
li.w t1, 0
.DSTW8:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.FILTERSIZE8:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .FILTERSIZE8
srai.w t8, t8, 7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 1
stx.h t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .DSTW8
b .END_LOOP
.LOOP_DSTW4:
ld.w t1, a5, 0
ld.w t2, a5, 4
ld.w t3, a5, 8
ld.w t4, a5, 12
fldx.s f1, a3, t1
fldx.s f2, a3, t2
fldx.s f3, a3, t3
fldx.s f4, a3, t4
ld.w t1, a5, 16
ld.w t2, a5, 20
ld.w t3, a5, 24
ld.w t4, a5, 28
fldx.s f5, a3, t1
fldx.s f6, a3, t2
fldx.s f7, a3, t3
fldx.s f8, a3, t4
vld vr9, a4, 0
vld vr10, a4, 16
vld vr11, a4, 32
vld vr12, a4, 48
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.w vr5, vr6, vr5
vilvl.w vr7, vr8, vr7
vilvl.b vr1, vr0, vr1
vilvl.b vr3, vr0, vr3
vilvl.b vr5, vr0, vr5
vilvl.b vr7, vr0, vr7
vdp2.w.h vr13, vr1, vr9
vdp2.w.h vr14, vr3, vr10
vdp2.w.h vr15, vr5, vr11
vdp2.w.h vr16, vr7, vr12
vhaddw.d.w vr13, vr13, vr13
vhaddw.d.w vr14, vr14, vr14
vhaddw.d.w vr15, vr15, vr15
vhaddw.d.w vr16, vr16, vr16
vpickev.w vr13, vr14, vr13
vpickev.w vr15, vr16, vr15
vsrai.w vr13, vr13, 7
vsrai.w vr15, vr15, 7
vmin.w vr13, vr13, vr20
vmin.w vr15, vr15, vr20
vpickev.h vr13, vr15, vr13
vst vr13, a1, 0
addi.d a1, a1, 16
addi.d a5, a5, 32
addi.d a4, a4, 64
addi.d a2, a2, -8
bge a2, t8, .LOOP_DSTW4
blt zero, a2, .RES4
b .END_LOOP
.RES4:
li.w t1, 0
.DSTW4:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.FILTERSIZE4:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .FILTERSIZE4
srai.w t8, t8, 7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 1
stx.h t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .DSTW4
b .END_LOOP
.END_DSTW4:
li.w t1, 0
.LOOP_DSTW1:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.FILTERSIZE1:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .FILTERSIZE1
srai.w t8, t8, 7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 1
stx.h t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .LOOP_DSTW1
b .END_LOOP
.END_LOOP:
ld.d s0, sp, 0
ld.d s1, sp, 8
ld.d s2, sp, 16
ld.d s3, sp, 24
ld.d s4, sp, 32
ld.d s5, sp, 40
ld.d s6, sp, 48
ld.d s7, sp, 56
ld.d s8, sp, 64
addi.d sp, sp, 72
endfunc
/* void ff_hscale_8_to_19_lsx(SwsInternal *c, int16_t *dst, int dstW,
* const uint8_t *src, const int16_t *filter,
* const int32_t *filterPos, int filterSize)
*/
function ff_hscale_8_to_19_lsx
addi.d sp, sp, -72
st.d s0, sp, 0
st.d s1, sp, 8
st.d s2, sp, 16
st.d s3, sp, 24
st.d s4, sp, 32
st.d s5, sp, 40
st.d s6, sp, 48
st.d s7, sp, 56
st.d s8, sp, 64
li.w t0, 524287
li.w t8, 8
li.w t7, 4
vldi vr0, 0
vreplgr2vr.w vr20, t0
beq a6, t7, .LOOP_DST4
beq a6, t8, .LOOP_DST8
blt t8, a6, .LOOP
b .END_DST4
.LOOP:
li.w t1, 0
li.w s1, 0
li.w s2, 0
li.w s3, 0
li.w s4, 0
li.w s5, 0
vldi vr22, 0
addi.w s0, a6, -7
slli.w s7, a6, 1
slli.w s8, a6, 2
add.w t6, s7, s8
.LOOP_DST:
ld.w t2, a5, 0
ld.w t3, a5, 4
ld.w t4, a5, 8
ld.w t5, a5, 12
fldx.d f1, a3, t2
fldx.d f2, a3, t3
fldx.d f3, a3, t4
fldx.d f4, a3, t5
vld vr9, a4, 0
vldx vr10, a4, s7
vldx vr11, a4, s8
vldx vr12, a4, t6
vilvl.b vr1, vr0, vr1
vilvl.b vr2, vr0, vr2
vilvl.b vr3, vr0, vr3
vilvl.b vr4, vr0, vr4
vdp2.w.h vr17, vr1, vr9
vdp2.w.h vr18, vr2, vr10
vdp2.w.h vr19, vr3, vr11
vdp2.w.h vr21, vr4, vr12
vhaddw.d.w vr1, vr17, vr17
vhaddw.d.w vr2, vr18, vr18
vhaddw.d.w vr3, vr19, vr19
vhaddw.d.w vr4, vr21, vr21
vhaddw.q.d vr1, vr1, vr1
vhaddw.q.d vr2, vr2, vr2
vhaddw.q.d vr3, vr3, vr3
vhaddw.q.d vr4, vr4, vr4
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.d vr1, vr3, vr1
vadd.w vr22, vr22, vr1
addi.w s1, s1, 8
addi.d a3, a3, 8
addi.d a4, a4, 16
blt s1, s0, .LOOP_DST
blt s1, a6, .DSTA
b .END_FILTERA
.DSTA:
ld.w t2, a5, 0
li.w t3, 0
move s6, s1
.FILTERA:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s2, s2, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .FILTERA
ld.w t2, a5, 4
li.w t3, 0
move s6, s1
addi.w t1, t1, 1
.FILTERB:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s3, s3, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .FILTERB
ld.w t2, a5, 8
addi.w t1, t1, 1
li.w t3, 0
move s6, s1
.FILTERC:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s4, s4, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .FILTERC
ld.w t2, a5, 12
addi.w t1, t1, 1
move s6, s1
li.w t3, 0
.FILTERD:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s5, s5, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .FILTERD
.END_FILTERA:
vpickve2gr.w t1, vr22, 0
vpickve2gr.w t2, vr22, 1
vpickve2gr.w t3, vr22, 2
vpickve2gr.w t4, vr22, 3
add.w s2, s2, t1
add.w s3, s3, t2
add.w s4, s4, t3
add.w s5, s5, t4
srai.w s2, s2, 3
srai.w s3, s3, 3
srai.w s4, s4, 3
srai.w s5, s5, 3
slt t1, s2, t0
slt t2, s3, t0
slt t3, s4, t0
slt t4, s5, t0
maskeqz s2, s2, t1
maskeqz s3, s3, t2
maskeqz s4, s4, t3
maskeqz s5, s5, t4
masknez t1, t0, t1
masknez t2, t0, t2
masknez t3, t0, t3
masknez t4, t0, t4
or s2, s2, t1
or s3, s3, t2
or s4, s4, t3
or s5, s5, t4
st.w s2, a1, 0
st.w s3, a1, 4
st.w s4, a1, 8
st.w s5, a1, 12
addi.d a1, a1, 16
sub.d a3, a3, s1
addi.d a5, a5, 16
slli.d t3, a6, 3
add.d a4, a4, t3
sub.d a4, a4, s1
sub.d a4, a4, s1
addi.d a2, a2, -4
bge a2, t7, .LOOP
blt zero, a2, .RESA
b .END
.RESA:
li.w t1, 0
.DST:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.FILTER:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .FILTER
srai.w t8, t8, 3
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 2
stx.w t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .DST
b .END
.LOOP_DST8:
ld.w t1, a5, 0
ld.w t2, a5, 4
ld.w t3, a5, 8
ld.w t4, a5, 12
fldx.d f1, a3, t1
fldx.d f2, a3, t2
fldx.d f3, a3, t3
fldx.d f4, a3, t4
ld.w t1, a5, 16
ld.w t2, a5, 20
ld.w t3, a5, 24
ld.w t4, a5, 28
fldx.d f5, a3, t1
fldx.d f6, a3, t2
fldx.d f7, a3, t3
fldx.d f8, a3, t4
vld vr9, a4, 0
vld vr10, a4, 16
vld vr11, a4, 32
vld vr12, a4, 48
vld vr13, a4, 64
vld vr14, a4, 80
vld vr15, a4, 96
vld vr16, a4, 112
vilvl.b vr1, vr0, vr1
vilvl.b vr2, vr0, vr2
vilvl.b vr3, vr0, vr3
vilvl.b vr4, vr0, vr4
vilvl.b vr5, vr0, vr5
vilvl.b vr6, vr0, vr6
vilvl.b vr7, vr0, vr7
vilvl.b vr8, vr0, vr8
vdp2.w.h vr17, vr1, vr9
vdp2.w.h vr18, vr2, vr10
vdp2.w.h vr19, vr3, vr11
vdp2.w.h vr21, vr4, vr12
vdp2.w.h vr1, vr5, vr13
vdp2.w.h vr2, vr6, vr14
vdp2.w.h vr3, vr7, vr15
vdp2.w.h vr4, vr8, vr16
vhaddw.d.w vr5, vr1, vr1
vhaddw.d.w vr6, vr2, vr2
vhaddw.d.w vr7, vr3, vr3
vhaddw.d.w vr8, vr4, vr4
vhaddw.d.w vr1, vr17, vr17
vhaddw.d.w vr2, vr18, vr18
vhaddw.d.w vr3, vr19, vr19
vhaddw.d.w vr4, vr21, vr21
vhaddw.q.d vr1, vr1, vr1
vhaddw.q.d vr2, vr2, vr2
vhaddw.q.d vr3, vr3, vr3
vhaddw.q.d vr4, vr4, vr4
vhaddw.q.d vr5, vr5, vr5
vhaddw.q.d vr6, vr6, vr6
vhaddw.q.d vr7, vr7, vr7
vhaddw.q.d vr8, vr8, vr8
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.w vr5, vr6, vr5
vilvl.w vr7, vr8, vr7
vilvl.d vr1, vr3, vr1
vilvl.d vr5, vr7, vr5
vsrai.w vr1, vr1, 3
vsrai.w vr5, vr5, 3
vmin.w vr1, vr1, vr20
vmin.w vr5, vr5, vr20
vst vr1, a1, 0
vst vr5, a1, 16
addi.d a1, a1, 32
addi.d a5, a5, 32
addi.d a4, a4, 128
addi.d a2, a2, -8
bge a2, t8, .LOOP_DST8
blt zero, a2, .REST8
b .END
.REST8:
li.w t1, 0
.DST8:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.FILTER8:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .FILTER8
srai.w t8, t8, 3
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 2
stx.w t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .DST8
b .END
.LOOP_DST4:
ld.w t1, a5, 0
ld.w t2, a5, 4
ld.w t3, a5, 8
ld.w t4, a5, 12
fldx.s f1, a3, t1
fldx.s f2, a3, t2
fldx.s f3, a3, t3
fldx.s f4, a3, t4
ld.w t1, a5, 16
ld.w t2, a5, 20
ld.w t3, a5, 24
ld.w t4, a5, 28
fldx.s f5, a3, t1
fldx.s f6, a3, t2
fldx.s f7, a3, t3
fldx.s f8, a3, t4
vld vr9, a4, 0
vld vr10, a4, 16
vld vr11, a4, 32
vld vr12, a4, 48
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.w vr5, vr6, vr5
vilvl.w vr7, vr8, vr7
vilvl.b vr1, vr0, vr1
vilvl.b vr3, vr0, vr3
vilvl.b vr5, vr0, vr5
vilvl.b vr7, vr0, vr7
vdp2.w.h vr13, vr1, vr9
vdp2.w.h vr14, vr3, vr10
vdp2.w.h vr15, vr5, vr11
vdp2.w.h vr16, vr7, vr12
vhaddw.d.w vr13, vr13, vr13
vhaddw.d.w vr14, vr14, vr14
vhaddw.d.w vr15, vr15, vr15
vhaddw.d.w vr16, vr16, vr16
vpickev.w vr13, vr14, vr13
vpickev.w vr15, vr16, vr15
vsrai.w vr13, vr13, 3
vsrai.w vr15, vr15, 3
vmin.w vr13, vr13, vr20
vmin.w vr15, vr15, vr20
vst vr13, a1, 0
vst vr15, a1, 16
addi.d a1, a1, 32
addi.d a5, a5, 32
addi.d a4, a4, 64
addi.d a2, a2, -8
bge a2, t8, .LOOP_DST4
blt zero, a2, .REST4
b .END
.REST4:
li.w t1, 0
.DST4:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.FILTER4:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .FILTER4
srai.w t8, t8, 3
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 2
stx.w t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .DST4
b .END
.END_DST4:
li.w t1, 0
.LOOP_DST1:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.FILTER1:
add.w t4, t2, t3
ldx.bu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .FILTER1
srai.w t8, t8, 3
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 2
stx.w t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .LOOP_DST1
b .END
.END:
ld.d s0, sp, 0
ld.d s1, sp, 8
ld.d s2, sp, 16
ld.d s3, sp, 24
ld.d s4, sp, 32
ld.d s5, sp, 40
ld.d s6, sp, 48
ld.d s7, sp, 56
ld.d s8, sp, 64
addi.d sp, sp, 72
endfunc
/* void ff_hscale_16_to_15_sub_lsx(SwsInternal *c, int16_t *dst, int dstW,
* const uint8_t *src, const int16_t *filter,
* const int32_t *filterPos, int filterSize, int sh)
*/
function ff_hscale_16_to_15_sub_lsx
addi.d sp, sp, -72
st.d s0, sp, 0
st.d s1, sp, 8
st.d s2, sp, 16
st.d s3, sp, 24
st.d s4, sp, 32
st.d s5, sp, 40
st.d s6, sp, 48
st.d s7, sp, 56
st.d s8, sp, 64
li.w t0, 32767
li.w t8, 8
li.w t7, 4
vreplgr2vr.w vr20, t0
vreplgr2vr.w vr0, a7
beq a6, t7, .LOOP_HS15_DST4
beq a6, t8, .LOOP_HS15_DST8
blt t8, a6, .LOOP_HS15
b .END_HS15_DST4
.LOOP_HS15:
li.w t1, 0
li.w s1, 0
li.w s2, 0
li.w s3, 0
li.w s4, 0
li.w s5, 0
vldi vr22, 0
addi.w s0, a6, -7
slli.w s7, a6, 1
slli.w s8, a6, 2
add.w t6, s7, s8
.LOOP_HS15_DST:
ld.w t2, a5, 0
ld.w t3, a5, 4
ld.w t4, a5, 8
ld.w t5, a5, 12
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
slli.w t5, t5, 1
vldx vr1, a3, t2
vldx vr2, a3, t3
vldx vr3, a3, t4
vldx vr4, a3, t5
vld vr9, a4, 0
vldx vr10, a4, s7
vldx vr11, a4, s8
vldx vr12, a4, t6
vmulwev.w.hu.h vr17, vr1, vr9
vmulwev.w.hu.h vr18, vr2, vr10
vmulwev.w.hu.h vr19, vr3, vr11
vmulwev.w.hu.h vr21, vr4, vr12
vmaddwod.w.hu.h vr17, vr1, vr9
vmaddwod.w.hu.h vr18, vr2, vr10
vmaddwod.w.hu.h vr19, vr3, vr11
vmaddwod.w.hu.h vr21, vr4, vr12
vhaddw.d.w vr1, vr17, vr17
vhaddw.d.w vr2, vr18, vr18
vhaddw.d.w vr3, vr19, vr19
vhaddw.d.w vr4, vr21, vr21
vhaddw.q.d vr1, vr1, vr1
vhaddw.q.d vr2, vr2, vr2
vhaddw.q.d vr3, vr3, vr3
vhaddw.q.d vr4, vr4, vr4
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.d vr1, vr3, vr1
vadd.w vr22, vr22, vr1
addi.w s1, s1, 8
addi.d a3, a3, 16
addi.d a4, a4, 16
blt s1, s0, .LOOP_HS15_DST
blt s1, a6, .HS15_DSTA
b .END_HS15_FILTERA
.HS15_DSTA:
ld.w t2, a5, 0
li.w t3, 0
move s6, s1
.HS15_FILTERA:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s2, s2, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .HS15_FILTERA
ld.w t2, a5, 4
li.w t3, 0
move s6, s1
addi.w t1, t1, 1
.HS15_FILTERB:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s3, s3, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .HS15_FILTERB
ld.w t2, a5, 8
addi.w t1, t1, 1
li.w t3, 0
move s6, s1
.HS15_FILTERC:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s4, s4, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .HS15_FILTERC
ld.w t2, a5, 12
addi.w t1, t1, 1
move s6, s1
li.w t3, 0
.HS15_FILTERD:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s5, s5, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .HS15_FILTERD
.END_HS15_FILTERA:
vpickve2gr.w t1, vr22, 0
vpickve2gr.w t2, vr22, 1
vpickve2gr.w t3, vr22, 2
vpickve2gr.w t4, vr22, 3
add.w s2, s2, t1
add.w s3, s3, t2
add.w s4, s4, t3
add.w s5, s5, t4
sra.w s2, s2, a7
sra.w s3, s3, a7
sra.w s4, s4, a7
sra.w s5, s5, a7
slt t1, s2, t0
slt t2, s3, t0
slt t3, s4, t0
slt t4, s5, t0
maskeqz s2, s2, t1
maskeqz s3, s3, t2
maskeqz s4, s4, t3
maskeqz s5, s5, t4
masknez t1, t0, t1
masknez t2, t0, t2
masknez t3, t0, t3
masknez t4, t0, t4
or s2, s2, t1
or s3, s3, t2
or s4, s4, t3
or s5, s5, t4
st.h s2, a1, 0
st.h s3, a1, 2
st.h s4, a1, 4
st.h s5, a1, 6
addi.d a1, a1, 8
sub.d a3, a3, s1
sub.d a3, a3, s1
addi.d a5, a5, 16
slli.d t3, a6, 3
add.d a4, a4, t3
sub.d a4, a4, s1
sub.d a4, a4, s1
addi.d a2, a2, -4
bge a2, t7, .LOOP_HS15
blt zero, a2, .HS15_RESA
b .HS15_END
.HS15_RESA:
li.w t1, 0
.HS15_DST:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.HS15_FILTER:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .HS15_FILTER
sra.w t8, t8, a7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 1
stx.h t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .HS15_DST
b .HS15_END
.LOOP_HS15_DST8:
ld.w t1, a5, 0
ld.w t2, a5, 4
ld.w t3, a5, 8
ld.w t4, a5, 12
slli.w t1, t1, 1
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
vldx vr1, a3, t1
vldx vr2, a3, t2
vldx vr3, a3, t3
vldx vr4, a3, t4
ld.w t1, a5, 16
ld.w t2, a5, 20
ld.w t3, a5, 24
ld.w t4, a5, 28
slli.w t1, t1, 1
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
vldx vr5, a3, t1
vldx vr6, a3, t2
vldx vr7, a3, t3
vldx vr8, a3, t4
vld vr9, a4, 0
vld vr10, a4, 16
vld vr11, a4, 32
vld vr12, a4, 48
vld vr13, a4, 64
vld vr14, a4, 80
vld vr15, a4, 96
vld vr16, a4, 112
vmulwev.w.hu.h vr17, vr1, vr9
vmulwev.w.hu.h vr18, vr2, vr10
vmulwev.w.hu.h vr19, vr3, vr11
vmulwev.w.hu.h vr21, vr4, vr12
vmaddwod.w.hu.h vr17, vr1, vr9
vmaddwod.w.hu.h vr18, vr2, vr10
vmaddwod.w.hu.h vr19, vr3, vr11
vmaddwod.w.hu.h vr21, vr4, vr12
vmulwev.w.hu.h vr1, vr5, vr13
vmulwev.w.hu.h vr2, vr6, vr14
vmulwev.w.hu.h vr3, vr7, vr15
vmulwev.w.hu.h vr4, vr8, vr16
vmaddwod.w.hu.h vr1, vr5, vr13
vmaddwod.w.hu.h vr2, vr6, vr14
vmaddwod.w.hu.h vr3, vr7, vr15
vmaddwod.w.hu.h vr4, vr8, vr16
vhaddw.d.w vr5, vr1, vr1
vhaddw.d.w vr6, vr2, vr2
vhaddw.d.w vr7, vr3, vr3
vhaddw.d.w vr8, vr4, vr4
vhaddw.d.w vr1, vr17, vr17
vhaddw.d.w vr2, vr18, vr18
vhaddw.d.w vr3, vr19, vr19
vhaddw.d.w vr4, vr21, vr21
vhaddw.q.d vr1, vr1, vr1
vhaddw.q.d vr2, vr2, vr2
vhaddw.q.d vr3, vr3, vr3
vhaddw.q.d vr4, vr4, vr4
vhaddw.q.d vr5, vr5, vr5
vhaddw.q.d vr6, vr6, vr6
vhaddw.q.d vr7, vr7, vr7
vhaddw.q.d vr8, vr8, vr8
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.w vr5, vr6, vr5
vilvl.w vr7, vr8, vr7
vilvl.d vr1, vr3, vr1
vilvl.d vr5, vr7, vr5
vsra.w vr1, vr1, vr0
vsra.w vr5, vr5, vr0
vmin.w vr1, vr1, vr20
vmin.w vr5, vr5, vr20
vpickev.h vr1, vr5, vr1
vst vr1, a1, 0
addi.d a1, a1, 16
addi.d a5, a5, 32
addi.d a4, a4, 128
addi.d a2, a2, -8
bge a2, t8, .LOOP_HS15_DST8
blt zero, a2, .HS15_REST8
b .HS15_END
.HS15_REST8:
li.w t1, 0
.HS15_DST8:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.HS15_FILTER8:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .HS15_FILTER8
sra.w t8, t8, a7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 1
stx.h t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .HS15_DST8
b .HS15_END
.LOOP_HS15_DST4:
ld.w t1, a5, 0
ld.w t2, a5, 4
ld.w t3, a5, 8
ld.w t4, a5, 12
slli.w t1, t1, 1
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
fldx.d f1, a3, t1
fldx.d f2, a3, t2
fldx.d f3, a3, t3
fldx.d f4, a3, t4
ld.w t1, a5, 16
ld.w t2, a5, 20
ld.w t3, a5, 24
ld.w t4, a5, 28
slli.w t1, t1, 1
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
fldx.d f5, a3, t1
fldx.d f6, a3, t2
fldx.d f7, a3, t3
fldx.d f8, a3, t4
vld vr9, a4, 0
vld vr10, a4, 16
vld vr11, a4, 32
vld vr12, a4, 48
vilvl.d vr1, vr2, vr1
vilvl.d vr3, vr4, vr3
vilvl.d vr5, vr6, vr5
vilvl.d vr7, vr8, vr7
vmulwev.w.hu.h vr13, vr1, vr9
vmulwev.w.hu.h vr14, vr3, vr10
vmulwev.w.hu.h vr15, vr5, vr11
vmulwev.w.hu.h vr16, vr7, vr12
vmaddwod.w.hu.h vr13, vr1, vr9
vmaddwod.w.hu.h vr14, vr3, vr10
vmaddwod.w.hu.h vr15, vr5, vr11
vmaddwod.w.hu.h vr16, vr7, vr12
vhaddw.d.w vr13, vr13, vr13
vhaddw.d.w vr14, vr14, vr14
vhaddw.d.w vr15, vr15, vr15
vhaddw.d.w vr16, vr16, vr16
vpickev.w vr13, vr14, vr13
vpickev.w vr15, vr16, vr15
vsra.w vr13, vr13, vr0
vsra.w vr15, vr15, vr0
vmin.w vr13, vr13, vr20
vmin.w vr15, vr15, vr20
vpickev.h vr13, vr15, vr13
vst vr13, a1, 0
addi.d a1, a1, 16
addi.d a5, a5, 32
addi.d a4, a4, 64
addi.d a2, a2, -8
bge a2, t8, .LOOP_HS15_DST4
blt zero, a2, .HS15_REST4
b .HS15_END
.HS15_REST4:
li.w t1, 0
.HS15_DST4:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.HS15_FILTER4:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .HS15_FILTER4
sra.w t8, t8, a7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 1
stx.h t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .HS15_DST4
b .HS15_END
.END_HS15_DST4:
li.w t1, 0
.LOOP_HS15_DST1:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.HS15_FILTER1:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .HS15_FILTER1
sra.w t8, t8, a7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 1
stx.h t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .LOOP_HS15_DST1
b .HS15_END
.HS15_END:
ld.d s0, sp, 0
ld.d s1, sp, 8
ld.d s2, sp, 16
ld.d s3, sp, 24
ld.d s4, sp, 32
ld.d s5, sp, 40
ld.d s6, sp, 48
ld.d s7, sp, 56
ld.d s8, sp, 64
addi.d sp, sp, 72
endfunc
/* void ff_hscale_16_to_19_sub_lsx(SwsInternal *c, int16_t *dst, int dstW,
* const uint8_t *src, const int16_t *filter,
* const int32_t *filterPos, int filterSize, int sh)
*/
function ff_hscale_16_to_19_sub_lsx
addi.d sp, sp, -72
st.d s0, sp, 0
st.d s1, sp, 8
st.d s2, sp, 16
st.d s3, sp, 24
st.d s4, sp, 32
st.d s5, sp, 40
st.d s6, sp, 48
st.d s7, sp, 56
st.d s8, sp, 64
li.w t0, 524287
li.w t8, 8
li.w t7, 4
vreplgr2vr.w vr20, t0
vreplgr2vr.w vr0, a7
beq a6, t7, .LOOP_HS19_DST4
beq a6, t8, .LOOP_HS19_DST8
blt t8, a6, .LOOP_HS19
b .END_HS19_DST4
.LOOP_HS19:
li.w t1, 0
li.w s1, 0
li.w s2, 0
li.w s3, 0
li.w s4, 0
li.w s5, 0
vldi vr22, 0
addi.w s0, a6, -7
slli.w s7, a6, 1
slli.w s8, a6, 2
add.w t6, s7, s8
.LOOP_HS19_DST:
ld.w t2, a5, 0
ld.w t3, a5, 4
ld.w t4, a5, 8
ld.w t5, a5, 12
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
slli.w t5, t5, 1
vldx vr1, a3, t2
vldx vr2, a3, t3
vldx vr3, a3, t4
vldx vr4, a3, t5
vld vr9, a4, 0
vldx vr10, a4, s7
vldx vr11, a4, s8
vldx vr12, a4, t6
vmulwev.w.hu.h vr17, vr1, vr9
vmulwev.w.hu.h vr18, vr2, vr10
vmulwev.w.hu.h vr19, vr3, vr11
vmulwev.w.hu.h vr21, vr4, vr12
vmaddwod.w.hu.h vr17, vr1, vr9
vmaddwod.w.hu.h vr18, vr2, vr10
vmaddwod.w.hu.h vr19, vr3, vr11
vmaddwod.w.hu.h vr21, vr4, vr12
vhaddw.d.w vr1, vr17, vr17
vhaddw.d.w vr2, vr18, vr18
vhaddw.d.w vr3, vr19, vr19
vhaddw.d.w vr4, vr21, vr21
vhaddw.q.d vr1, vr1, vr1
vhaddw.q.d vr2, vr2, vr2
vhaddw.q.d vr3, vr3, vr3
vhaddw.q.d vr4, vr4, vr4
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.d vr1, vr3, vr1
vadd.w vr22, vr22, vr1
addi.w s1, s1, 8
addi.d a3, a3, 16
addi.d a4, a4, 16
blt s1, s0, .LOOP_HS19_DST
blt s1, a6, .HS19_DSTA
b .END_HS19_FILTERA
.HS19_DSTA:
ld.w t2, a5, 0
li.w t3, 0
move s6, s1
.HS19_FILTERA:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s2, s2, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .HS19_FILTERA
ld.w t2, a5, 4
li.w t3, 0
move s6, s1
addi.w t1, t1, 1
.HS19_FILTERB:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s3, s3, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .HS19_FILTERB
ld.w t2, a5, 8
addi.w t1, t1, 1
li.w t3, 0
move s6, s1
.HS19_FILTERC:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s4, s4, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .HS19_FILTERC
ld.w t2, a5, 12
addi.w t1, t1, 1
move s6, s1
li.w t3, 0
.HS19_FILTERD:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t6, t6, 1
ldx.h t6, a4, t6
mul.w t6, t5, t6
add.w s5, s5, t6
addi.w t3, t3, 1
addi.w s6, s6, 1
blt s6, a6, .HS19_FILTERD
.END_HS19_FILTERA:
vpickve2gr.w t1, vr22, 0
vpickve2gr.w t2, vr22, 1
vpickve2gr.w t3, vr22, 2
vpickve2gr.w t4, vr22, 3
add.w s2, s2, t1
add.w s3, s3, t2
add.w s4, s4, t3
add.w s5, s5, t4
sra.w s2, s2, a7
sra.w s3, s3, a7
sra.w s4, s4, a7
sra.w s5, s5, a7
slt t1, s2, t0
slt t2, s3, t0
slt t3, s4, t0
slt t4, s5, t0
maskeqz s2, s2, t1
maskeqz s3, s3, t2
maskeqz s4, s4, t3
maskeqz s5, s5, t4
masknez t1, t0, t1
masknez t2, t0, t2
masknez t3, t0, t3
masknez t4, t0, t4
or s2, s2, t1
or s3, s3, t2
or s4, s4, t3
or s5, s5, t4
st.w s2, a1, 0
st.w s3, a1, 4
st.w s4, a1, 8
st.w s5, a1, 12
addi.d a1, a1, 16
sub.d a3, a3, s1
sub.d a3, a3, s1
addi.d a5, a5, 16
slli.d t3, a6, 3
add.d a4, a4, t3
sub.d a4, a4, s1
sub.d a4, a4, s1
addi.d a2, a2, -4
bge a2, t7, .LOOP_HS19
blt zero, a2, .HS19_RESA
b .HS19_END
.HS19_RESA:
li.w t1, 0
.HS19_DST:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.HS19_FILTER:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .HS19_FILTER
sra.w t8, t8, a7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 2
stx.w t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .HS19_DST
b .HS19_END
.LOOP_HS19_DST8:
ld.w t1, a5, 0
ld.w t2, a5, 4
ld.w t3, a5, 8
ld.w t4, a5, 12
slli.w t1, t1, 1
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
vldx vr1, a3, t1
vldx vr2, a3, t2
vldx vr3, a3, t3
vldx vr4, a3, t4
ld.w t1, a5, 16
ld.w t2, a5, 20
ld.w t3, a5, 24
ld.w t4, a5, 28
slli.w t1, t1, 1
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
vldx vr5, a3, t1
vldx vr6, a3, t2
vldx vr7, a3, t3
vldx vr8, a3, t4
vld vr9, a4, 0
vld vr10, a4, 16
vld vr11, a4, 32
vld vr12, a4, 48
vld vr13, a4, 64
vld vr14, a4, 80
vld vr15, a4, 96
vld vr16, a4, 112
vmulwev.w.hu.h vr17, vr1, vr9
vmulwev.w.hu.h vr18, vr2, vr10
vmulwev.w.hu.h vr19, vr3, vr11
vmulwev.w.hu.h vr21, vr4, vr12
vmaddwod.w.hu.h vr17, vr1, vr9
vmaddwod.w.hu.h vr18, vr2, vr10
vmaddwod.w.hu.h vr19, vr3, vr11
vmaddwod.w.hu.h vr21, vr4, vr12
vmulwev.w.hu.h vr1, vr5, vr13
vmulwev.w.hu.h vr2, vr6, vr14
vmulwev.w.hu.h vr3, vr7, vr15
vmulwev.w.hu.h vr4, vr8, vr16
vmaddwod.w.hu.h vr1, vr5, vr13
vmaddwod.w.hu.h vr2, vr6, vr14
vmaddwod.w.hu.h vr3, vr7, vr15
vmaddwod.w.hu.h vr4, vr8, vr16
vhaddw.d.w vr5, vr1, vr1
vhaddw.d.w vr6, vr2, vr2
vhaddw.d.w vr7, vr3, vr3
vhaddw.d.w vr8, vr4, vr4
vhaddw.d.w vr1, vr17, vr17
vhaddw.d.w vr2, vr18, vr18
vhaddw.d.w vr3, vr19, vr19
vhaddw.d.w vr4, vr21, vr21
vhaddw.q.d vr1, vr1, vr1
vhaddw.q.d vr2, vr2, vr2
vhaddw.q.d vr3, vr3, vr3
vhaddw.q.d vr4, vr4, vr4
vhaddw.q.d vr5, vr5, vr5
vhaddw.q.d vr6, vr6, vr6
vhaddw.q.d vr7, vr7, vr7
vhaddw.q.d vr8, vr8, vr8
vilvl.w vr1, vr2, vr1
vilvl.w vr3, vr4, vr3
vilvl.w vr5, vr6, vr5
vilvl.w vr7, vr8, vr7
vilvl.d vr1, vr3, vr1
vilvl.d vr5, vr7, vr5
vsra.w vr1, vr1, vr0
vsra.w vr5, vr5, vr0
vmin.w vr1, vr1, vr20
vmin.w vr5, vr5, vr20
vst vr1, a1, 0
vst vr5, a1, 16
addi.d a1, a1, 32
addi.d a5, a5, 32
addi.d a4, a4, 128
addi.d a2, a2, -8
bge a2, t8, .LOOP_HS19_DST8
blt zero, a2, .HS19_REST8
b .HS19_END
.HS19_REST8:
li.w t1, 0
.HS19_DST8:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.HS19_FILTER8:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .HS19_FILTER8
sra.w t8, t8, a7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 2
stx.w t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .HS19_DST8
b .HS19_END
.LOOP_HS19_DST4:
ld.w t1, a5, 0
ld.w t2, a5, 4
ld.w t3, a5, 8
ld.w t4, a5, 12
slli.w t1, t1, 1
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
fldx.d f1, a3, t1
fldx.d f2, a3, t2
fldx.d f3, a3, t3
fldx.d f4, a3, t4
ld.w t1, a5, 16
ld.w t2, a5, 20
ld.w t3, a5, 24
ld.w t4, a5, 28
slli.w t1, t1, 1
slli.w t2, t2, 1
slli.w t3, t3, 1
slli.w t4, t4, 1
fldx.d f5, a3, t1
fldx.d f6, a3, t2
fldx.d f7, a3, t3
fldx.d f8, a3, t4
vld vr9, a4, 0
vld vr10, a4, 16
vld vr11, a4, 32
vld vr12, a4, 48
vilvl.d vr1, vr2, vr1
vilvl.d vr3, vr4, vr3
vilvl.d vr5, vr6, vr5
vilvl.d vr7, vr8, vr7
vmulwev.w.hu.h vr13, vr1, vr9
vmulwev.w.hu.h vr14, vr3, vr10
vmulwev.w.hu.h vr15, vr5, vr11
vmulwev.w.hu.h vr16, vr7, vr12
vmaddwod.w.hu.h vr13, vr1, vr9
vmaddwod.w.hu.h vr14, vr3, vr10
vmaddwod.w.hu.h vr15, vr5, vr11
vmaddwod.w.hu.h vr16, vr7, vr12
vhaddw.d.w vr13, vr13, vr13
vhaddw.d.w vr14, vr14, vr14
vhaddw.d.w vr15, vr15, vr15
vhaddw.d.w vr16, vr16, vr16
vpickev.w vr13, vr14, vr13
vpickev.w vr15, vr16, vr15
vsra.w vr13, vr13, vr0
vsra.w vr15, vr15, vr0
vmin.w vr13, vr13, vr20
vmin.w vr15, vr15, vr20
vst vr13, a1, 0
vst vr15, a1, 16
addi.d a1, a1, 32
addi.d a5, a5, 32
addi.d a4, a4, 64
addi.d a2, a2, -8
bge a2, t8, .LOOP_HS19_DST4
blt zero, a2, .HS19_REST4
b .HS19_END
.HS19_REST4:
li.w t1, 0
.HS19_DST4:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.HS19_FILTER4:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .HS19_FILTER4
sra.w t8, t8, a7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 2
stx.w t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .HS19_DST4
b .HS19_END
.END_HS19_DST4:
li.w t1, 0
.LOOP_HS19_DST1:
slli.w t2, t1, 2
ldx.w t2, a5, t2
li.w t3, 0
li.w t8, 0
.HS19_FILTER1:
add.w t4, t2, t3
slli.w t4, t4, 1
ldx.hu t5, a3, t4
mul.w t6, a6, t1
add.w t6, t6, t3
slli.w t7, t6, 1
ldx.h t7, a4, t7
mul.w t7, t5, t7
add.w t8, t8, t7
addi.w t3, t3, 1
blt t3, a6, .HS19_FILTER1
sra.w t8, t8, a7
slt t5, t8, t0
maskeqz t8, t8, t5
masknez t5, t0, t5
or t8, t8, t5
slli.w t4, t1, 2
stx.w t8, a1, t4
addi.w t1, t1, 1
blt t1, a2, .LOOP_HS19_DST1
b .HS19_END
.HS19_END:
ld.d s0, sp, 0
ld.d s1, sp, 8
ld.d s2, sp, 16
ld.d s3, sp, 24
ld.d s4, sp, 32
ld.d s5, sp, 40
ld.d s6, sp, 48
ld.d s7, sp, 56
ld.d s8, sp, 64
addi.d sp, sp, 72
endfunc
function lumRangeFromJpeg_lsx
li.w t0, 14071
li.w t1, 33561947
vreplgr2vr.h vr0, t0
srli.w t2, a1, 3
andi t3, a1, 7
beqz t2, 2f
1:
vld vr1, a0, 0
vreplgr2vr.w vr2, t1
vreplgr2vr.w vr3, t1
vmaddwev.w.h vr2, vr0, vr1
vmaddwod.w.h vr3, vr0, vr1
vsrai.w vr2, vr2, 14
vsrai.w vr3, vr3, 14
vpackev.h vr1, vr3, vr2
vst vr1, a0, 0
addi.d a0, a0, 16
addi.d t2, t2, -1
bnez t2, 1b
2:
beqz t3, 4f
3:
ld.h t4, a0, 0
mul.w t4, t4, t0
add.w t4, t4, t1
srai.w t4, t4, 14
st.h t4, a0, 0
addi.d a0, a0, 2
addi.d t3, t3, -1
bnez t3, 3b
4:
endfunc
function lumRangeFromJpeg_lasx
li.w t0, 14071
li.w t1, 33561947
xvreplgr2vr.h xr0, t0
srli.w t2, a1, 4
andi t3, a1, 15
beqz t2, 2f
1:
xvld xr1, a0, 0
xvreplgr2vr.w xr2, t1
xvreplgr2vr.w xr3, t1
xvmaddwev.w.h xr2, xr0, xr1
xvmaddwod.w.h xr3, xr0, xr1
xvsrai.w xr2, xr2, 14
xvsrai.w xr3, xr3, 14
xvpackev.h xr1, xr3, xr2
xvst xr1, a0, 0
addi.d a0, a0, 32
addi.d t2, t2, -1
bnez t2, 1b
2:
beqz t3, 4f
3:
ld.h t4, a0, 0
mul.w t4, t4, t0
add.w t4, t4, t1
srai.w t4, t4, 14
st.h t4, a0, 0
addi.d a0, a0, 2
addi.d t3, t3, -1
bnez t3, 3b
4:
endfunc
function lumRangeToJpeg_lsx
li.w t0, 19077
li.w t1, -39057361
li.w t2, 30189
vreplgr2vr.h vr0, t0
vreplgr2vr.h vr4, t2
srli.w t2, a1, 3
andi t3, a1, 7
beqz t2, 2f
1:
vld vr1, a0, 0
vreplgr2vr.w vr2, t1
vreplgr2vr.w vr3, t1
vmin.h vr1, vr1, vr4
vmaddwev.w.h vr2, vr0, vr1
vmaddwod.w.h vr3, vr0, vr1
vsrai.w vr2, vr2, 14
vsrai.w vr3, vr3, 14
vpackev.h vr1, vr3, vr2
vst vr1, a0, 0
addi.d a0, a0, 16
addi.d t2, t2, -1
bnez t2, 1b
2:
beqz t3, 4f
3:
ld.h t4, a0, 0
vreplgr2vr.h vr1, t4
vmin.h vr1, vr1, vr4
vpickve2gr.h t4, vr1, 0
mul.w t4, t4, t0
add.w t4, t4, t1
srai.w t4, t4, 14
st.h t4, a0, 0
addi.d a0, a0, 2
addi.d t3, t3, -1
bnez t3, 3b
4:
endfunc
function lumRangeToJpeg_lasx
li.w t0, 19077
li.w t1, -39057361
li.w t2, 30189
xvreplgr2vr.h xr0, t0
xvreplgr2vr.h xr4, t2
srli.w t2, a1, 4
andi t3, a1, 15
beqz t2, 2f
1:
xvld xr1, a0, 0
xvreplgr2vr.w xr2, t1
xvreplgr2vr.w xr3, t1
xvmin.h xr1, xr1, xr4
xvmaddwev.w.h xr2, xr0, xr1
xvmaddwod.w.h xr3, xr0, xr1
xvsrai.w xr2, xr2, 14
xvsrai.w xr3, xr3, 14
xvpackev.h xr1, xr3, xr2
xvst xr1, a0, 0
addi.d a0, a0, 32
addi.d t2, t2, -1
bnez t2, 1b
2:
beqz t3, 4f
3:
ld.h t4, a0, 0
vreplgr2vr.h vr1, t4
vmin.h vr1, vr1, vr4
vpickve2gr.h t4, vr1, 0
mul.w t4, t4, t0
add.w t4, t4, t1
srai.w t4, t4, 14
st.h t4, a0, 0
addi.d a0, a0, 2
addi.d t3, t3, -1
bnez t3, 3b
4:
endfunc
function chrRangeFromJpeg_lsx
li.w t0, 1799
li.w t1, 4081085
vreplgr2vr.h vr0, t0
srli.w t2, a2, 3
andi t3, a2, 7
beqz t2, 2f
1:
vld vr1, a0, 0
vld vr2, a1, 0
vreplgr2vr.w vr3, t1
vreplgr2vr.w vr4, t1
vreplgr2vr.w vr5, t1
vreplgr2vr.w vr6, t1
vmaddwev.w.h vr3, vr0, vr1
vmaddwod.w.h vr4, vr0, vr1
vmaddwev.w.h vr5, vr0, vr2
vmaddwod.w.h vr6, vr0, vr2
vsrai.w vr3, vr3, 11
vsrai.w vr4, vr4, 11
vsrai.w vr5, vr5, 11
vsrai.w vr6, vr6, 11
vpackev.h vr1, vr4, vr3
vpackev.h vr2, vr6, vr5
vst vr1, a0, 0
vst vr2, a1, 0
addi.d a0, a0, 16
addi.d a1, a1, 16
addi.d t2, t2, -1
bnez t2, 1b
2:
beqz t3, 4f
3:
ld.h t4, a0, 0
ld.h t5, a1, 0
mul.w t4, t4, t0
mul.w t5, t5, t0
add.w t4, t4, t1
add.w t5, t5, t1
srai.w t4, t4, 11
srai.w t5, t5, 11
st.h t4, a0, 0
st.h t5, a1, 0
addi.d a0, a0, 2
addi.d a1, a1, 2
addi.d t3, t3, -1
bnez t3, 3b
4:
endfunc
function chrRangeFromJpeg_lasx
li.w t0, 1799
li.w t1, 4081085
xvreplgr2vr.h xr0, t0
srli.w t2, a2, 4
andi t3, a2, 15
beqz t2, 2f
1:
xvld xr1, a0, 0
xvld xr2, a1, 0
xvreplgr2vr.w xr3, t1
xvreplgr2vr.w xr4, t1
xvreplgr2vr.w xr5, t1
xvreplgr2vr.w xr6, t1
xvmaddwev.w.h xr3, xr0, xr1
xvmaddwod.w.h xr4, xr0, xr1
xvmaddwev.w.h xr5, xr0, xr2
xvmaddwod.w.h xr6, xr0, xr2
xvsrai.w xr3, xr3, 11
xvsrai.w xr4, xr4, 11
xvsrai.w xr5, xr5, 11
xvsrai.w xr6, xr6, 11
xvpackev.h xr1, xr4, xr3
xvpackev.h xr2, xr6, xr5
xvst xr1, a0, 0
xvst xr2, a1, 0
addi.d a0, a0, 32
addi.d a1, a1, 32
addi.d t2, t2, -1
bnez t2, 1b
2:
beqz t3, 4f
3:
ld.h t4, a0, 0
ld.h t5, a1, 0
mul.w t4, t4, t0
mul.w t5, t5, t0
add.w t4, t4, t1
add.w t5, t5, t1
srai.w t4, t4, 11
srai.w t5, t5, 11
st.h t4, a0, 0
st.h t5, a1, 0
addi.d a0, a0, 2
addi.d a1, a1, 2
addi.d t3, t3, -1
bnez t3, 3b
4:
endfunc
function chrRangeToJpeg_lsx
li.w t0, 4663
li.w t1, -9289992
li.w t2, 30775
vreplgr2vr.h vr0, t0
vreplgr2vr.h vr7, t2
srli.w t2, a2, 3
andi t3, a2, 7
beqz t2, 2f
1:
vld vr1, a0, 0
vld vr2, a1, 0
vreplgr2vr.w vr3, t1
vreplgr2vr.w vr4, t1
vreplgr2vr.w vr5, t1
vreplgr2vr.w vr6, t1
vmin.h vr1, vr1, vr7
vmin.h vr2, vr2, vr7
vmaddwev.w.h vr3, vr0, vr1
vmaddwod.w.h vr4, vr0, vr1
vmaddwev.w.h vr5, vr0, vr2
vmaddwod.w.h vr6, vr0, vr2
vsrai.w vr3, vr3, 12
vsrai.w vr4, vr4, 12
vsrai.w vr5, vr5, 12
vsrai.w vr6, vr6, 12
vpackev.h vr1, vr4, vr3
vpackev.h vr2, vr6, vr5
vst vr1, a0, 0
vst vr2, a1, 0
addi.d a0, a0, 16
addi.d a1, a1, 16
addi.d t2, t2, -1
bnez t2, 1b
2:
beqz t3, 4f
3:
ld.h t4, a0, 0
ld.h t5, a1, 0
vreplgr2vr.h vr1, t4
vreplgr2vr.h vr2, t5
vmin.h vr1, vr1, vr7
vmin.h vr2, vr2, vr7
vpickve2gr.h t4, vr1, 0
vpickve2gr.h t5, vr2, 0
mul.w t4, t4, t0
mul.w t5, t5, t0
add.w t4, t4, t1
add.w t5, t5, t1
srai.w t4, t4, 12
srai.w t5, t5, 12
st.h t4, a0, 0
st.h t5, a1, 0
addi.d a0, a0, 2
addi.d a1, a1, 2
addi.d t3, t3, -1
bnez t3, 3b
4:
endfunc
function chrRangeToJpeg_lasx
li.w t0, 4663
li.w t1, -9289992
li.w t2, 30775
xvreplgr2vr.h xr0, t0
xvreplgr2vr.h xr7, t2
srli.w t2, a2, 4
andi t3, a2, 15
beqz t2, 2f
1:
xvld xr1, a0, 0
xvld xr2, a1, 0
xvreplgr2vr.w xr3, t1
xvreplgr2vr.w xr4, t1
xvreplgr2vr.w xr5, t1
xvreplgr2vr.w xr6, t1
xvmin.h xr1, xr1, xr7
xvmin.h xr2, xr2, xr7
xvmaddwev.w.h xr3, xr0, xr1
xvmaddwod.w.h xr4, xr0, xr1
xvmaddwev.w.h xr5, xr0, xr2
xvmaddwod.w.h xr6, xr0, xr2
xvsrai.w xr3, xr3, 12
xvsrai.w xr4, xr4, 12
xvsrai.w xr5, xr5, 12
xvsrai.w xr6, xr6, 12
xvpackev.h xr1, xr4, xr3
xvpackev.h xr2, xr6, xr5
xvst xr1, a0, 0
xvst xr2, a1, 0
addi.d a0, a0, 32
addi.d a1, a1, 32
addi.d t2, t2, -1
bnez t2, 1b
2:
beqz t3, 4f
3:
ld.h t4, a0, 0
ld.h t5, a1, 0
vreplgr2vr.h vr1, t4
vreplgr2vr.h vr2, t5
vmin.h vr1, vr1, vr7
vmin.h vr2, vr2, vr7
vpickve2gr.h t4, vr1, 0
vpickve2gr.h t5, vr2, 0
mul.w t4, t4, t0
mul.w t5, t5, t0
add.w t4, t4, t1
add.w t5, t5, t1
srai.w t4, t4, 12
srai.w t5, t5, 12
st.h t4, a0, 0
st.h t5, a1, 0
addi.d a0, a0, 2
addi.d a1, a1, 2
addi.d t3, t3, -1
bnez t3, 3b
4:
endfunc