ffmpeg/libavcodec/loongarch/h264qpel.S
yuanhecai f6077cc666
avcodec/la: Add LSX optimization for h264 qpel.
./configure --disable-lasx
ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an
before: 214fps
after:  274fps

Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn>
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
2023-05-25 21:05:01 +02:00

1687 lines
60 KiB
ArmAsm

/*
* Loongson LSX optimized h264qpel
*
* Copyright (c) 2023 Loongson Technology Corporation Limited
* Contributed by Hecai Yuan <yuanhecai@loongson.cn>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "loongson_asm.S"
.macro VLD_QPEL8_H_SSRANI_LSX in0, in1, in2, in3, in4
vld vr0, \in4, 0
vldx vr1, \in4, a2
QPEL8_H_LSX \in0, \in1
vssrani.bu.h \in0, \in2, 5
vssrani.bu.h \in1, \in3, 5
.endm
.macro VLDX_QPEL8_H_SSRANI_LSX in0, in1, in2, in3, in4
vldx vr0, \in4, t1
vldx vr1, \in4, t2
QPEL8_H_LSX \in0, \in1
vssrani.bu.h \in0, \in2, 5
vssrani.bu.h \in1, \in3, 5
.endm
.macro VLD_DOUBLE_QPEL8_H_SSRANI_LSX in0, in1, in2, in3, in4, in5, in6, in7, in8
vld vr0, \in8, 0
vldx vr1, \in8, a2
QPEL8_H_LSX \in0, \in1
vssrani.bu.h \in0, \in4, 5
vssrani.bu.h \in1, \in5, 5
vldx vr0, \in8, t1
vldx vr1, \in8, t2
QPEL8_H_LSX \in2, \in3
vssrani.bu.h \in2, \in6, 5
vssrani.bu.h \in3, \in7, 5
.endm
function ff_put_h264_qpel16_mc00_lsx
slli.d t0, a2, 1
add.d t1, t0, a2
slli.d t2, t0, 1
.rept 4
vld vr0, a1, 0
vldx vr1, a1, a2
vldx vr2, a1, t0
vldx vr3, a1, t1
add.d a1, a1, t2
vst vr0, a0, 0
vstx vr1, a0, a2
vstx vr2, a0, t0
vstx vr3, a0, t1
add.d a0, a0, t2
.endr
endfunc
.macro QPEL8_H_LSX out0, out1
vbsrl.v vr2, vr0, 1
vbsrl.v vr3, vr1, 1
vbsrl.v vr4, vr0, 2
vbsrl.v vr5, vr1, 2
vbsrl.v vr6, vr0, 3
vbsrl.v vr7, vr1, 3
vbsrl.v vr8, vr0, 4
vbsrl.v vr9, vr1, 4
vbsrl.v vr10, vr0, 5
vbsrl.v vr11, vr1, 5
vilvl.b vr6, vr4, vr6
vilvl.b vr7, vr5, vr7
vilvl.b vr8, vr2, vr8
vilvl.b vr9, vr3, vr9
vilvl.b vr10, vr0, vr10
vilvl.b vr11, vr1, vr11
vhaddw.hu.bu vr6, vr6, vr6
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr11, vr11, vr11
vmul.h vr2, vr6, vr20
vmul.h vr3, vr7, vr20
vmul.h vr4, vr8, vr21
vmul.h vr5, vr9, vr21
vssub.h vr2, vr2, vr4
vssub.h vr3, vr3, vr5
vsadd.h vr2, vr2, vr10
vsadd.h vr3, vr3, vr11
vsadd.h \out0, vr2, vr22
vsadd.h \out1, vr3, vr22
.endm
.macro VLD_DOUBLE_QPEL8_H_LSX in0, in1, in2, in3, in4
vld vr0, \in4, 0
vldx vr1, \in4, a2
QPEL8_H_LSX \in0, \in1
vldx vr0, \in4, t1
vldx vr1, \in4, t2
QPEL8_H_LSX \in2, \in3
.endm
.macro put_h264_qpel16 in0
function ff_put_h264_qpel16_mc\in0\()_lsx
.ifc \in0, 10
addi.d t8, a1, 0
.else
addi.d t8, a1, 1
.endif
vldi vr20, 0x414
vldi vr21, 0x405
vldi vr22, 0x410
slli.d t1, a2, 1
add.d t2, t1, a2
addi.d t0, a1, -2 // t0 = src - 2
addi.d a1, t0, 8 // a1 = t0 + 8
.rept 4
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, a1
vld vr10, t8, 0
vldx vr11, t8, a2
vavgr.bu vr0, vr2, vr10
vavgr.bu vr1, vr3, vr11
vst vr0, a0, 0
vstx vr1, a0, a2
VLDX_QPEL8_H_SSRANI_LSX vr4, vr5, vr14, vr15, a1
vldx vr12, t8, t1
vldx vr13, t8, t2
vavgr.bu vr2, vr4, vr12
vavgr.bu vr3, vr5, vr13
vstx vr2, a0, t1
vstx vr3, a0, t2
alsl.d a0, a2, a0, 2
alsl.d t8, a2, t8, 2
alsl.d a1, a2, a1, 2
alsl.d t0, a2, t0, 2
.endr
endfunc
.endm
put_h264_qpel16 10
put_h264_qpel16 30
function ff_put_h264_qpel16_mc20_lsx
vldi vr20, 0x414
vldi vr21, 0x405
vldi vr22, 0x410
slli.d t1, a2, 1
add.d t2, t1, a2
addi.d t0, a1, -2 // t0 = src - 2
addi.d a1, t0, 8 // a1 = t0 + 8
.rept 4
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, a1
vst vr2, a0, 0
vstx vr3, a0, a2
VLDX_QPEL8_H_SSRANI_LSX vr4, vr5, vr14, vr15, a1
vstx vr4, a0, t1
vstx vr5, a0, t2
alsl.d a0, a2, a0, 2
alsl.d a1, a2, a1, 2
alsl.d t0, a2, t0, 2
.endr
endfunc
.macro QPEL8_V_LSX in0, in1, in2, in3, in4, in5, in6
vilvl.b vr7, \in3, \in2
vilvl.b vr8, \in4, \in3
vilvl.b vr9, \in4, \in1
vilvl.b vr10, \in5, \in2
vilvl.b vr11, \in5, \in0
vilvl.b vr12, \in6, \in1
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr11, vr11, vr11
vhaddw.hu.bu vr12, vr12, vr12
vmul.h vr7, vr7, vr20
vmul.h vr8, vr8, vr20
vmul.h vr9, vr9, vr21
vmul.h vr10, vr10, vr21
vssub.h vr7, vr7, vr9
vssub.h vr8, vr8, vr10
vsadd.h vr7, vr7, vr11
vsadd.h vr8, vr8, vr12
vsadd.h vr7, vr7, vr22
vsadd.h vr8, vr8, vr22
vilvh.b vr13, \in3, \in2
vilvh.b vr14, \in4, \in3
vilvh.b vr15, \in4, \in1
vilvh.b vr16, \in5, \in2
vilvh.b vr17, \in5, \in0
vilvh.b vr18, \in6, \in1
vhaddw.hu.bu vr13, vr13, vr13
vhaddw.hu.bu vr14, vr14, vr14
vhaddw.hu.bu vr15, vr15, vr15
vhaddw.hu.bu vr16, vr16, vr16
vhaddw.hu.bu vr17, vr17, vr17
vhaddw.hu.bu vr18, vr18, vr18
vmul.h vr13, vr13, vr20
vmul.h vr14, vr14, vr20
vmul.h vr15, vr15, vr21
vmul.h vr16, vr16, vr21
vssub.h vr13, vr13, vr15
vssub.h vr14, vr14, vr16
vsadd.h vr13, vr13, vr17
vsadd.h vr14, vr14, vr18
vsadd.h vr13, vr13, vr22
vsadd.h vr14, vr14, vr22
vssrani.bu.h vr13, vr7, 5
vssrani.bu.h vr14, vr8, 5
.endm
.macro put_h264_qpel16_mc1 in0
function ff_put_h264_qpel16_mc\in0\()_lsx
slli.d t0, a2, 1
add.d t1, t0, a2
sub.d t2, a1, t0 // t2 = src - 2 * stride
vldi vr20, 0x414
vldi vr21, 0x405
vldi vr22, 0x410
vld vr0, t2, 0
vldx vr1, t2, a2
vldx vr2, t2, t0
vldx vr3, t2, t1
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
vld vr4, t2, 0
vldx vr5, t2, a2
vldx vr6, t2, t0
QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
.ifc \in0, 01
vavgr.bu vr13, vr2, vr13
vavgr.bu vr14, vr3, vr14
.else
vavgr.bu vr13, vr3, vr13
vavgr.bu vr14, vr4, vr14
.endif
vst vr13, a0, 0
vstx vr14, a0, a2
vldx vr0, t2, t1
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 *stride
vld vr1, t2, 0
QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1
.ifc \in0, 01
vavgr.bu vr13, vr4, vr13
vavgr.bu vr14, vr5, vr14
.else
vavgr.bu vr13, vr5, vr13
vavgr.bu vr14, vr6, vr14
.endif
vstx vr13, a0, t0
vstx vr14, a0, t1
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
vldx vr2, t2, a2
vldx vr3, t2, t0
QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3
.ifc \in0, 01
vavgr.bu vr13, vr6, vr13
vavgr.bu vr14, vr0, vr14
.else
vavgr.bu vr13, vr0, vr13
vavgr.bu vr14, vr1, vr14
.endif
vst vr13, a0, 0
vstx vr14, a0, a2
vldx vr4, t2, t1
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
vld vr5, t2, 0
QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5
.ifc \in0, 01
vavgr.bu vr13, vr1, vr13
vavgr.bu vr14, vr2, vr14
.else
vavgr.bu vr13, vr2, vr13
vavgr.bu vr14, vr3, vr14
.endif
vstx vr13, a0, t0
vstx vr14, a0, t1
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
vldx vr6, t2, a2
vldx vr0, t2, t0
QPEL8_V_LSX vr1, vr2, vr3, vr4, vr5, vr6, vr0
.ifc \in0, 01
vavgr.bu vr13, vr3, vr13
vavgr.bu vr14, vr4, vr14
.else
vavgr.bu vr13, vr4, vr13
vavgr.bu vr14, vr5, vr14
.endif
vst vr13, a0, 0
vstx vr14, a0, a2
vldx vr1, t2, t1
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
vld vr2, t2, 0
QPEL8_V_LSX vr3, vr4, vr5, vr6, vr0, vr1, vr2
.ifc \in0, 01
vavgr.bu vr13, vr5, vr13
vavgr.bu vr14, vr6, vr14
.else
vavgr.bu vr13, vr6, vr13
vavgr.bu vr14, vr0, vr14
.endif
vstx vr13, a0, t0
vstx vr14, a0, t1
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
vldx vr3, t2, a2
vldx vr4, t2, t0
QPEL8_V_LSX vr5, vr6, vr0, vr1, vr2, vr3, vr4
.ifc \in0, 01
vavgr.bu vr13, vr0, vr13
vavgr.bu vr14, vr1, vr14
.else
vavgr.bu vr13, vr1, vr13
vavgr.bu vr14, vr2, vr14
.endif
vst vr13, a0, 0
vstx vr14, a0, a2
vldx vr5, t2, t1
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
vld vr6, t2, 0
QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
.ifc \in0, 01
vavgr.bu vr13, vr2, vr13
vavgr.bu vr14, vr3, vr14
.else
vavgr.bu vr13, vr3, vr13
vavgr.bu vr14, vr4, vr14
.endif
vstx vr13, a0, t0
vstx vr14, a0, t1
endfunc
.endm
put_h264_qpel16_mc1 01
put_h264_qpel16_mc1 03
.macro VST_QPEL8_V_LOWPASS_LSX in0, in1, in2, in3, in4, in5, in6, in7, in8
QPEL8_V_LSX \in0, \in1, \in2, \in3, \in4, \in5, \in6
vavgr.bu vr13, \in7, vr13
vavgr.bu vr14, \in8, vr14
vst vr13, a0, 0
vstx vr14, a0, a2
.endm
.macro VSTX_QPEL8_V_LOWPASS_LSX in0, in1, in2, in3, in4, in5, in6, in7, in8
QPEL8_V_LSX \in0, \in1, \in2, \in3, \in4, \in5, \in6
vavgr.bu vr13, \in7, vr13
vavgr.bu vr14, \in8, vr14
vstx vr13, a0, t1
vstx vr14, a0, t2
.endm
function ff_put_h264_qpel16_mc11_lsx
addi.d sp, sp, -64
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
fst.d f29, sp, 40
fst.d f30, sp, 48
fst.d f31, sp, 56
slli.d t1, a2, 1
add.d t2, t1, a2
slli.d t6, t1, 1
vldi vr20, 0x414
vldi vr21, 0x405
vldi vr22, 0x410
sub.d t4, a1, t1 // t4 = src - 2 * stride
addi.d t0, a1, -2 // t0 = src - 2
addi.d a1, t0, 8 // a1 = t0 + 8
.rept 2
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
alsl.d t0, a2, t0, 2
VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, t0
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
vr14, vr15, a1
alsl.d a1, a2, a1, 2
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
vr18, vr19, a1
vld vr0, t4, 0 // t4 = src - 2 * stride
vldx vr1, t4, a2
vldx vr2, t4, t1
vldx vr3, t4, t2
alsl.d t4, a2, t4, 2 // src + 2 *stride
vld vr4, t4, 0
vldx vr5, t4, a2
vldx vr6, t4, t1
VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
vldx vr0, t4, t2
alsl.d t4, a2, t4, 2 // src + 6 *stride
vld vr1, t4, 0
VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
vldx vr2, t4, a2
vldx vr3, t4, t1
VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
vldx vr4, t4, t2
alsl.d t4, a2, t4, 2 // src + 10 *stride
vld vr5, t4, 0
VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
alsl.d t0, a2, t0, 2
alsl.d a1, a2, a1, 2 // a1 = src + 8 * stride
alsl.d a0, a2, a0, 2 // dst = dst + 8 * stride
sub.d t4, t4, t6
.endr
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
fld.d f29, sp, 40
fld.d f30, sp, 48
fld.d f31, sp, 56
addi.d sp, sp, 64
endfunc
function ff_avg_h264_qpel16_mc00_lsx
slli.d t0, a2, 1
add.d t1, t0, a2
slli.d t2, t0, 1
addi.d t3, a0, 0
.rept 4
vld vr0, a1, 0
vldx vr1, a1, a2
vldx vr2, a1, t0
vldx vr3, a1, t1
add.d a1, a1, t2
vld vr8, t3, 0
vldx vr9, t3, a2
vldx vr10, t3, t0
vldx vr11, t3, t1
add.d t3, t3, t2
vavgr.bu vr0, vr8, vr0
vavgr.bu vr1, vr9, vr1
vavgr.bu vr2, vr10, vr2
vavgr.bu vr3, vr11, vr3
vst vr0, a0, 0
vstx vr1, a0, a2
vstx vr2, a0, t0
vstx vr3, a0, t1
add.d a0, a0, t2
.endr
endfunc
.macro put_h264_qpel16_mc in0
function ff_put_h264_qpel16_mc\in0\()_lsx
addi.d sp, sp, -64
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
fst.d f29, sp, 40
fst.d f30, sp, 48
fst.d f31, sp, 56
slli.d t1, a2, 1
add.d t2, t1, a2
vldi vr20, 0x414
vldi vr21, 0x405
vldi vr22, 0x410
addi.d t0, a1, -2 // t0 = src - 2
.ifc \in0, 33
add.d t0, t0, a2
.endif
add.d t3, a1, zero // t3 = src
sub.d t4, a1, t1 // t4 = src - 2 * stride
addi.d t4, t4, 1
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
alsl.d a1, a2, t0, 2
VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
addi.d a1, t0, 8
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
vr14, vr15, a1
alsl.d a1, a2, a1, 2
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
vr18, vr19, a1
vld vr0, t4, 0 // t4 = src - 2 * stride + 1
vldx vr1, t4, a2
vldx vr2, t4, t1
vldx vr3, t4, t2
alsl.d t4, a2, t4, 2
vld vr4, t4, 0
vldx vr5, t4, a2
vldx vr6, t4, t1
VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
vldx vr0, t4, t2
alsl.d t4, a2, t4, 2
vld vr1, t4, 0
VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26
add.d t6, t4, zero // t6 = src + 6 * stride
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
vldx vr2, t4, a2
vldx vr3, t4, t1
VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
vldx vr4, t4, t2
alsl.d t4, a2, t4, 2
vld vr5, t4, 0
VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
alsl.d a1, a2, t0, 3 // a1 = src + 8 * stride
addi.d t5, a1, 8 // a1 = src + 8 * stride + 8
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, a1
alsl.d a1, a2, a1, 2
VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
vr14, vr15, t5
alsl.d t5, a2, t5, 2
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
vr18, vr19, t5
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
// t6 = src + 6 * stride + 1
vld vr0, t6, 0
vldx vr1, t6, a2
vldx vr2, t6, t1
vldx vr3, t6, t2
alsl.d t6, a2, t6, 2
vld vr4, t6, 0
vldx vr5, t6, a2
vldx vr6, t6, t1
VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
vldx vr0, t6, t2
alsl.d t6, a2, t6, 2
vld vr1, t6, 0
VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5 ,vr6, vr0, vr1, vr25, vr26
alsl.d a0, a2, a0, 2 // dst = dst + 4 *stride
vldx vr2, t6, a2
vldx vr3, t6, t1
VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
vldx vr4, t6, t2
alsl.d t6, a2, t6, 2
vld vr5, t6, 0
VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
fld.d f29, sp, 40
fld.d f30, sp, 48
fld.d f31, sp, 56
addi.d sp, sp, 64
endfunc
.endm
put_h264_qpel16_mc 33
put_h264_qpel16_mc 31
function ff_put_h264_qpel16_mc13_lsx
slli.d t1, a2, 1
add.d t2, t1, a2
vldi vr20, 0x414
vldi vr21, 0x405
vldi vr22, 0x410
addi.d sp, sp, -64
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
fst.d f29, sp, 40
fst.d f30, sp, 48
fst.d f31, sp, 56
addi.d t0, a1, -2 // t0 = src - 2
add.d t0, t0, a2
add.d t3, a1, zero // t3 = src
sub.d t4, a1, t1 // t4 = src - 2 * stride
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
alsl.d a1, a2, t0, 2
VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
addi.d a1, t0, 8
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
vr14, vr15, a1
alsl.d a1, a2, a1, 2
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
vr18, vr19, a1
vld vr0, t4, 0 // t4 = src - 2 * stride + 1
vldx vr1, t4, a2
vldx vr2, t4, t1
vldx vr3, t4, t2
alsl.d t4, a2, t4, 2
vld vr4, t4, 0
vldx vr5, t4, a2
vldx vr6, t4, t1
VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
vldx vr0, t4, t2
alsl.d t4, a2, t4, 2
vld vr1, t4, 0
VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26
add.d t6, t4, zero
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
vldx vr2, t4, a2
vldx vr3, t4, t1
VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
vldx vr4, t4, t2
alsl.d t4, a2, t4, 2
vld vr5, t4, 0
VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
alsl.d a1, a2, t0, 3 // a1 = src + 8 * stride
addi.d t5, a1, 8 // a1 = src + 8 * stride + 8
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, a1
alsl.d a1, a2, a1, 2
VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
vr14, vr15, t5
alsl.d t5, a2, t5, 2
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
vr18, vr19, t5
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
vld vr0, t6, 0 // // t6 = src + 6 * stride + 1
vldx vr1, t6, a2
vldx vr2, t6, t1
vldx vr3, t6, t2
alsl.d t6, a2, t6, 2
vld vr4, t6, 0
vldx vr5, t6, a2
vldx vr6, t6, t1
VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
vldx vr0, t6, t2
alsl.d t6, a2, t6, 2
vld vr1, t6, 0
VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26
alsl.d a0, a2, a0, 2 // dst = dst + 4 *stride
vldx vr2, t6, a2
vldx vr3, t6, t1
VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
vldx vr4, t6, t2
alsl.d t6, a2, t6, 2
vld vr5, t6, 0
VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
fld.d f29, sp, 40
fld.d f30, sp, 48
fld.d f31, sp, 56
addi.d sp, sp, 64
endfunc
function ff_avg_h264_qpel16_mc10_lsx
addi.d t0, a0, 0 // t0 = dst
addi.d t4, a1, -2 // t1 = src - 2
addi.d t5, t4, 8
slli.d t1, a2, 1
add.d t2, a2, t1
vldi vr20, 0x414
vldi vr21, 0x405
vldi vr22, 0x410
.rept 2
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t4
alsl.d t4, a2, t4, 2
VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, t4
VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, t5
vld vr0, a1, 0
vldx vr1, a1, a2
vld vr12, t0, 0
vldx vr13, t0, a2
vavgr.bu vr0, vr0, vr2
vavgr.bu vr1, vr1, vr3
vavgr.bu vr0, vr0, vr12
vavgr.bu vr1, vr1, vr13
vst vr0, a0, 0
vstx vr1, a0, a2
VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr14, vr15, t5
vldx vr0, a1, t1
vldx vr1, a1, t2
vldx vr12, t0, t1
vldx vr13, t0, t2
vavgr.bu vr0, vr0, vr2
vavgr.bu vr1, vr1, vr3
vavgr.bu vr0, vr0, vr12
vavgr.bu vr1, vr1, vr13
vstx vr0, a0, t1
vstx vr1, a0, t2
alsl.d t5, a2, t5, 2
alsl.d a1, a2, a1, 2
alsl.d t0, a2, t0, 2
alsl.d a0, a2, a0, 2
VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr16, vr17, t5
vld vr0, a1, 0
vldx vr1, a1, a2
vld vr12, t0, 0
vldx vr13, t0, a2
vavgr.bu vr0, vr0, vr2
vavgr.bu vr1, vr1, vr3
vavgr.bu vr0, vr0, vr12
vavgr.bu vr1, vr1, vr13
vst vr0, a0, 0
vstx vr1, a0, a2
VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr18, vr19, t5
vldx vr0, a1, t1
vldx vr1, a1, t2
vldx vr12, t0, t1
vldx vr13, t0, t2
vavgr.bu vr0, vr0, vr2
vavgr.bu vr1, vr1, vr3
vavgr.bu vr0, vr0, vr12
vavgr.bu vr1, vr1, vr13
vstx vr0, a0, t1
vstx vr1, a0, t2
alsl.d t5, a2, t5, 2
alsl.d a1, a2, a1, 2
alsl.d t0, a2, t0, 2
alsl.d a0, a2, a0, 2
alsl.d t4, a2, t4, 2 // src + 8 * stride -2
.endr
endfunc
function ff_avg_h264_qpel16_mc30_lsx
addi.d t0, a0, 0 // t0 = dst
addi.d t4, a1, -2 // t1 = src - 2
addi.d t5, t4, 8
addi.d a1, a1, 1 // a1 = a1 + 1
slli.d t1, a2, 1
add.d t2, a2, t1
vldi vr20, 0x414
vldi vr21, 0x405
vldi vr22, 0x410
.rept 2
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t4
alsl.d t4, a2, t4, 2
VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, t4
VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, t5
vld vr0, a1, 0
vldx vr1, a1, a2
vld vr12, t0, 0
vldx vr13, t0, a2
vavgr.bu vr0, vr0, vr2
vavgr.bu vr1, vr1, vr3
vavgr.bu vr0, vr0, vr12
vavgr.bu vr1, vr1, vr13
vst vr0, a0, 0
vstx vr1, a0, a2
VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr14, vr15, t5
vldx vr0, a1, t1
vldx vr1, a1, t2
vldx vr12, t0, t1
vldx vr13, t0, t2
vavgr.bu vr0, vr0, vr2
vavgr.bu vr1, vr1, vr3
vavgr.bu vr0, vr0, vr12
vavgr.bu vr1, vr1, vr13
vstx vr0, a0, t1
vstx vr1, a0, t2
alsl.d t5, a2, t5, 2
alsl.d a1, a2, a1, 2
alsl.d t0, a2, t0, 2
alsl.d a0, a2, a0, 2
VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr16, vr17, t5
vld vr0, a1, 0
vldx vr1, a1, a2
vld vr12, t0, 0
vldx vr13, t0, a2
vavgr.bu vr0, vr0, vr2
vavgr.bu vr1, vr1, vr3
vavgr.bu vr0, vr0, vr12
vavgr.bu vr1, vr1, vr13
vst vr0, a0, 0
vstx vr1, a0, a2
VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr18, vr19, t5
vldx vr0, a1, t1
vldx vr1, a1, t2
vldx vr12, t0, t1
vldx vr13, t0, t2
vavgr.bu vr0, vr0, vr2
vavgr.bu vr1, vr1, vr3
vavgr.bu vr0, vr0, vr12
vavgr.bu vr1, vr1, vr13
vstx vr0, a0, t1
vstx vr1, a0, t2
alsl.d t5, a2, t5, 2
alsl.d a1, a2, a1, 2
alsl.d t0, a2, t0, 2
alsl.d a0, a2, a0, 2
alsl.d t4, a2, t4, 2 // t1 = src + 8 * stride -2
.endr
endfunc
function ff_put_h264_qpel16_mc02_lsx
slli.d t0, a2, 1
add.d t1, t0, a2
sub.d t2, a1, t0 // t2 = src - 2 * stride
vldi vr20, 0x414
vldi vr21, 0x405
vldi vr22, 0x410
vld vr0, t2, 0
vldx vr1, t2, a2
vldx vr2, t2, t0
vldx vr3, t2, t1
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
vld vr4, t2, 0
vldx vr5, t2, a2
vldx vr6, t2, t0
QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
vst vr13, a0, 0
vstx vr14, a0, a2
vldx vr0, t2, t1
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 *stride
vld vr1, t2, 0
QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1
vstx vr13, a0, t0
vstx vr14, a0, t1
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
vldx vr2, t2, a2
vldx vr3, t2, t0
QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3
vst vr13, a0, 0
vstx vr14, a0, a2
vldx vr4, t2, t1
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
vld vr5, t2, 0
QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5
vstx vr13, a0, t0
vstx vr14, a0, t1
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
vldx vr6, t2, a2
vldx vr0, t2, t0
QPEL8_V_LSX vr1, vr2, vr3, vr4, vr5, vr6, vr0
vst vr13, a0, 0
vstx vr14, a0, a2
vldx vr1, t2, t1
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
vld vr2, t2, 0
QPEL8_V_LSX vr3, vr4, vr5, vr6, vr0, vr1, vr2
vstx vr13, a0, t0
vstx vr14, a0, t1
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
vldx vr3, t2, a2
vldx vr4, t2, t0
QPEL8_V_LSX vr5, vr6, vr0, vr1, vr2, vr3, vr4
vst vr13, a0, 0
vstx vr14, a0, a2
vldx vr5, t2, t1
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
vld vr6, t2, 0
QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
vstx vr13, a0, t0
vstx vr14, a0, t1
endfunc
.macro avc_luma_hv_qrt_and_aver_dst_16x16_lsx
addi.d sp, sp, -64
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
fst.d f29, sp, 40
fst.d f30, sp, 48
fst.d f31, sp, 56
vldi vr20, 0x414
vldi vr21, 0x405
vldi vr22, 0x410
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
alsl.d a1, a2, t0, 2
VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
addi.d a1, t0, 8
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
vr14, vr15, a1
alsl.d a1, a2, a1, 2
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
vr18, vr19, a1
vld vr0, t4, 0 // t4 = src - 2 * stride + 1
vldx vr1, t4, a2
vldx vr2, t4, t1
vldx vr3, t4, t2
alsl.d t4, a2, t4, 2
vld vr4, t4, 0
vldx vr5, t4, a2
vldx vr6, t4, t1
QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
vld vr0, t8, 0
vldx vr1, t8, a2
vavgr.bu vr13, vr23, vr13
vavgr.bu vr14, vr24, vr14
vavgr.bu vr13, vr13, vr0
vavgr.bu vr14, vr14, vr1
vst vr13, a0, 0
vstx vr14, a0, a2
vldx vr0, t4, t2
alsl.d t4, a2, t4, 2
vld vr1, t4, 0
QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1
vldx vr2, t8, t1
vldx vr3, t8, t2
vavgr.bu vr13, vr25, vr13
vavgr.bu vr14, vr26, vr14
vavgr.bu vr13, vr13, vr2
vavgr.bu vr14, vr14, vr3
add.d t6, t4, zero // t6 = src + 6 * stride
vstx vr13, a0, t1
vstx vr14, a0, t2
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
alsl.d t8, a2, t8, 2
vldx vr2, t4, a2
vldx vr3, t4, t1
QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3
vld vr4, t8, 0
vldx vr5, t8, a2
vavgr.bu vr13, vr27, vr13
vavgr.bu vr14, vr28, vr14
vavgr.bu vr13, vr13, vr4
vavgr.bu vr14, vr14, vr5
vst vr13, a0, 0
vstx vr14, a0, a2
vldx vr4, t4, t2
alsl.d t4, a2, t4, 2
vld vr5, t4, 0
QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5
vldx vr6, t8, t1
vldx vr0, t8, t2
vavgr.bu vr13, vr29, vr13
vavgr.bu vr14, vr30, vr14
vavgr.bu vr13, vr13, vr6
vavgr.bu vr14, vr14, vr0
vstx vr13, a0, t1
vstx vr14, a0, t2
alsl.d a1, a2, t0, 3 // a1 = src + 8 * stride
addi.d t5, a1, 8 // a1 = src + 8 * stride + 8
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, a1
alsl.d a1, a2, a1, 2
VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
vr14, vr15, t5
alsl.d t5, a2, t5, 2
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
vr18, vr19, t5
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
alsl.d t8, a2, t8, 2
// t6 = src + 6 * stride + 1
vld vr0, t6, 0
vldx vr1, t6, a2
vldx vr2, t6, t1
vldx vr3, t6, t2
alsl.d t6, a2, t6, 2
vld vr4, t6, 0
vldx vr5, t6, a2
vldx vr6, t6, t1
QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
vld vr0, t8, 0
vldx vr1, t8, a2
vavgr.bu vr13, vr23, vr13
vavgr.bu vr14, vr24, vr14
vavgr.bu vr13, vr13, vr0
vavgr.bu vr14, vr14, vr1
vst vr13, a0, 0
vstx vr14, a0, a2
vldx vr0, t6, t2
alsl.d t6, a2, t6, 2
vld vr1, t6, 0
QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1
vldx vr2, t8, t1
vldx vr3, t8, t2
vavgr.bu vr13, vr25, vr13
vavgr.bu vr14, vr26, vr14
vavgr.bu vr13, vr13, vr2
vavgr.bu vr14, vr14, vr3
vstx vr13, a0, t1
vstx vr14, a0, t2
alsl.d a0, a2, a0, 2 // dst = dst + 4 *stride
alsl.d t8, a2, t8, 2
vldx vr2, t6, a2
vldx vr3, t6, t1
QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3
vld vr4, t8, 0
vldx vr5, t8, a2
vavgr.bu vr13, vr27, vr13
vavgr.bu vr14, vr28, vr14
vavgr.bu vr13, vr13, vr4
vavgr.bu vr14, vr14, vr5
vst vr13, a0, 0
vstx vr14, a0, a2
vldx vr4, t6, t2
alsl.d t6, a2, t6, 2
vld vr5, t6, 0
QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5
vldx vr6, t8, t1
vldx vr0, t8, t2
vavgr.bu vr13, vr29, vr13
vavgr.bu vr14, vr30, vr14
vavgr.bu vr13, vr13, vr6
vavgr.bu vr14, vr14, vr0
vstx vr13, a0, t1
vstx vr14, a0, t2
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
fld.d f29, sp, 40
fld.d f30, sp, 48
fld.d f31, sp, 56
addi.d sp, sp, 64
.endm
function ff_avg_h264_qpel16_mc33_lsx
slli.d t1, a2, 1
add.d t2, t1, a2
addi.d t0, a1, -2 // t0 = src - 2
add.d t0, t0, a2 // t0 = src + stride - 2
add.d t3, a1, zero // t3 = src
sub.d t4, a1, t1 // t4 = src - 2 * stride
addi.d t4, t4, 1
addi.d t8, a0, 0
avc_luma_hv_qrt_and_aver_dst_16x16_lsx
endfunc
function ff_avg_h264_qpel16_mc11_lsx
slli.d t1, a2, 1
add.d t2, t1, a2
addi.d t0, a1, -2 // t0 = src - 2
add.d t3, a1, zero // t3 = src
sub.d t4, a1, t1 // t4 = src - 2 * stride
addi.d t8, a0, 0
avc_luma_hv_qrt_and_aver_dst_16x16_lsx
endfunc
function ff_avg_h264_qpel16_mc31_lsx
slli.d t1, a2, 1
add.d t2, t1, a2
addi.d t0, a1, -2 // t0 = src - 2
add.d t3, a1, zero // t3 = src
sub.d t4, a1, t1 // t4 = src - 2 * stride
addi.d t4, t4, 1
addi.d t8, a0, 0
avc_luma_hv_qrt_and_aver_dst_16x16_lsx
endfunc
function ff_avg_h264_qpel16_mc13_lsx
slli.d t1, a2, 1
add.d t2, t1, a2
addi.d t0, a1, -2 // t0 = src - 2
add.d t0, t0, a2
add.d t3, a1, zero // t3 = src
sub.d t4, a1, t1 // t4 = src - 2 * stride
addi.d t8, a0, 0
avc_luma_hv_qrt_and_aver_dst_16x16_lsx
endfunc
function ff_avg_h264_qpel16_mc20_lsx
slli.d t1, a2, 1
add.d t2, t1, a2
vldi vr20, 0x414
vldi vr21, 0x405
vldi vr22, 0x410
addi.d t0, a1, -2 // t0 = src - 2
addi.d t5, a0, 0
addi.d a1, t0, 8
.rept 4
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, a1
vld vr0, t5, 0
vldx vr1, t5, a2
vavgr.bu vr0, vr0, vr2
vavgr.bu vr1, vr1, vr3
vst vr0, a0, 0
vstx vr1, a0, a2
add.d a1, a1, t1
VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr14, vr15, a1
vldx vr0, t5, t1
vldx vr1, t5, t2
vavgr.bu vr0, vr0, vr2
vavgr.bu vr1, vr1, vr3
vstx vr0, a0, t1
vstx vr1, a0, t2
alsl.d t0, a2, t0, 2
alsl.d t5, a2, t5, 2
alsl.d a0, a2, a0, 2
alsl.d a1, a2, a1, 1
.endr
endfunc
.macro QPEL8_HV_H_LSX out0, out1
vbsrl.v vr2, vr0, 1
vbsrl.v vr3, vr1, 1
vbsrl.v vr4, vr0, 2
vbsrl.v vr5, vr1, 2
vbsrl.v vr6, vr0, 3
vbsrl.v vr7, vr1, 3
vbsrl.v vr8, vr0, 4
vbsrl.v vr9, vr1, 4
vbsrl.v vr10, vr0, 5
vbsrl.v vr11, vr1, 5
vilvl.b vr6, vr4, vr6
vilvl.b vr7, vr5, vr7
vilvl.b vr8, vr2, vr8
vilvl.b vr9, vr3, vr9
vilvl.b vr10, vr0, vr10
vilvl.b vr11, vr1, vr11
vhaddw.hu.bu vr6, vr6, vr6
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr11, vr11, vr11
vmul.h vr2, vr6, vr20
vmul.h vr3, vr7, vr20
vmul.h vr4, vr8, vr21
vmul.h vr5, vr9, vr21
vssub.h vr2, vr2, vr4
vssub.h vr3, vr3, vr5
vsadd.h \out0, vr2, vr10
vsadd.h \out1, vr3, vr11
.endm
.macro QPEL8_HV_V_LSX in0, in1, in2, in3, in4, in5, in6, out0, out1, out2, out3
vilvl.h vr0, \in2, \in3
vilvl.h vr1, \in3, \in4 // tmp0
vilvl.h vr2, \in1, \in4
vilvl.h vr3, \in2, \in5 // tmp2
vilvl.h vr4, \in0, \in5
vilvl.h vr5, \in1, \in6 // tmp4
vhaddw.w.h vr0, vr0, vr0
vhaddw.w.h vr1, vr1, vr1
vhaddw.w.h vr2, vr2, vr2
vhaddw.w.h vr3, vr3, vr3
vhaddw.w.h vr4, vr4, vr4
vhaddw.w.h vr5, vr5, vr5
vmul.w vr0, vr0, vr22
vmul.w vr1, vr1, vr22
vmul.w vr2, vr2, vr23
vmul.w vr3, vr3, vr23
vssub.w vr0, vr0, vr2
vssub.w vr1, vr1, vr3
vsadd.w vr0, vr0, vr4
vsadd.w vr1, vr1, vr5
vsadd.w \out0, vr0, vr24
vsadd.w \out1, vr1, vr24
vilvh.h vr0, \in2, \in3
vilvh.h vr1, \in3, \in4 // tmp0
vilvh.h vr2, \in1, \in4
vilvh.h vr3, \in2, \in5 // tmp2
vilvh.h vr4, \in0, \in5
vilvh.h vr5, \in1, \in6 // tmp4
vhaddw.w.h vr0, vr0, vr0
vhaddw.w.h vr1, vr1, vr1
vhaddw.w.h vr2, vr2, vr2
vhaddw.w.h vr3, vr3, vr3
vhaddw.w.h vr4, vr4, vr4
vhaddw.w.h vr5, vr5, vr5
vmul.w vr0, vr0, vr22
vmul.w vr1, vr1, vr22
vmul.w vr2, vr2, vr23
vmul.w vr3, vr3, vr23
vssub.w vr0, vr0, vr2
vssub.w vr1, vr1, vr3
vsadd.w vr0, vr0, vr4
vsadd.w vr1, vr1, vr5
vsadd.w \out2, vr0, vr24
vsadd.w \out3, vr1, vr24
vssrani.hu.w \out2, \out0, 10
vssrani.hu.w \out3, \out1, 10
vssrani.bu.h \out3, \out2, 0
.endm
.macro h264_qpel8_hv_lowpass_core_lsx in0, in1, type
vld vr0, \in0, 0
vldx vr1, \in0, a3
QPEL8_HV_H_LSX vr12, vr13 // a b$
vldx vr0, \in0, t1
vldx vr1, \in0, t2
QPEL8_HV_H_LSX vr14, vr15 // c d$
alsl.d \in0, a3, \in0, 2
vld vr0, \in0, 0
vldx vr1, \in0, a3
QPEL8_HV_H_LSX vr16, vr17 // e f$
vldx vr0, \in0, t1
vldx vr1, \in0, t2
QPEL8_HV_H_LSX vr18, vr19 // g h$
QPEL8_HV_V_LSX vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr6, vr7, vr0, vr1
.ifc \type, avg
fld.d f2, t3, 0
fldx.d f3, t3, a2
vilvl.d vr2, vr3, vr2
vavgr.bu vr1, vr2, vr1
.endif
vstelm.d vr1, \in1, 0, 0
add.d \in1, \in1, a2
vstelm.d vr1, \in1, 0, 1
alsl.d \in0, a3, \in0, 2
// tmp8
vld vr0, \in0, 0
vldx vr1, \in0, a3
QPEL8_HV_H_LSX vr12, vr13
QPEL8_HV_V_LSX vr14, vr15, vr16, vr17, vr18, vr19, vr12, vr6, vr7, vr0, vr1
.ifc \type, avg
fldx.d f2, t3, t5
fldx.d f3, t3, t6
vilvl.d vr2, vr3, vr2
vavgr.bu vr1, vr2, vr1
.endif
add.d \in1, \in1, a2
vstelm.d vr1, \in1, 0, 0
add.d \in1, \in1, a2
vstelm.d vr1, \in1, 0, 1
// tmp10
vldx vr0, \in0, t1
vldx vr1, \in0, t2
QPEL8_HV_H_LSX vr14, vr15
QPEL8_HV_V_LSX vr16, vr17, vr18, vr19, vr12, vr13, vr14, vr6, vr7, vr0, vr1
.ifc \type, avg
alsl.d t3, a2, t3, 2
fld.d f2, t3, 0
fldx.d f3, t3, a2
vilvl.d vr2, vr3, vr2
vavgr.bu vr1, vr2, vr1
.endif
add.d \in1, \in1, a2
vstelm.d vr1, \in1, 0, 0
add.d \in1, \in1, a2
vstelm.d vr1, \in1, 0, 1
// tmp12
alsl.d \in0, a3, \in0, 2
vld vr0, \in0, 0
vldx vr1, \in0, a3
QPEL8_HV_H_LSX vr16, vr17
QPEL8_HV_V_LSX vr18, vr19, vr12, vr13, vr14, vr15, vr16, vr6, vr7, vr0, vr1
.ifc \type, avg
fldx.d f2, t3, t5
fldx.d f3, t3, t6
vilvl.d vr2, vr3, vr2
vavgr.bu vr1, vr2, vr1
.endif
add.d \in1, \in1, a2
vstelm.d vr1, \in1, 0, 0
add.d \in1, \in1, a2
vstelm.d vr1, \in1, 0, 1
.endm
function put_h264_qpel8_hv_lowpass_lsx
slli.d t1, a3, 1
add.d t2, t1, a3
addi.d sp, sp, -8
fst.d f24, sp, 0
addi.d t0, a1, -2 // t0 = src - 2
sub.d t0, t0, t1 // t0 = t0 - 2 * stride
vldi vr20, 0x414 // h_20
vldi vr21, 0x405 // h_5
vldi vr22, 0x814 // w_20
vldi vr23, 0x805 // w_5
addi.d t4, zero, 512
vreplgr2vr.w vr24, t4 // w_512
h264_qpel8_hv_lowpass_core_lsx t0, a0, put
fld.d f24, sp, 0
addi.d sp, sp, 8
endfunc
function put_h264_qpel8_h_lowpass_lsx
slli.d t1, a3, 1
add.d t2, t1, a3
vldi vr20, 0x414
vldi vr21, 0x405
vldi vr22, 0x410
addi.d t0, a1, -2 // t0 = src - 2
add.d t3, a1, zero // t3 = src
.rept 2
vld vr0, t0, 0
vldx vr1, t0, a3
QPEL8_H_LSX vr12, vr13
vssrani.bu.h vr13, vr12, 5
vstelm.d vr13, a0, 0, 0
add.d a0, a0, a2
vstelm.d vr13, a0, 0, 1
add.d a0, a0, a2
vldx vr0, t0, t1
vldx vr1, t0, t2
QPEL8_H_LSX vr12, vr13
vssrani.bu.h vr13, vr12, 5
vstelm.d vr13, a0, 0, 0
add.d a0, a0, a2
vstelm.d vr13, a0, 0, 1
add.d a0, a0, a2
alsl.d t0, a3, t0, 2
.endr
endfunc
function put_pixels16_l2_8_lsx
slli.d t0, a4, 1
add.d t1, t0, a4
slli.d t2, t0, 1
slli.d t3, a3, 1
add.d t4, t3, a3
slli.d t5, t3, 1
.rept 4
vld vr0, a1, 0
vldx vr1, a1, a4
vldx vr2, a1, t0
vldx vr3, a1, t1
add.d a1, a1, t2
vld vr8, a2, 0x00
vld vr9, a2, 0x10
vld vr10, a2, 0x20
vld vr11, a2, 0x30
addi.d a2, a2, 0x40
vavgr.bu vr0, vr8, vr0
vavgr.bu vr1, vr9, vr1
vavgr.bu vr2, vr10, vr2
vavgr.bu vr3, vr11, vr3
vst vr0, a0, 0
vstx vr1, a0, a3
vstx vr2, a0, t3
vstx vr3, a0, t4
add.d a0, a0, t5
.endr
endfunc
.macro QPEL8_V1_LSX in0, in1, in2, in3, in4, in5, in6
vilvl.b vr7, \in3, \in2
vilvl.b vr8, \in4, \in3
vilvl.b vr9, \in4, \in1
vilvl.b vr10, \in5, \in2
vilvl.b vr11, \in5, \in0
vilvl.b vr12, \in6, \in1
vhaddw.hu.bu vr7, vr7, vr7
vhaddw.hu.bu vr8, vr8, vr8
vhaddw.hu.bu vr9, vr9, vr9
vhaddw.hu.bu vr10, vr10, vr10
vhaddw.hu.bu vr11, vr11, vr11
vhaddw.hu.bu vr12, vr12, vr12
vmul.h vr7, vr7, vr20
vmul.h vr8, vr8, vr20
vmul.h vr9, vr9, vr21
vmul.h vr10, vr10, vr21
vssub.h vr7, vr7, vr9
vssub.h vr8, vr8, vr10
vsadd.h vr7, vr7, vr11
vsadd.h vr8, vr8, vr12
vsadd.h vr7, vr7, vr22
vsadd.h vr8, vr8, vr22
vssrani.bu.h vr8, vr7, 5
.endm
.macro h264_qpel8_v_lowpass_lsx type
function \type\()_h264_qpel8_v_lowpass_lsx
slli.d t0, a3, 1
add.d t1, t0, a3
sub.d t2, a1, t0 // t2 = src - 2 * stride
.ifc \type, avg
addi.d t3, a0, 0
slli.d t4, a2, 1
add.d t5, t4, a2
.endif
vldi vr20, 0x414
vldi vr21, 0x405
vldi vr22, 0x410
fld.d f0, t2, 0
fldx.d f1, t2, a3
fldx.d f2, t2, t0
fldx.d f3, t2, t1
alsl.d t2, a3, t2, 2 // t2 = t2 + 4 * stride
fld.d f4, t2, 0
fldx.d f5, t2, a3
fldx.d f6, t2, t0
QPEL8_V1_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
.ifc \type, avg
fld.d f0, t3, 0
fldx.d f1, t3, a2
vilvl.d vr0, vr1, vr0
vavgr.bu vr8, vr8, vr0
.endif
vstelm.d vr8, a0, 0, 0
add.d a0, a0, a2
vstelm.d vr8, a0, 0, 1
add.d a0, a0, a2
fldx.d f0, t2, t1
alsl.d t2, a3, t2, 2 // t2 = t2 + 4 *stride
fld.d f1, t2, 0
QPEL8_V1_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1
.ifc \type, avg
fldx.d f2, t3, t4
fldx.d f3, t3, t5
vilvl.d vr2, vr3, vr2
vavgr.bu vr8, vr8, vr2
.endif
vstelm.d vr8, a0, 0, 0
add.d a0, a0, a2
vstelm.d vr8, a0, 0, 1
add.d a0, a0, a2
alsl.d t3, a2, t3, 2
fldx.d f2, t2, a3
fldx.d f3, t2, t0
QPEL8_V1_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3
.ifc \type, avg
fld.d f4, t3, 0
fldx.d f5, t3, a2
vilvl.d vr4, vr5, vr4
vavgr.bu vr8, vr8, vr4
.endif
vstelm.d vr8, a0, 0, 0
add.d a0, a0, a2
vstelm.d vr8, a0, 0, 1
add.d a0, a0, a2
fldx.d f4, t2, t1
alsl.d t2, a3, t2, 2 // t2 = t2 + 4 * stride
fld.d f5, t2, 0
QPEL8_V1_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5
.ifc \type, avg
fldx.d f6, t3, t4
fldx.d f0, t3, t5
vilvl.d vr6, vr0, vr6
vavgr.bu vr8, vr8, vr6
.endif
vstelm.d vr8, a0, 0, 0
add.d a0, a0, a2
vstelm.d vr8, a0, 0, 1
endfunc
.endm
h264_qpel8_v_lowpass_lsx put
h264_qpel8_v_lowpass_lsx avg
function avg_pixels16_l2_8_lsx
slli.d t0, a4, 1
add.d t1, t0, a4
slli.d t2, t0, 1
slli.d t3, a3, 1
add.d t4, t3, a3
slli.d t5, t3, 1
addi.d t6, a0, 0
.rept 4
vld vr0, a1, 0
vldx vr1, a1, a4
vldx vr2, a1, t0
vldx vr3, a1, t1
add.d a1, a1, t2
vld vr8, a2, 0x00
vld vr9, a2, 0x10
vld vr10, a2, 0x20
vld vr11, a2, 0x30
addi.d a2, a2, 0x40
vavgr.bu vr0, vr8, vr0
vavgr.bu vr1, vr9, vr1
vavgr.bu vr2, vr10, vr2
vavgr.bu vr3, vr11, vr3
vld vr8, t6, 0
vldx vr9, t6, a3
vldx vr10, t6, t3
vldx vr11, t6, t4
add.d t6, t6, t5
vavgr.bu vr0, vr8, vr0
vavgr.bu vr1, vr9, vr1
vavgr.bu vr2, vr10, vr2
vavgr.bu vr3, vr11, vr3
vst vr0, a0, 0
vstx vr1, a0, a3
vstx vr2, a0, t3
vstx vr3, a0, t4
add.d a0, a0, t5
.endr
endfunc
function avg_h264_qpel8_hv_lowpass_lsx
slli.d t1, a3, 1
add.d t2, t1, a3
slli.d t5, a2, 1
add.d t6, a2, t5
addi.d sp, sp, -8
fst.d f24, sp, 0
vldi vr20, 0x414 // h_20
vldi vr21, 0x405 // h_5
vldi vr22, 0x814 // w_20
vldi vr23, 0x805 // w_5
addi.d t4, zero, 512
vreplgr2vr.w vr24, t4 // w_512
addi.d t0, a1, -2 // t0 = src - 2
sub.d t0, t0, t1 // t0 = t0 - 2 * stride
addi.d t3, a0, 0 // t3 = dst
h264_qpel8_hv_lowpass_core_lsx t0, a0, avg
fld.d f24, sp, 0
addi.d sp, sp, 8
endfunc
function put_pixels8_l2_8_lsx
slli.d t0, a4, 1
add.d t1, t0, a4
slli.d t2, t0, 1
.rept 2
vld vr0, a1, 0
vldx vr1, a1, a4
vldx vr2, a1, t0
vldx vr3, a1, t1
add.d a1, a1, t2
vilvl.d vr0, vr1, vr0
vilvl.d vr2, vr3, vr2
vld vr8, a2, 0x00
vld vr9, a2, 0x08
vld vr10, a2, 0x10
vld vr11, a2, 0x18
vilvl.d vr8, vr9, vr8
vilvl.d vr10, vr11, vr10
addi.d a2, a2, 32
vavgr.bu vr0, vr8, vr0
vavgr.bu vr2, vr10, vr2
vstelm.d vr0, a0, 0, 0
add.d a0, a0, a3
vstelm.d vr0, a0, 0, 1
add.d a0, a0, a3
vstelm.d vr2, a0, 0, 0
add.d a0, a0, a3
vstelm.d vr2, a0, 0, 1
add.d a0, a0, a3
.endr
endfunc
function ff_put_h264_qpel8_mc00_lsx
slli.d t0, a2, 1
add.d t1, t0, a2
slli.d t2, t0, 1
ld.d t3, a1, 0x0
ldx.d t4, a1, a2
ldx.d t5, a1, t0
ldx.d t6, a1, t1
st.d t3, a0, 0x0
stx.d t4, a0, a2
stx.d t5, a0, t0
stx.d t6, a0, t1
add.d a1, a1, t2
add.d a0, a0, t2
ld.d t3, a1, 0x0
ldx.d t4, a1, a2
ldx.d t5, a1, t0
ldx.d t6, a1, t1
st.d t3, a0, 0x0
stx.d t4, a0, a2
stx.d t5, a0, t0
stx.d t6, a0, t1
endfunc
function ff_avg_h264_qpel8_mc00_lsx
slli.d t0, a2, 1
add.d t1, t0, a2
slli.d t2, t0, 1
addi.d t3, a0, 0
.rept 2
vld vr0, a1, 0
vldx vr1, a1, a2
vldx vr2, a1, t0
vldx vr3, a1, t1
add.d a1, a1, t2
vilvl.d vr0, vr1, vr0
vilvl.d vr2, vr3, vr2
vld vr8, t3, 0
vldx vr9, t3, a2
vldx vr10, t3, t0
vldx vr11, t3, t1
add.d t3, t3, t2
vilvl.d vr8, vr9, vr8
vilvl.d vr10, vr11, vr10
vavgr.bu vr0, vr8, vr0
vavgr.bu vr2, vr10, vr2
vstelm.d vr0, a0, 0, 0
add.d a0, a0, a2
vstelm.d vr0, a0, 0, 1
add.d a0, a0, a2
vstelm.d vr2, a0, 0, 0
add.d a0, a0, a2
vstelm.d vr2, a0, 0, 1
add.d a0, a0, a2
.endr
endfunc
function avg_pixels8_l2_8_lsx
slli.d t0, a4, 1
add.d t1, t0, a4
slli.d t2, t0, 1
addi.d t3, a0, 0
slli.d t4, a3, 1
add.d t5, t4, a3
slli.d t6, t4, 1
.rept 2
vld vr0, a1, 0
vldx vr1, a1, a4
vldx vr2, a1, t0
vldx vr3, a1, t1
add.d a1, a1, t2
vilvl.d vr0, vr1, vr0
vilvl.d vr2, vr3, vr2
vld vr8, a2, 0x00
vld vr9, a2, 0x08
vld vr10, a2, 0x10
vld vr11, a2, 0x18
addi.d a2, a2, 0x20
vilvl.d vr8, vr9, vr8
vilvl.d vr10, vr11, vr10
vavgr.bu vr0, vr8, vr0
vavgr.bu vr2, vr10, vr2
vld vr8, t3, 0
vldx vr9, t3, a3
vldx vr10, t3, t4
vldx vr11, t3, t5
add.d t3, t3, t6
vilvl.d vr8, vr9, vr8
vilvl.d vr10, vr11, vr10
vavgr.bu vr0, vr8, vr0
vavgr.bu vr2, vr10, vr2
vstelm.d vr0, a0, 0, 0
add.d a0, a0, a3
vstelm.d vr0, a0, 0, 1
add.d a0, a0, a3
vstelm.d vr2, a0, 0, 0
add.d a0, a0, a3
vstelm.d vr2, a0, 0, 1
add.d a0, a0, a3
.endr
endfunc
function avg_h264_qpel8_h_lowpass_lsx
slli.d t1, a3, 1
add.d t2, t1, a3
slli.d t5, a2, 1
add.d t6, t5, a2
vldi vr20, 0x414
vldi vr21, 0x405
vldi vr22, 0x410
addi.d t0, a1, -2 // t0 = src - 2
add.d t3, a1, zero // t3 = src
addi.d t4, a0, 0 // t4 = dst
.rept 4
vld vr0, t0, 0
vldx vr1, t0, a3
QPEL8_H_LSX vr12, vr13
vssrani.bu.h vr13, vr12, 5
fld.d f0, t4, 0
fldx.d f1, t4, a2
vilvl.d vr0, vr1, vr0
vavgr.bu vr13, vr13, vr0
vstelm.d vr13, a0, 0, 0
add.d a0, a0, a2
vstelm.d vr13, a0, 0, 1
add.d a0, a0, a2
add.d t0, t0, t1
add.d t4, t4, t1
.endr
endfunc