mirror of https://git.ffmpeg.org/ffmpeg.git
967 lines
36 KiB
ArmAsm
967 lines
36 KiB
ArmAsm
/*
|
|
* Loongson LSX/LASX optimized h264chroma
|
|
*
|
|
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
|
* Contributed by Lu Wang <wanglu@loongson.cn>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "loongson_asm.S"
|
|
|
|
/* void ff_put_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
|
int h, int x, int y) */
|
|
function ff_put_h264_chroma_mc8_lsx
|
|
li.d t8, 8
|
|
sub.d t1, t8, a4 // 8-x
|
|
sub.d t2, t8, a5 // 8-y
|
|
mul.d t3, t1, t2 // A
|
|
mul.d t4, a4, t2 // B
|
|
mul.d t5, t1, a5 // C
|
|
mul.d t6, a4, a5 // D
|
|
add.d t0, t4, t5 // E
|
|
vreplgr2vr.b vr0, t3
|
|
vreplgr2vr.b vr1, t4
|
|
vreplgr2vr.b vr2, t5
|
|
vreplgr2vr.b vr3, t6
|
|
vreplgr2vr.b vr4, t0
|
|
slli.d t2, a2, 1
|
|
add.d t3, t2, a2
|
|
slli.d t4, a2, 2
|
|
|
|
bge zero, t6, .ENDLOOP_D
|
|
move t1, a3
|
|
vilvl.b vr9, vr1, vr0
|
|
vilvl.b vr10, vr3, vr2
|
|
.LOOP_D:
|
|
vld vr5, a1, 0
|
|
vld vr6, a1, 1
|
|
add.d a1, a1, a2
|
|
vld vr7, a1, 0
|
|
vld vr8, a1, 1
|
|
vilvl.b vr11, vr6, vr5
|
|
vilvl.b vr12, vr8, vr7
|
|
vmulwev.h.bu vr13, vr9, vr11
|
|
vmaddwod.h.bu vr13, vr9, vr11
|
|
vmulwev.h.bu vr14, vr10, vr12
|
|
vmaddwod.h.bu vr14, vr10, vr12
|
|
vadd.h vr13, vr13, vr14
|
|
vsrarni.b.h vr13, vr13, 6
|
|
vstelm.d vr13, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
vld vr5, a1, 0
|
|
vld vr6, a1, 1
|
|
vilvl.b vr11, vr8, vr7
|
|
vilvl.b vr12, vr6, vr5
|
|
vmulwev.h.bu vr13, vr9, vr11
|
|
vmaddwod.h.bu vr13, vr9, vr11
|
|
vmulwev.h.bu vr14, vr10, vr12
|
|
vmaddwod.h.bu vr14, vr10, vr12
|
|
vadd.h vr13, vr13, vr14
|
|
vsrarni.b.h vr13, vr13, 6
|
|
vstelm.d vr13, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
vld vr7, a1, 0
|
|
vld vr8, a1, 1
|
|
vilvl.b vr11, vr6, vr5
|
|
vilvl.b vr12, vr8, vr7
|
|
vmulwev.h.bu vr13, vr9, vr11
|
|
vmaddwod.h.bu vr13, vr9, vr11
|
|
vmulwev.h.bu vr14, vr10, vr12
|
|
vmaddwod.h.bu vr14, vr10, vr12
|
|
vadd.h vr13, vr13, vr14
|
|
vsrarni.b.h vr13, vr13, 6
|
|
vstelm.d vr13, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
vld vr5, a1, 0
|
|
vld vr6, a1, 1
|
|
vilvl.b vr11, vr8, vr7
|
|
vilvl.b vr12, vr6, vr5
|
|
vmulwev.h.bu vr13, vr9, vr11
|
|
vmaddwod.h.bu vr13, vr9, vr11
|
|
vmulwev.h.bu vr14, vr10, vr12
|
|
vmaddwod.h.bu vr14, vr10, vr12
|
|
vadd.h vr13, vr13, vr14
|
|
vsrarni.b.h vr13, vr13, 6
|
|
vstelm.d vr13, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
|
|
addi.d t1, t1, -4
|
|
blt zero, t1, .LOOP_D
|
|
b .ENDLOOP
|
|
.ENDLOOP_D:
|
|
|
|
bge zero, t0, .ENDLOOP_E
|
|
move t1, a3
|
|
li.d t7, 1
|
|
slt t8, zero, t5
|
|
maskeqz t5, a2, t8
|
|
masknez t7, t7, t8
|
|
or t7, t7, t5
|
|
vilvl.b vr7, vr4, vr0
|
|
.LOOP_E:
|
|
vld vr5, a1, 0
|
|
vldx vr6, a1, t7
|
|
vilvl.b vr5, vr6, vr5
|
|
vmulwev.h.bu vr6, vr7, vr5
|
|
vmaddwod.h.bu vr6, vr7, vr5
|
|
vsrarni.b.h vr6, vr6, 6
|
|
vstelm.d vr6, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
vld vr5, a1, 0
|
|
vldx vr6, a1, t7
|
|
vilvl.b vr5, vr6, vr5
|
|
vmulwev.h.bu vr6, vr7, vr5
|
|
vmaddwod.h.bu vr6, vr7, vr5
|
|
vsrarni.b.h vr6, vr6, 6
|
|
vstelm.d vr6, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
vld vr5, a1, 0
|
|
vldx vr6, a1, t7
|
|
vilvl.b vr5, vr6, vr5
|
|
vmulwev.h.bu vr6, vr7, vr5
|
|
vmaddwod.h.bu vr6, vr7, vr5
|
|
vsrarni.b.h vr6, vr6, 6
|
|
vstelm.d vr6, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
vld vr5, a1, 0
|
|
vldx vr6, a1, t7
|
|
vilvl.b vr5, vr6, vr5
|
|
vmulwev.h.bu vr6, vr7, vr5
|
|
vmaddwod.h.bu vr6, vr7, vr5
|
|
vsrarni.b.h vr6, vr6, 6
|
|
vstelm.d vr6, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
|
|
addi.d t1, t1, -4
|
|
blt zero, t1, .LOOP_E
|
|
b .ENDLOOP
|
|
.ENDLOOP_E:
|
|
|
|
move t1, a3
|
|
.LOOP:
|
|
vld vr5, a1, 0
|
|
vmulwev.h.bu vr6, vr0, vr5
|
|
vmulwod.h.bu vr7, vr0, vr5
|
|
vsrarni.b.h vr6, vr6, 6
|
|
vsrarni.b.h vr7, vr7, 6
|
|
vilvl.b vr6, vr7, vr6
|
|
vstelm.d vr6, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
vldx vr5, a1, a2
|
|
vmulwev.h.bu vr6, vr0, vr5
|
|
vmulwod.h.bu vr7, vr0, vr5
|
|
vsrarni.b.h vr6, vr6, 6
|
|
vsrarni.b.h vr7, vr7, 6
|
|
vilvl.b vr6, vr7, vr6
|
|
vstelm.d vr6, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
vldx vr5, a1, t2
|
|
vmulwev.h.bu vr6, vr0, vr5
|
|
vmulwod.h.bu vr7, vr0, vr5
|
|
vsrarni.b.h vr6, vr6, 6
|
|
vsrarni.b.h vr7, vr7, 6
|
|
vilvl.b vr6, vr7, vr6
|
|
vstelm.d vr6, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
vldx vr5, a1, t3
|
|
vmulwev.h.bu vr6, vr0, vr5
|
|
vmulwod.h.bu vr7, vr0, vr5
|
|
vsrarni.b.h vr6, vr6, 6
|
|
vsrarni.b.h vr7, vr7, 6
|
|
vilvl.b vr6, vr7, vr6
|
|
vstelm.d vr6, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, t4
|
|
|
|
addi.d t1, t1, -4
|
|
blt zero, t1, .LOOP
|
|
.ENDLOOP:
|
|
endfunc
|
|
|
|
/* void ff_avg_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
|
int h, int x, int y) */
|
|
function ff_avg_h264_chroma_mc8_lsx
|
|
li.d t8, 8
|
|
sub.d t1, t8, a4 // 8-x
|
|
sub.d t2, t8, a5 // 8-y
|
|
mul.d t3, t1, t2 // A
|
|
mul.d t4, a4, t2 // B
|
|
mul.d t5, t1, a5 // C
|
|
mul.d t6, a4, a5 // D
|
|
add.d t0, t4, t5 // E
|
|
vreplgr2vr.b vr0, t3
|
|
vreplgr2vr.b vr1, t4
|
|
vreplgr2vr.b vr2, t5
|
|
vreplgr2vr.b vr3, t6
|
|
vreplgr2vr.b vr4, t0
|
|
slli.d t2, a2, 1
|
|
add.d t3, t2, a2
|
|
slli.d t4, a2, 2
|
|
|
|
bge zero, t6, .ENDLOOPD
|
|
move t1, a3
|
|
vilvl.b vr9, vr1, vr0
|
|
vilvl.b vr10, vr3, vr2
|
|
.LOOPD:
|
|
vld vr5, a1, 0
|
|
vld vr6, a1, 1
|
|
add.d a1, a1, a2
|
|
vld vr7, a1, 0
|
|
vld vr8, a1, 1
|
|
vld vr11, a0, 0
|
|
vilvl.b vr12, vr6, vr5
|
|
vilvl.b vr13, vr8, vr7
|
|
vmulwev.h.bu vr14, vr9, vr12
|
|
vmaddwod.h.bu vr14, vr9, vr12
|
|
vmulwev.h.bu vr15, vr10, vr13
|
|
vmaddwod.h.bu vr15, vr10, vr13
|
|
vadd.h vr14, vr14, vr15
|
|
vsrari.h vr14, vr14, 6
|
|
vsllwil.hu.bu vr11, vr11, 0
|
|
vadd.h vr11, vr14, vr11
|
|
vsrarni.b.h vr11, vr11, 1
|
|
vstelm.d vr11, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
vld vr5, a1, 0
|
|
vld vr6, a1, 1
|
|
vld vr11, a0, 0
|
|
vilvl.b vr12, vr8, vr7
|
|
vilvl.b vr13, vr6, vr5
|
|
vmulwev.h.bu vr14, vr9, vr12
|
|
vmaddwod.h.bu vr14, vr9, vr12
|
|
vmulwev.h.bu vr15, vr10, vr13
|
|
vmaddwod.h.bu vr15, vr10, vr13
|
|
vadd.h vr14, vr14, vr15
|
|
vsrari.h vr14, vr14, 6
|
|
vsllwil.hu.bu vr11, vr11, 0
|
|
vadd.h vr11, vr14, vr11
|
|
vsrarni.b.h vr11, vr11, 1
|
|
vstelm.d vr11, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
vld vr7, a1, 0
|
|
vld vr8, a1, 1
|
|
vld vr11, a0, 0
|
|
vilvl.b vr12, vr6, vr5
|
|
vilvl.b vr13, vr8, vr7
|
|
vmulwev.h.bu vr14, vr9, vr12
|
|
vmaddwod.h.bu vr14, vr9, vr12
|
|
vmulwev.h.bu vr15, vr10, vr13
|
|
vmaddwod.h.bu vr15, vr10, vr13
|
|
vadd.h vr14, vr14, vr15
|
|
vsrari.h vr14, vr14, 6
|
|
vsllwil.hu.bu vr11, vr11, 0
|
|
vadd.h vr11, vr14, vr11
|
|
vsrarni.b.h vr11, vr11, 1
|
|
vstelm.d vr11, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
vld vr5, a1, 0
|
|
vld vr6, a1, 1
|
|
vld vr11, a0, 0
|
|
vilvl.b vr12, vr8, vr7
|
|
vilvl.b vr13, vr6, vr5
|
|
vmulwev.h.bu vr14, vr9, vr12
|
|
vmaddwod.h.bu vr14, vr9, vr12
|
|
vmulwev.h.bu vr15, vr10, vr13
|
|
vmaddwod.h.bu vr15, vr10, vr13
|
|
vadd.h vr14, vr14, vr15
|
|
vsrari.h vr14, vr14, 6
|
|
vsllwil.hu.bu vr11, vr11, 0
|
|
vadd.h vr11, vr14, vr11
|
|
vsrarni.b.h vr11, vr11, 1
|
|
vstelm.d vr11, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
|
|
addi.d t1, t1, -4
|
|
blt zero, t1, .LOOPD
|
|
b .ENDLOOPELSE
|
|
.ENDLOOPD:
|
|
|
|
bge zero, t0, .ENDLOOPE
|
|
move t1, a3
|
|
li.d t7, 1
|
|
slt t8, zero, t5
|
|
maskeqz t5, a2, t8
|
|
masknez t7, t7, t8
|
|
or t7, t7, t5
|
|
vilvl.b vr7, vr4, vr0
|
|
.LOOPE:
|
|
vld vr5, a1, 0
|
|
vldx vr6, a1, t7
|
|
vld vr8, a0, 0
|
|
vilvl.b vr5, vr6, vr5
|
|
vmulwev.h.bu vr6, vr7, vr5
|
|
vmaddwod.h.bu vr6, vr7, vr5
|
|
vsrari.h vr6, vr6, 6
|
|
vsllwil.hu.bu vr8, vr8, 0
|
|
vadd.h vr8, vr6, vr8
|
|
vsrarni.b.h vr8, vr8, 1
|
|
vstelm.d vr8, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
vld vr5, a1, 0
|
|
vldx vr6, a1, t7
|
|
vld vr8, a0, 0
|
|
vilvl.b vr5, vr6, vr5
|
|
vmulwev.h.bu vr6, vr7, vr5
|
|
vmaddwod.h.bu vr6, vr7, vr5
|
|
vsrari.h vr6, vr6, 6
|
|
vsllwil.hu.bu vr8, vr8, 0
|
|
vadd.h vr8, vr6, vr8
|
|
vsrarni.b.h vr8, vr8, 1
|
|
vstelm.d vr8, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
vld vr5, a1, 0
|
|
vldx vr6, a1, t7
|
|
vld vr8, a0, 0
|
|
vilvl.b vr5, vr6, vr5
|
|
vmulwev.h.bu vr6, vr7, vr5
|
|
vmaddwod.h.bu vr6, vr7, vr5
|
|
vsrari.h vr6, vr6, 6
|
|
vsllwil.hu.bu vr8, vr8, 0
|
|
vadd.h vr8, vr6, vr8
|
|
vsrarni.b.h vr8, vr8, 1
|
|
vstelm.d vr8, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
vld vr5, a1, 0
|
|
vldx vr6, a1, t7
|
|
vld vr8, a0, 0
|
|
vilvl.b vr5, vr6, vr5
|
|
vmulwev.h.bu vr6, vr7, vr5
|
|
vmaddwod.h.bu vr6, vr7, vr5
|
|
vsrari.h vr6, vr6, 6
|
|
vsllwil.hu.bu vr8, vr8, 0
|
|
vadd.h vr8, vr6, vr8
|
|
vsrarni.b.h vr8, vr8, 1
|
|
vstelm.d vr8, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
|
|
addi.d t1, t1, -4
|
|
blt zero, t1, .LOOPE
|
|
b .ENDLOOPELSE
|
|
.ENDLOOPE:
|
|
|
|
move t1, a3
|
|
.LOOPELSE:
|
|
vld vr5, a1, 0
|
|
vld vr8, a0, 0
|
|
vmulwev.h.bu vr6, vr0, vr5
|
|
vmulwod.h.bu vr7, vr0, vr5
|
|
vilvl.h vr6, vr7, vr6
|
|
vsrari.h vr6, vr6, 6
|
|
vsllwil.hu.bu vr8, vr8, 0
|
|
vadd.h vr8, vr6, vr8
|
|
vsrarni.b.h vr8, vr8, 1
|
|
vstelm.d vr8, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
vldx vr5, a1, a2
|
|
vld vr8, a0, 0
|
|
vmulwev.h.bu vr6, vr0, vr5
|
|
vmulwod.h.bu vr7, vr0, vr5
|
|
vilvl.h vr6, vr7, vr6
|
|
vsrari.h vr6, vr6, 6
|
|
vsllwil.hu.bu vr8, vr8, 0
|
|
vadd.h vr8, vr6, vr8
|
|
vsrarni.b.h vr8, vr8, 1
|
|
vstelm.d vr8, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
vldx vr5, a1, t2
|
|
vld vr8, a0, 0
|
|
vmulwev.h.bu vr6, vr0, vr5
|
|
vmulwod.h.bu vr7, vr0, vr5
|
|
vilvl.h vr6, vr7, vr6
|
|
vsrari.h vr6, vr6, 6
|
|
vsllwil.hu.bu vr8, vr8, 0
|
|
vadd.h vr8, vr6, vr8
|
|
vsrarni.b.h vr8, vr8, 1
|
|
vstelm.d vr8, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
vldx vr5, a1, t3
|
|
vld vr8, a0, 0
|
|
vmulwev.h.bu vr6, vr0, vr5
|
|
vmulwod.h.bu vr7, vr0, vr5
|
|
vilvl.h vr6, vr7, vr6
|
|
vsrari.h vr6, vr6, 6
|
|
vsllwil.hu.bu vr8, vr8, 0
|
|
vadd.h vr8, vr6, vr8
|
|
vsrarni.b.h vr8, vr8, 1
|
|
vstelm.d vr8, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, t4
|
|
|
|
addi.d t1, t1, -4
|
|
blt zero, t1, .LOOPELSE
|
|
.ENDLOOPELSE:
|
|
endfunc
|
|
|
|
/* void ff_put_h264_chroma_mc4_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
|
int h, int x, int y) */
|
|
function ff_put_h264_chroma_mc4_lsx
|
|
li.d t8, 8
|
|
sub.d t1, t8, a4 // 8-x
|
|
sub.d t2, t8, a5 // 8-y
|
|
mul.d t3, t1, t2 // A
|
|
mul.d t4, a4, t2 // B
|
|
mul.d t5, t1, a5 // C
|
|
mul.d t6, a4, a5 // D
|
|
add.d t0, t4, t5 // E
|
|
slli.d t8, a2, 1
|
|
vreplgr2vr.b vr0, t3
|
|
vreplgr2vr.b vr1, t4
|
|
vreplgr2vr.b vr2, t5
|
|
vreplgr2vr.b vr3, t6
|
|
vreplgr2vr.b vr4, t0
|
|
|
|
bge zero, t6, .ENDPUT_D
|
|
move t1, a3
|
|
vilvl.b vr9, vr1, vr0
|
|
vilvl.b vr10, vr3, vr2
|
|
.PUT_D:
|
|
vld vr5, a1, 0
|
|
vld vr6, a1, 1
|
|
add.d a1, a1, a2
|
|
vld vr7, a1, 0
|
|
vld vr8, a1, 1
|
|
add.d a1, a1, a2
|
|
vld vr11, a1, 0
|
|
vld vr12, a1, 1
|
|
vilvl.b vr5, vr6, vr5
|
|
vilvl.b vr7, vr8, vr7
|
|
vilvl.b vr13, vr12, vr11
|
|
vilvl.d vr5, vr7, vr5
|
|
vilvl.d vr13, vr13, vr7
|
|
vmulwev.h.bu vr14, vr9, vr5
|
|
vmaddwod.h.bu vr14, vr9, vr5
|
|
vmulwev.h.bu vr15, vr10, vr13
|
|
vmaddwod.h.bu vr15, vr10, vr13
|
|
vadd.h vr14, vr14, vr15
|
|
vsrarni.b.h vr14, vr14, 6
|
|
vstelm.w vr14, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
vstelm.w vr14, a0, 0, 1
|
|
add.d a0, a0, a2
|
|
addi.d t1, t1, -2
|
|
blt zero, t1, .PUT_D
|
|
b .ENDPUT
|
|
.ENDPUT_D:
|
|
|
|
bge zero, t0, .ENDPUT_E
|
|
move t1, a3
|
|
li.d t7, 1
|
|
slt t8, zero, t5
|
|
maskeqz t5, a2, t8
|
|
masknez t7, t7, t8
|
|
or t7, t7, t5
|
|
vilvl.b vr7, vr4, vr0
|
|
.PUT_E:
|
|
vld vr5, a1, 0
|
|
vldx vr6, a1, t7
|
|
vilvl.b vr5, vr6, vr5
|
|
add.d a1, a1, a2
|
|
vld vr8, a1, 0
|
|
vldx vr9, a1, t7
|
|
vilvl.b vr8, vr9, vr8
|
|
vilvl.d vr5, vr8, vr5
|
|
vmulwev.h.bu vr6, vr7, vr5
|
|
vmaddwod.h.bu vr6, vr7, vr5
|
|
vsrarni.b.h vr6, vr6, 6
|
|
vstelm.w vr6, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
vstelm.w vr6, a0, 0, 1
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
addi.d t1, t1, -2
|
|
blt zero, t1, .PUT_E
|
|
b .ENDPUT
|
|
.ENDPUT_E:
|
|
|
|
move t1, a3
|
|
.PUT:
|
|
vld vr5, a1, 0
|
|
vldx vr8, a1, a2
|
|
vilvl.w vr5, vr8, vr5
|
|
vmulwev.h.bu vr6, vr0, vr5
|
|
vmulwod.h.bu vr7, vr0, vr5
|
|
vsrarni.b.h vr6, vr6, 6
|
|
vsrarni.b.h vr7, vr7, 6
|
|
vilvl.b vr6, vr7, vr6
|
|
vstelm.w vr6, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
vstelm.w vr6, a0, 0, 1
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, t8
|
|
addi.d t1, t1, -2
|
|
blt zero, t1, .PUT
|
|
.ENDPUT:
|
|
endfunc
|
|
|
|
/* void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
|
int h, int x, int y) */
|
|
function ff_put_h264_chroma_mc8_lasx
|
|
li.d t8, 8
|
|
sub.d t1, t8, a4 // 8-x
|
|
sub.d t2, t8, a5 // 8-y
|
|
mul.d t3, t1, t2 // A
|
|
mul.d t4, a4, t2 // B
|
|
mul.d t5, t1, a5 // C
|
|
mul.d t6, a4, a5 // D
|
|
add.d t0, t4, t5 // E
|
|
xvreplgr2vr.b xr0, t3
|
|
xvreplgr2vr.b xr1, t4
|
|
xvreplgr2vr.b xr2, t5
|
|
xvreplgr2vr.b xr3, t6
|
|
xvreplgr2vr.b xr4, t0
|
|
slli.d t2, a2, 1
|
|
add.d t3, t2, a2
|
|
slli.d t4, a2, 2
|
|
|
|
bge zero, t6, .ENDLOOP_DA
|
|
move t1, a3
|
|
xvilvl.b xr9, xr1, xr0
|
|
xvilvl.b xr10, xr3, xr2
|
|
.LOOP_DA:
|
|
fld.d f5, a1, 0
|
|
fld.d f6, a1, 1
|
|
add.d a1, a1, a2
|
|
fld.d f7, a1, 0
|
|
fld.d f8, a1, 1
|
|
add.d a1, a1, a2
|
|
fld.d f13, a1, 0
|
|
fld.d f14, a1, 1
|
|
add.d a1, a1, a2
|
|
fld.d f15, a1, 0
|
|
fld.d f16, a1, 1
|
|
add.d a1, a1, a2
|
|
fld.d f17, a1, 0
|
|
fld.d f18, a1, 1
|
|
vilvl.b vr11, vr6, vr5
|
|
vilvl.b vr12, vr8, vr7
|
|
vilvl.b vr14, vr14, vr13
|
|
vilvl.b vr15, vr16, vr15
|
|
vilvl.b vr16, vr18, vr17
|
|
xvpermi.q xr11, xr12, 0x02
|
|
xvpermi.q xr12, xr14, 0x02
|
|
xvpermi.q xr14, xr15, 0x02
|
|
xvpermi.q xr15, xr16, 0x02
|
|
|
|
xvmulwev.h.bu xr19, xr9, xr11
|
|
xvmaddwod.h.bu xr19, xr9, xr11
|
|
xvmulwev.h.bu xr20, xr10, xr12
|
|
xvmaddwod.h.bu xr20, xr10, xr12
|
|
xvadd.h xr21, xr19, xr20
|
|
xvsrarni.b.h xr21, xr21, 6
|
|
vstelm.d vr21, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
xvstelm.d xr21, a0, 0, 2
|
|
add.d a0, a0, a2
|
|
xvmulwev.h.bu xr13, xr9, xr14
|
|
xvmaddwod.h.bu xr13, xr9, xr14
|
|
xvmulwev.h.bu xr14, xr10, xr15
|
|
xvmaddwod.h.bu xr14, xr10, xr15
|
|
xvadd.h xr13, xr13, xr14
|
|
xvsrarni.b.h xr13, xr13, 6
|
|
vstelm.d vr13, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
xvstelm.d xr13, a0, 0, 2
|
|
add.d a0, a0, a2
|
|
|
|
addi.d t1, t1, -4
|
|
blt zero, t1, .LOOP_DA
|
|
b .ENDLOOPA
|
|
.ENDLOOP_DA:
|
|
|
|
bge zero, t0, .ENDLOOP_EA
|
|
move t1, a3
|
|
li.d t7, 1
|
|
slt t8, zero, t5
|
|
maskeqz t5, a2, t8
|
|
masknez t7, t7, t8
|
|
or t7, t7, t5
|
|
xvilvl.b xr7, xr4, xr0
|
|
.LOOP_EA:
|
|
fld.d f5, a1, 0
|
|
fldx.d f6, a1, t7
|
|
add.d a1, a1, a2
|
|
fld.d f9, a1, 0
|
|
fldx.d f10, a1, t7
|
|
add.d a1, a1, a2
|
|
fld.d f11, a1, 0
|
|
fldx.d f12, a1, t7
|
|
add.d a1, a1, a2
|
|
fld.d f13, a1, 0
|
|
fldx.d f14, a1, t7
|
|
vilvl.b vr5, vr6, vr5
|
|
vilvl.b vr9, vr10, vr9
|
|
vilvl.b vr11, vr12, vr11
|
|
vilvl.b vr13, vr14, vr13
|
|
xvpermi.q xr5, xr9, 0x02
|
|
xvpermi.q xr11, xr13, 0x02
|
|
|
|
xvmulwev.h.bu xr8, xr7, xr5
|
|
xvmaddwod.h.bu xr8, xr7, xr5
|
|
xvmulwev.h.bu xr6, xr7, xr11
|
|
xvmaddwod.h.bu xr6, xr7, xr11
|
|
xvsrarni.b.h xr8, xr8, 6
|
|
vstelm.d vr8, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
xvstelm.d xr8, a0, 0, 2
|
|
add.d a0, a0, a2
|
|
xvsrarni.b.h xr6, xr6, 6
|
|
vstelm.d vr6, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
xvstelm.d xr6, a0, 0, 2
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
|
|
addi.d t1, t1, -4
|
|
blt zero, t1, .LOOP_EA
|
|
b .ENDLOOPA
|
|
.ENDLOOP_EA:
|
|
|
|
move t1, a3
|
|
.LOOPA:
|
|
fld.d f5, a1, 0
|
|
fldx.d f6, a1, a2
|
|
fldx.d f7, a1, t2
|
|
fldx.d f8, a1, t3
|
|
vilvl.d vr5, vr6, vr5
|
|
vilvl.d vr7, vr8, vr7
|
|
xvpermi.q xr5, xr7, 0x02
|
|
xvmulwev.h.bu xr6, xr0, xr5
|
|
xvmulwod.h.bu xr7, xr0, xr5
|
|
xvilvl.h xr8, xr7, xr6
|
|
xvilvh.h xr9, xr7, xr6
|
|
xvsrarni.b.h xr9, xr8, 6
|
|
vstelm.d vr9, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
vstelm.d vr9, a0, 0, 1
|
|
add.d a0, a0, a2
|
|
xvstelm.d xr9, a0, 0, 2
|
|
add.d a0, a0, a2
|
|
xvstelm.d xr9, a0, 0, 3
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, t4
|
|
|
|
addi.d t1, t1, -4
|
|
blt zero, t1, .LOOPA
|
|
.ENDLOOPA:
|
|
endfunc
|
|
|
|
/* void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
|
int h, int x, int y) */
|
|
function ff_avg_h264_chroma_mc8_lasx
|
|
li.d t8, 8
|
|
sub.d t1, t8, a4 // 8-x
|
|
sub.d t2, t8, a5 // 8-y
|
|
mul.d t3, t1, t2 // A
|
|
mul.d t4, a4, t2 // B
|
|
mul.d t5, t1, a5 // C
|
|
mul.d t6, a4, a5 // D
|
|
add.d t0, t4, t5 // E
|
|
xvreplgr2vr.b xr0, t3
|
|
xvreplgr2vr.b xr1, t4
|
|
xvreplgr2vr.b xr2, t5
|
|
xvreplgr2vr.b xr3, t6
|
|
xvreplgr2vr.b xr4, t0
|
|
slli.d t2, a2, 1
|
|
add.d t3, t2, a2
|
|
slli.d t4, a2, 2
|
|
|
|
bge zero, t6, .ENDLOOPDA
|
|
move t1, a3
|
|
xvilvl.b xr9, xr1, xr0
|
|
xvilvl.b xr10, xr3, xr2
|
|
.LOOPDA:
|
|
fld.d f5, a1, 0
|
|
fld.d f6, a1, 1
|
|
add.d a1, a1, a2
|
|
fld.d f7, a1, 0
|
|
fld.d f8, a1, 1
|
|
add.d a1, a1, a2
|
|
fld.d f11, a1, 0
|
|
fld.d f12, a1, 1
|
|
add.d a1, a1, a2
|
|
fld.d f13, a1, 0
|
|
fld.d f14, a1, 1
|
|
add.d a1, a1, a2
|
|
fld.d f15, a1, 0
|
|
fld.d f16, a1, 1
|
|
fld.d f17, a0, 0
|
|
fldx.d f18, a0, a2
|
|
fldx.d f19, a0, t2
|
|
fldx.d f20, a0, t3
|
|
vilvl.b vr5, vr6, vr5
|
|
vilvl.b vr7, vr8, vr7
|
|
vilvl.b vr11, vr12, vr11
|
|
vilvl.b vr13, vr14, vr13
|
|
vilvl.b vr16, vr16, vr15
|
|
xvpermi.q xr5, xr7, 0x02
|
|
xvpermi.q xr7, xr11, 0x02
|
|
xvpermi.q xr11, xr13, 0x02
|
|
xvpermi.q xr13, xr16, 0x02
|
|
xvpermi.q xr17, xr18, 0x02
|
|
xvpermi.q xr19, xr20, 0x02
|
|
|
|
xvmulwev.h.bu xr14, xr9, xr5
|
|
xvmaddwod.h.bu xr14, xr9, xr5
|
|
xvmulwev.h.bu xr15, xr10, xr7
|
|
xvmaddwod.h.bu xr15, xr10, xr7
|
|
xvadd.h xr14, xr14, xr15
|
|
xvsrari.h xr14, xr14, 6
|
|
xvsllwil.hu.bu xr17, xr17, 0
|
|
xvadd.h xr20, xr14, xr17
|
|
xvsrarni.b.h xr20, xr20, 1
|
|
xvstelm.d xr20, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
xvstelm.d xr20, a0, 0, 2
|
|
add.d a0, a0, a2
|
|
xvmulwev.h.bu xr14, xr9, xr11
|
|
xvmaddwod.h.bu xr14, xr9, xr11
|
|
xvmulwev.h.bu xr15, xr10, xr13
|
|
xvmaddwod.h.bu xr15, xr10, xr13
|
|
xvadd.h xr14, xr14, xr15
|
|
xvsrari.h xr14, xr14, 6
|
|
xvsllwil.hu.bu xr19, xr19, 0
|
|
xvadd.h xr21, xr14, xr19
|
|
xvsrarni.b.h xr21, xr21, 1
|
|
xvstelm.d xr21, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
xvstelm.d xr21, a0, 0, 2
|
|
add.d a0, a0, a2
|
|
|
|
addi.d t1, t1, -4
|
|
blt zero, t1, .LOOPDA
|
|
b .ENDLOOPELSEA
|
|
.ENDLOOPDA:
|
|
|
|
bge zero, t0, .ENDLOOPEA
|
|
move t1, a3
|
|
li.d t7, 1
|
|
slt t8, zero, t5
|
|
maskeqz t5, a2, t8
|
|
masknez t7, t7, t8
|
|
or t7, t7, t5
|
|
xvilvl.b xr7, xr4, xr0
|
|
.LOOPEA:
|
|
fld.d f5, a1, 0
|
|
fldx.d f6, a1, t7
|
|
add.d a1, a1, a2
|
|
fld.d f8, a1, 0
|
|
fldx.d f9, a1, t7
|
|
add.d a1, a1, a2
|
|
fld.d f10, a1, 0
|
|
fldx.d f11, a1, t7
|
|
add.d a1, a1, a2
|
|
fld.d f12, a1, 0
|
|
fldx.d f13, a1, t7
|
|
add.d a1, a1, a2
|
|
fld.d f14, a0, 0
|
|
fldx.d f15, a0, a2
|
|
fldx.d f16, a0, t2
|
|
fldx.d f17, a0, t3
|
|
vilvl.b vr5, vr6, vr5
|
|
vilvl.b vr8, vr9, vr8
|
|
vilvl.b vr10, vr11, vr10
|
|
vilvl.b vr12, vr13, vr12
|
|
xvpermi.q xr5, xr8, 0x02
|
|
xvpermi.q xr10, xr12, 0x02
|
|
xvpermi.q xr14, xr15, 0x02
|
|
xvpermi.q xr16, xr17, 0x02
|
|
|
|
xvmulwev.h.bu xr6, xr7, xr5
|
|
xvmaddwod.h.bu xr6, xr7, xr5
|
|
xvsrari.h xr6, xr6, 6
|
|
xvsllwil.hu.bu xr14, xr14, 0
|
|
xvadd.h xr8, xr6, xr14
|
|
xvsrarni.b.h xr8, xr8, 1
|
|
xvstelm.d xr8, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
xvstelm.d xr8, a0, 0, 2
|
|
add.d a0, a0, a2
|
|
xvmulwev.h.bu xr6, xr7, xr10
|
|
xvmaddwod.h.bu xr6, xr7, xr10
|
|
xvsrari.h xr6, xr6, 6
|
|
xvsllwil.hu.bu xr16, xr16, 0
|
|
xvadd.h xr8, xr6, xr16
|
|
xvsrarni.b.h xr8, xr8, 1
|
|
xvstelm.d xr8, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
xvstelm.d xr8, a0, 0, 2
|
|
add.d a0, a0, a2
|
|
|
|
addi.d t1, t1, -4
|
|
blt zero, t1, .LOOPEA
|
|
b .ENDLOOPELSEA
|
|
.ENDLOOPEA:
|
|
|
|
move t1, a3
|
|
.LOOPELSEA:
|
|
fld.d f5, a1, 0
|
|
fldx.d f6, a1, a2
|
|
fldx.d f7, a1, t2
|
|
fldx.d f8, a1, t3
|
|
fld.d f9, a0, 0
|
|
fldx.d f10, a0, a2
|
|
fldx.d f11, a0, t2
|
|
fldx.d f12, a0, t3
|
|
xvpermi.q xr5, xr6, 0x02
|
|
xvpermi.q xr7, xr8, 0x02
|
|
xvpermi.q xr9, xr10, 0x02
|
|
xvpermi.q xr11, xr12, 0x02
|
|
|
|
xvmulwev.h.bu xr12, xr0, xr5
|
|
xvmulwod.h.bu xr13, xr0, xr5
|
|
xvilvl.h xr12, xr13, xr12
|
|
xvsrari.h xr12, xr12, 6
|
|
xvsllwil.hu.bu xr9, xr9, 0
|
|
xvadd.h xr9, xr12, xr9
|
|
xvsrarni.b.h xr9, xr9, 1
|
|
xvstelm.d xr9, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
xvstelm.d xr9, a0, 0, 2
|
|
add.d a0, a0, a2
|
|
xvmulwev.h.bu xr12, xr0, xr7
|
|
xvmulwod.h.bu xr13, xr0, xr7
|
|
xvilvl.h xr12, xr13, xr12
|
|
xvsrari.h xr12, xr12, 6
|
|
xvsllwil.hu.bu xr11, xr11, 0
|
|
xvadd.h xr13, xr12, xr11
|
|
xvsrarni.b.h xr13, xr13, 1
|
|
xvstelm.d xr13, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
xvstelm.d xr13, a0, 0, 2
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, t4
|
|
|
|
addi.d t1, t1, -4
|
|
blt zero, t1, .LOOPELSEA
|
|
.ENDLOOPELSEA:
|
|
endfunc
|
|
|
|
/* void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
|
int h, int x, int y) */
|
|
function ff_put_h264_chroma_mc4_lasx
|
|
li.d t8, 8
|
|
sub.d t1, t8, a4 // 8-x
|
|
sub.d t2, t8, a5 // 8-y
|
|
mul.d t3, t1, t2 // A
|
|
mul.d t4, a4, t2 // B
|
|
mul.d t5, t1, a5 // C
|
|
mul.d t6, a4, a5 // D
|
|
add.d t0, t4, t5 // E
|
|
slli.d t8, a2, 1
|
|
vreplgr2vr.b vr0, t3
|
|
vreplgr2vr.b vr1, t4
|
|
vreplgr2vr.b vr2, t5
|
|
vreplgr2vr.b vr3, t6
|
|
vreplgr2vr.b vr4, t0
|
|
|
|
bge zero, t6, .ENDPUT_DA
|
|
move t1, a3
|
|
vilvl.b vr9, vr1, vr0
|
|
vilvl.b vr10, vr3, vr2
|
|
.PUT_DA:
|
|
fld.d f5, a1, 0
|
|
fld.d f6, a1, 1
|
|
add.d a1, a1, a2
|
|
fld.d f7, a1, 0
|
|
fld.d f8, a1, 1
|
|
add.d a1, a1, a2
|
|
fld.d f11, a1, 0
|
|
fld.d f12, a1, 1
|
|
vilvl.b vr5, vr6, vr5
|
|
vilvl.b vr7, vr8, vr7
|
|
vilvl.b vr13, vr12, vr11
|
|
vilvl.d vr5, vr7, vr5
|
|
vilvl.d vr13, vr13, vr7
|
|
vmulwev.h.bu vr14, vr9, vr5
|
|
vmaddwod.h.bu vr14, vr9, vr5
|
|
vmulwev.h.bu vr15, vr10, vr13
|
|
vmaddwod.h.bu vr15, vr10, vr13
|
|
xvadd.h xr14, xr14, xr15
|
|
vsrarni.b.h vr16, vr14, 6
|
|
vstelm.w vr16, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
vstelm.w vr16, a0, 0, 1
|
|
add.d a0, a0, a2
|
|
addi.d t1, t1, -2
|
|
blt zero, t1, .PUT_DA
|
|
b .ENDPUTA
|
|
.ENDPUT_DA:
|
|
|
|
bge zero, t0, .ENDPUT_EA
|
|
move t1, a3
|
|
li.d t7, 1
|
|
slt t8, zero, t5
|
|
maskeqz t5, a2, t8
|
|
masknez t7, t7, t8
|
|
or t7, t7, t5
|
|
vilvl.b vr7, vr4, vr0
|
|
.PUT_EA:
|
|
fld.d f5, a1, 0
|
|
fldx.d f6, a1, t7
|
|
vilvl.b vr5, vr6, vr5
|
|
add.d a1, a1, a2
|
|
fld.d f8, a1, 0
|
|
fldx.d f9, a1, t7
|
|
vilvl.b vr8, vr9, vr8
|
|
vilvl.d vr5, vr8, vr5
|
|
vmulwev.h.bu vr6, vr7, vr5
|
|
vmaddwod.h.bu vr6, vr7, vr5
|
|
vsrarni.b.h vr6, vr6, 6
|
|
vstelm.w vr6, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
vstelm.w vr6, a0, 0, 1
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, a2
|
|
addi.d t1, t1, -2
|
|
blt zero, t1, .PUT_EA
|
|
b .ENDPUTA
|
|
.ENDPUT_EA:
|
|
|
|
move t1, a3
|
|
.PUTA:
|
|
fld.d f5, a1, 0
|
|
fldx.d f8, a1, a2
|
|
vilvl.w vr5, vr8, vr5
|
|
vmulwev.h.bu vr6, vr0, vr5
|
|
vmulwod.h.bu vr7, vr0, vr5
|
|
vilvl.h vr6, vr7, vr6
|
|
vsrarni.b.h vr6, vr6, 6
|
|
vstelm.w vr6, a0, 0, 0
|
|
add.d a0, a0, a2
|
|
vstelm.w vr6, a0, 0, 1
|
|
add.d a0, a0, a2
|
|
add.d a1, a1, t8
|
|
addi.d t1, t1, -2
|
|
blt zero, t1, .PUTA
|
|
.ENDPUTA:
|
|
endfunc
|