/* * Loongson LSX optimized h264intrapred * * Copyright (c) 2023 Loongson Technology Corporation Limited * Contributed by Lu Wang * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "loongson_asm.S" const shufa .byte 6, 5, 4, 3, 2, 1, 0 endconst const mulk .byte 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0 endconst const mulh .byte 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 .byte 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0 endconst .macro PRED16X16_PLANE slli.d t6, a1, 1 slli.d t4, a1, 3 addi.d t0, a0, 7 sub.d t0, t0, a1 add.d t1, a0, t4 addi.d t1, t1, -1 sub.d t2, t1, t6 ld.bu t3, t0, 1 ld.bu t4, t0, -1 ld.bu t5, t1, 0 ld.bu t7, t2, 0 sub.d t3, t3, t4 sub.d t4, t5, t7 la.local t5, mulk vld vr0, t5, 0 fld.d f1, t0, 2 fld.d f2, t0, -8 la.local t5, shufa fld.d f3, t5, 0 vshuf.b vr2, vr2, vr2, vr3 vilvl.b vr1, vr1, vr2 vhsubw.hu.bu vr1, vr1, vr1 vmul.h vr0, vr0, vr1 vhaddw.w.h vr1, vr0, vr0 vhaddw.d.w vr0, vr1, vr1 vhaddw.q.d vr1, vr0, vr0 vpickve2gr.w t5, vr1, 0 add.d t3, t3, t5 //2 sub.d t2, t2, a1 ld.bu t8, t2, 0 ldx.bu t7, t1, a1 sub.d t5, t7, t8 slli.d t5, t5, 1 //3&4 add.d t1, t1, t6 sub.d t2, t2, a1 ld.bu t8, t2, 0 ld.bu t7, t1, 0 sub.d t7, t7, t8 slli.d t8, t7, 1 add.d t7, t7, t8 add.d t5, t5, t7 sub.d t2, t2, a1 ld.bu t8, t2, 0 ldx.bu t7, t1, a1 sub.d t7, t7, t8 slli.d t7, t7, 2 add.d t5, t5, t7 //5&6 add.d t1, t1, t6 sub.d t2, t2, a1 ld.bu t8, t2, 0 ld.bu t7, t1, 0 sub.d t7, t7, t8 slli.d t8, t7, 2 add.d t7, t7, t8 add.d t5, t5, t7 sub.d t2, t2, a1 ld.bu t8, t2, 0 ldx.bu t7, t1, a1 sub.d t7, t7, t8 slli.d t8, t7, 1 slli.d t7, t7, 2 add.d t7, t7, t8 add.d t5, t5, t7 //7&8 add.d t1, t1, t6 sub.d t2, t2, a1 ld.bu t8, t2, 0 ld.bu t7, t1, 0 sub.d t7, t7, t8 slli.d t8, t7, 3 sub.d t7, t8, t7 add.d t5, t5, t7 sub.d t2, t2, a1 ld.bu t8, t2, 0 ldx.bu t7, t1, a1 sub.d t7, t7, t8 slli.d t7, t7, 3 add.d t5, t5, t7 add.d t4, t4, t5 add.d t1, t1, a1 .endm .macro PRED16X16_PLANE_END ld.bu t7, t1, 0 ld.bu t8, t2, 16 add.d t5, t7, t8 addi.d t5, t5, 1 slli.d t5, t5, 4 add.d t7, t3, t4 slli.d t8, t7, 3 sub.d t7, t8, t7 sub.d t5, t5, t7 la.local t8, mulh vld vr3, t8, 0 slli.d t8, t3, 3 vreplgr2vr.h vr4, t3 vreplgr2vr.h vr9, t8 vmul.h vr5, vr3, vr4 .rept 16 move t7, t5 add.d t5, t5, t4 vreplgr2vr.h vr6, t7 vadd.h vr7, vr6, vr5 vadd.h vr8, vr9, vr7 vssrani.bu.h vr8, vr7, 5 vst vr8, a0, 0 add.d a0, a0, a1 .endr .endm .macro PRED16X16_PLANE_END_LASX ld.bu t7, t1, 0 ld.bu t8, t2, 16 add.d t5, t7, t8 addi.d t5, t5, 1 slli.d t5, t5, 4 add.d t7, t3, t4 slli.d t8, t7, 3 sub.d t7, t8, t7 sub.d t5, t5, t7 la.local t8, mulh xvld xr3, t8, 0 xvreplgr2vr.h xr4, t3 xvmul.h xr5, xr3, xr4 .rept 8 move t7, t5 add.d t5, t5, t4 xvreplgr2vr.h xr6, t7 xvreplgr2vr.h xr8, t5 add.d t5, t5, t4 xvadd.h xr7, xr6, xr5 xvadd.h xr9, xr8, xr5 xvssrani.bu.h xr9, xr7, 5 vstelm.d vr9, a0, 0, 0 xvstelm.d xr9, a0, 8, 2 add.d a0, a0, a1 vstelm.d vr9, a0, 0, 1 xvstelm.d xr9, a0, 8, 3 add.d a0, a0, a1 .endr .endm /* void ff_h264_pred16x16_plane_h264_8_lsx(uint8_t *src, ptrdiff_t stride) */ function ff_h264_pred16x16_plane_h264_8_lsx PRED16X16_PLANE slli.d t7, t3, 2 add.d t3, t3, t7 addi.d t3, t3, 32 srai.d t3, t3, 6 slli.d t7, t4, 2 add.d t4, t4, t7 addi.d t4, t4, 32 srai.d t4, t4, 6 PRED16X16_PLANE_END endfunc /* void ff_h264_pred16x16_plane_rv40_8_lsx(uint8_t *src, ptrdiff_t stride) */ function ff_h264_pred16x16_plane_rv40_8_lsx PRED16X16_PLANE srai.d t7, t3, 2 add.d t3, t3, t7 srai.d t3, t3, 4 srai.d t7, t4, 2 add.d t4, t4, t7 srai.d t4, t4, 4 PRED16X16_PLANE_END endfunc /* void ff_h264_pred16x16_plane_svq3_8_lsx(uint8_t *src, ptrdiff_t stride) */ function ff_h264_pred16x16_plane_svq3_8_lsx PRED16X16_PLANE li.d t6, 4 li.d t7, 5 li.d t8, 16 div.d t3, t3, t6 mul.d t3, t3, t7 div.d t3, t3, t8 div.d t4, t4, t6 mul.d t4, t4, t7 div.d t4, t4, t8 move t7, t3 move t3, t4 move t4, t7 PRED16X16_PLANE_END endfunc /* void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride) */ function ff_h264_pred16x16_plane_h264_8_lasx PRED16X16_PLANE slli.d t7, t3, 2 add.d t3, t3, t7 addi.d t3, t3, 32 srai.d t3, t3, 6 slli.d t7, t4, 2 add.d t4, t4, t7 addi.d t4, t4, 32 srai.d t4, t4, 6 PRED16X16_PLANE_END_LASX endfunc /* void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride) */ function ff_h264_pred16x16_plane_rv40_8_lasx PRED16X16_PLANE srai.d t7, t3, 2 add.d t3, t3, t7 srai.d t3, t3, 4 srai.d t7, t4, 2 add.d t4, t4, t7 srai.d t4, t4, 4 PRED16X16_PLANE_END_LASX endfunc /* void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride) */ function ff_h264_pred16x16_plane_svq3_8_lasx PRED16X16_PLANE li.d t5, 4 li.d t7, 5 li.d t8, 16 div.d t3, t3, t5 mul.d t3, t3, t7 div.d t3, t3, t8 div.d t4, t4, t5 mul.d t4, t4, t7 div.d t4, t4, t8 move t7, t3 move t3, t4 move t4, t7 PRED16X16_PLANE_END_LASX endfunc