swscale: [LA] Optimize yuv2plane1_8_c.

Reviewed-by: colleague of Shiyou Yin
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
Shiyou Yin 2024-03-16 11:03:32 +08:00 committed by Michael Niedermayer
parent f3fe2cb5f7
commit 8b76df9142
No known key found for this signature in database
GPG Key ID: B18E8928B3948D64
5 changed files with 323 additions and 15 deletions

View File

@ -23,11 +23,11 @@
#include "libavcodec/loongarch/loongson_asm.S"
/* static void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
/* static void yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
* const int16_t **src, uint8_t *dest, int dstW,
* const uint8_t *dither, int offset)
*/
function ff_yuv2planeX_8_lsx
function yuv2planeX_8_lsx
addi.w t1, a6, 1
addi.w t2, a6, 2
addi.w t3, a6, 3
@ -136,3 +136,253 @@ function ff_yuv2planeX_8_lsx
blt zero, a4, .DEST
.END:
endfunc
/*
* void yuv2plane1_8_lsx(const int16_t *src, uint8_t *dest, int dstW,
* const uint8_t *dither, int offset)
*/
function yuv2plane1_8_lsx
addi.w t1, a4, 1
addi.w t2, a4, 2
addi.w t3, a4, 3
addi.w t4, a4, 4
addi.w t5, a4, 5
addi.w t6, a4, 6
addi.w t7, a4, 7
andi t0, a4, 7
andi t1, t1, 7
andi t2, t2, 7
andi t3, t3, 7
andi t4, t4, 7
andi t5, t5, 7
andi t6, t6, 7
andi t7, t7, 7
ldx.bu t0, a3, t0
ldx.bu t1, a3, t1
ldx.bu t2, a3, t2
ldx.bu t3, a3, t3
ldx.bu t4, a3, t4
ldx.bu t5, a3, t5
ldx.bu t6, a3, t6
ldx.bu t7, a3, t7
vinsgr2vr.h vr1, t0, 0
vinsgr2vr.h vr1, t1, 1
vinsgr2vr.h vr1, t2, 2
vinsgr2vr.h vr1, t3, 3
vinsgr2vr.h vr1, t4, 4
vinsgr2vr.h vr1, t5, 5
vinsgr2vr.h vr1, t6, 6
vinsgr2vr.h vr1, t7, 7
vsub.h vr0, vr0, vr0
vilvl.h vr2, vr0, vr1
vilvh.h vr3, vr0, vr1
andi t8, a2, 7
srli.d a2, a2, 3
beqz a2, 2f
1:
vld vr1, a0, 0
addi.d a0, a0, 16
vshuf4i.d vr0, vr1, 8
vexth.w.h vr4, vr0
vexth.w.h vr5, vr1
vadd.w vr4, vr2, vr4
vadd.w vr5, vr3, vr5
vsrai.w vr4, vr4, 7
vsrai.w vr5, vr5, 7
vclip255.w vr4, vr4
vclip255.w vr5, vr5
vpickev.h vr1, vr5, vr4
vpickev.b vr1, vr1, vr1
fst.d f1, a1, 0
addi.d a1, a1, 8
addi.d a2, a2, -1
bnez a2, 1b
2:
beqz t8, 4f
3:
add.w a4, a4, t8
addi.w t1, a4, 1
addi.w t2, a4, 2
addi.w t3, a4, 3
addi.w t4, a4, 4
addi.w t5, a4, 5
addi.w t6, a4, 6
addi.w t7, a4, 7
andi t0, a4, 7
andi t1, t1, 7
andi t2, t2, 7
andi t3, t3, 7
andi t4, t4, 7
andi t5, t5, 7
andi t6, t6, 7
andi t7, t7, 7
ldx.bu t0, a3, t0
ldx.bu t1, a3, t1
ldx.bu t2, a3, t2
ldx.bu t3, a3, t3
ldx.bu t4, a3, t4
ldx.bu t5, a3, t5
ldx.bu t6, a3, t6
ldx.bu t7, a3, t7
vinsgr2vr.h vr1, t0, 0
vinsgr2vr.h vr1, t1, 1
vinsgr2vr.h vr1, t2, 2
vinsgr2vr.h vr1, t3, 3
vinsgr2vr.h vr1, t4, 4
vinsgr2vr.h vr1, t5, 5
vinsgr2vr.h vr1, t6, 6
vinsgr2vr.h vr1, t7, 7
vsub.h vr0, vr0, vr0
vilvl.h vr2, vr0, vr1
vilvh.h vr3, vr0, vr1
addi.d a0, a0, -16
add.d a0, a0, t8
add.d a0, a0, t8
addi.d a1, a1, -8
add.d a1, a1, t8
vld vr1, a0, 0
vshuf4i.d vr0, vr1, 8
vexth.w.h vr4, vr0
vexth.w.h vr5, vr1
vadd.w vr4, vr2, vr4
vadd.w vr5, vr3, vr5
vsrai.w vr4, vr4, 7
vsrai.w vr5, vr5, 7
vclip255.w vr4, vr4
vclip255.w vr5, vr5
vpickev.h vr1, vr5, vr4
vpickev.b vr1, vr1, vr1
fst.d f1, a1, 0
4:
endfunc
function yuv2plane1_8_lasx
addi.w t1, a4, 1
addi.w t2, a4, 2
addi.w t3, a4, 3
addi.w t4, a4, 4
addi.w t5, a4, 5
addi.w t6, a4, 6
addi.w t7, a4, 7
andi t0, a4, 7
andi t1, t1, 7
andi t2, t2, 7
andi t3, t3, 7
andi t4, t4, 7
andi t5, t5, 7
andi t6, t6, 7
andi t7, t7, 7
ldx.bu t0, a3, t0
ldx.bu t1, a3, t1
ldx.bu t2, a3, t2
ldx.bu t3, a3, t3
ldx.bu t4, a3, t4
ldx.bu t5, a3, t5
ldx.bu t6, a3, t6
ldx.bu t7, a3, t7
vinsgr2vr.h vr1, t0, 0
vinsgr2vr.h vr1, t1, 1
vinsgr2vr.h vr1, t2, 2
vinsgr2vr.h vr1, t3, 3
vinsgr2vr.h vr1, t4, 4
vinsgr2vr.h vr1, t5, 5
vinsgr2vr.h vr1, t6, 6
vinsgr2vr.h vr1, t7, 7
xvpermi.q xr1, xr1, 0
xvsub.h xr0, xr0, xr0
xvilvl.h xr2, xr0, xr1
xvilvh.h xr3, xr0, xr1
andi t8, a2, 15
srli.d a2, a2, 4
beqz a2, 2f
1:
xvld xr1, a0, 0
addi.d a0, a0, 32
xvpermi.d xr0, xr1, 0xa0
xvexth.w.h xr4, xr0
xvexth.w.h xr5, xr1
xvadd.w xr4, xr2, xr4
xvadd.w xr5, xr3, xr5
xvsrai.w xr4, xr4, 7
xvsrai.w xr5, xr5, 7
xvclip255.w xr4, xr4
xvclip255.w xr5, xr5
xvpickev.h xr1, xr5, xr4
xvpickev.b xr0, xr1, xr1
xvpermi.q xr1, xr0, 1
fst.d f0, a1, 0
fst.d f1, a1, 8
addi.d a1, a1, 16
addi.d a2, a2, -1
bnez a2, 1b
2:
beqz t8, 4f
3:
add.w a4, a4, t8
addi.w t1, a4, 1
addi.w t2, a4, 2
addi.w t3, a4, 3
addi.w t4, a4, 4
addi.w t5, a4, 5
addi.w t6, a4, 6
addi.w t7, a4, 7
andi t0, a4, 7
andi t1, t1, 7
andi t2, t2, 7
andi t3, t3, 7
andi t4, t4, 7
andi t5, t5, 7
andi t6, t6, 7
andi t7, t7, 7
ldx.bu t0, a3, t0
ldx.bu t1, a3, t1
ldx.bu t2, a3, t2
ldx.bu t3, a3, t3
ldx.bu t4, a3, t4
ldx.bu t5, a3, t5
ldx.bu t6, a3, t6
ldx.bu t7, a3, t7
vinsgr2vr.h vr1, t0, 0
vinsgr2vr.h vr1, t1, 1
vinsgr2vr.h vr1, t2, 2
vinsgr2vr.h vr1, t3, 3
vinsgr2vr.h vr1, t4, 4
vinsgr2vr.h vr1, t5, 5
vinsgr2vr.h vr1, t6, 6
vinsgr2vr.h vr1, t7, 7
xvpermi.q xr1, xr1, 0
xvsub.h xr0, xr0, xr0
xvilvl.h xr2, xr0, xr1
xvilvh.h xr3, xr0, xr1
addi.d a0, a0, -32
add.d a0, a0, t8
add.d a0, a0, t8
addi.d a1, a1, -16
add.d a1, a1, t8
xvld xr1, a0, 0
xvpermi.d xr0, xr1, 0xa0
xvexth.w.h xr4, xr0
xvexth.w.h xr5, xr1
xvadd.w xr4, xr2, xr4
xvadd.w xr5, xr3, xr5
xvsrai.w xr4, xr4, 7
xvsrai.w xr5, xr5, 7
xvclip255.w xr4, xr4
xvclip255.w xr5, xr5
xvpickev.h xr1, xr5, xr4
xvpickev.b xr0, xr1, xr1
xvpermi.q xr1, xr0, 1
fst.d f0, a1, 0
fst.d f1, a1, 8
4:
endfunc

View File

@ -22,7 +22,7 @@
#include "swscale_loongarch.h"
#include "libavutil/loongarch/loongson_intrinsics.h"
void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
void yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
{
@ -1775,8 +1775,27 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
av_cold void ff_sws_init_output_lasx(SwsContext *c)
av_cold void ff_sws_init_output_lasx(SwsContext *c,
yuv2planar1_fn *yuv2plane1,
yuv2planarX_fn *yuv2planeX,
yuv2interleavedX_fn *yuv2nv12cX,
yuv2packed1_fn *yuv2packed1,
yuv2packed2_fn *yuv2packed2,
yuv2packedX_fn *yuv2packedX,
yuv2anyX_fn *yuv2anyX)
{
enum AVPixelFormat dstFormat = c->dstFormat;
/* Add initialization once optimized */
if (isSemiPlanarYUV(dstFormat) && isDataInHighBits(dstFormat)) {
} else if (is16BPS(dstFormat)) {
} else if (isNBPS(dstFormat)) {
} else if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
} else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
} else {
*yuv2plane1 = yuv2plane1_8_lasx;
*yuv2planeX = yuv2planeX_8_lasx;
}
if(c->flags & SWS_FULL_CHR_H_INT) {
switch (c->dstFormat) {

View File

@ -1624,8 +1624,28 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
av_cold void ff_sws_init_output_lsx(SwsContext *c)
av_cold void ff_sws_init_output_lsx(SwsContext *c,
yuv2planar1_fn *yuv2plane1,
yuv2planarX_fn *yuv2planeX,
yuv2interleavedX_fn *yuv2nv12cX,
yuv2packed1_fn *yuv2packed1,
yuv2packed2_fn *yuv2packed2,
yuv2packedX_fn *yuv2packedX,
yuv2anyX_fn *yuv2anyX)
{
enum AVPixelFormat dstFormat = c->dstFormat;
/* Add initialization once optimized */
if (isSemiPlanarYUV(dstFormat) && isDataInHighBits(dstFormat)) {
} else if (is16BPS(dstFormat)) {
} else if (isNBPS(dstFormat)) {
} else if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
} else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
} else {
*yuv2plane1 = yuv2plane1_8_lsx;
*yuv2planeX = yuv2planeX_8_lsx;
}
if(c->flags & SWS_FULL_CHR_H_INT) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGBA:

View File

@ -60,7 +60,9 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_lsx(cpu_flags)) {
ff_sws_init_output_lsx(c);
ff_sws_init_output_lsx(c, &c->yuv2plane1, &c->yuv2planeX,
&c->yuv2nv12cX, &c->yuv2packed1,
&c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
if (c->srcBpc == 8) {
if (c->dstBpc <= 14) {
c->hyScale = c->hcScale = ff_hscale_8_to_15_lsx;
@ -80,12 +82,12 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
}
break;
}
if (c->dstBpc == 8)
c->yuv2planeX = ff_yuv2planeX_8_lsx;
}
#if HAVE_LASX
if (have_lasx(cpu_flags)) {
ff_sws_init_output_lasx(c);
ff_sws_init_output_lasx(c, &c->yuv2plane1, &c->yuv2planeX,
&c->yuv2nv12cX, &c->yuv2packed1,
&c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
if (c->srcBpc == 8) {
if (c->dstBpc <= 14) {
c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
@ -105,8 +107,6 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
}
break;
}
if (c->dstBpc == 8)
c->yuv2planeX = ff_yuv2planeX_8_lasx;
}
#endif // #if HAVE_LASX
ff_sws_init_range_convert_loongarch(c);

View File

@ -61,11 +61,21 @@ void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4], int width,
int32_t *rgb2yuv, void *opq);
void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
void yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
av_cold void ff_sws_init_output_lsx(SwsContext *c);
void yuv2plane1_8_lsx(const int16_t *src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
av_cold void ff_sws_init_output_lsx(SwsContext *c,
yuv2planar1_fn *yuv2plane1,
yuv2planarX_fn *yuv2planeX,
yuv2interleavedX_fn *yuv2nv12cX,
yuv2packed1_fn *yuv2packed1,
yuv2packed2_fn *yuv2packed2,
yuv2packedX_fn *yuv2packedX,
yuv2anyX_fn *yuv2anyX);
int yuv420_rgb24_lsx(SwsContext *c, const uint8_t *src[], int srcStride[],
int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
@ -135,12 +145,21 @@ void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
uint8_t *dest, int width, int height,
int src1Stride, int src2Stride, int dstStride);
void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
void yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
av_cold void ff_sws_init_output_lasx(SwsContext *c);
void yuv2plane1_8_lasx(const int16_t *src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
av_cold void ff_sws_init_output_lasx(SwsContext *c,
yuv2planar1_fn *yuv2plane1,
yuv2planarX_fn *yuv2planeX,
yuv2interleavedX_fn *yuv2nv12cX,
yuv2packed1_fn *yuv2packed1,
yuv2packed2_fn *yuv2packed2,
yuv2packedX_fn *yuv2packedX,
yuv2anyX_fn *yuv2anyX);
#endif // #if HAVE_LASX
#endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */