swscale: [LA] Optimize range convert for yuvj420p.

Reviewed-by: 陈昊 <chenhao@loongson.cn>
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
Shiyou Yin 2024-03-16 11:03:31 +08:00 committed by Michael Niedermayer
parent dd5f665b40
commit f3fe2cb5f7
No known key found for this signature in database
GPG Key ID: B18E8928B3948D64
5 changed files with 418 additions and 1 deletions

View File

@ -1866,3 +1866,371 @@ function ff_hscale_16_to_19_sub_lsx
ld.d s8, sp, 64
addi.d sp, sp, 72
endfunc
function lumRangeFromJpeg_lsx
li.w t0, 14071
li.w t1, 33561947
vreplgr2vr.h vr0, t0
srli.w t2, a1, 3
andi t3, a1, 7
beqz t2, 2f
1:
vld vr1, a0, 0
vreplgr2vr.w vr2, t1
vreplgr2vr.w vr3, t1
vmaddwev.w.h vr2, vr0, vr1
vmaddwod.w.h vr3, vr0, vr1
vsrai.w vr2, vr2, 14
vsrai.w vr3, vr3, 14
vpackev.h vr1, vr3, vr2
vst vr1, a0, 0
addi.d a0, a0, 16
addi.d t2, t2, -1
bnez t2, 1b
2:
beqz t3, 4f
3:
ld.h t4, a0, 0
mul.w t4, t4, t0
add.w t4, t4, t1
srai.w t4, t4, 14
st.h t4, a0, 0
addi.d a0, a0, 2
addi.d t3, t3, -1
bnez t3, 3b
4:
endfunc
function lumRangeFromJpeg_lasx
li.w t0, 14071
li.w t1, 33561947
xvreplgr2vr.h xr0, t0
srli.w t2, a1, 4
andi t3, a1, 15
beqz t2, 2f
1:
xvld xr1, a0, 0
xvreplgr2vr.w xr2, t1
xvreplgr2vr.w xr3, t1
xvmaddwev.w.h xr2, xr0, xr1
xvmaddwod.w.h xr3, xr0, xr1
xvsrai.w xr2, xr2, 14
xvsrai.w xr3, xr3, 14
xvpackev.h xr1, xr3, xr2
xvst xr1, a0, 0
addi.d a0, a0, 32
addi.d t2, t2, -1
bnez t2, 1b
2:
beqz t3, 4f
3:
ld.h t4, a0, 0
mul.w t4, t4, t0
add.w t4, t4, t1
srai.w t4, t4, 14
st.h t4, a0, 0
addi.d a0, a0, 2
addi.d t3, t3, -1
bnez t3, 3b
4:
endfunc
function lumRangeToJpeg_lsx
li.w t0, 19077
li.w t1, -39057361
li.w t2, 30189
vreplgr2vr.h vr0, t0
vreplgr2vr.h vr4, t2
srli.w t2, a1, 3
andi t3, a1, 7
beqz t2, 2f
1:
vld vr1, a0, 0
vreplgr2vr.w vr2, t1
vreplgr2vr.w vr3, t1
vmin.h vr1, vr1, vr4
vmaddwev.w.h vr2, vr0, vr1
vmaddwod.w.h vr3, vr0, vr1
vsrai.w vr2, vr2, 14
vsrai.w vr3, vr3, 14
vpackev.h vr1, vr3, vr2
vst vr1, a0, 0
addi.d a0, a0, 16
addi.d t2, t2, -1
bnez t2, 1b
2:
beqz t3, 4f
3:
ld.h t4, a0, 0
vreplgr2vr.h vr1, t4
vmin.h vr1, vr1, vr4
vpickve2gr.h t4, vr1, 0
mul.w t4, t4, t0
add.w t4, t4, t1
srai.w t4, t4, 14
st.h t4, a0, 0
addi.d a0, a0, 2
addi.d t3, t3, -1
bnez t3, 3b
4:
endfunc
function lumRangeToJpeg_lasx
li.w t0, 19077
li.w t1, -39057361
li.w t2, 30189
xvreplgr2vr.h xr0, t0
xvreplgr2vr.h xr4, t2
srli.w t2, a1, 4
andi t3, a1, 15
beqz t2, 2f
1:
xvld xr1, a0, 0
xvreplgr2vr.w xr2, t1
xvreplgr2vr.w xr3, t1
xvmin.h xr1, xr1, xr4
xvmaddwev.w.h xr2, xr0, xr1
xvmaddwod.w.h xr3, xr0, xr1
xvsrai.w xr2, xr2, 14
xvsrai.w xr3, xr3, 14
xvpackev.h xr1, xr3, xr2
xvst xr1, a0, 0
addi.d a0, a0, 32
addi.d t2, t2, -1
bnez t2, 1b
2:
beqz t3, 4f
3:
ld.h t4, a0, 0
vreplgr2vr.h vr1, t4
vmin.h vr1, vr1, vr4
vpickve2gr.h t4, vr1, 0
mul.w t4, t4, t0
add.w t4, t4, t1
srai.w t4, t4, 14
st.h t4, a0, 0
addi.d a0, a0, 2
addi.d t3, t3, -1
bnez t3, 3b
4:
endfunc
function chrRangeFromJpeg_lsx
li.w t0, 1799
li.w t1, 4081085
vreplgr2vr.h vr0, t0
srli.w t2, a2, 3
andi t3, a2, 7
beqz t2, 2f
1:
vld vr1, a0, 0
vld vr2, a1, 0
vreplgr2vr.w vr3, t1
vreplgr2vr.w vr4, t1
vreplgr2vr.w vr5, t1
vreplgr2vr.w vr6, t1
vmaddwev.w.h vr3, vr0, vr1
vmaddwod.w.h vr4, vr0, vr1
vmaddwev.w.h vr5, vr0, vr2
vmaddwod.w.h vr6, vr0, vr2
vsrai.w vr3, vr3, 11
vsrai.w vr4, vr4, 11
vsrai.w vr5, vr5, 11
vsrai.w vr6, vr6, 11
vpackev.h vr1, vr4, vr3
vpackev.h vr2, vr6, vr5
vst vr1, a0, 0
vst vr2, a1, 0
addi.d a0, a0, 16
addi.d a1, a1, 16
addi.d t2, t2, -1
bnez t2, 1b
2:
beqz t3, 4f
3:
ld.h t4, a0, 0
ld.h t5, a1, 0
mul.w t4, t4, t0
mul.w t5, t5, t0
add.w t4, t4, t1
add.w t5, t5, t1
srai.w t4, t4, 11
srai.w t5, t5, 11
st.h t4, a0, 0
st.h t5, a1, 0
addi.d a0, a0, 2
addi.d a1, a1, 2
addi.d t3, t3, -1
bnez t3, 3b
4:
endfunc
function chrRangeFromJpeg_lasx
li.w t0, 1799
li.w t1, 4081085
xvreplgr2vr.h xr0, t0
srli.w t2, a2, 4
andi t3, a2, 15
beqz t2, 2f
1:
xvld xr1, a0, 0
xvld xr2, a1, 0
xvreplgr2vr.w xr3, t1
xvreplgr2vr.w xr4, t1
xvreplgr2vr.w xr5, t1
xvreplgr2vr.w xr6, t1
xvmaddwev.w.h xr3, xr0, xr1
xvmaddwod.w.h xr4, xr0, xr1
xvmaddwev.w.h xr5, xr0, xr2
xvmaddwod.w.h xr6, xr0, xr2
xvsrai.w xr3, xr3, 11
xvsrai.w xr4, xr4, 11
xvsrai.w xr5, xr5, 11
xvsrai.w xr6, xr6, 11
xvpackev.h xr1, xr4, xr3
xvpackev.h xr2, xr6, xr5
xvst xr1, a0, 0
xvst xr2, a1, 0
addi.d a0, a0, 32
addi.d a1, a1, 32
addi.d t2, t2, -1
bnez t2, 1b
2:
beqz t3, 4f
3:
ld.h t4, a0, 0
ld.h t5, a1, 0
mul.w t4, t4, t0
mul.w t5, t5, t0
add.w t4, t4, t1
add.w t5, t5, t1
srai.w t4, t4, 11
srai.w t5, t5, 11
st.h t4, a0, 0
st.h t5, a1, 0
addi.d a0, a0, 2
addi.d a1, a1, 2
addi.d t3, t3, -1
bnez t3, 3b
4:
endfunc
function chrRangeToJpeg_lsx
li.w t0, 4663
li.w t1, -9289992
li.w t2, 30775
vreplgr2vr.h vr0, t0
vreplgr2vr.h vr7, t2
srli.w t2, a2, 3
andi t3, a2, 7
beqz t2, 2f
1:
vld vr1, a0, 0
vld vr2, a1, 0
vreplgr2vr.w vr3, t1
vreplgr2vr.w vr4, t1
vreplgr2vr.w vr5, t1
vreplgr2vr.w vr6, t1
vmin.h vr1, vr1, vr7
vmin.h vr2, vr2, vr7
vmaddwev.w.h vr3, vr0, vr1
vmaddwod.w.h vr4, vr0, vr1
vmaddwev.w.h vr5, vr0, vr2
vmaddwod.w.h vr6, vr0, vr2
vsrai.w vr3, vr3, 12
vsrai.w vr4, vr4, 12
vsrai.w vr5, vr5, 12
vsrai.w vr6, vr6, 12
vpackev.h vr1, vr4, vr3
vpackev.h vr2, vr6, vr5
vst vr1, a0, 0
vst vr2, a1, 0
addi.d a0, a0, 16
addi.d a1, a1, 16
addi.d t2, t2, -1
bnez t2, 1b
2:
beqz t3, 4f
3:
ld.h t4, a0, 0
ld.h t5, a1, 0
vreplgr2vr.h vr1, t4
vreplgr2vr.h vr2, t5
vmin.h vr1, vr1, vr7
vmin.h vr2, vr2, vr7
vpickve2gr.h t4, vr1, 0
vpickve2gr.h t5, vr2, 0
mul.w t4, t4, t0
mul.w t5, t5, t0
add.w t4, t4, t1
add.w t5, t5, t1
srai.w t4, t4, 12
srai.w t5, t5, 12
st.h t4, a0, 0
st.h t5, a1, 0
addi.d a0, a0, 2
addi.d a1, a1, 2
addi.d t3, t3, -1
bnez t3, 3b
4:
endfunc
function chrRangeToJpeg_lasx
li.w t0, 4663
li.w t1, -9289992
li.w t2, 30775
xvreplgr2vr.h xr0, t0
xvreplgr2vr.h xr7, t2
srli.w t2, a2, 4
andi t3, a2, 15
beqz t2, 2f
1:
xvld xr1, a0, 0
xvld xr2, a1, 0
xvreplgr2vr.w xr3, t1
xvreplgr2vr.w xr4, t1
xvreplgr2vr.w xr5, t1
xvreplgr2vr.w xr6, t1
xvmin.h xr1, xr1, xr7
xvmin.h xr2, xr2, xr7
xvmaddwev.w.h xr3, xr0, xr1
xvmaddwod.w.h xr4, xr0, xr1
xvmaddwev.w.h xr5, xr0, xr2
xvmaddwod.w.h xr6, xr0, xr2
xvsrai.w xr3, xr3, 12
xvsrai.w xr4, xr4, 12
xvsrai.w xr5, xr5, 12
xvsrai.w xr6, xr6, 12
xvpackev.h xr1, xr4, xr3
xvpackev.h xr2, xr6, xr5
xvst xr1, a0, 0
xvst xr2, a1, 0
addi.d a0, a0, 32
addi.d a1, a1, 32
addi.d t2, t2, -1
bnez t2, 1b
2:
beqz t3, 4f
3:
ld.h t4, a0, 0
ld.h t5, a1, 0
vreplgr2vr.h vr1, t4
vreplgr2vr.h vr2, t5
vmin.h vr1, vr1, vr7
vmin.h vr2, vr2, vr7
vpickve2gr.h t4, vr1, 0
vpickve2gr.h t5, vr2, 0
mul.w t4, t4, t0
mul.w t5, t5, t0
add.w t4, t4, t1
add.w t5, t5, t1
srai.w t4, t4, 12
srai.w t5, t5, 12
st.h t4, a0, 0
st.h t5, a1, 0
addi.d a0, a0, 2
addi.d a1, a1, 2
addi.d t3, t3, -1
bnez t3, 3b
4:
endfunc

View File

@ -24,6 +24,38 @@
#include "libswscale/rgb2rgb.h"
#include "libavutil/loongarch/cpu.h"
av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_lsx(cpu_flags)) {
if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
if (c->dstBpc <= 14) {
if (c->srcRange) {
c->lumConvertRange = lumRangeFromJpeg_lsx;
c->chrConvertRange = chrRangeFromJpeg_lsx;
} else {
c->lumConvertRange = lumRangeToJpeg_lsx;
c->chrConvertRange = chrRangeToJpeg_lsx;
}
}
}
}
if (have_lasx(cpu_flags)) {
if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
if (c->dstBpc <= 14) {
if (c->srcRange) {
c->lumConvertRange = lumRangeFromJpeg_lasx;
c->chrConvertRange = chrRangeFromJpeg_lasx;
} else {
c->lumConvertRange = lumRangeToJpeg_lasx;
c->chrConvertRange = chrRangeToJpeg_lasx;
}
}
}
}
}
av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
@ -77,6 +109,7 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
c->yuv2planeX = ff_yuv2planeX_8_lasx;
}
#endif // #if HAVE_LASX
ff_sws_init_range_convert_loongarch(c);
}
av_cold void rgb2rgb_init_loongarch(void)

View File

@ -50,6 +50,11 @@ void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
const uint8_t *_src, const int16_t *filter,
const int32_t *filterPos, int filterSize, int sh);
void lumRangeFromJpeg_lsx(int16_t *dst, int width);
void chrRangeFromJpeg_lsx(int16_t *dstU, int16_t *dstV, int width);
void lumRangeToJpeg_lsx(int16_t *dst, int width);
void chrRangeToJpeg_lsx(int16_t *dstU, int16_t *dstV, int width);
void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
int width, int32_t *rgb2yuv, void *opq);
@ -97,6 +102,11 @@ void ff_hscale_16_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *_src, const int16_t *filter,
const int32_t *filterPos, int filterSize);
void lumRangeFromJpeg_lasx(int16_t *dst, int width);
void chrRangeFromJpeg_lasx(int16_t *dstU, int16_t *dstV, int width);
void lumRangeToJpeg_lasx(int16_t *dst, int width);
void chrRangeToJpeg_lasx(int16_t *dstU, int16_t *dstV, int width);
void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
int width, int32_t *rgb2yuv, void *opq);
@ -130,6 +140,7 @@ void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
const uint8_t *dither, int offset);
av_cold void ff_sws_init_output_lasx(SwsContext *c);
#endif // #if HAVE_LASX
#endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */

View File

@ -697,6 +697,7 @@ void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4],
void ff_updateMMXDitherTables(SwsContext *c, int dstY);
av_cold void ff_sws_init_range_convert(SwsContext *c);
av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);

View File

@ -1078,8 +1078,12 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
c->srcRange = srcRange;
c->dstRange = dstRange;
if (need_reinit)
if (need_reinit) {
ff_sws_init_range_convert(c);
#if ARCH_LOONGARCH64
ff_sws_init_range_convert_loongarch(c);
#endif
}
c->dstFormatBpp = av_get_bits_per_pixel(desc_dst);
c->srcFormatBpp = av_get_bits_per_pixel(desc_src);