From f3fe2cb5f72a669bd737203f6f82ed7f2fa60ded Mon Sep 17 00:00:00 2001 From: Shiyou Yin Date: Sat, 16 Mar 2024 11:03:31 +0800 Subject: [PATCH] swscale: [LA] Optimize range convert for yuvj420p. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: 陈昊 Signed-off-by: Michael Niedermayer --- libswscale/loongarch/swscale.S | 368 ++++++++++++++++++ libswscale/loongarch/swscale_init_loongarch.c | 33 ++ libswscale/loongarch/swscale_loongarch.h | 11 + libswscale/swscale_internal.h | 1 + libswscale/utils.c | 6 +- 5 files changed, 418 insertions(+), 1 deletion(-) diff --git a/libswscale/loongarch/swscale.S b/libswscale/loongarch/swscale.S index aa4c5cbe28..67b1bc834d 100644 --- a/libswscale/loongarch/swscale.S +++ b/libswscale/loongarch/swscale.S @@ -1866,3 +1866,371 @@ function ff_hscale_16_to_19_sub_lsx ld.d s8, sp, 64 addi.d sp, sp, 72 endfunc + +function lumRangeFromJpeg_lsx + li.w t0, 14071 + li.w t1, 33561947 + vreplgr2vr.h vr0, t0 + srli.w t2, a1, 3 + andi t3, a1, 7 + beqz t2, 2f +1: + vld vr1, a0, 0 + vreplgr2vr.w vr2, t1 + vreplgr2vr.w vr3, t1 + vmaddwev.w.h vr2, vr0, vr1 + vmaddwod.w.h vr3, vr0, vr1 + vsrai.w vr2, vr2, 14 + vsrai.w vr3, vr3, 14 + vpackev.h vr1, vr3, vr2 + vst vr1, a0, 0 + addi.d a0, a0, 16 + addi.d t2, t2, -1 + bnez t2, 1b +2: + beqz t3, 4f +3: + ld.h t4, a0, 0 + mul.w t4, t4, t0 + add.w t4, t4, t1 + srai.w t4, t4, 14 + st.h t4, a0, 0 + addi.d a0, a0, 2 + addi.d t3, t3, -1 + bnez t3, 3b +4: +endfunc + +function lumRangeFromJpeg_lasx + li.w t0, 14071 + li.w t1, 33561947 + xvreplgr2vr.h xr0, t0 + srli.w t2, a1, 4 + andi t3, a1, 15 + beqz t2, 2f +1: + xvld xr1, a0, 0 + xvreplgr2vr.w xr2, t1 + xvreplgr2vr.w xr3, t1 + xvmaddwev.w.h xr2, xr0, xr1 + xvmaddwod.w.h xr3, xr0, xr1 + xvsrai.w xr2, xr2, 14 + xvsrai.w xr3, xr3, 14 + xvpackev.h xr1, xr3, xr2 + xvst xr1, a0, 0 + addi.d a0, a0, 32 + addi.d t2, t2, -1 + bnez t2, 1b +2: + beqz t3, 4f +3: + ld.h t4, a0, 0 + mul.w t4, t4, t0 + add.w t4, t4, t1 + srai.w t4, t4, 14 + st.h t4, a0, 0 + addi.d a0, a0, 2 + addi.d t3, t3, -1 + bnez t3, 3b +4: +endfunc + +function lumRangeToJpeg_lsx + li.w t0, 19077 + li.w t1, -39057361 + li.w t2, 30189 + vreplgr2vr.h vr0, t0 + vreplgr2vr.h vr4, t2 + srli.w t2, a1, 3 + andi t3, a1, 7 + beqz t2, 2f +1: + vld vr1, a0, 0 + vreplgr2vr.w vr2, t1 + vreplgr2vr.w vr3, t1 + vmin.h vr1, vr1, vr4 + vmaddwev.w.h vr2, vr0, vr1 + vmaddwod.w.h vr3, vr0, vr1 + vsrai.w vr2, vr2, 14 + vsrai.w vr3, vr3, 14 + vpackev.h vr1, vr3, vr2 + vst vr1, a0, 0 + addi.d a0, a0, 16 + addi.d t2, t2, -1 + bnez t2, 1b +2: + beqz t3, 4f +3: + ld.h t4, a0, 0 + vreplgr2vr.h vr1, t4 + vmin.h vr1, vr1, vr4 + vpickve2gr.h t4, vr1, 0 + mul.w t4, t4, t0 + add.w t4, t4, t1 + srai.w t4, t4, 14 + st.h t4, a0, 0 + addi.d a0, a0, 2 + addi.d t3, t3, -1 + bnez t3, 3b +4: +endfunc + +function lumRangeToJpeg_lasx + li.w t0, 19077 + li.w t1, -39057361 + li.w t2, 30189 + xvreplgr2vr.h xr0, t0 + xvreplgr2vr.h xr4, t2 + srli.w t2, a1, 4 + andi t3, a1, 15 + beqz t2, 2f +1: + xvld xr1, a0, 0 + xvreplgr2vr.w xr2, t1 + xvreplgr2vr.w xr3, t1 + xvmin.h xr1, xr1, xr4 + xvmaddwev.w.h xr2, xr0, xr1 + xvmaddwod.w.h xr3, xr0, xr1 + xvsrai.w xr2, xr2, 14 + xvsrai.w xr3, xr3, 14 + xvpackev.h xr1, xr3, xr2 + xvst xr1, a0, 0 + addi.d a0, a0, 32 + addi.d t2, t2, -1 + bnez t2, 1b +2: + beqz t3, 4f +3: + ld.h t4, a0, 0 + vreplgr2vr.h vr1, t4 + vmin.h vr1, vr1, vr4 + vpickve2gr.h t4, vr1, 0 + mul.w t4, t4, t0 + add.w t4, t4, t1 + srai.w t4, t4, 14 + st.h t4, a0, 0 + addi.d a0, a0, 2 + addi.d t3, t3, -1 + bnez t3, 3b +4: +endfunc + +function chrRangeFromJpeg_lsx + li.w t0, 1799 + li.w t1, 4081085 + vreplgr2vr.h vr0, t0 + srli.w t2, a2, 3 + andi t3, a2, 7 + beqz t2, 2f +1: + vld vr1, a0, 0 + vld vr2, a1, 0 + vreplgr2vr.w vr3, t1 + vreplgr2vr.w vr4, t1 + vreplgr2vr.w vr5, t1 + vreplgr2vr.w vr6, t1 + vmaddwev.w.h vr3, vr0, vr1 + vmaddwod.w.h vr4, vr0, vr1 + vmaddwev.w.h vr5, vr0, vr2 + vmaddwod.w.h vr6, vr0, vr2 + vsrai.w vr3, vr3, 11 + vsrai.w vr4, vr4, 11 + vsrai.w vr5, vr5, 11 + vsrai.w vr6, vr6, 11 + vpackev.h vr1, vr4, vr3 + vpackev.h vr2, vr6, vr5 + vst vr1, a0, 0 + vst vr2, a1, 0 + addi.d a0, a0, 16 + addi.d a1, a1, 16 + addi.d t2, t2, -1 + bnez t2, 1b +2: + beqz t3, 4f +3: + ld.h t4, a0, 0 + ld.h t5, a1, 0 + mul.w t4, t4, t0 + mul.w t5, t5, t0 + add.w t4, t4, t1 + add.w t5, t5, t1 + srai.w t4, t4, 11 + srai.w t5, t5, 11 + st.h t4, a0, 0 + st.h t5, a1, 0 + addi.d a0, a0, 2 + addi.d a1, a1, 2 + addi.d t3, t3, -1 + bnez t3, 3b +4: +endfunc + +function chrRangeFromJpeg_lasx + li.w t0, 1799 + li.w t1, 4081085 + xvreplgr2vr.h xr0, t0 + srli.w t2, a2, 4 + andi t3, a2, 15 + beqz t2, 2f +1: + xvld xr1, a0, 0 + xvld xr2, a1, 0 + xvreplgr2vr.w xr3, t1 + xvreplgr2vr.w xr4, t1 + xvreplgr2vr.w xr5, t1 + xvreplgr2vr.w xr6, t1 + xvmaddwev.w.h xr3, xr0, xr1 + xvmaddwod.w.h xr4, xr0, xr1 + xvmaddwev.w.h xr5, xr0, xr2 + xvmaddwod.w.h xr6, xr0, xr2 + xvsrai.w xr3, xr3, 11 + xvsrai.w xr4, xr4, 11 + xvsrai.w xr5, xr5, 11 + xvsrai.w xr6, xr6, 11 + xvpackev.h xr1, xr4, xr3 + xvpackev.h xr2, xr6, xr5 + xvst xr1, a0, 0 + xvst xr2, a1, 0 + addi.d a0, a0, 32 + addi.d a1, a1, 32 + addi.d t2, t2, -1 + bnez t2, 1b +2: + beqz t3, 4f +3: + ld.h t4, a0, 0 + ld.h t5, a1, 0 + mul.w t4, t4, t0 + mul.w t5, t5, t0 + add.w t4, t4, t1 + add.w t5, t5, t1 + srai.w t4, t4, 11 + srai.w t5, t5, 11 + st.h t4, a0, 0 + st.h t5, a1, 0 + addi.d a0, a0, 2 + addi.d a1, a1, 2 + addi.d t3, t3, -1 + bnez t3, 3b +4: +endfunc + +function chrRangeToJpeg_lsx + li.w t0, 4663 + li.w t1, -9289992 + li.w t2, 30775 + vreplgr2vr.h vr0, t0 + vreplgr2vr.h vr7, t2 + srli.w t2, a2, 3 + andi t3, a2, 7 + beqz t2, 2f +1: + vld vr1, a0, 0 + vld vr2, a1, 0 + vreplgr2vr.w vr3, t1 + vreplgr2vr.w vr4, t1 + vreplgr2vr.w vr5, t1 + vreplgr2vr.w vr6, t1 + vmin.h vr1, vr1, vr7 + vmin.h vr2, vr2, vr7 + vmaddwev.w.h vr3, vr0, vr1 + vmaddwod.w.h vr4, vr0, vr1 + vmaddwev.w.h vr5, vr0, vr2 + vmaddwod.w.h vr6, vr0, vr2 + vsrai.w vr3, vr3, 12 + vsrai.w vr4, vr4, 12 + vsrai.w vr5, vr5, 12 + vsrai.w vr6, vr6, 12 + vpackev.h vr1, vr4, vr3 + vpackev.h vr2, vr6, vr5 + vst vr1, a0, 0 + vst vr2, a1, 0 + addi.d a0, a0, 16 + addi.d a1, a1, 16 + addi.d t2, t2, -1 + bnez t2, 1b +2: + beqz t3, 4f +3: + ld.h t4, a0, 0 + ld.h t5, a1, 0 + vreplgr2vr.h vr1, t4 + vreplgr2vr.h vr2, t5 + vmin.h vr1, vr1, vr7 + vmin.h vr2, vr2, vr7 + vpickve2gr.h t4, vr1, 0 + vpickve2gr.h t5, vr2, 0 + mul.w t4, t4, t0 + mul.w t5, t5, t0 + add.w t4, t4, t1 + add.w t5, t5, t1 + srai.w t4, t4, 12 + srai.w t5, t5, 12 + st.h t4, a0, 0 + st.h t5, a1, 0 + addi.d a0, a0, 2 + addi.d a1, a1, 2 + addi.d t3, t3, -1 + bnez t3, 3b +4: +endfunc + +function chrRangeToJpeg_lasx + li.w t0, 4663 + li.w t1, -9289992 + li.w t2, 30775 + xvreplgr2vr.h xr0, t0 + xvreplgr2vr.h xr7, t2 + srli.w t2, a2, 4 + andi t3, a2, 15 + beqz t2, 2f +1: + xvld xr1, a0, 0 + xvld xr2, a1, 0 + xvreplgr2vr.w xr3, t1 + xvreplgr2vr.w xr4, t1 + xvreplgr2vr.w xr5, t1 + xvreplgr2vr.w xr6, t1 + xvmin.h xr1, xr1, xr7 + xvmin.h xr2, xr2, xr7 + xvmaddwev.w.h xr3, xr0, xr1 + xvmaddwod.w.h xr4, xr0, xr1 + xvmaddwev.w.h xr5, xr0, xr2 + xvmaddwod.w.h xr6, xr0, xr2 + xvsrai.w xr3, xr3, 12 + xvsrai.w xr4, xr4, 12 + xvsrai.w xr5, xr5, 12 + xvsrai.w xr6, xr6, 12 + xvpackev.h xr1, xr4, xr3 + xvpackev.h xr2, xr6, xr5 + xvst xr1, a0, 0 + xvst xr2, a1, 0 + addi.d a0, a0, 32 + addi.d a1, a1, 32 + addi.d t2, t2, -1 + bnez t2, 1b +2: + beqz t3, 4f +3: + ld.h t4, a0, 0 + ld.h t5, a1, 0 + vreplgr2vr.h vr1, t4 + vreplgr2vr.h vr2, t5 + vmin.h vr1, vr1, vr7 + vmin.h vr2, vr2, vr7 + vpickve2gr.h t4, vr1, 0 + vpickve2gr.h t5, vr2, 0 + mul.w t4, t4, t0 + mul.w t5, t5, t0 + add.w t4, t4, t1 + add.w t5, t5, t1 + srai.w t4, t4, 12 + srai.w t5, t5, 12 + st.h t4, a0, 0 + st.h t5, a1, 0 + addi.d a0, a0, 2 + addi.d a1, a1, 2 + addi.d t3, t3, -1 + bnez t3, 3b +4: +endfunc diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c index 53e4f970b6..6d2786c55f 100644 --- a/libswscale/loongarch/swscale_init_loongarch.c +++ b/libswscale/loongarch/swscale_init_loongarch.c @@ -24,6 +24,38 @@ #include "libswscale/rgb2rgb.h" #include "libavutil/loongarch/cpu.h" +av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_lsx(cpu_flags)) { + if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) { + if (c->dstBpc <= 14) { + if (c->srcRange) { + c->lumConvertRange = lumRangeFromJpeg_lsx; + c->chrConvertRange = chrRangeFromJpeg_lsx; + } else { + c->lumConvertRange = lumRangeToJpeg_lsx; + c->chrConvertRange = chrRangeToJpeg_lsx; + } + } + } + } + if (have_lasx(cpu_flags)) { + if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) { + if (c->dstBpc <= 14) { + if (c->srcRange) { + c->lumConvertRange = lumRangeFromJpeg_lasx; + c->chrConvertRange = chrRangeFromJpeg_lasx; + } else { + c->lumConvertRange = lumRangeToJpeg_lasx; + c->chrConvertRange = chrRangeToJpeg_lasx; + } + } + } + } +} + av_cold void ff_sws_init_swscale_loongarch(SwsContext *c) { int cpu_flags = av_get_cpu_flags(); @@ -77,6 +109,7 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c) c->yuv2planeX = ff_yuv2planeX_8_lasx; } #endif // #if HAVE_LASX + ff_sws_init_range_convert_loongarch(c); } av_cold void rgb2rgb_init_loongarch(void) diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h index 0514abae21..c96b085982 100644 --- a/libswscale/loongarch/swscale_loongarch.h +++ b/libswscale/loongarch/swscale_loongarch.h @@ -50,6 +50,11 @@ void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src, const int16_t *filter, const int32_t *filterPos, int filterSize, int sh); +void lumRangeFromJpeg_lsx(int16_t *dst, int width); +void chrRangeFromJpeg_lsx(int16_t *dstU, int16_t *dstV, int width); +void lumRangeToJpeg_lsx(int16_t *dst, int width); +void chrRangeToJpeg_lsx(int16_t *dstU, int16_t *dstV, int width); + void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4], int width, int32_t *rgb2yuv, void *opq); @@ -97,6 +102,11 @@ void ff_hscale_16_to_15_lasx(SwsContext *c, int16_t *dst, int dstW, const uint8_t *_src, const int16_t *filter, const int32_t *filterPos, int filterSize); +void lumRangeFromJpeg_lasx(int16_t *dst, int width); +void chrRangeFromJpeg_lasx(int16_t *dstU, int16_t *dstV, int width); +void lumRangeToJpeg_lasx(int16_t *dst, int width); +void chrRangeToJpeg_lasx(int16_t *dstU, int16_t *dstV, int width); + void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4], int width, int32_t *rgb2yuv, void *opq); @@ -130,6 +140,7 @@ void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize, const uint8_t *dither, int offset); av_cold void ff_sws_init_output_lasx(SwsContext *c); + #endif // #if HAVE_LASX #endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */ diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index c2cc736dd2..d4b0c3cee2 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -697,6 +697,7 @@ void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4], void ff_updateMMXDitherTables(SwsContext *c, int dstY); av_cold void ff_sws_init_range_convert(SwsContext *c); +av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c); SwsFunc ff_yuv2rgb_init_x86(SwsContext *c); SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c); diff --git a/libswscale/utils.c b/libswscale/utils.c index df14eb016c..476a24fea5 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -1078,8 +1078,12 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4], c->srcRange = srcRange; c->dstRange = dstRange; - if (need_reinit) + if (need_reinit) { ff_sws_init_range_convert(c); +#if ARCH_LOONGARCH64 + ff_sws_init_range_convert_loongarch(c); +#endif + } c->dstFormatBpp = av_get_bits_per_pixel(desc_dst); c->srcFormatBpp = av_get_bits_per_pixel(desc_src);