swscale: [LA] Optimize swscale funcs in input.c

Optimized 7 funcs with LSX and LASX: 1. yuy2ToUV_c 2. yvy2ToUV_c 3. uyvyToUV_c 4. nv12ToUV_c 5. nv21ToUV_c 6. abgrToA_c 7. rgbaToA_c Reviewed-by: colleague of Shiyou Yin Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
2024-03-16 11:03:33 +08:00 · 2024-03-16 11:03:33 +08:00 · 2a7d622ddd
parent 8b76df9142
commit 2a7d622ddd
6 changed files with 652 additions and 18 deletions
--- a/libswscale/loongarch/Makefile
+++ b/libswscale/loongarch/Makefile
@ -9,4 +9,5 @@ LSX-OBJS-$(CONFIG_SWSCALE)  += loongarch/swscale.o \
                               loongarch/input.o   \
                               loongarch/output.o  \
                               loongarch/output_lsx.o  \
+                               loongarch/input_lsx.o   \
                               loongarch/yuv2rgb_lsx.o
--- a/libswscale/loongarch/input.S
+++ b/libswscale/loongarch/input.S
@ -283,3 +283,498 @@ function planar_rgb_to_uv_lsx
    ld.d            s3,     sp,    16
    addi.d          sp,     sp,    24
 endfunc
+
+/*
+ * void yuy2ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ *                   const uint8_t *src2, int width, uint32_t *unused, void *opq)
+ */
+function yuy2ToUV_lsx
+    andi         t0,    a5,    7
+    srli.d       a5,    a5,    3
+    beqz         a5,    2f
+1:
+    vld          vr0,   a3,    1
+    vld          vr1,   a3,    17
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    32
+    vpickev.b    vr2,   vr1,   vr0
+    vpickev.b    vr0,   vr2,   vr2
+    vpickod.b    vr1,   vr2,   vr2
+    fst.d        f0,    a0,    0
+    fst.d        f1,    a1,    0
+    addi.d       a0,    a0,    8
+    addi.d       a1,    a1,    8
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    1
+    ld.b         t2,    a3,    3
+    addi.d       a3,    a3,    4
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a0,    0
+    st.b         t2,    a1,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+function yuy2ToUV_lasx
+    andi         t0,    a5,    15
+    srli.d       a5,    a5,    4
+    beqz         a5,    2f
+1:
+    xvld         xr0,   a3,    1
+    xvld         xr1,   a3,    33
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    64
+    xvpickev.b   xr2,   xr1,   xr0
+    xvpermi.d    xr2,   xr2,   0xd8
+    xvpickev.b   xr0,   xr2,   xr2
+    xvpermi.d    xr0,   xr0,   0xd8
+    xvpickod.b   xr1,   xr2,   xr2
+    xvpermi.d    xr1,   xr1,   0xd8
+    vst          vr0,   a0,    0
+    vst          vr1,   a1,    0
+    addi.d       a0,    a0,    16
+    addi.d       a1,    a1,    16
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    1
+    ld.b         t2,    a3,    3
+    addi.d       a3,    a3,    4
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a0,    0
+    st.b         t2,    a1,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+/*
+ * void yvy2ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ *                   const uint8_t *src2, int width, uint32_t *unused, void *opq)
+ */
+function yvy2ToUV_lsx
+    andi         t0,    a5,    7
+    srli.d       a5,    a5,    3
+    beqz         a5,    2f
+1:
+    vld          vr0,   a3,    1
+    vld          vr1,   a3,    17
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    32
+    vpickev.b    vr2,   vr1,   vr0
+    vpickev.b    vr0,   vr2,   vr2
+    vpickod.b    vr1,   vr2,   vr2
+    fst.d        f0,    a1,    0
+    fst.d        f1,    a0,    0
+    addi.d       a0,    a0,    8
+    addi.d       a1,    a1,    8
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    1
+    ld.b         t2,    a3,    3
+    addi.d       a3,    a3,    4
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a1,    0
+    st.b         t2,    a0,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+function yvy2ToUV_lasx
+    andi         t0,    a5,    15
+    srli.d       a5,    a5,    4
+    beqz         a5,    2f
+1:
+    xvld         xr0,   a3,    1
+    xvld         xr1,   a3,    33
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    64
+    xvpickev.b   xr2,   xr1,   xr0
+    xvpermi.d    xr2,   xr2,   0xd8
+    xvpickev.b   xr0,   xr2,   xr2
+    xvpermi.d    xr0,   xr0,   0xd8
+    xvpickod.b   xr1,   xr2,   xr2
+    xvpermi.d    xr1,   xr1,   0xd8
+    vst          vr0,   a1,    0
+    vst          vr1,   a0,    0
+    addi.d       a0,    a0,    16
+    addi.d       a1,    a1,    16
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    1
+    ld.b         t2,    a3,    3
+    addi.d       a3,    a3,    4
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a1,    0
+    st.b         t2,    a0,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+/*
+ * void uyvyToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ *                   const uint8_t *src2, int width, uint32_t *unused, void *opq)
+ */
+function uyvyToUV_lsx
+    andi         t0,    a5,    7
+    srli.d       a5,    a5,    3
+    beqz         a5,    2f
+1:
+    vld          vr0,   a3,    0
+    vld          vr1,   a3,    16
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    32
+    vpickev.b    vr2,   vr1,   vr0
+    vpickev.b    vr0,   vr2,   vr2
+    vpickod.b    vr1,   vr2,   vr2
+    fst.d        f0,    a0,    0
+    fst.d        f1,    a1,    0
+    addi.d       a0,    a0,    8
+    addi.d       a1,    a1,    8
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    1
+    ld.b         t2,    a3,    3
+    addi.d       a3,    a3,    4
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a0,    0
+    st.b         t2,    a1,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+function uyvyToUV_lasx
+    andi         t0,    a5,    15
+    srli.d       a5,    a5,    4
+    beqz         a5,    2f
+1:
+    xvld         xr0,   a3,    0
+    xvld         xr1,   a3,    32
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    64
+    xvpickev.b   xr2,   xr1,   xr0
+    xvpermi.d    xr2,   xr2,   0xd8
+    xvpickev.b   xr0,   xr2,   xr2
+    xvpermi.d    xr0,   xr0,   0xd8
+    xvpickod.b   xr1,   xr2,   xr2
+    xvpermi.d    xr1,   xr1,   0xd8
+    vst          vr0,   a0,    0
+    vst          vr1,   a1,    0
+    addi.d       a0,    a0,    16
+    addi.d       a1,    a1,    16
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    1
+    ld.b         t2,    a3,    3
+    addi.d       a3,    a3,    4
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a0,    0
+    st.b         t2,    a1,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+/*
+ * void nv12ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ *                   const uint8_t *src2, int width, uint32_t *unused, void *opq)
+ */
+function nv12ToUV_lsx
+    andi         t0,    a5,    15
+    srli.d       a5,    a5,    4
+    beqz         a5,    2f
+1:
+    vld          vr0,   a3,    0
+    vld          vr1,   a3,    16
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    32
+    vpickev.b    vr2,   vr1,   vr0
+    vpickod.b    vr3,   vr1,   vr0
+    vst          vr2,   a0,    0
+    vst          vr3,   a1,    0
+    addi.d       a0,    a0,    16
+    addi.d       a1,    a1,    16
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    0
+    ld.b         t2,    a3,    1
+    addi.d       a3,    a3,    2
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a0,    0
+    st.b         t2,    a1,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+function nv12ToUV_lasx
+    andi         t0,    a5,    31
+    srli.d       a5,    a5,    5
+    beqz         a5,    2f
+1:
+    xvld         xr0,   a3,    0
+    xvld         xr1,   a3,    32
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    64
+    xvpickev.b   xr2,   xr1,   xr0
+    xvpickod.b   xr3,   xr1,   xr0
+    xvpermi.d    xr2,   xr2,   0xd8
+    xvpermi.d    xr3,   xr3,   0xd8
+    xvst         xr2,   a0,    0
+    xvst         xr3,   a1,    0
+    addi.d       a0,    a0,    32
+    addi.d       a1,    a1,    32
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    0
+    ld.b         t2,    a3,    1
+    addi.d       a3,    a3,    2
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a0,    0
+    st.b         t2,    a1,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+/*
+ * void nv21ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ *                   const uint8_t *src2, int width, uint32_t *unused, void *opq)
+ */
+function nv21ToUV_lsx
+    andi         t0,    a5,    15
+    srli.d       a5,    a5,    4
+    beqz         a5,    2f
+1:
+    vld          vr0,   a3,    0
+    vld          vr1,   a3,    16
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    32
+    vpickev.b    vr2,   vr1,   vr0
+    vpickod.b    vr3,   vr1,   vr0
+    vst          vr2,   a1,    0
+    vst          vr3,   a0,    0
+    addi.d       a0,    a0,    16
+    addi.d       a1,    a1,    16
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    0
+    ld.b         t2,    a3,    1
+    addi.d       a3,    a3,    2
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a1,    0
+    st.b         t2,    a0,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+function nv21ToUV_lasx
+    andi         t0,    a5,    31
+    srli.d       a5,    a5,    5
+    beqz         a5,    2f
+1:
+    xvld         xr0,   a3,    0
+    xvld         xr1,   a3,    32
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    64
+    xvpickev.b   xr2,   xr1,   xr0
+    xvpickod.b   xr3,   xr1,   xr0
+    xvpermi.d    xr2,   xr2,   0xd8
+    xvpermi.d    xr3,   xr3,   0xd8
+    xvst         xr2,   a1,    0
+    xvst         xr3,   a0,    0
+    addi.d       a0,    a0,    32
+    addi.d       a1,    a1,    32
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    0
+    ld.b         t2,    a3,    1
+    addi.d       a3,    a3,    2
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a1,    0
+    st.b         t2,    a0,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+/*
+ *void abgrToA_lsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+ *                 const uint8_t *unused2, int width, uint32_t *unused, void *opq)
+ */
+function abgrToA_lsx
+    andi         t0,    a4,    7
+    srli.d       a4,    a4,    3
+    vxor.v       vr0,   vr0,   vr0
+    beqz         a4,    2f
+1:
+    vld          vr1,   a1,    0
+    vld          vr2,   a1,    16
+    addi.d       a4,    a4,    -1
+    addi.d       a1,    a1,    32
+    vpickev.b    vr3,   vr2,   vr1
+    vpackev.b    vr3,   vr0,   vr3
+    vslli.h      vr1,   vr3,   6
+    vsrli.h      vr2,   vr3,   2
+    vor.v        vr3,   vr2,   vr1
+    vst          vr3,   a0,    0
+    addi.d       a0,    a0,    16
+    bnez         a4,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a1,    3
+    addi.d       t0,    t0,    -1
+    addi.d       a1,    a1,    4
+    andi         t1,    t1,    0xff
+    slli.w       t2,    t1,    6
+    srli.w       t3,    t1,    2
+    or           t1,    t2,    t3
+    st.h         t1,    a0,    0
+    addi.d       a0,    a0,    2
+    bnez         t0,    3b
+4:
+endfunc
+
+function abgrToA_lasx
+    andi         t0,    a4,    15
+    srli.d       a4,    a4,    4
+    xvxor.v      xr0,   xr0,   xr0
+    beqz         a4,    2f
+1:
+    xvld         xr1,   a1,    0
+    xvld         xr2,   a1,    32
+    addi.d       a4,    a4,    -1
+    addi.d       a1,    a1,    64
+    xvpickev.b   xr3,   xr2,   xr1
+    xvpermi.d    xr3,   xr3,   0xd8
+    xvpackev.b   xr3,   xr0,   xr3
+    xvslli.h     xr1,   xr3,   6
+    xvsrli.h     xr2,   xr3,   2
+    xvor.v       xr3,   xr2,   xr1
+    xvst         xr3,   a0,    0
+    addi.d       a0,    a0,    32
+    bnez         a4,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a1,    3
+    addi.d       t0,    t0,    -1
+    addi.d       a1,    a1,    4
+    andi         t1,    t1,    0xff
+    slli.w       t2,    t1,    6
+    srli.w       t3,    t1,    2
+    or           t1,    t2,    t3
+    st.h         t1,    a0,    0
+    addi.d       a0,    a0,    2
+    bnez         t0,    3b
+4:
+endfunc
+
+/*
+ *void rgbaToA_lsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+ *                 const uint8_t *unused2, int width, uint32_t *unused, void *opq)
+ */
+function rgbaToA_lsx
+    andi         t0,    a4,    7
+    srli.d       a4,    a4,    3
+    vxor.v       vr0,   vr0,   vr0
+    beqz         a4,    2f
+1:
+    vld          vr1,   a1,    3
+    vld          vr2,   a1,    19
+    addi.d       a4,    a4,    -1
+    addi.d       a1,    a1,    32
+    vpickev.b    vr3,   vr2,   vr1
+    vpackev.b    vr3,   vr0,   vr3
+    vslli.h      vr1,   vr3,   6
+    vsrli.h      vr2,   vr3,   2
+    vor.v        vr3,   vr2,   vr1
+    vst          vr3,   a0,    0
+    addi.d       a0,    a0,    16
+    bnez         a4,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a1,    3
+    addi.d       t0,    t0,    -1
+    addi.d       a1,    a1,    4
+    andi         t1,    t1,    0xff
+    slli.w       t2,    t1,    6
+    srli.w       t3,    t1,    2
+    or           t1,    t2,    t3
+    st.h         t1,    a0,    0
+    addi.d       a0,    a0,    2
+    bnez         t0,    3b
+4:
+endfunc
+
+function rgbaToA_lasx
+    andi         t0,    a4,    15
+    srli.d       a4,    a4,    4
+    xvxor.v      xr0,   xr0,   xr0
+    beqz         a4,    2f
+1:
+    xvld         xr1,   a1,    3
+    xvld         xr2,   a1,    35
+    addi.d       a4,    a4,    -1
+    addi.d       a1,    a1,    64
+    xvpickev.b   xr3,   xr2,   xr1
+    xvpermi.d    xr3,   xr3,   0xd8
+    xvpackev.b   xr3,   xr0,   xr3
+    xvslli.h     xr1,   xr3,   6
+    xvsrli.h     xr2,   xr3,   2
+    xvor.v       xr3,   xr2,   xr1
+    xvst         xr3,   a0,    0
+    addi.d       a0,    a0,    32
+    bnez         a4,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a1,    3
+    addi.d       t0,    t0,    -1
+    addi.d       a1,    a1,    4
+    andi         t1,    t1,    0xff
+    slli.w       t2,    t1,    6
+    srli.w       t3,    t1,    2
+    or           t1,    t2,    t3
+    st.h         t1,    a0,    0
+    addi.d       a0,    a0,    2
+    bnez         t0,    3b
+4:
+endfunc
--- a/libswscale/loongarch/input_lasx.c
+++ b/libswscale/loongarch/input_lasx.c
@ -200,3 +200,46 @@ void planar_rgb_to_y_lasx(uint8_t *_dst, const uint8_t *src[4], int width,
        dst[i] = (tem_ry * r + tem_gy * g + tem_by * b + set) >> shift;
    }
 }
+
+av_cold void ff_sws_init_input_lasx(SwsContext *c)
+{
+    enum AVPixelFormat srcFormat = c->srcFormat;
+
+    switch (srcFormat) {
+    case AV_PIX_FMT_YUYV422:
+        c->chrToYV12 = yuy2ToUV_lasx;
+        break;
+    case AV_PIX_FMT_YVYU422:
+        c->chrToYV12 = yvy2ToUV_lasx;
+        break;
+    case AV_PIX_FMT_UYVY422:
+        c->chrToYV12 = uyvyToUV_lasx;
+        break;
+    case AV_PIX_FMT_NV12:
+    case AV_PIX_FMT_NV16:
+    case AV_PIX_FMT_NV24:
+        c->chrToYV12 = nv12ToUV_lasx;
+        break;
+    case AV_PIX_FMT_NV21:
+    case AV_PIX_FMT_NV42:
+        c->chrToYV12 = nv21ToUV_lasx;
+        break;
+    case AV_PIX_FMT_GBRAP:
+    case AV_PIX_FMT_GBRP:
+        c->readChrPlanar = planar_rgb_to_uv_lasx;
+        break;
+    }
+
+    if (c->needAlpha) {
+        switch (srcFormat) {
+        case AV_PIX_FMT_BGRA:
+        case AV_PIX_FMT_RGBA:
+            c->alpToYV12 = rgbaToA_lasx;
+            break;
+        case AV_PIX_FMT_ABGR:
+        case AV_PIX_FMT_ARGB:
+            c->alpToYV12 = abgrToA_lasx;
+            break;
+        }
+    }
+}
--- a/libswscale/loongarch/input_lsx.c
+++ b/libswscale/loongarch/input_lsx.c
@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2024 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin<yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+
+av_cold void ff_sws_init_input_lsx(SwsContext *c)
+{
+    enum AVPixelFormat srcFormat = c->srcFormat;
+
+    switch (srcFormat) {
+    case AV_PIX_FMT_YUYV422:
+        c->chrToYV12 = yuy2ToUV_lsx;
+        break;
+    case AV_PIX_FMT_YVYU422:
+        c->chrToYV12 = yvy2ToUV_lsx;
+        break;
+    case AV_PIX_FMT_UYVY422:
+        c->chrToYV12 = uyvyToUV_lsx;
+        break;
+    case AV_PIX_FMT_NV12:
+    case AV_PIX_FMT_NV16:
+    case AV_PIX_FMT_NV24:
+        c->chrToYV12 = nv12ToUV_lsx;
+        break;
+    case AV_PIX_FMT_NV21:
+    case AV_PIX_FMT_NV42:
+        c->chrToYV12 = nv21ToUV_lsx;
+        break;
+    case AV_PIX_FMT_GBRAP:
+    case AV_PIX_FMT_GBRP:
+        c->readChrPlanar = planar_rgb_to_uv_lsx;
+        break;
+    }
+
+    if (c->needAlpha) {
+        switch (srcFormat) {
+        case AV_PIX_FMT_BGRA:
+        case AV_PIX_FMT_RGBA:
+            c->alpToYV12 = rgbaToA_lsx;
+            break;
+        case AV_PIX_FMT_ABGR:
+        case AV_PIX_FMT_ARGB:
+            c->alpToYV12 = abgrToA_lsx;
+            break;
+        }
+    }
+}
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@ -63,6 +63,7 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
        ff_sws_init_output_lsx(c, &c->yuv2plane1, &c->yuv2planeX,
                               &c->yuv2nv12cX, &c->yuv2packed1,
                               &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
+        ff_sws_init_input_lsx(c);
        if (c->srcBpc == 8) {
            if (c->dstBpc <= 14) {
                c->hyScale = c->hcScale = ff_hscale_8_to_15_lsx;
@ -73,21 +74,13 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
            c->hyScale = c->hcScale = c->dstBpc > 14 ? ff_hscale_16_to_19_lsx
                                                     : ff_hscale_16_to_15_lsx;
        }
-        switch (c->srcFormat) {
-        case AV_PIX_FMT_GBRAP:
-        case AV_PIX_FMT_GBRP:
-            {
-                c->readChrPlanar = planar_rgb_to_uv_lsx;
-                c->readLumPlanar = planar_rgb_to_y_lsx;
-            }
-            break;
-        }
    }
 #if HAVE_LASX
    if (have_lasx(cpu_flags)) {
        ff_sws_init_output_lasx(c, &c->yuv2plane1, &c->yuv2planeX,
                                &c->yuv2nv12cX, &c->yuv2packed1,
                                &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
+        ff_sws_init_input_lasx(c);
        if (c->srcBpc == 8) {
            if (c->dstBpc <= 14) {
                c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
@ -98,15 +91,6 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
            c->hyScale = c->hcScale = c->dstBpc > 14 ? ff_hscale_16_to_19_lasx
                                                     : ff_hscale_16_to_15_lasx;
        }
-        switch (c->srcFormat) {
-        case AV_PIX_FMT_GBRAP:
-        case AV_PIX_FMT_GBRP:
-            {
-                c->readChrPlanar = planar_rgb_to_uv_lasx;
-                c->readLumPlanar = planar_rgb_to_y_lasx;
-            }
-            break;
-        }
    }
 #endif // #if HAVE_LASX
    ff_sws_init_range_convert_loongarch(c);
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@ -68,6 +68,29 @@ void yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
 void yuv2plane1_8_lsx(const int16_t *src, uint8_t *dest, int dstW,
                      const uint8_t *dither, int offset);

+void yuy2ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                  const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void yvy2ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                  const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void uyvyToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                  const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void nv12ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                  const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void nv21ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                  const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void abgrToA_lsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+                 const uint8_t *unused2, int width, uint32_t *unused, void *opq);
+
+void rgbaToA_lsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+                 const uint8_t *unused2, int width, uint32_t *unused, void *opq);
+
+av_cold void ff_sws_init_input_lsx(SwsContext *c);
+
 av_cold void ff_sws_init_output_lsx(SwsContext *c,
                                    yuv2planar1_fn *yuv2plane1,
                                    yuv2planarX_fn *yuv2planeX,
@ -152,6 +175,29 @@ void yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
 void yuv2plane1_8_lasx(const int16_t *src, uint8_t *dest, int dstW,
                      const uint8_t *dither, int offset);

+void yuy2ToUV_lasx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                   const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void yvy2ToUV_lasx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                   const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void uyvyToUV_lasx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                   const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void nv12ToUV_lasx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                   const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void nv21ToUV_lasx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                   const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void abgrToA_lasx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+                  const uint8_t *unused2, int width, uint32_t *unused, void *opq);
+
+void rgbaToA_lasx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+                  const uint8_t *unused2, int width, uint32_t *unused, void *opq);
+
+av_cold void ff_sws_init_input_lasx(SwsContext *c);
+
 av_cold void ff_sws_init_output_lasx(SwsContext *c,
                                     yuv2planar1_fn *yuv2plane1,
                                     yuv2planarX_fn *yuv2planeX,