avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for HEVC uni mc epel functions

This patch adds MSA (MIPS-SIMD-Arch) optimizations for HEVC uni mc epel functions. Adds new generic macros (needed for this patch) in libavutil/mips/generic_macros_msa.h Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
2024-12-27 09:52:17 +00:00 · 2015-06-02 14:08:12 +05:30 · 2015-06-02 14:08:12 +05:30 · aef34ab950
commit aef34ab950
parent c96c73b0b0
4 changed files with 2286 additions and 0 deletions
--- a/libavcodec/mips/hevc_mc_uni_msa.c
+++ b/libavcodec/mips/hevc_mc_uni_msa.c
--- a/libavcodec/mips/hevcdsp_init_mips.c
+++ b/libavcodec/mips/hevcdsp_init_mips.c
@ -129,6 +129,36 @@ static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c,
        c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_msa;
        c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_msa;

+        c->put_hevc_epel_uni[3][0][0] = ff_hevc_put_hevc_uni_pel_pixels8_8_msa;
+        c->put_hevc_epel_uni[4][0][0] = ff_hevc_put_hevc_uni_pel_pixels12_8_msa;
+        c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels16_8_msa;
+        c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels24_8_msa;
+        c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_msa;
+
+        c->put_hevc_epel_uni[1][0][1] = ff_hevc_put_hevc_uni_epel_h4_8_msa;
+        c->put_hevc_epel_uni[2][0][1] = ff_hevc_put_hevc_uni_epel_h6_8_msa;
+        c->put_hevc_epel_uni[3][0][1] = ff_hevc_put_hevc_uni_epel_h8_8_msa;
+        c->put_hevc_epel_uni[4][0][1] = ff_hevc_put_hevc_uni_epel_h12_8_msa;
+        c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_8_msa;
+        c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_8_msa;
+        c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_msa;
+
+        c->put_hevc_epel_uni[1][1][0] = ff_hevc_put_hevc_uni_epel_v4_8_msa;
+        c->put_hevc_epel_uni[2][1][0] = ff_hevc_put_hevc_uni_epel_v6_8_msa;
+        c->put_hevc_epel_uni[3][1][0] = ff_hevc_put_hevc_uni_epel_v8_8_msa;
+        c->put_hevc_epel_uni[4][1][0] = ff_hevc_put_hevc_uni_epel_v12_8_msa;
+        c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_8_msa;
+        c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_8_msa;
+        c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_msa;
+
+        c->put_hevc_epel_uni[1][1][1] = ff_hevc_put_hevc_uni_epel_hv4_8_msa;
+        c->put_hevc_epel_uni[2][1][1] = ff_hevc_put_hevc_uni_epel_hv6_8_msa;
+        c->put_hevc_epel_uni[3][1][1] = ff_hevc_put_hevc_uni_epel_hv8_8_msa;
+        c->put_hevc_epel_uni[4][1][1] = ff_hevc_put_hevc_uni_epel_hv12_8_msa;
+        c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_8_msa;
+        c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_msa;
+        c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_msa;
+
        c->put_hevc_qpel_uni_w[1][0][0] =
            ff_hevc_put_hevc_uni_w_pel_pixels4_8_msa;
        c->put_hevc_qpel_uni_w[3][0][0] =
--- a/libavcodec/mips/hevcdsp_mips.h
+++ b/libavcodec/mips/hevcdsp_mips.h
@ -145,6 +145,36 @@ UNI_MC(qpel, hv, 32);
 UNI_MC(qpel, hv, 48);
 UNI_MC(qpel, hv, 64);

+UNI_MC(epel, h, 4);
+UNI_MC(epel, h, 6);
+UNI_MC(epel, h, 8);
+UNI_MC(epel, h, 12);
+UNI_MC(epel, h, 16);
+UNI_MC(epel, h, 24);
+UNI_MC(epel, h, 32);
+UNI_MC(epel, h, 48);
+UNI_MC(epel, h, 64);
+
+UNI_MC(epel, v, 4);
+UNI_MC(epel, v, 6);
+UNI_MC(epel, v, 8);
+UNI_MC(epel, v, 12);
+UNI_MC(epel, v, 16);
+UNI_MC(epel, v, 24);
+UNI_MC(epel, v, 32);
+UNI_MC(epel, v, 48);
+UNI_MC(epel, v, 64);
+
+UNI_MC(epel, hv, 4);
+UNI_MC(epel, hv, 6);
+UNI_MC(epel, hv, 8);
+UNI_MC(epel, hv, 12);
+UNI_MC(epel, hv, 16);
+UNI_MC(epel, hv, 24);
+UNI_MC(epel, hv, 32);
+UNI_MC(epel, hv, 48);
+UNI_MC(epel, hv, 64);
+
 #undef UNI_MC

 #define UNI_W_MC(PEL, DIR, WIDTH)                                         \
--- a/libavutil/mips/generic_macros_msa.h
+++ b/libavutil/mips/generic_macros_msa.h
@ -291,6 +291,7 @@
    LD_B2(RTYPE, (psrc), stride, out0, out1);         \
    out2 = LD_B(RTYPE, (psrc) + 2 * stride);          \
 }
+#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)

 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3)   \
@ -573,6 +574,18 @@
    SH(out7_m, (pblk_6x4_m + 4));              \
 }

+/* Description : Store as 8x1 byte block to destination memory from input vector
+   Arguments   : Inputs  - in, pdst
+   Details     : Index 0 double word element from input vector 'in' is copied
+                 and stored to destination memory at (pdst)
+*/
+#define ST8x1_UB(in, pdst)                   \
+{                                            \
+    uint64_t out0_m;                         \
+    out0_m = __msa_copy_u_d((v2i64) in, 0);  \
+    SD(out0_m, pdst);                        \
+}
+
 /* Description : Store as 8x2 byte block to destination memory from input vector
   Arguments   : Inputs  - in, pdst, stride
   Details     : Index 0 double word element from input vector 'in' is copied
@ -716,6 +729,23 @@
 }
 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)

+/* Description : Immediate number of columns to slide
+   Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0_0' vector are slide into 'in1_0' by
+                 number of elements specified by 'slide_val'
+*/
+#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)  \
+{                                                                          \
+    out0 = (RTYPE) __msa_sldi_b((v16i8) in0_0, (v16i8) in1_0, slide_val);  \
+    out1 = (RTYPE) __msa_sldi_b((v16i8) in0_1, (v16i8) in1_1, slide_val);  \
+}
+#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
+#define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
+#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
+
+
 /* Description : Shuffle byte vector elements as per mask vector
   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
                 Outputs - out0, out1
@ -1090,6 +1120,16 @@
 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
+#define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
+
+#define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
+{                                                                       \
+    ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
+    out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5);              \
+}
+#define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
+#define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
+#define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)

 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
                out0, out1, out2, out3)                         \
@ -1306,6 +1346,7 @@
    out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0);  \
    out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1);  \
 }
+#define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)

 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
@ -1427,7 +1468,9 @@
    in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128);  \
    in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128);  \
 }
+#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
+#define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)

 #define XORI_B3_128(RTYPE, in0, in1, in2)          \
 {                                                  \
@ -1628,6 +1671,14 @@
 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)

+#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift)    \
+{                                                     \
+    SRARI_H2(RTYPE, in0, in1, shift);                 \
+    SRARI_H2(RTYPE, in2, in3, shift);                 \
+}
+#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
+#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
+
 /* Description : Shift right arithmetic rounded (immediate)
   Arguments   : Inputs  - in0, in1, shift
                 Outputs - in0, in1     (in place)