avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for HEVC loop filter and sao functions

This patch adds MSA (MIPS-SIMD-Arch) optimizations for HEVC loop filter and sao functions in new file hevc_lpf_sao_msa.c
Adds new generic macros (needed for this patch) in libavutil/mips/generic_macros_msa.h

In this patch, in comparision with previous patch, duplicated c functions are removed.

Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Shivraj Patil 2015-06-09 20:50:26 +05:30 committed by Michael Niedermayer
parent 05c57ba2f4
commit 271195f85b
5 changed files with 2256 additions and 2 deletions

View File

@ -26,7 +26,8 @@ MSA-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_msa.o \
mips/hevc_mc_uniw_msa.o \
mips/hevc_mc_bi_msa.o \
mips/hevc_mc_biw_msa.o \
mips/hevc_idct_msa.o
mips/hevc_idct_msa.o \
mips/hevc_lpf_sao_msa.o
MSA-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_msa.o
LOONGSON3-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_mmi.o
LOONGSON3-OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_mmi.o

File diff suppressed because it is too large Load Diff

View File

@ -403,6 +403,32 @@ static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c,
c->put_hevc_epel_bi_w[6][1][1] = ff_hevc_put_hevc_bi_w_epel_hv24_8_msa;
c->put_hevc_epel_bi_w[7][1][1] = ff_hevc_put_hevc_bi_w_epel_hv32_8_msa;
c->sao_band_filter[0] =
c->sao_band_filter[1] =
c->sao_band_filter[2] =
c->sao_band_filter[3] =
c->sao_band_filter[4] = ff_hevc_sao_band_filter_0_8_msa;
c->sao_edge_filter[0] =
c->sao_edge_filter[1] =
c->sao_edge_filter[2] =
c->sao_edge_filter[3] =
c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_msa;
c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_msa;
c->hevc_v_loop_filter_luma = ff_hevc_loop_filter_luma_v_8_msa;
c->hevc_h_loop_filter_chroma = ff_hevc_loop_filter_chroma_h_8_msa;
c->hevc_v_loop_filter_chroma = ff_hevc_loop_filter_chroma_v_8_msa;
c->hevc_h_loop_filter_luma_c = ff_hevc_loop_filter_luma_h_8_msa;
c->hevc_v_loop_filter_luma_c = ff_hevc_loop_filter_luma_v_8_msa;
c->hevc_h_loop_filter_chroma_c =
ff_hevc_loop_filter_chroma_h_8_msa;
c->hevc_v_loop_filter_chroma_c =
ff_hevc_loop_filter_chroma_v_8_msa;
c->idct[0] = ff_hevc_idct_4x4_msa;
c->idct[1] = ff_hevc_idct_8x8_msa;
c->idct[2] = ff_hevc_idct_16x16_msa;

View File

@ -431,6 +431,36 @@ BI_W_MC(epel, hv, 64);
#undef BI_W_MC
void ff_hevc_loop_filter_luma_h_8_msa(uint8_t *src,
ptrdiff_t src_stride,
int32_t beta, int32_t *tc,
uint8_t *no_p, uint8_t *no_q);
void ff_hevc_loop_filter_luma_v_8_msa(uint8_t *src,
ptrdiff_t src_stride,
int32_t beta, int32_t *tc,
uint8_t *no_p, uint8_t *no_q);
void ff_hevc_loop_filter_chroma_h_8_msa(uint8_t *src,
ptrdiff_t src_stride,
int32_t *tc, uint8_t *no_p,
uint8_t *no_q);
void ff_hevc_loop_filter_chroma_v_8_msa(uint8_t *src,
ptrdiff_t src_stride,
int32_t *tc, uint8_t *no_p,
uint8_t *no_q);
void ff_hevc_sao_band_filter_0_8_msa(uint8_t *dst, uint8_t *src,
ptrdiff_t stride_dst, ptrdiff_t stride_src,
int16_t *sao_offset_val, int sao_left_class,
int width, int height);
void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src,
ptrdiff_t stride_dst,
int16_t *sao_offset_val,
int eo, int width, int height);
void ff_hevc_idct_4x4_msa(int16_t *coeffs, int col_limit);
void ff_hevc_idct_8x8_msa(int16_t *coeffs, int col_limit);
void ff_hevc_idct_16x16_msa(int16_t *coeffs, int col_limit);

View File

@ -761,6 +761,8 @@
out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \
}
#define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
#define SLDI_B2_0_SB(...) SLDI_B2_0(v16i8, __VA_ARGS__)
#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \
out0, out1, out2, out3, slide_val) \
@ -821,6 +823,23 @@
VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
}
#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
/* Description : Shuffle halfword vector elements as per mask vector
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
Outputs - out0, out1
Return Type - as per RTYPE
Details : Selective halfword elements from in0 & in1 are copied to out0
as per control vector mask0
Selective halfword elements from in2 & in3 are copied to out1
as per control vector mask1
*/
#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
{ \
out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0); \
out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2); \
}
#define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
/* Description : Shuffle byte vector elements as per mask vector
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
@ -1060,6 +1079,25 @@
#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
#define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
/* Description : Interleave even byte elements from vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1
Return Type - as per RTYPE
Details : Even byte elements of 'in0' and even byte
elements of 'in1' are interleaved and copied to 'out0'
Even byte elements of 'in2' and even byte
elements of 'in3' are interleaved and copied to 'out1'
*/
#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
{ \
out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0); \
out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2); \
}
#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
#define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
#define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
/* Description : Interleave even halfword elements from vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1
@ -1107,6 +1145,8 @@
out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \
}
#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
#define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
#define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
/* Description : Interleave left half of byte elements from vectors
Arguments : Inputs - in0, in1, in2, in3
@ -1212,9 +1252,22 @@
ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
}
#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
#define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
in8, in9, in10, in11, in12, in13, in14, in15, \
out0, out1, out2, out3, out4, out5, out6, out7) \
{ \
ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3); \
ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \
out4, out5, out6, out7); \
}
#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
/* Description : Interleave right half of halfword elements from vectors
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
@ -1870,6 +1923,25 @@
ADD2(in4, in5, in6, in7, out2, out3); \
}
/* Description : Sign extend byte elements from input vector and return
halfword results in pair of vectors
Arguments : Inputs - in (1 input byte vector)
Outputs - out0, out1 (sign extended 2 halfword vectors)
Return Type - signed halfword
Details : Sign bit of byte elements from input vector 'in' is
extracted and interleaved right with same vector 'in0' to
generate 8 signed halfword elements in 'out0'
Then interleaved left with same vector 'in0' to
generate 8 signed halfword elements in 'out1'
*/
#define UNPCK_SB_SH(in, out0, out1) \
{ \
v16i8 tmp_m; \
\
tmp_m = __msa_clti_s_b((v16i8) in, 0); \
ILVRL_B2_SH(tmp_m, in, out0, out1); \
}
/* Description : Zero extend unsigned byte elements to halfword elements
Arguments : Inputs - in (1 input unsigned byte vector)
Outputs - out0, out1 (unsigned 2 halfword vectors)
@ -1903,6 +1975,18 @@
ILVRL_H2_SW(tmp_m, in, out0, out1); \
}
/* Description : Swap two variables
Arguments : Inputs - in0, in1
Outputs - in0, in1 (in-place)
Details : Swapping of two input variables using xor
*/
#define SWAP(in0, in1) \
{ \
in0 = in0 ^ in1; \
in1 = in0 ^ in1; \
in0 = in0 ^ in1; \
}
/* Description : Butterfly of 4 input vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1, out2, out3
@ -1959,9 +2043,34 @@
out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
}
#define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
#define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
/* Description : Transposes input 8x8 byte block
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
(input 8x8 byte block)
Outputs - out0, out1, out2, out3, out4, out5, out6, out7
(output 8x8 byte block)
Return Type - unsigned byte
Details :
*/
#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7) \
{ \
v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
\
ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
}
#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
#define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
/* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
in8, in9, in10, in11, in12, in13, in14, in15