mirror of https://git.ffmpeg.org/ffmpeg.git
avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for AVC chroma mc functions
s patch adds MSA (MIPS-SIMD-Arch) optimizations for AVC chroma mc functions in new file h264chroma_msa.c Adds new generic macros (needed for this patch) in libavutil/mips/generic_macros_msa.h Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
fd004e10d3
commit
b87dc70c65
|
@ -31,5 +31,6 @@ MSA-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_msa.o \
|
||||||
mips/hevc_lpf_sao_msa.o \
|
mips/hevc_lpf_sao_msa.o \
|
||||||
mips/hevcpred_msa.o
|
mips/hevcpred_msa.o
|
||||||
MSA-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_msa.o
|
MSA-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_msa.o
|
||||||
|
MSA-OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_msa.o
|
||||||
LOONGSON3-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_mmi.o
|
LOONGSON3-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_mmi.o
|
||||||
LOONGSON3-OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_mmi.o
|
LOONGSON3-OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_mmi.o
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
|
* Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
|
||||||
|
* Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
|
||||||
*
|
*
|
||||||
* This file is part of FFmpeg.
|
* This file is part of FFmpeg.
|
||||||
*
|
*
|
||||||
|
@ -20,6 +21,23 @@
|
||||||
|
|
||||||
#include "h264chroma_mips.h"
|
#include "h264chroma_mips.h"
|
||||||
|
|
||||||
|
#if HAVE_MSA
|
||||||
|
static av_cold void h264chroma_init_msa(H264ChromaContext *c, int bit_depth)
|
||||||
|
{
|
||||||
|
const int high_bit_depth = bit_depth > 8;
|
||||||
|
|
||||||
|
if (!high_bit_depth) {
|
||||||
|
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_msa;
|
||||||
|
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_msa;
|
||||||
|
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_msa;
|
||||||
|
|
||||||
|
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_msa;
|
||||||
|
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_msa;
|
||||||
|
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_msa;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif // #if HAVE_MSA
|
||||||
|
|
||||||
#if HAVE_LOONGSON3
|
#if HAVE_LOONGSON3
|
||||||
static av_cold void h264chroma_init_mmi(H264ChromaContext *c, int bit_depth)
|
static av_cold void h264chroma_init_mmi(H264ChromaContext *c, int bit_depth)
|
||||||
{
|
{
|
||||||
|
@ -36,6 +54,9 @@ static av_cold void h264chroma_init_mmi(H264ChromaContext *c, int bit_depth)
|
||||||
|
|
||||||
av_cold void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth)
|
av_cold void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth)
|
||||||
{
|
{
|
||||||
|
#if HAVE_MSA
|
||||||
|
h264chroma_init_msa(c, bit_depth);
|
||||||
|
#endif // #if HAVE_MSA
|
||||||
#if HAVE_LOONGSON3
|
#if HAVE_LOONGSON3
|
||||||
h264chroma_init_mmi(c, bit_depth);
|
h264chroma_init_mmi(c, bit_depth);
|
||||||
#endif /* HAVE_LOONGSON3 */
|
#endif /* HAVE_LOONGSON3 */
|
||||||
|
|
|
@ -22,6 +22,18 @@
|
||||||
#define H264_CHROMA_MIPS_H
|
#define H264_CHROMA_MIPS_H
|
||||||
|
|
||||||
#include "libavcodec/h264.h"
|
#include "libavcodec/h264.h"
|
||||||
|
void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride,
|
||||||
|
int height, int x, int y);
|
||||||
|
void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride,
|
||||||
|
int height, int x, int y);
|
||||||
|
void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride,
|
||||||
|
int height, int x, int y);
|
||||||
|
void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride,
|
||||||
|
int height, int x, int y);
|
||||||
|
void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride,
|
||||||
|
int height, int x, int y);
|
||||||
|
void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride,
|
||||||
|
int height, int x, int y);
|
||||||
|
|
||||||
void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
|
void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
|
||||||
int h, int x, int y);
|
int h, int x, int y);
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -747,6 +747,33 @@
|
||||||
SW(out15_m, pblk_12x8_m + 8); \
|
SW(out15_m, pblk_12x8_m + 8); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Description : average with rounding (in0 + in1 + 1) / 2.
|
||||||
|
Arguments : Inputs - in0, in1, in2, in3,
|
||||||
|
Outputs - out0, out1
|
||||||
|
Return Type - signed byte
|
||||||
|
Details : Each byte element from 'in0' vector is added with each byte
|
||||||
|
element from 'in1' vector. The addition of the elements plus 1
|
||||||
|
(for rounding) is done unsigned with full precision,
|
||||||
|
i.e. the result has one extra bit. Unsigned division by 2
|
||||||
|
(or logical shift right by one bit) is performed before writing
|
||||||
|
the result to vector 'out0'
|
||||||
|
Similar for the pair of 'in2' and 'in3'
|
||||||
|
*/
|
||||||
|
#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
|
||||||
|
{ \
|
||||||
|
out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1); \
|
||||||
|
out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3); \
|
||||||
|
}
|
||||||
|
#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||||
|
out0, out1, out2, out3) \
|
||||||
|
{ \
|
||||||
|
AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
|
||||||
|
AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
|
||||||
|
}
|
||||||
|
#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
|
||||||
|
|
||||||
/* Description : Immediate number of columns to slide with zero
|
/* Description : Immediate number of columns to slide with zero
|
||||||
Arguments : Inputs - in0, in1, slide_val
|
Arguments : Inputs - in0, in1, slide_val
|
||||||
Outputs - out0, out1
|
Outputs - out0, out1
|
||||||
|
@ -859,6 +886,34 @@
|
||||||
}
|
}
|
||||||
#define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
|
#define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
|
||||||
|
|
||||||
|
/* Description : Dot product of byte vector elements
|
||||||
|
Arguments : Inputs - mult0, mult1
|
||||||
|
cnst0, cnst1
|
||||||
|
Outputs - out0, out1
|
||||||
|
Return Type - unsigned halfword
|
||||||
|
Details : Unsigned byte elements from mult0 are multiplied with
|
||||||
|
unsigned byte elements from cnst0 producing a result
|
||||||
|
twice the size of input i.e. unsigned halfword.
|
||||||
|
Then this multiplication results of adjacent odd-even elements
|
||||||
|
are added together and stored to the out vector
|
||||||
|
(2 unsigned halfword results)
|
||||||
|
*/
|
||||||
|
#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
|
||||||
|
{ \
|
||||||
|
out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0); \
|
||||||
|
out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1); \
|
||||||
|
}
|
||||||
|
#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \
|
||||||
|
cnst0, cnst1, cnst2, cnst3, \
|
||||||
|
out0, out1, out2, out3) \
|
||||||
|
{ \
|
||||||
|
DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
|
||||||
|
DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
|
||||||
|
}
|
||||||
|
#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
|
||||||
|
|
||||||
/* Description : Dot product of byte vector elements
|
/* Description : Dot product of byte vector elements
|
||||||
Arguments : Inputs - mult0, mult1
|
Arguments : Inputs - mult0, mult1
|
||||||
cnst0, cnst1
|
cnst0, cnst1
|
||||||
|
@ -1363,6 +1418,7 @@
|
||||||
out0 = (RTYPE) __msa_ilvr_d((v2i64) (in0), (v2i64) (in1)); \
|
out0 = (RTYPE) __msa_ilvr_d((v2i64) (in0), (v2i64) (in1)); \
|
||||||
out1 = (RTYPE) __msa_ilvr_d((v2i64) (in2), (v2i64) (in3)); \
|
out1 = (RTYPE) __msa_ilvr_d((v2i64) (in2), (v2i64) (in3)); \
|
||||||
}
|
}
|
||||||
|
#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
|
||||||
#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
|
#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
|
||||||
#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
|
#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue