ac3enc: add int32_t array clipping function to DSPUtil, including x86 versions.

This commit is contained in:
Justin Ruggles 2011-06-02 14:00:50 -04:00
parent 8a8d0ce208
commit 6054cd25b4
4 changed files with 171 additions and 0 deletions

View File

@ -2676,6 +2676,22 @@ static void apply_window_int16_c(int16_t *output, const int16_t *input,
}
}
static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
int32_t max, unsigned int len)
{
do {
*dst++ = av_clip(*src++, min, max);
*dst++ = av_clip(*src++, min, max);
*dst++ = av_clip(*src++, min, max);
*dst++ = av_clip(*src++, min, max);
*dst++ = av_clip(*src++, min, max);
*dst++ = av_clip(*src++, min, max);
*dst++ = av_clip(*src++, min, max);
*dst++ = av_clip(*src++, min, max);
len -= 8;
} while (len > 0);
}
#define W0 2048
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
@ -3122,6 +3138,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
c->scalarproduct_int16 = scalarproduct_int16_c;
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
c->apply_window_int16 = apply_window_int16_c;
c->vector_clip_int32 = vector_clip_int32_c;
c->scalarproduct_float = scalarproduct_float_c;
c->butterflies_float = butterflies_float_c;
c->vector_fmul_scalar = vector_fmul_scalar_c;

View File

@ -555,6 +555,22 @@ typedef struct DSPContext {
void (*apply_window_int16)(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
/**
* Clip each element in an array of int32_t to a given minimum and maximum value.
* @param dst destination array
* constraints: 16-byte aligned
* @param src source array
* constraints: 16-byte aligned
* @param min minimum value
* constraints: must in the the range [-(1<<24), 1<<24]
* @param max maximum value
* constraints: must in the the range [-(1<<24), 1<<24]
* @param len number of elements in the array
* constraints: multiple of 32 greater than zero
*/
void (*vector_clip_int32)(int32_t *dst, const int32_t *src, int32_t min,
int32_t max, unsigned int len);
/* rv30 functions */
qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
qpel_mc_func avg_rv30_tpel_pixels_tab[4][16];

View File

@ -2429,6 +2429,15 @@ int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, i
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, int32_t min,
int32_t max, unsigned int len);
void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, int32_t min,
int32_t max, unsigned int len);
void ff_vector_clip_int32_sse2_int(int32_t *dst, const int32_t *src, int32_t min,
int32_t max, unsigned int len);
void ff_vector_clip_int32_sse41 (int32_t *dst, const int32_t *src, int32_t min,
int32_t max, unsigned int len);
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
@ -2570,6 +2579,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx;
c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx;
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
#endif
if (mm_flags & AV_CPU_FLAG_MMX2) {
@ -2855,6 +2866,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#if HAVE_YASM
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
if (mm_flags & AV_CPU_FLAG_ATOM) {
c->vector_clip_int32 = ff_vector_clip_int32_sse2_int;
} else {
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
}
if (avctx->flags & CODEC_FLAG_BITEXACT) {
c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
} else {
@ -2880,6 +2896,13 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
#endif
}
if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
#if HAVE_YASM
c->vector_clip_int32 = ff_vector_clip_int32_sse41;
#endif
}
#if HAVE_AVX && HAVE_YASM
if (mm_flags & AV_CPU_FLAG_AVX) {
if (bit_depth == 10) {

View File

@ -1048,3 +1048,118 @@ emu_edge sse
%ifdef ARCH_X86_32
emu_edge mmx
%endif
;-----------------------------------------------------------------------------
; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
; int32_t max, unsigned int len)
;-----------------------------------------------------------------------------
%macro PMINSD_MMX 3 ; dst, src, tmp
mova %3, %2
pcmpgtd %3, %1
pxor %1, %2
pand %1, %3
pxor %1, %2
%endmacro
%macro PMAXSD_MMX 3 ; dst, src, tmp
mova %3, %1
pcmpgtd %3, %2
pand %1, %3
pandn %3, %2
por %1, %3
%endmacro
%macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp
PMINSD_MMX %1, %3, %4
PMAXSD_MMX %1, %2, %4
%endmacro
%macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused
cvtdq2ps %1, %1
minps %1, %3
maxps %1, %2
cvtps2dq %1, %1
%endmacro
%macro CLIPD_SSE41 3-4 ; src/dst, min, max, unused
pminsd %1, %3
pmaxsd %1, %2
%endmacro
%macro SPLATD_MMX 1
punpckldq %1, %1
%endmacro
%macro SPLATD_SSE2 1
pshufd %1, %1, 0
%endmacro
%macro VECTOR_CLIP_INT32 4
cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len
%ifidn %1, sse2
cvtsi2ss m4, minm
cvtsi2ss m5, maxm
%else
movd m4, minm
movd m5, maxm
%endif
SPLATD m4
SPLATD m5
.loop:
%assign %%i 1
%rep %3
mova m0, [srcq+mmsize*0*%%i]
mova m1, [srcq+mmsize*1*%%i]
mova m2, [srcq+mmsize*2*%%i]
mova m3, [srcq+mmsize*3*%%i]
%if %4
mova m7, [srcq+mmsize*4*%%i]
mova m8, [srcq+mmsize*5*%%i]
mova m9, [srcq+mmsize*6*%%i]
mova m10, [srcq+mmsize*7*%%i]
%endif
CLIPD m0, m4, m5, m6
CLIPD m1, m4, m5, m6
CLIPD m2, m4, m5, m6
CLIPD m3, m4, m5, m6
%if %4
CLIPD m7, m4, m5, m6
CLIPD m8, m4, m5, m6
CLIPD m9, m4, m5, m6
CLIPD m10, m4, m5, m6
%endif
mova [dstq+mmsize*0*%%i], m0
mova [dstq+mmsize*1*%%i], m1
mova [dstq+mmsize*2*%%i], m2
mova [dstq+mmsize*3*%%i], m3
%if %4
mova [dstq+mmsize*4*%%i], m7
mova [dstq+mmsize*5*%%i], m8
mova [dstq+mmsize*6*%%i], m9
mova [dstq+mmsize*7*%%i], m10
%endif
%assign %%i %%i+1
%endrep
add srcq, mmsize*4*(%3+%4)
add dstq, mmsize*4*(%3+%4)
sub lend, mmsize*(%3+%4)
jg .loop
REP_RET
%endmacro
INIT_MMX
%define SPLATD SPLATD_MMX
%define CLIPD CLIPD_MMX
VECTOR_CLIP_INT32 mmx, 0, 1, 0
INIT_XMM
%define SPLATD SPLATD_SSE2
VECTOR_CLIP_INT32 sse2_int, 6, 1, 0
%define CLIPD CLIPD_SSE2
VECTOR_CLIP_INT32 sse2, 6, 2, 0
%define CLIPD CLIPD_SSE41
%ifdef m8
VECTOR_CLIP_INT32 sse41, 11, 1, 1
%else
VECTOR_CLIP_INT32 sse41, 6, 1, 0
%endif