diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm index 747c645666..8ebc9a06d2 100644 --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -625,3 +625,47 @@ INIT_MMX mmx PUT_SIGNED_PIXELS_CLAMPED 0 INIT_XMM sse2 PUT_SIGNED_PIXELS_CLAMPED 3 + +;----------------------------------------------------- +;void ff_vector_clipf(float *dst, const float *src, +; float min, float max, int len) +;----------------------------------------------------- +INIT_XMM sse +%if ARCH_X86_32 +cglobal vector_clipf, 5,5,6, dst, src, min, max, len +%else +cglobal vector_clipf, 3,3,6, dst, src, len +%endif +%if WIN64 + SWAP 0, 2 + SWAP 1, 3 +%elif ARCH_X86_32 + movss m0, minm + movss m1, maxm +%endif + SPLATD m0 + SPLATD m1 + shl lenq, 2 + add srcq, lenq + add dstq, lenq + neg lenq +.loop: + mova m2, [srcq+lenq+mmsize*0] + mova m3, [srcq+lenq+mmsize*1] + mova m4, [srcq+lenq+mmsize*2] + mova m5, [srcq+lenq+mmsize*3] + maxps m2, m0 + maxps m3, m0 + maxps m4, m0 + maxps m5, m0 + minps m2, m1 + minps m3, m1 + minps m4, m1 + minps m5, m1 + mova [dstq+lenq+mmsize*0], m2 + mova [dstq+lenq+mmsize*1], m3 + mova [dstq+lenq+mmsize*2], m4 + mova [dstq+lenq+mmsize*3], m5 + add lenq, mmsize*4 + jl .loop + REP_RET diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c index e274e671d7..5dd6c20e4f 100644 --- a/libavcodec/x86/dsputil_init.c +++ b/libavcodec/x86/dsputil_init.c @@ -585,12 +585,10 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int cpu_flags, unsigned high_bit_depth) { -#if HAVE_SSE_INLINE - c->vector_clipf = ff_vector_clipf_sse; -#endif /* HAVE_SSE_INLINE */ - #if HAVE_YASM #if HAVE_SSE_EXTERNAL + c->vector_clipf = ff_vector_clipf_sse; + /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb) return; diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index fa77a5c938..28066d8546 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -506,37 +506,4 @@ void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, #endif #endif -void ff_vector_clipf_sse(float *dst, const float *src, - float min, float max, int len) -{ - x86_reg i = (len - 16) * 4; - __asm__ volatile ( - "movss %3, %%xmm4 \n\t" - "movss %4, %%xmm5 \n\t" - "shufps $0, %%xmm4, %%xmm4 \n\t" - "shufps $0, %%xmm5, %%xmm5 \n\t" - "1: \n\t" - "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel - "movaps 16(%2, %0), %%xmm1 \n\t" - "movaps 32(%2, %0), %%xmm2 \n\t" - "movaps 48(%2, %0), %%xmm3 \n\t" - "maxps %%xmm4, %%xmm0 \n\t" - "maxps %%xmm4, %%xmm1 \n\t" - "maxps %%xmm4, %%xmm2 \n\t" - "maxps %%xmm4, %%xmm3 \n\t" - "minps %%xmm5, %%xmm0 \n\t" - "minps %%xmm5, %%xmm1 \n\t" - "minps %%xmm5, %%xmm2 \n\t" - "minps %%xmm5, %%xmm3 \n\t" - "movaps %%xmm0, (%1, %0) \n\t" - "movaps %%xmm1, 16(%1, %0) \n\t" - "movaps %%xmm2, 32(%1, %0) \n\t" - "movaps %%xmm3, 48(%1, %0) \n\t" - "sub $64, %0 \n\t" - "jge 1b \n\t" - : "+&r" (i) - : "r" (dst), "r" (src), "m" (min), "m" (max) - : "memory"); -} - #endif /* HAVE_INLINE_ASM */