diff --git a/libswscale/swscale.c b/libswscale/swscale.c index a94be88e45..502cb438da 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -2563,6 +2563,7 @@ static int swScale(SwsContext *c, const uint8_t* src[], dst[2] + dstStride[2] * chrDstY, (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL, }; + int use_mmx_vfilter= c->use_mmx_vfilter; const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<chrDstVSubSample) - 1), dstH-1)]; @@ -2653,6 +2654,7 @@ static int swScale(SwsContext *c, const uint8_t* src[], // hmm looks like we can't use MMX here without overwriting this array's tail find_c_packed_planar_out_funcs(c, &yuv2plane1, &yuv2planeX, &yuv2nv12cX, &yuv2packed1, &yuv2packed2, &yuv2packedX); + use_mmx_vfilter= 0; } { @@ -2669,6 +2671,19 @@ static int swScale(SwsContext *c, const uint8_t* src[], vLumFilter += dstY * vLumFilterSize; vChrFilter += chrDstY * vChrFilterSize; + av_assert0(use_mmx_vfilter != ( + yuv2planeX == yuv2planeX_10BE_c + || yuv2planeX == yuv2planeX_10LE_c + || yuv2planeX == yuv2planeX_9BE_c + || yuv2planeX == yuv2planeX_9LE_c + || yuv2planeX == yuv2planeX_16BE_c + || yuv2planeX == yuv2planeX_16LE_c + || yuv2planeX == yuv2planeX_8_c)); + if(use_mmx_vfilter){ + vLumFilter= c->lumMmxFilter; + vChrFilter= c->chrMmxFilter; + } + if (vLumFilterSize == 1) { yuv2plane1(lumSrcPtr[0], dest[0], dstW, c->lumDither8, 0); } else { @@ -2686,11 +2701,14 @@ static int swScale(SwsContext *c, const uint8_t* src[], yuv2planeX(vChrFilter, vChrFilterSize, chrUSrcPtr, dest[1], chrDstW, c->chrDither8, 0); yuv2planeX(vChrFilter, vChrFilterSize, - chrVSrcPtr, dest[2], chrDstW, c->chrDither8, 3); + chrVSrcPtr, dest[2], chrDstW, c->chrDither8, use_mmx_vfilter ? (c->uv_offx2 >> 1) : 3); } } if (CONFIG_SWSCALE_ALPHA && alpPixBuf){ + if(use_mmx_vfilter){ + vLumFilter= c->alpMmxFilter; + } if (vLumFilterSize == 1) { yuv2plane1(alpSrcPtr[0], dest[3], dstW, c->lumDither8, 0); } else { diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index 2e7f0cbc4a..fb63ccdb29 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -412,6 +412,7 @@ typedef struct SwsContext { #if HAVE_VIS DECLARE_ALIGNED(8, uint64_t, sparc_coeffs)[10]; #endif + int use_mmx_vfilter; /* function pointers for swScale() */ yuv2planar1_fn yuv2plane1; diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c index ab09eaa3fe..7ce9deddb2 100644 --- a/libswscale/x86/swscale_template.c +++ b/libswscale/x86/swscale_template.c @@ -66,38 +66,47 @@ dither_8to16(const uint8_t *srcDither, int rot) } #endif -static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, - const int16_t *chrUSrc, const int16_t *chrVSrc, - const int16_t *alpSrc, - uint8_t *dst[4], int dstW, int chrDstW) +static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize, + const int16_t **src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset) { - int p= 4; - const int16_t *src[4]= { - lumSrc + dstW, chrUSrc + chrDstW, - chrVSrc + chrDstW, alpSrc + dstW - }; - x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW }; - - while (p--) { - if (dst[p]) { - __asm__ volatile( - "mov %2, %%"REG_a" \n\t" - ".p2align 4 \n\t" /* FIXME Unroll? */ - "1: \n\t" - "movq (%0, %%"REG_a", 2), %%mm0 \n\t" - "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t" - "psraw $7, %%mm0 \n\t" - "psraw $7, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - MOVNTQ(%%mm0, (%1, %%REGa)) - "add $8, %%"REG_a" \n\t" - "jnc 1b \n\t" - :: "r" (src[p]), "r" (dst[p] + counter[p]), - "g" (-counter[p]) - : "%"REG_a - ); - } - } + dither_8to16(dither, offset); + __asm__ volatile(\ + "psraw $4, %%mm3\n\t" + "psraw $4, %%mm4\n\t" + "movq %%mm3, %%mm6\n\t" + "movq %%mm4, %%mm7\n\t" + "movslq %3, %%"REG_c"\n\t" + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + ".p2align 4 \n\t" /* FIXME Unroll? */\ + "1: \n\t"\ + "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ + "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\ + "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\ + "add $16, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "test %%"REG_S", %%"REG_S" \n\t"\ + "pmulhw %%mm0, %%mm2 \n\t"\ + "pmulhw %%mm0, %%mm5 \n\t"\ + "paddw %%mm2, %%mm3 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + " jnz 1b \n\t"\ + "psraw $3, %%mm3 \n\t"\ + "psraw $3, %%mm4 \n\t"\ + "packuswb %%mm4, %%mm3 \n\t" + MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t" + "add $8, %%"REG_c" \n\t"\ + "cmp %2, %%"REG_c" \n\t"\ + "movq %%mm6, %%mm3\n\t" + "movq %%mm7, %%mm4\n\t" + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "jb 1b \n\t"\ + :: "g" (filter), + "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset) + : "%"REG_d, "%"REG_S, "%"REG_c + ); } static void RENAME(yuv2yuv1_ar)(const int16_t *src, uint8_t *dst, int dstW, const uint8_t *dither, int offset) @@ -1869,7 +1878,7 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) { enum PixelFormat srcFormat = c->srcFormat, dstFormat = c->dstFormat; - + c->use_mmx_vfilter= 0; if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) { c->yuv2plane1 = RENAME(yuv2yuv1_ar ); @@ -1886,7 +1895,10 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) } } } else { - //c->yuv2yuv1 = RENAME(yuv2yuv1 ); + int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat); + //c->yuv2plane1 = should_dither ? RENAME(yuv2yuv1_ar ) : RENAME(yuv2yuv1 ); + c->use_mmx_vfilter= 1; + c->yuv2planeX = RENAME(yuv2yuvX ); if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;