diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c index e73750f396..3bf9458296 100644 --- a/libswscale/ppc/swscale_altivec.c +++ b/libswscale/ppc/swscale_altivec.c @@ -405,7 +405,7 @@ void ff_sws_init_swScale_altivec(SwsContext *c) if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21 && !c->alpPixBuf) { - c->yuv2yuvX = yuv2yuvX_altivec_real; +// c->yuv2yuvX = yuv2yuvX_altivec_real; } /* The following list of supported dstFormat values should diff --git a/libswscale/swscale.c b/libswscale/swscale.c index bac2932f65..1d9b965ac9 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -196,141 +196,67 @@ DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] = { 64, 64, 64, 64, 64, 64, 64, 64 }; static av_always_inline void -yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc, - int lumFilterSize, const int16_t *chrFilter, - const int32_t **chrUSrc, const int32_t **chrVSrc, - int chrFilterSize, const int32_t **alpSrc, - uint16_t *dest[4], int dstW, int chrDstW, +yuv2yuvX16_c_template(const int16_t *filter, int filterSize, + const int32_t **src, uint16_t *dest, int dstW, int big_endian, int output_bits) { - //FIXME Optimize (just quickly written not optimized..) - int i; - uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], - *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; - int shift = 15 + 16 - output_bits - 1; - #define output_pixel(pos, val) \ if (big_endian) { \ AV_WB16(pos, av_clip_uint16(val >> shift)); \ } else { \ AV_WL16(pos, av_clip_uint16(val >> shift)); \ } + + int i; + int shift = 15 + 16 - output_bits - 1; + for (i = 0; i < dstW; i++) { int val = 1 << (30-output_bits - 1); int j; - for (j = 0; j < lumFilterSize; j++) - val += (lumSrc[j][i] * lumFilter[j]) >> 1; + for (j = 0; j < filterSize; j++) + val += (src[j][i] * filter[j]) >> 1; - output_pixel(&yDest[i], val); - } - - if (uDest) { - for (i = 0; i < chrDstW; i++) { - int u = 1 << (30-output_bits - 1); - int v = 1 << (30-output_bits - 1); - int j; - - for (j = 0; j < chrFilterSize; j++) { - u += (chrUSrc[j][i] * chrFilter[j]) >> 1; - v += (chrVSrc[j][i] * chrFilter[j]) >> 1; - } - - output_pixel(&uDest[i], u); - output_pixel(&vDest[i], v); - } - } - - if (CONFIG_SWSCALE_ALPHA && aDest) { - for (i = 0; i < dstW; i++) { - int val = 1 << (30-output_bits - 1); - int j; - - for (j = 0; j < lumFilterSize; j++) - val += (alpSrc[j][i] * lumFilter[j]) >> 1; - - output_pixel(&aDest[i], val); - } + output_pixel(&dest[i], val); } #undef output_pixel } -static av_always_inline void -yuv2yuvX10_c_template(const int16_t *lumFilter, const int16_t **lumSrc, - int lumFilterSize, const int16_t *chrFilter, - const int16_t **chrUSrc, const int16_t **chrVSrc, - int chrFilterSize, const int16_t **alpSrc, - uint16_t *dest[4], int dstW, int chrDstW, - int big_endian, int output_bits) -{ - //FIXME Optimize (just quickly written not optimized..) - int i; - uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], - *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; - int shift = 11 + 16 - output_bits; - #define output_pixel(pos, val) \ if (big_endian) { \ AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \ } else { \ AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \ } + +static av_always_inline void +yuv2yuvX10_c_template(const int16_t *filter, int filterSize, + const int16_t **src, uint16_t *dest, int dstW, + int big_endian, int output_bits) +{ + int i; + int shift = 11 + 16 - output_bits; + for (i = 0; i < dstW; i++) { int val = 1 << (26-output_bits); int j; - for (j = 0; j < lumFilterSize; j++) - val += lumSrc[j][i] * lumFilter[j]; + for (j = 0; j < filterSize; j++) + val += src[j][i] * filter[j]; - output_pixel(&yDest[i], val); + output_pixel(&dest[i], val); } - - if (uDest) { - for (i = 0; i < chrDstW; i++) { - int u = 1 << (26-output_bits); - int v = 1 << (26-output_bits); - int j; - - for (j = 0; j < chrFilterSize; j++) { - u += chrUSrc[j][i] * chrFilter[j]; - v += chrVSrc[j][i] * chrFilter[j]; - } - - output_pixel(&uDest[i], u); - output_pixel(&vDest[i], v); - } - } - - if (CONFIG_SWSCALE_ALPHA && aDest) { - for (i = 0; i < dstW; i++) { - int val = 1 << (26-output_bits); - int j; - - for (j = 0; j < lumFilterSize; j++) - val += alpSrc[j][i] * lumFilter[j]; - - output_pixel(&aDest[i], val); - } - } -#undef output_pixel } +#undef output_pixel + #define yuv2NBPS(bits, BE_LE, is_be, yuv2yuvX_template_fn, typeX_t) \ -static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \ - const int16_t **_lumSrc, int lumFilterSize, \ - const int16_t *chrFilter, const int16_t **_chrUSrc, \ - const int16_t **_chrVSrc, \ - int chrFilterSize, const int16_t **_alpSrc, \ - uint8_t *_dest[4], int dstW, int chrDstW) \ +static void yuv2yuvX ## bits ## BE_LE ## _c(const int16_t *filter, int filterSize, \ + const int16_t **src, uint16_t *dest, int dstW, \ + const uint8_t *dither, int offset)\ { \ - const typeX_t **lumSrc = (const typeX_t **) _lumSrc, \ - **chrUSrc = (const typeX_t **) _chrUSrc, \ - **chrVSrc = (const typeX_t **) _chrVSrc, \ - **alpSrc = (const typeX_t **) _alpSrc; \ - yuv2yuvX_template_fn(lumFilter, lumSrc, lumFilterSize, \ - chrFilter, chrUSrc, chrVSrc, chrFilterSize, \ - alpSrc, (uint16_t **) _dest, \ - dstW, chrDstW, is_be, bits); \ + yuv2yuvX_template_fn(filter, filterSize, (const typeX_t **) src, \ + dest, dstW, is_be, bits); \ } yuv2NBPS( 9, BE, 1, yuv2yuvX10_c_template, int16_t); yuv2NBPS( 9, LE, 0, yuv2yuvX10_c_template, int16_t); @@ -339,51 +265,19 @@ yuv2NBPS(10, LE, 0, yuv2yuvX10_c_template, int16_t); yuv2NBPS(16, BE, 1, yuv2yuvX16_c_template, int32_t); yuv2NBPS(16, LE, 0, yuv2yuvX16_c_template, int32_t); -static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter, - const int16_t **lumSrc, int lumFilterSize, - const int16_t *chrFilter, const int16_t **chrUSrc, - const int16_t **chrVSrc, - int chrFilterSize, const int16_t **alpSrc, - uint8_t *dest[4], int dstW, int chrDstW) +static void yuv2yuvX_c(const int16_t *filter, int filterSize, + const int16_t **src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset) { - uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], - *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; int i; - const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; - - //FIXME Optimize (just quickly written not optimized..) for (i=0; i>19); + dest[i]= av_clip_uint8(val>>19); } - - if (uDest) - for (i=0; i>19); - vDest[i]= av_clip_uint8(v>>19); - } - - if (CONFIG_SWSCALE_ALPHA && aDest) - for (i=0; i>19); - } } static void yuv2yuv1_c(const int16_t *src, uint8_t *dest, int dstW, @@ -396,30 +290,13 @@ static void yuv2yuv1_c(const int16_t *src, uint8_t *dest, int dstW, } } -static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter, - const int16_t **lumSrc, int lumFilterSize, - const int16_t *chrFilter, const int16_t **chrUSrc, - const int16_t **chrVSrc, int chrFilterSize, - const int16_t **alpSrc, uint8_t *dest[4], - int dstW, int chrDstW) +static void yuv2nv12X_chroma_c(SwsContext *c, const int16_t *chrFilter, int chrFilterSize, + const int16_t **chrUSrc, const int16_t **chrVSrc, + uint8_t *dest, int chrDstW) { - uint8_t *yDest = dest[0], *uDest = dest[1]; enum PixelFormat dstFormat = c->dstFormat; - const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; - - //FIXME Optimize (just quickly written not optimized..) + const uint8_t *chrDither = c->chrDither8; int i; - for (i=0; i>19); - } - - if (!uDest) - return; if (dstFormat == PIX_FMT_NV12) for (i=0; i>19); - uDest[2*i+1]= av_clip_uint8(v>>19); + dest[2*i]= av_clip_uint8(u>>19); + dest[2*i+1]= av_clip_uint8(v>>19); } else for (i=0; i>19); - uDest[2*i+1]= av_clip_uint8(u>>19); + dest[2*i]= av_clip_uint8(v>>19); + dest[2*i+1]= av_clip_uint8(u>>19); } } @@ -2117,26 +1994,29 @@ static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2 static av_always_inline void find_c_packed_planar_out_funcs(SwsContext *c, - yuv2planar1_fn *yuv2yuv1, yuv2planarX_fn *yuv2yuvX, + yuv2planar1_fn *yuv2yuv1, yuv2planarX_fn *yuv2planeX_luma, + yuv2planarX_fn *yuv2planeX_chroma, yuv2interleavedX_fn *yuv2nv12X_chroma, yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2, yuv2packedX_fn *yuv2packedX) { enum PixelFormat dstFormat = c->dstFormat; if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) { - *yuv2yuvX = yuv2nv12X_c; + *yuv2planeX_luma = yuv2yuvX_c; + *yuv2nv12X_chroma = yuv2nv12X_chroma_c; } else if (is16BPS(dstFormat)) { - *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX16BE_c : yuv2yuvX16LE_c; + *yuv2planeX_luma = *yuv2planeX_chroma = isBE(dstFormat) ? yuv2yuvX16BE_c : yuv2yuvX16LE_c; } else if (is9_OR_10BPS(dstFormat)) { if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) { - *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c : yuv2yuvX9LE_c; + *yuv2planeX_luma = *yuv2planeX_chroma = isBE(dstFormat) ? yuv2yuvX9BE_c : yuv2yuvX9LE_c; } else { - *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c; + *yuv2planeX_luma = *yuv2planeX_chroma = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c; } } else { *yuv2yuv1 = yuv2yuv1_c; - *yuv2yuvX = yuv2yuvX_c; + *yuv2planeX_luma = *yuv2planeX_chroma = yuv2yuvX_c; } + if(c->flags & SWS_FULL_CHR_H_INT) { switch (dstFormat) { case PIX_FMT_RGBA: @@ -2396,7 +2276,9 @@ static int swScale(SwsContext *c, const uint8_t* src[], int lastDstY; uint32_t *pal=c->pal_yuv; yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1; - yuv2planarX_fn yuv2yuvX = c->yuv2yuvX; + yuv2planarX_fn yuv2planeX_luma = c->yuv2planeX_luma; + yuv2planarX_fn yuv2planeX_chroma = c->yuv2planeX_chroma; + yuv2interleavedX_fn yuv2nv12X_chroma = c->yuv2nv12X_chroma; yuv2packed1_fn yuv2packed1 = c->yuv2packed1; yuv2packed2_fn yuv2packed2 = c->yuv2packed2; yuv2packedX_fn yuv2packedX = c->yuv2packedX; @@ -2548,8 +2430,8 @@ static int swScale(SwsContext *c, const uint8_t* src[], } if (dstY >= dstH-2) { // hmm looks like we can't use MMX here without overwriting this array's tail - find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX, - &yuv2packed1, &yuv2packed2, + find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2planeX_luma, &yuv2planeX_chroma, + &yuv2nv12X_chroma, &yuv2packed1, &yuv2packed2, &yuv2packedX); } @@ -2564,7 +2446,13 @@ static int swScale(SwsContext *c, const uint8_t* src[], dest[1] = dest[2] = NULL; //FIXME split functions in lumi / chromi const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL; - if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12 + if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) { + yuv2planeX_luma(vLumFilter + dstY * vLumFilterSize, vLumFilterSize, lumSrcPtr, dest[0], dstW, c->lumDither8, 0); + + if (dest[1]){ + yuv2nv12X_chroma(c, vChrFilter + chrDstY * vChrFilterSize, vChrFilterSize, chrUSrcPtr, chrVSrcPtr, dest[1], chrDstW); + } + } else if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12 yuv2yuv1(lumSrcPtr[0], dest[0], dstW, c->lumDither8, 0); if (dest[1]){ @@ -2575,11 +2463,15 @@ static int swScale(SwsContext *c, const uint8_t* src[], if (alpBuf && dest[3]) yuv2yuv1(alpBuf, dest[3], dstW, c->lumDither8, 0); } else { //General YV12 - yuv2yuvX(c, vLumFilter + dstY * vLumFilterSize, - lumSrcPtr, vLumFilterSize, - vChrFilter + chrDstY * vChrFilterSize, - chrUSrcPtr, chrVSrcPtr, vChrFilterSize, - alpSrcPtr, dest, dstW, chrDstW); + yuv2planeX_luma(vLumFilter + dstY * vLumFilterSize, vLumFilterSize, lumSrcPtr, dest[0], dstW, c->lumDither8, 0); + + if (dest[1]){ + yuv2planeX_chroma(vChrFilter + chrDstY * vChrFilterSize, vChrFilterSize, chrUSrcPtr, dest[1], chrDstW, c->chrDither8, 0); + yuv2planeX_chroma(vChrFilter + chrDstY * vChrFilterSize, vChrFilterSize, chrVSrcPtr, dest[2], chrDstW, c->chrDither8, 3); + } + + if (alpBuf && dest[3]) + yuv2planeX_luma(vLumFilter + dstY * vLumFilterSize, vLumFilterSize, alpSrcPtr, dest[3], dstW, c->lumDither8, 0); } } else { assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); @@ -2633,8 +2525,8 @@ static av_cold void sws_init_swScale_c(SwsContext *c) { enum PixelFormat srcFormat = c->srcFormat; - find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2yuvX, - &c->yuv2packed1, &c->yuv2packed2, + find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2planeX_luma, &c->yuv2planeX_chroma, + &c->yuv2nv12X_chroma, &c->yuv2packed1, &c->yuv2packed2, &c->yuv2packedX); c->chrToYV12 = NULL; diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index 7253c833fa..713045556e 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -76,33 +76,41 @@ typedef void (*yuv2planar1_fn) (const int16_t *src, uint8_t *dest, int dstW, const uint8_t *dither, int offset); /** - * Write one line of horizontally scaled Y/U/V/A to planar output + * Write one line of horizontally scaled data to planar output + * with multi-point vertical scaling between input pixels. + * + * @param filter vertical luma/alpha scaling coefficients, 12bit [0,4096] + * @param src scaled luma (Y) or alpha (A) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) + * @param filterSize number of vertical input lines to scale + * @param dest pointer to output plane. For >8bit + * output, this is in uint16_t + * @param dstW width of destination pixels + * @param offset Dither offset + */ +typedef void (*yuv2planarX_fn) (const int16_t *filter, int filterSize, + const int16_t **src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset); + +/** + * Write one line of horizontally scaled chroma to interleaved output * with multi-point vertical scaling between input pixels. * * @param c SWS scaling context - * @param lumFilter vertical luma/alpha scaling coefficients, 12bit [0,4096] - * @param lumSrc scaled luma (Y) source data, 15bit for 8-10bit output, - * 19-bit for 16bit output (in int32_t) - * @param lumFilterSize number of vertical luma/alpha input lines to scale * @param chrFilter vertical chroma scaling coefficients, 12bit [0,4096] * @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output, * 19-bit for 16bit output (in int32_t) * @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output, * 19-bit for 16bit output (in int32_t) * @param chrFilterSize number of vertical chroma input lines to scale - * @param alpSrc scaled alpha (A) source data, 15bit for 8-10bit output, - * 19-bit for 16bit output (in int32_t) - * @param dest pointer to the 4 output planes (Y/U/V/A). For >8bit + * @param dest pointer to the output plane. For >8bit * output, this is in uint16_t - * @param dstW width of dest[0], dest[3], lumSrc and alpSrc in pixels - * @param chrDstW width of dest[1], dest[2], chrUSrc and chrVSrc + * @param dstW width of chroma planes */ -typedef void (*yuv2planarX_fn) (struct SwsContext *c, const int16_t *lumFilter, - const int16_t **lumSrc, int lumFilterSize, - const int16_t *chrFilter, const int16_t **chrUSrc, - const int16_t **chrVSrc, int chrFilterSize, - const int16_t **alpSrc, uint8_t *dest[4], - int dstW, int chrDstW); +typedef void (*yuv2interleavedX_fn) (struct SwsContext *c, const int16_t *chrFilter, int chrFilterSize, + const int16_t **chrUSrc, const int16_t **chrVSrc, + uint8_t *dest, int dstW); + /** * Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB * output without any additional vertical scaling (or point-scaling). Note @@ -405,7 +413,9 @@ typedef struct SwsContext { /* function pointers for swScale() */ yuv2planar1_fn yuv2yuv1; - yuv2planarX_fn yuv2yuvX; + yuv2planarX_fn yuv2planeX_luma; + yuv2planarX_fn yuv2planeX_chroma; + yuv2interleavedX_fn yuv2nv12X_chroma; yuv2packed1_fn yuv2packed1; yuv2packed2_fn yuv2packed2; yuv2packedX_fn yuv2packedX;