mmx yuy2 output

Originally committed as revision 7724 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
This commit is contained in:
Michael Niedermayer 2002-10-13 17:23:02 +00:00
parent 4d2858deac
commit 25593e29f2
2 changed files with 163 additions and 25 deletions

View File

@ -421,7 +421,7 @@ static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilt
}
#define YSCALE_YUV_2_X_C(type) \
#define YSCALE_YUV_2_PACKEDX_C(type) \
for(i=0; i<(dstW>>1); i++){\
int j;\
int Y1=0;\
@ -458,12 +458,12 @@ static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilt
}
#define YSCALE_YUV_2_RGBX_C(type) \
YSCALE_YUV_2_X_C(type)\
YSCALE_YUV_2_PACKEDX_C(type)\
r = c->table_rV[V];\
g = c->table_gU[U] + c->table_gV[V];\
b = c->table_bU[U];\
#define YSCALE_YUV_2_2_C \
#define YSCALE_YUV_2_PACKED2_C \
for(i=0; i<(dstW>>1); i++){\
const int i2= 2*i;\
int Y1= (buf0[i2 ]*yalpha1+buf1[i2 ]*yalpha)>>19;\
@ -472,13 +472,13 @@ static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilt
int V= (uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19;\
#define YSCALE_YUV_2_RGB2_C(type) \
YSCALE_YUV_2_2_C\
YSCALE_YUV_2_PACKED2_C\
type *r, *b, *g;\
r = c->table_rV[V];\
g = c->table_gU[U] + c->table_gV[V];\
b = c->table_bU[U];\
#define YSCALE_YUV_2_1_C \
#define YSCALE_YUV_2_PACKED1_C \
for(i=0; i<(dstW>>1); i++){\
const int i2= 2*i;\
int Y1= buf0[i2 ]>>7;\
@ -487,13 +487,13 @@ static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilt
int V= (uvbuf1[i+2048])>>7;\
#define YSCALE_YUV_2_RGB1_C(type) \
YSCALE_YUV_2_1_C\
YSCALE_YUV_2_PACKED1_C\
type *r, *b, *g;\
r = c->table_rV[V];\
g = c->table_gU[U] + c->table_gV[V];\
b = c->table_bU[U];\
#define YSCALE_YUV_2_1B_C \
#define YSCALE_YUV_2_PACKED1B_C \
for(i=0; i<(dstW>>1); i++){\
const int i2= 2*i;\
int Y1= buf0[i2 ]>>7;\
@ -502,7 +502,7 @@ static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilt
int V= (uvbuf0[i+2048] + uvbuf1[i+2048])>>8;\
#define YSCALE_YUV_2_RGB1B_C(type) \
YSCALE_YUV_2_1B_C\
YSCALE_YUV_2_PACKED1B_C\
type *r, *b, *g;\
r = c->table_rV[V];\
g = c->table_gU[U] + c->table_gV[V];\
@ -668,7 +668,7 @@ static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilt
}\
static inline void yuv2rgbXinC(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
static inline void yuv2packedXinC(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
uint8_t *dest, int dstW, int y)
{
@ -791,7 +791,7 @@ static inline void yuv2rgbXinC(SwsContext *c, int16_t *lumFilter, int16_t **lumS
}
break;
case IMGFMT_YUY2:
YSCALE_YUV_2_X_C(void)
YSCALE_YUV_2_PACKEDX_C(void)
((uint8_t*)dest)[2*i2+0]= Y1;
((uint8_t*)dest)[2*i2+1]= U;
((uint8_t*)dest)[2*i2+2]= Y2;

View File

@ -107,7 +107,7 @@
"m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
: "%eax", "%ebx", "%ecx", "%edx", "%esi"
*/
#define YSCALEYUV2RGBX \
#define YSCALEYUV2PACKEDX \
"xorl %%eax, %%eax \n\t"\
".balign 16 \n\t"\
"1: \n\t"\
@ -144,7 +144,10 @@
"paddw %%mm5, %%mm7 \n\t"\
"addl $1, %%edx \n\t"\
" jnz 2b \n\t"\
\
#define YSCALEYUV2RGBX \
YSCALEYUV2PACKEDX\
"psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
"psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
@ -234,6 +237,46 @@
\
"packuswb %%mm1, %%mm1 \n\t"
#define YSCALEYUV2PACKED \
"movd %6, %%mm6 \n\t" /*yalpha1*/\
"punpcklwd %%mm6, %%mm6 \n\t"\
"punpcklwd %%mm6, %%mm6 \n\t"\
"psraw $3, %%mm6 \n\t"\
"movq %%mm6, 3968(%2) \n\t"\
"movd %7, %%mm5 \n\t" /*uvalpha1*/\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"psraw $3, %%mm5 \n\t"\
"movq %%mm5, 3976(%2) \n\t"\
"xorl %%eax, %%eax \n\t"\
".balign 16 \n\t"\
"1: \n\t"\
"movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
"movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
"movq 3976(%2), %%mm0 \n\t"\
"pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
"pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
"psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
"psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
"movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
"movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
"movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
"movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
"psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
"pmulhw 3968(%2), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"pmulhw 3968(%2), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
"paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
#define YSCALEYUV2RGB \
"movd %6, %%mm6 \n\t" /*yalpha1*/\
"punpcklwd %%mm6, %%mm6 \n\t"\
@ -306,7 +349,20 @@
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
"pxor %%mm7, %%mm7 \n\t"
#define YSCALEYUV2PACKED1 \
"xorl %%eax, %%eax \n\t"\
".balign 16 \n\t"\
"1: \n\t"\
"movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
"movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
"psraw $7, %%mm3 \n\t" \
"psraw $7, %%mm4 \n\t" \
"movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
"movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
"psraw $7, %%mm1 \n\t" \
"psraw $7, %%mm7 \n\t" \
#define YSCALEYUV2RGB1 \
"xorl %%eax, %%eax \n\t"\
".balign 16 \n\t"\
@ -355,6 +411,23 @@
"packuswb %%mm3, %%mm4 \n\t"\
"pxor %%mm7, %%mm7 \n\t"
#define YSCALEYUV2PACKED1b \
"xorl %%eax, %%eax \n\t"\
".balign 16 \n\t"\
"1: \n\t"\
"movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
"movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
"psrlw $8, %%mm3 \n\t" \
"psrlw $8, %%mm4 \n\t" \
"movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
"movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
"psraw $7, %%mm1 \n\t" \
"psraw $7, %%mm7 \n\t"
// do vertical chrominance interpolation
#define YSCALEYUV2RGB1b \
"xorl %%eax, %%eax \n\t"\
@ -652,6 +725,23 @@
#define WRITEBGR24 WRITEBGR24MMX
#endif
#define WRITEYUY2 \
"packuswb %%mm3, %%mm3 \n\t"\
"packuswb %%mm4, %%mm4 \n\t"\
"packuswb %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm4, %%mm3 \n\t"\
"movq %%mm1, %%mm7 \n\t"\
"punpcklbw %%mm3, %%mm1 \n\t"\
"punpckhbw %%mm3, %%mm7 \n\t"\
\
MOVNTQ(%%mm1, (%4, %%eax, 2))\
MOVNTQ(%%mm7, 8(%4, %%eax, 2))\
\
"addl $8, %%eax \n\t"\
"cmpl %5, %%eax \n\t"\
" jb 1b \n\t"
static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW,
@ -752,7 +842,7 @@ static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
/**
* vertical scale YV12 to RGB
*/
static inline void RENAME(yuv2rgbX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
uint8_t *dest, int dstW, int16_t * lumMmxFilter, int16_t * chrMmxFilter, int dstY)
{
@ -831,9 +921,29 @@ static inline void RENAME(yuv2rgbX)(SwsContext *c, int16_t *lumFilter, int16_t *
);
}
break;
case IMGFMT_YUY2:
{
asm volatile(
YSCALEYUV2PACKEDX
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
"psraw $3, %%mm3 \n\t"
"psraw $3, %%mm4 \n\t"
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
WRITEYUY2
:: "m" (-lumFilterSize), "m" (-chrFilterSize),
"m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
"r" (dest), "m" (dstW),
"m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
: "%eax", "%ebx", "%ecx", "%edx", "%esi"
);
}
break;
#endif
default:
yuv2rgbXinC(c, lumFilter, lumSrc, lumFilterSize,
yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
chrFilter, chrSrc, chrFilterSize,
dest, dstW, dstY);
break;
@ -843,7 +953,7 @@ static inline void RENAME(yuv2rgbX)(SwsContext *c, int16_t *lumFilter, int16_t *
/**
* vertical bilinear scale YV12 to RGB
*/
static inline void RENAME(yuv2rgb2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
{
int yalpha1=yalpha^4095;
@ -1119,6 +1229,16 @@ FULL_YSCALEYUV2RGB
WRITEBGR16
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
"m" (yalpha1), "m" (uvalpha1)
: "%eax"
);
return;
case IMGFMT_YUY2:
asm volatile(
YSCALEYUV2PACKED
WRITEYUY2
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
"m" (yalpha1), "m" (uvalpha1)
: "%eax"
@ -1127,13 +1247,13 @@ FULL_YSCALEYUV2RGB
default: break;
}
#endif //HAVE_MMX
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_2_C)
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
}
/**
* YV12 to RGB without scaling or interpolating
*/
static inline void RENAME(yuv2rgb1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
{
int uvalpha1=uvalpha^4095;
@ -1145,7 +1265,7 @@ static inline void RENAME(yuv2rgb1)(SwsContext *c, uint16_t *buf0, uint16_t *uvb
if(flags&SWS_FULL_CHR_H_INT)
{
RENAME(yuv2rgb2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
return;
}
@ -1204,6 +1324,15 @@ static inline void RENAME(yuv2rgb1)(SwsContext *c, uint16_t *buf0, uint16_t *uvb
: "%eax"
);
return;
case IMGFMT_YUY2:
asm volatile(
YSCALEYUV2PACKED1
WRITEYUY2
:: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
"m" (yalpha1), "m" (uvalpha1)
: "%eax"
);
return;
}
}
else
@ -1260,14 +1389,23 @@ static inline void RENAME(yuv2rgb1)(SwsContext *c, uint16_t *buf0, uint16_t *uvb
: "%eax"
);
return;
case IMGFMT_YUY2:
asm volatile(
YSCALEYUV2PACKED1b
WRITEYUY2
:: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
"m" (yalpha1), "m" (uvalpha1)
: "%eax"
);
return;
}
}
#endif
if( uvalpha < 2048 )
{
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_1_C)
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
}else{
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_1B_C)
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
}
}
@ -2533,7 +2671,7 @@ i--;
{
int chrAlpha= vChrFilter[2*dstY+1];
RENAME(yuv2rgb1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
dest, dstW, chrAlpha, dstFormat, flags, dstY);
}
else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
@ -2541,12 +2679,12 @@ i--;
int lumAlpha= vLumFilter[2*dstY+1];
int chrAlpha= vChrFilter[2*dstY+1];
RENAME(yuv2rgb2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
dest, dstW, lumAlpha, chrAlpha, dstY);
}
else //General RGB
{
RENAME(yuv2rgbX)(c,
RENAME(yuv2packedX)(c,
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
dest, dstW,
@ -2571,7 +2709,7 @@ i--;
{
ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
yuv2rgbXinC(c,
yuv2packedXinC(c,
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
dest, dstW, dstY);