mirror of
https://github.com/mpv-player/mpv
synced 2024-12-26 00:42:57 +00:00
overread in the mmx2 horizontal scaler fixed
2% faster horizontal mmx2 scaler git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@5453 b3059339-0415-0410-9bf9-f77b7e298cf2
This commit is contained in:
parent
57c7eef360
commit
91438d4597
@ -117,10 +117,6 @@ untested special converters
|
|||||||
extern int verbose; // defined in mplayer.c
|
extern int verbose; // defined in mplayer.c
|
||||||
/*
|
/*
|
||||||
NOTES
|
NOTES
|
||||||
|
|
||||||
known BUGS with known cause (no bugreports please!, but patches are welcome :) )
|
|
||||||
horizontal fast_bilinear MMX2 scaler reads 1-7 samples too much (might cause a sig11)
|
|
||||||
|
|
||||||
Special versions: fast Y 1:1 scaling (no interpolation in y direction)
|
Special versions: fast Y 1:1 scaling (no interpolation in y direction)
|
||||||
|
|
||||||
TODO
|
TODO
|
||||||
@ -1020,12 +1016,17 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef ARCH_X86
|
#ifdef ARCH_X86
|
||||||
static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
|
static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits)
|
||||||
{
|
{
|
||||||
uint8_t *fragment;
|
uint8_t *fragmentA;
|
||||||
int imm8OfPShufW1;
|
int imm8OfPShufW1A;
|
||||||
int imm8OfPShufW2;
|
int imm8OfPShufW2A;
|
||||||
int fragmentLength;
|
int fragmentLengthA;
|
||||||
|
uint8_t *fragmentB;
|
||||||
|
int imm8OfPShufW1B;
|
||||||
|
int imm8OfPShufW2B;
|
||||||
|
int fragmentLengthB;
|
||||||
|
int fragmentPos;
|
||||||
|
|
||||||
int xpos, i;
|
int xpos, i;
|
||||||
|
|
||||||
@ -1037,22 +1038,18 @@ static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
|
|||||||
"jmp 9f \n\t"
|
"jmp 9f \n\t"
|
||||||
// Begin
|
// Begin
|
||||||
"0: \n\t"
|
"0: \n\t"
|
||||||
"movq (%%esi), %%mm0 \n\t" //FIXME Alignment
|
"movq (%%edx, %%eax), %%mm3 \n\t"
|
||||||
"movq %%mm0, %%mm1 \n\t"
|
"movd (%%ecx, %%esi), %%mm0 \n\t"
|
||||||
"psrlq $8, %%mm0 \n\t"
|
"movd 1(%%ecx, %%esi), %%mm1 \n\t"
|
||||||
"punpcklbw %%mm7, %%mm1 \n\t"
|
"punpcklbw %%mm7, %%mm1 \n\t"
|
||||||
"movq %%mm2, %%mm3 \n\t"
|
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
|
||||||
"addw %%bx, %%cx \n\t" //2*xalpha += (4*lumXInc)&0xFFFF
|
|
||||||
"pshufw $0xFF, %%mm1, %%mm1 \n\t"
|
"pshufw $0xFF, %%mm1, %%mm1 \n\t"
|
||||||
"1: \n\t"
|
"1: \n\t"
|
||||||
"adcl %%edx, %%esi \n\t" //xx+= (4*lumXInc)>>16 + carry
|
|
||||||
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
|
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
|
||||||
"2: \n\t"
|
"2: \n\t"
|
||||||
"psrlw $9, %%mm3 \n\t"
|
|
||||||
"psubw %%mm1, %%mm0 \n\t"
|
"psubw %%mm1, %%mm0 \n\t"
|
||||||
|
"movl 8(%%ebx, %%eax), %%esi \n\t"
|
||||||
"pmullw %%mm3, %%mm0 \n\t"
|
"pmullw %%mm3, %%mm0 \n\t"
|
||||||
"paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF
|
|
||||||
"psllw $7, %%mm1 \n\t"
|
"psllw $7, %%mm1 \n\t"
|
||||||
"paddw %%mm1, %%mm0 \n\t"
|
"paddw %%mm1, %%mm0 \n\t"
|
||||||
|
|
||||||
@ -1071,13 +1068,54 @@ static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
|
|||||||
"subl %0, %2 \n\t"
|
"subl %0, %2 \n\t"
|
||||||
"leal 9b, %3 \n\t"
|
"leal 9b, %3 \n\t"
|
||||||
"subl %0, %3 \n\t"
|
"subl %0, %3 \n\t"
|
||||||
:"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
|
|
||||||
"=r" (fragmentLength)
|
|
||||||
|
:"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
|
||||||
|
"=r" (fragmentLengthA)
|
||||||
|
);
|
||||||
|
|
||||||
|
asm volatile(
|
||||||
|
"jmp 9f \n\t"
|
||||||
|
// Begin
|
||||||
|
"0: \n\t"
|
||||||
|
"movq (%%edx, %%eax), %%mm3 \n\t"
|
||||||
|
"movd (%%ecx, %%esi), %%mm0 \n\t"
|
||||||
|
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||||
|
"pshufw $0xFF, %%mm0, %%mm1 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
|
||||||
|
"2: \n\t"
|
||||||
|
"psubw %%mm1, %%mm0 \n\t"
|
||||||
|
"movl 8(%%ebx, %%eax), %%esi \n\t"
|
||||||
|
"pmullw %%mm3, %%mm0 \n\t"
|
||||||
|
"psllw $7, %%mm1 \n\t"
|
||||||
|
"paddw %%mm1, %%mm0 \n\t"
|
||||||
|
|
||||||
|
"movq %%mm0, (%%edi, %%eax) \n\t"
|
||||||
|
|
||||||
|
"addl $8, %%eax \n\t"
|
||||||
|
// End
|
||||||
|
"9: \n\t"
|
||||||
|
// "int $3\n\t"
|
||||||
|
"leal 0b, %0 \n\t"
|
||||||
|
"leal 1b, %1 \n\t"
|
||||||
|
"leal 2b, %2 \n\t"
|
||||||
|
"decl %1 \n\t"
|
||||||
|
"decl %2 \n\t"
|
||||||
|
"subl %0, %1 \n\t"
|
||||||
|
"subl %0, %2 \n\t"
|
||||||
|
"leal 9b, %3 \n\t"
|
||||||
|
"subl %0, %3 \n\t"
|
||||||
|
|
||||||
|
|
||||||
|
:"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
|
||||||
|
"=r" (fragmentLengthB)
|
||||||
);
|
);
|
||||||
|
|
||||||
xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
|
xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
|
||||||
|
fragmentPos=0;
|
||||||
for(i=0; i<dstW/8; i++)
|
|
||||||
|
for(i=0; i<dstW/numSplits; i++)
|
||||||
{
|
{
|
||||||
int xx=xpos>>16;
|
int xx=xpos>>16;
|
||||||
|
|
||||||
@ -1088,20 +1126,65 @@ static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
|
|||||||
int c=((xpos+xInc*2)>>16) - xx;
|
int c=((xpos+xInc*2)>>16) - xx;
|
||||||
int d=((xpos+xInc*3)>>16) - xx;
|
int d=((xpos+xInc*3)>>16) - xx;
|
||||||
|
|
||||||
memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
|
filter[i ] = (( xpos & 0xFFFF) ^ 0xFFFF)>>9;
|
||||||
|
filter[i+1] = (((xpos+xInc ) & 0xFFFF) ^ 0xFFFF)>>9;
|
||||||
|
filter[i+2] = (((xpos+xInc*2) & 0xFFFF) ^ 0xFFFF)>>9;
|
||||||
|
filter[i+3] = (((xpos+xInc*3) & 0xFFFF) ^ 0xFFFF)>>9;
|
||||||
|
filterPos[i/2]= xx;
|
||||||
|
|
||||||
funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
|
if(d+1<4)
|
||||||
funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
|
{
|
||||||
a | (b<<2) | (c<<4) | (d<<6);
|
int maxShift= 3-(d+1);
|
||||||
|
int shift=0;
|
||||||
|
|
||||||
// if we dont need to read 8 bytes than dont :), reduces the chance of
|
memcpy(funnyCode + fragmentPos, fragmentB, fragmentLengthB);
|
||||||
// crossing a cache line
|
|
||||||
if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
|
|
||||||
|
|
||||||
funnyCode[fragmentLength*(i+4)/4]= RET;
|
funnyCode[fragmentPos + imm8OfPShufW1B]=
|
||||||
|
(a+1) | ((b+1)<<2) | ((c+1)<<4) | ((d+1)<<6);
|
||||||
|
funnyCode[fragmentPos + imm8OfPShufW2B]=
|
||||||
|
a | (b<<2) | (c<<4) | (d<<6);
|
||||||
|
|
||||||
|
if(i+3>=dstW) shift=maxShift; //avoid overread
|
||||||
|
else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //Align
|
||||||
|
|
||||||
|
if(shift && i>=shift)
|
||||||
|
{
|
||||||
|
funnyCode[fragmentPos + imm8OfPShufW1B]+= 0x55*shift;
|
||||||
|
funnyCode[fragmentPos + imm8OfPShufW2B]+= 0x55*shift;
|
||||||
|
filterPos[i/2]-=shift;
|
||||||
|
}
|
||||||
|
|
||||||
|
fragmentPos+= fragmentLengthB;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int maxShift= 3-d;
|
||||||
|
int shift=0;
|
||||||
|
|
||||||
|
memcpy(funnyCode + fragmentPos, fragmentA, fragmentLengthA);
|
||||||
|
|
||||||
|
funnyCode[fragmentPos + imm8OfPShufW1A]=
|
||||||
|
funnyCode[fragmentPos + imm8OfPShufW2A]=
|
||||||
|
a | (b<<2) | (c<<4) | (d<<6);
|
||||||
|
|
||||||
|
if(i+4>=dstW) shift=maxShift; //avoid overread
|
||||||
|
else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //partial align
|
||||||
|
|
||||||
|
if(shift && i>=shift)
|
||||||
|
{
|
||||||
|
funnyCode[fragmentPos + imm8OfPShufW1A]+= 0x55*shift;
|
||||||
|
funnyCode[fragmentPos + imm8OfPShufW2A]+= 0x55*shift;
|
||||||
|
filterPos[i/2]-=shift;
|
||||||
|
}
|
||||||
|
|
||||||
|
fragmentPos+= fragmentLengthA;
|
||||||
|
}
|
||||||
|
|
||||||
|
funnyCode[fragmentPos]= RET;
|
||||||
}
|
}
|
||||||
xpos+=xInc;
|
xpos+=xInc;
|
||||||
}
|
}
|
||||||
|
filterPos[i/2]= xpos>>16; // needed to jump to the next part
|
||||||
}
|
}
|
||||||
#endif // ARCH_X86
|
#endif // ARCH_X86
|
||||||
|
|
||||||
@ -1565,8 +1648,13 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
|
|||||||
// cant downscale !!!
|
// cant downscale !!!
|
||||||
if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
|
if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
|
||||||
{
|
{
|
||||||
initMMX2HScaler( dstW, c->lumXInc, c->funnyYCode);
|
c->lumMmx2Filter = (int16_t*)memalign(8, (dstW /8+8)*sizeof(int16_t));
|
||||||
initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode);
|
c->chrMmx2Filter = (int16_t*)memalign(8, (c->chrDstW /4+8)*sizeof(int16_t));
|
||||||
|
c->lumMmx2FilterPos= (int32_t*)memalign(8, (dstW /2/8+8)*sizeof(int32_t));
|
||||||
|
c->chrMmx2FilterPos= (int32_t*)memalign(8, (c->chrDstW/2/4+8)*sizeof(int32_t));
|
||||||
|
|
||||||
|
initMMX2HScaler( dstW, c->lumXInc, c->funnyYCode , c->lumMmx2Filter, c->lumMmx2FilterPos, 8);
|
||||||
|
initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode, c->chrMmx2Filter, c->chrMmx2FilterPos, 4);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} // Init Horizontal stuff
|
} // Init Horizontal stuff
|
||||||
@ -2014,6 +2102,15 @@ void freeSwsContext(SwsContext *c){
|
|||||||
if(c->chrMmxFilter) free(c->chrMmxFilter);
|
if(c->chrMmxFilter) free(c->chrMmxFilter);
|
||||||
c->chrMmxFilter = NULL;
|
c->chrMmxFilter = NULL;
|
||||||
|
|
||||||
|
if(c->lumMmx2Filter) free(c->lumMmx2Filter);
|
||||||
|
c->lumMmx2Filter=NULL;
|
||||||
|
if(c->chrMmx2Filter) free(c->chrMmx2Filter);
|
||||||
|
c->chrMmx2Filter=NULL;
|
||||||
|
if(c->lumMmx2FilterPos) free(c->lumMmx2FilterPos);
|
||||||
|
c->lumMmx2FilterPos=NULL;
|
||||||
|
if(c->chrMmx2FilterPos) free(c->chrMmx2FilterPos);
|
||||||
|
c->chrMmx2FilterPos=NULL;
|
||||||
|
|
||||||
free(c);
|
free(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -69,6 +69,10 @@ typedef struct SwsContext{
|
|||||||
|
|
||||||
uint8_t __attribute__((aligned(32))) funnyYCode[10000];
|
uint8_t __attribute__((aligned(32))) funnyYCode[10000];
|
||||||
uint8_t __attribute__((aligned(32))) funnyUVCode[10000];
|
uint8_t __attribute__((aligned(32))) funnyUVCode[10000];
|
||||||
|
int32_t *lumMmx2FilterPos;
|
||||||
|
int32_t *chrMmx2FilterPos;
|
||||||
|
int16_t *lumMmx2Filter;
|
||||||
|
int16_t *chrMmx2Filter;
|
||||||
|
|
||||||
int canMMX2BeUsed;
|
int canMMX2BeUsed;
|
||||||
|
|
||||||
|
@ -2238,7 +2238,8 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW
|
|||||||
static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
|
static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
|
||||||
int flags, int canMMX2BeUsed, int16_t *hLumFilter,
|
int flags, int canMMX2BeUsed, int16_t *hLumFilter,
|
||||||
int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
|
int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
|
||||||
int srcFormat, uint8_t *formatConvBuffer)
|
int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
|
||||||
|
int32_t *mmx2FilterPos)
|
||||||
{
|
{
|
||||||
if(srcFormat==IMGFMT_YUY2)
|
if(srcFormat==IMGFMT_YUY2)
|
||||||
{
|
{
|
||||||
@ -2294,35 +2295,21 @@ static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, in
|
|||||||
{
|
{
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"pxor %%mm7, %%mm7 \n\t"
|
"pxor %%mm7, %%mm7 \n\t"
|
||||||
"pxor %%mm2, %%mm2 \n\t" // 2*xalpha
|
"movl %0, %%ecx \n\t"
|
||||||
"movd %5, %%mm6 \n\t" // xInc&0xFFFF
|
"movl %1, %%edi \n\t"
|
||||||
"punpcklwd %%mm6, %%mm6 \n\t"
|
"movl %2, %%edx \n\t"
|
||||||
"punpcklwd %%mm6, %%mm6 \n\t"
|
"movl %3, %%ebx \n\t"
|
||||||
"movq %%mm6, %%mm2 \n\t"
|
|
||||||
"psllq $16, %%mm2 \n\t"
|
|
||||||
"paddw %%mm6, %%mm2 \n\t"
|
|
||||||
"psllq $16, %%mm2 \n\t"
|
|
||||||
"paddw %%mm6, %%mm2 \n\t"
|
|
||||||
"psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF
|
|
||||||
"movq %%mm2, %%mm4 \n\t"
|
|
||||||
"movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
|
|
||||||
"punpcklwd %%mm6, %%mm6 \n\t"
|
|
||||||
"punpcklwd %%mm6, %%mm6 \n\t"
|
|
||||||
"xorl %%eax, %%eax \n\t" // i
|
"xorl %%eax, %%eax \n\t" // i
|
||||||
"movl %0, %%esi \n\t" // src
|
PREFETCH" (%%ecx) \n\t"
|
||||||
"movl %1, %%edi \n\t" // buf1
|
PREFETCH" 32(%%ecx) \n\t"
|
||||||
"movl %3, %%edx \n\t" // (xInc*4)>>16
|
PREFETCH" 64(%%ecx) \n\t"
|
||||||
"xorl %%ecx, %%ecx \n\t"
|
|
||||||
"xorl %%ebx, %%ebx \n\t"
|
|
||||||
"movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
|
|
||||||
|
|
||||||
#define FUNNY_Y_CODE \
|
#define FUNNY_Y_CODE \
|
||||||
PREFETCH" 1024(%%esi) \n\t"\
|
"movl (%%ebx), %%esi \n\t"\
|
||||||
PREFETCH" 1056(%%esi) \n\t"\
|
"call *%4 \n\t"\
|
||||||
PREFETCH" 1088(%%esi) \n\t"\
|
"addl (%%ebx, %%eax), %%ecx \n\t"\
|
||||||
"call *%6 \n\t"\
|
"addl %%eax, %%edi \n\t"\
|
||||||
"movq %%mm4, %%mm2 \n\t"\
|
"xorl %%eax, %%eax \n\t"\
|
||||||
"xorl %%ecx, %%ecx \n\t"
|
|
||||||
|
|
||||||
FUNNY_Y_CODE
|
FUNNY_Y_CODE
|
||||||
FUNNY_Y_CODE
|
FUNNY_Y_CODE
|
||||||
@ -2333,8 +2320,8 @@ FUNNY_Y_CODE
|
|||||||
FUNNY_Y_CODE
|
FUNNY_Y_CODE
|
||||||
FUNNY_Y_CODE
|
FUNNY_Y_CODE
|
||||||
|
|
||||||
:: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
|
:: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
|
||||||
"m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (funnyYCode)
|
"m" (funnyYCode)
|
||||||
: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
|
: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
|
||||||
);
|
);
|
||||||
for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
|
for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
|
||||||
@ -2402,7 +2389,8 @@ FUNNY_Y_CODE
|
|||||||
inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
|
inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
|
||||||
int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
|
int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
|
||||||
int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
|
int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
|
||||||
int srcFormat, uint8_t *formatConvBuffer)
|
int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
|
||||||
|
int32_t *mmx2FilterPos)
|
||||||
{
|
{
|
||||||
if(srcFormat==IMGFMT_YUY2)
|
if(srcFormat==IMGFMT_YUY2)
|
||||||
{
|
{
|
||||||
@ -2469,65 +2457,44 @@ inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, u
|
|||||||
if(canMMX2BeUsed)
|
if(canMMX2BeUsed)
|
||||||
{
|
{
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"pxor %%mm7, %%mm7 \n\t"
|
"pxor %%mm7, %%mm7 \n\t"
|
||||||
"pxor %%mm2, %%mm2 \n\t" // 2*xalpha
|
"movl %0, %%ecx \n\t"
|
||||||
"movd %5, %%mm6 \n\t" // xInc&0xFFFF
|
"movl %1, %%edi \n\t"
|
||||||
"punpcklwd %%mm6, %%mm6 \n\t"
|
"movl %2, %%edx \n\t"
|
||||||
"punpcklwd %%mm6, %%mm6 \n\t"
|
"movl %3, %%ebx \n\t"
|
||||||
"movq %%mm6, %%mm2 \n\t"
|
"xorl %%eax, %%eax \n\t" // i
|
||||||
"psllq $16, %%mm2 \n\t"
|
PREFETCH" (%%ecx) \n\t"
|
||||||
"paddw %%mm6, %%mm2 \n\t"
|
PREFETCH" 32(%%ecx) \n\t"
|
||||||
"psllq $16, %%mm2 \n\t"
|
PREFETCH" 64(%%ecx) \n\t"
|
||||||
"paddw %%mm6, %%mm2 \n\t"
|
|
||||||
"psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF
|
|
||||||
"movq %%mm2, %%mm4 \n\t"
|
|
||||||
"movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
|
|
||||||
"punpcklwd %%mm6, %%mm6 \n\t"
|
|
||||||
"punpcklwd %%mm6, %%mm6 \n\t"
|
|
||||||
"xorl %%eax, %%eax \n\t" // i
|
|
||||||
"movl %0, %%esi \n\t" // src
|
|
||||||
"movl %1, %%edi \n\t" // buf1
|
|
||||||
"movl %3, %%edx \n\t" // (xInc*4)>>16
|
|
||||||
"xorl %%ecx, %%ecx \n\t"
|
|
||||||
"xorl %%ebx, %%ebx \n\t"
|
|
||||||
"movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
|
|
||||||
|
|
||||||
#define FUNNYUVCODE \
|
#define FUNNY_UV_CODE \
|
||||||
PREFETCH" 1024(%%esi) \n\t"\
|
"movl (%%ebx), %%esi \n\t"\
|
||||||
PREFETCH" 1056(%%esi) \n\t"\
|
"call *%4 \n\t"\
|
||||||
PREFETCH" 1088(%%esi) \n\t"\
|
"addl (%%ebx, %%eax), %%ecx \n\t"\
|
||||||
"call *%7 \n\t"\
|
"addl %%eax, %%edi \n\t"\
|
||||||
"movq %%mm4, %%mm2 \n\t"\
|
"xorl %%eax, %%eax \n\t"\
|
||||||
"xorl %%ecx, %%ecx \n\t"
|
|
||||||
|
|
||||||
FUNNYUVCODE
|
FUNNY_UV_CODE
|
||||||
FUNNYUVCODE
|
FUNNY_UV_CODE
|
||||||
FUNNYUVCODE
|
FUNNY_UV_CODE
|
||||||
FUNNYUVCODE
|
FUNNY_UV_CODE
|
||||||
|
"xorl %%eax, %%eax \n\t" // i
|
||||||
|
"movl %5, %%ecx \n\t" // src
|
||||||
|
"movl %1, %%edi \n\t" // buf1
|
||||||
|
"addl $4096, %%edi \n\t"
|
||||||
|
PREFETCH" (%%ecx) \n\t"
|
||||||
|
PREFETCH" 32(%%ecx) \n\t"
|
||||||
|
PREFETCH" 64(%%ecx) \n\t"
|
||||||
|
|
||||||
FUNNYUVCODE
|
FUNNY_UV_CODE
|
||||||
FUNNYUVCODE
|
FUNNY_UV_CODE
|
||||||
FUNNYUVCODE
|
FUNNY_UV_CODE
|
||||||
FUNNYUVCODE
|
FUNNY_UV_CODE
|
||||||
"xorl %%eax, %%eax \n\t" // i
|
|
||||||
"movl %6, %%esi \n\t" // src
|
|
||||||
"movl %1, %%edi \n\t" // buf1
|
|
||||||
"addl $4096, %%edi \n\t"
|
|
||||||
|
|
||||||
FUNNYUVCODE
|
:: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
|
||||||
FUNNYUVCODE
|
"m" (funnyUVCode), "m" (src2)
|
||||||
FUNNYUVCODE
|
: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
|
||||||
FUNNYUVCODE
|
);
|
||||||
|
|
||||||
FUNNYUVCODE
|
|
||||||
FUNNYUVCODE
|
|
||||||
FUNNYUVCODE
|
|
||||||
FUNNYUVCODE
|
|
||||||
|
|
||||||
:: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
|
|
||||||
"m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2), "m" (funnyUVCode)
|
|
||||||
: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
|
|
||||||
);
|
|
||||||
for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
|
for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
|
||||||
{
|
{
|
||||||
// printf("%d %d %d\n", dstWidth, i, srcW);
|
// printf("%d %d %d\n", dstWidth, i, srcW);
|
||||||
@ -2749,7 +2716,8 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
|
|||||||
// printf("%d %d\n", lumBufIndex, vLumBufSize);
|
// printf("%d %d\n", lumBufIndex, vLumBufSize);
|
||||||
RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
|
RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
|
||||||
flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
|
flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
|
||||||
funnyYCode, c->srcFormat, formatConvBuffer);
|
funnyYCode, c->srcFormat, formatConvBuffer,
|
||||||
|
c->lumMmx2Filter, c->lumMmx2FilterPos);
|
||||||
lastInLumBuf++;
|
lastInLumBuf++;
|
||||||
}
|
}
|
||||||
while(lastInChrBuf < lastChrSrcY)
|
while(lastInChrBuf < lastChrSrcY)
|
||||||
@ -2763,7 +2731,8 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
|
|||||||
//FIXME replace parameters through context struct (some at least)
|
//FIXME replace parameters through context struct (some at least)
|
||||||
RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
|
RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
|
||||||
flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
|
flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
|
||||||
funnyUVCode, c->srcFormat, formatConvBuffer);
|
funnyUVCode, c->srcFormat, formatConvBuffer,
|
||||||
|
c->chrMmx2Filter, c->chrMmx2FilterPos);
|
||||||
lastInChrBuf++;
|
lastInChrBuf++;
|
||||||
}
|
}
|
||||||
//wrap buf index around to stay inside the ring buffer
|
//wrap buf index around to stay inside the ring buffer
|
||||||
@ -2787,7 +2756,8 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
|
|||||||
ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
|
ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
|
||||||
RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
|
RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
|
||||||
flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
|
flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
|
||||||
funnyYCode, c->srcFormat, formatConvBuffer);
|
funnyYCode, c->srcFormat, formatConvBuffer,
|
||||||
|
c->lumMmx2Filter, c->lumMmx2FilterPos);
|
||||||
lastInLumBuf++;
|
lastInLumBuf++;
|
||||||
}
|
}
|
||||||
while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
|
while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
|
||||||
@ -2800,7 +2770,8 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
|
|||||||
ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
|
ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
|
||||||
RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
|
RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
|
||||||
flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
|
flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
|
||||||
funnyUVCode, c->srcFormat, formatConvBuffer);
|
funnyUVCode, c->srcFormat, formatConvBuffer,
|
||||||
|
c->chrMmx2Filter, c->chrMmx2FilterPos);
|
||||||
lastInChrBuf++;
|
lastInChrBuf++;
|
||||||
}
|
}
|
||||||
//wrap buf index around to stay inside the ring buffer
|
//wrap buf index around to stay inside the ring buffer
|
||||||
|
Loading…
Reference in New Issue
Block a user