faster mmx2 / 3dnow deblocking filter

brightness_debug (draws luminance histogram & autodetected white/black level)

Originally committed as revision 3014 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
This commit is contained in:
Michael Niedermayer 2001-11-19 22:20:30 +00:00
parent 644d98a4f2
commit 7f16f6e64e
2 changed files with 666 additions and 16 deletions

View File

@ -21,11 +21,11 @@
isVertDC Ec Ec
isVertMinMaxOk Ec Ec
doVertLowPass E e e
doVertDefFilter Ec Ec Ec
doVertDefFilter Ec Ec e e
isHorizDC Ec Ec
isHorizMinMaxOk a E
doHorizLowPass E e e
doHorizDefFilter Ec Ec Ec
doHorizDefFilter Ec Ec e e
deRing E e e*
Vertical RKAlgo1 E a a
Horizontal RKAlgo1 a a
@ -63,8 +63,6 @@ optimize c versions
try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
smart blur
...
Notes:
*/
//Changelog: use the CVS log
@ -80,6 +78,7 @@ Notes:
//#undef HAVE_MMX2
//#define HAVE_3DNOW
//#undef HAVE_MMX
//#define DEBUG_BRIGHTNESS
#include "postprocess.h"
#define MIN(a,b) ((a) > (b) ? (b) : (a))
@ -1067,10 +1066,299 @@ HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
{
#ifdef HAVE_MMX
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
/*
uint8_t tmp[16];
const int l1= stride;
const int l2= stride + l1;
const int l3= stride + l2;
const int l4= (int)tmp - (int)src - stride*3;
const int l5= (int)tmp - (int)src - stride*3 + 8;
const int l6= stride*3 + l3;
const int l7= stride + l6;
const int l8= stride + l7;
memcpy(tmp, src+stride*7, 8);
memcpy(tmp+8, src+stride*8, 8);
*/
src+= stride*4;
//FIXME try pmul for *5 stuff
// src[0]=0;
asm volatile(
#if 0 //sligtly more accurate and slightly slower
"pxor %%mm7, %%mm7 \n\t" // 0
"leal (%0, %1), %%eax \n\t"
"leal (%%eax, %1, 4), %%ebx \n\t"
// 0 1 2 3 4 5 6 7
// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
"movq (%0, %1, 2), %%mm0 \n\t" // l2
"movq (%0), %%mm1 \n\t" // l0
"movq %%mm0, %%mm2 \n\t" // l2
PAVGB(%%mm7, %%mm0) // ~l2/2
PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
"movq (%%eax), %%mm1 \n\t" // l1
"movq (%%eax, %1, 2), %%mm3 \n\t" // l3
"movq %%mm1, %%mm4 \n\t" // l1
PAVGB(%%mm7, %%mm1) // ~l1/2
PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
"movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
"psubusb %%mm1, %%mm0 \n\t"
"psubusb %%mm4, %%mm1 \n\t"
"por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
"movq (%0, %1, 4), %%mm0 \n\t" // l4
"movq %%mm0, %%mm4 \n\t" // l4
PAVGB(%%mm7, %%mm0) // ~l4/2
PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
"movq (%%ebx), %%mm2 \n\t" // l5
"movq %%mm3, %%mm5 \n\t" // l3
PAVGB(%%mm7, %%mm3) // ~l3/2
PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
"movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
"psubusb %%mm3, %%mm0 \n\t"
"psubusb %%mm6, %%mm3 \n\t"
"por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
"pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
"movq (%%ebx, %1), %%mm6 \n\t" // l6
"movq %%mm6, %%mm5 \n\t" // l6
PAVGB(%%mm7, %%mm6) // ~l6/2
PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
"movq (%%ebx, %1, 2), %%mm5 \n\t" // l7
"movq %%mm2, %%mm4 \n\t" // l5
PAVGB(%%mm7, %%mm2) // ~l5/2
PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
"movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
"psubusb %%mm2, %%mm6 \n\t"
"psubusb %%mm4, %%mm2 \n\t"
"por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
"movq pQPb, %%mm4 \n\t" // QP //FIXME QP+1 ?
"paddusb b01, %%mm4 \n\t"
"pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
"psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
"pand %%mm4, %%mm3 \n\t"
"movq %%mm3, %%mm1 \n\t"
// "psubusb b01, %%mm3 \n\t"
PAVGB(%%mm7, %%mm3)
PAVGB(%%mm7, %%mm3)
"paddusb %%mm1, %%mm3 \n\t"
// "paddusb b01, %%mm3 \n\t"
"movq (%%eax, %1, 2), %%mm6 \n\t" //l3
"movq (%0, %1, 4), %%mm5 \n\t" //l4
"movq (%0, %1, 4), %%mm4 \n\t" //l4
"psubusb %%mm6, %%mm5 \n\t"
"psubusb %%mm4, %%mm6 \n\t"
"por %%mm6, %%mm5 \n\t" // |l3-l4|
"pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
"pxor %%mm6, %%mm0 \n\t"
"pand %%mm0, %%mm3 \n\t"
PMINUB(%%mm5, %%mm3, %%mm0)
"psubusb b01, %%mm3 \n\t"
PAVGB(%%mm7, %%mm3)
"movq (%%eax, %1, 2), %%mm0 \n\t"
"movq (%0, %1, 4), %%mm2 \n\t"
"pxor %%mm6, %%mm0 \n\t"
"pxor %%mm6, %%mm2 \n\t"
"psubb %%mm3, %%mm0 \n\t"
"paddb %%mm3, %%mm2 \n\t"
"pxor %%mm6, %%mm0 \n\t"
"pxor %%mm6, %%mm2 \n\t"
"movq %%mm0, (%%eax, %1, 2) \n\t"
"movq %%mm2, (%0, %1, 4) \n\t"
#endif
"leal (%0, %1), %%eax \n\t"
"pcmpeqb %%mm6, %%mm6 \n\t" // -1
// 0 1 2 3 4 5 6 7
// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
"movq (%%eax, %1, 2), %%mm1 \n\t" // l3
"movq (%0, %1, 4), %%mm0 \n\t" // l4
"pxor %%mm6, %%mm1 \n\t" // -l3-1
PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
// mm1=-l3-1, mm0=128-q
"movq (%%eax, %1, 4), %%mm2 \n\t" // l5
"movq (%%eax, %1), %%mm3 \n\t" // l2
"pxor %%mm6, %%mm2 \n\t" // -l5-1
"movq %%mm2, %%mm5 \n\t" // -l5-1
"movq b80, %%mm4 \n\t" // 128
"leal (%%eax, %1, 4), %%ebx \n\t"
PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
"movq (%%eax), %%mm2 \n\t" // l1
"pxor %%mm6, %%mm2 \n\t" // -l1-1
PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
PAVGB((%0), %%mm1) // (l0-l3+256)/2
"movq b80, %%mm3 \n\t" // 128
PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2
"movq (%%ebx, %1, 2), %%mm1 \n\t" // l7
"pxor %%mm6, %%mm1 \n\t" // -l7-1
PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
"movq b80, %%mm2 \n\t" // 128
PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
"movq b00, %%mm1 \n\t" // 0
"movq b00, %%mm5 \n\t" // 0
"psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
"psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
"movq b00, %%mm7 \n\t" // 0
"movq pQPb, %%mm2 \n\t" // QP
PAVGB(%%mm6, %%mm2) // 128 + QP/2
"psubb %%mm6, %%mm2 \n\t"
"movq %%mm4, %%mm1 \n\t"
"pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
"pxor %%mm1, %%mm4 \n\t"
"psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
"pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
"psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
"movq %%mm4, %%mm3 \n\t" // d
"psubusb b01, %%mm4 \n\t"
PAVGB(%%mm7, %%mm4) // d/32
PAVGB(%%mm7, %%mm4) // (d + 32)/64
"paddb %%mm3, %%mm4 \n\t" // 5d/64
"pand %%mm2, %%mm4 \n\t"
"movq b80, %%mm5 \n\t" // 128
"psubb %%mm0, %%mm5 \n\t" // q
"paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
"pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
"pxor %%mm7, %%mm5 \n\t"
PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
"pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
"pand %%mm7, %%mm4 \n\t"
"movq (%%eax, %1, 2), %%mm0 \n\t"
"movq (%0, %1, 4), %%mm2 \n\t"
"pxor %%mm1, %%mm0 \n\t"
"pxor %%mm1, %%mm2 \n\t"
"paddb %%mm4, %%mm0 \n\t"
"psubb %%mm4, %%mm2 \n\t"
"pxor %%mm1, %%mm0 \n\t"
"pxor %%mm1, %%mm2 \n\t"
"movq %%mm0, (%%eax, %1, 2) \n\t"
"movq %%mm2, (%0, %1, 4) \n\t"
:
: "r" (src), "r" (stride)
: "%eax", "%ebx"
);
/*
{
int x;
src-= stride;
for(x=0; x<BLOCK_SIZE; x++)
{
const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
if(ABS(middleEnergy)< 8*QP)
{
const int q=(src[l4] - src[l5])/2;
const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
d= MAX(d, 0);
d= (5*d + 32) >> 6;
d*= SIGN(-middleEnergy);
if(q>0)
{
d= d<0 ? 0 : d;
d= d>q ? q : d;
}
else
{
d= d>0 ? 0 : d;
d= d<q ? q : d;
}
src[l4]-= d;
src[l5]+= d;
}
src++;
}
src-=8;
for(x=0; x<8; x++)
{
int y;
for(y=4; y<6; y++)
{
int d= src[x+y*stride] - tmp[x+(y-4)*8];
int ad= ABS(d);
static int max=0;
static int sum=0;
static int num=0;
static int bias=0;
if(max<ad) max=ad;
sum+= ad>3 ? 1 : 0;
if(ad>3)
{
src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
}
if(y==4) bias+=d;
num++;
if(num%1000000 == 0)
{
printf(" %d %d %d %d\n", num, sum, max, bias);
}
}
}
}
*/
#elif defined (HAVE_MMX)
src+= stride*4;
asm volatile(
"pxor %%mm7, %%mm7 \n\t"
"leal (%0, %1), %%eax \n\t"
@ -3961,7 +4249,17 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
uint8_t *dstBlock= &(dst[y*dstStride]);
memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
}
}
/*
for(x=0; x<width; x+=32)
{
int i;
i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
+ dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
+ dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]
+ dstBlock[x +13*dstStride] + dstBlock[x +14*dstStride]
+ dstBlock[x +15*dstStride];
}
*/ }
#ifdef HAVE_3DNOW
asm volatile("femms");
#elif defined (HAVE_MMX)
@ -3977,4 +4275,31 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
(int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
, black, white);
#endif
#ifdef DEBUG_BRIGHTNESS
if(!isColor)
{
int max=1;
int i;
for(i=0; i<256; i++)
if(yHistogram[i] > max) max=yHistogram[i];
for(i=1; i<256; i++)
{
int x;
int start=yHistogram[i-1]/(max/256+1);
int end=yHistogram[i]/(max/256+1);
int inc= end > start ? 1 : -1;
for(x=start; x!=end+inc; x+=inc)
dst[ i*dstStride + x]+=128;
}
for(i=0; i<100; i+=2)
{
dst[ (white)*dstStride + i]+=128;
dst[ (black)*dstStride + i]+=128;
}
}
#endif
}

View File

@ -21,11 +21,11 @@
isVertDC Ec Ec
isVertMinMaxOk Ec Ec
doVertLowPass E e e
doVertDefFilter Ec Ec Ec
doVertDefFilter Ec Ec e e
isHorizDC Ec Ec
isHorizMinMaxOk a E
doHorizLowPass E e e
doHorizDefFilter Ec Ec Ec
doHorizDefFilter Ec Ec e e
deRing E e e*
Vertical RKAlgo1 E a a
Horizontal RKAlgo1 a a
@ -63,8 +63,6 @@ optimize c versions
try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
smart blur
...
Notes:
*/
//Changelog: use the CVS log
@ -80,6 +78,7 @@ Notes:
//#undef HAVE_MMX2
//#define HAVE_3DNOW
//#undef HAVE_MMX
//#define DEBUG_BRIGHTNESS
#include "postprocess.h"
#define MIN(a,b) ((a) > (b) ? (b) : (a))
@ -1067,10 +1066,299 @@ HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
{
#ifdef HAVE_MMX
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
/*
uint8_t tmp[16];
const int l1= stride;
const int l2= stride + l1;
const int l3= stride + l2;
const int l4= (int)tmp - (int)src - stride*3;
const int l5= (int)tmp - (int)src - stride*3 + 8;
const int l6= stride*3 + l3;
const int l7= stride + l6;
const int l8= stride + l7;
memcpy(tmp, src+stride*7, 8);
memcpy(tmp+8, src+stride*8, 8);
*/
src+= stride*4;
//FIXME try pmul for *5 stuff
// src[0]=0;
asm volatile(
#if 0 //sligtly more accurate and slightly slower
"pxor %%mm7, %%mm7 \n\t" // 0
"leal (%0, %1), %%eax \n\t"
"leal (%%eax, %1, 4), %%ebx \n\t"
// 0 1 2 3 4 5 6 7
// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
"movq (%0, %1, 2), %%mm0 \n\t" // l2
"movq (%0), %%mm1 \n\t" // l0
"movq %%mm0, %%mm2 \n\t" // l2
PAVGB(%%mm7, %%mm0) // ~l2/2
PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
"movq (%%eax), %%mm1 \n\t" // l1
"movq (%%eax, %1, 2), %%mm3 \n\t" // l3
"movq %%mm1, %%mm4 \n\t" // l1
PAVGB(%%mm7, %%mm1) // ~l1/2
PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
"movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
"psubusb %%mm1, %%mm0 \n\t"
"psubusb %%mm4, %%mm1 \n\t"
"por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
"movq (%0, %1, 4), %%mm0 \n\t" // l4
"movq %%mm0, %%mm4 \n\t" // l4
PAVGB(%%mm7, %%mm0) // ~l4/2
PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
"movq (%%ebx), %%mm2 \n\t" // l5
"movq %%mm3, %%mm5 \n\t" // l3
PAVGB(%%mm7, %%mm3) // ~l3/2
PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
"movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
"psubusb %%mm3, %%mm0 \n\t"
"psubusb %%mm6, %%mm3 \n\t"
"por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
"pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
"movq (%%ebx, %1), %%mm6 \n\t" // l6
"movq %%mm6, %%mm5 \n\t" // l6
PAVGB(%%mm7, %%mm6) // ~l6/2
PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
"movq (%%ebx, %1, 2), %%mm5 \n\t" // l7
"movq %%mm2, %%mm4 \n\t" // l5
PAVGB(%%mm7, %%mm2) // ~l5/2
PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
"movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
"psubusb %%mm2, %%mm6 \n\t"
"psubusb %%mm4, %%mm2 \n\t"
"por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
"movq pQPb, %%mm4 \n\t" // QP //FIXME QP+1 ?
"paddusb b01, %%mm4 \n\t"
"pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
"psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
"pand %%mm4, %%mm3 \n\t"
"movq %%mm3, %%mm1 \n\t"
// "psubusb b01, %%mm3 \n\t"
PAVGB(%%mm7, %%mm3)
PAVGB(%%mm7, %%mm3)
"paddusb %%mm1, %%mm3 \n\t"
// "paddusb b01, %%mm3 \n\t"
"movq (%%eax, %1, 2), %%mm6 \n\t" //l3
"movq (%0, %1, 4), %%mm5 \n\t" //l4
"movq (%0, %1, 4), %%mm4 \n\t" //l4
"psubusb %%mm6, %%mm5 \n\t"
"psubusb %%mm4, %%mm6 \n\t"
"por %%mm6, %%mm5 \n\t" // |l3-l4|
"pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
"pxor %%mm6, %%mm0 \n\t"
"pand %%mm0, %%mm3 \n\t"
PMINUB(%%mm5, %%mm3, %%mm0)
"psubusb b01, %%mm3 \n\t"
PAVGB(%%mm7, %%mm3)
"movq (%%eax, %1, 2), %%mm0 \n\t"
"movq (%0, %1, 4), %%mm2 \n\t"
"pxor %%mm6, %%mm0 \n\t"
"pxor %%mm6, %%mm2 \n\t"
"psubb %%mm3, %%mm0 \n\t"
"paddb %%mm3, %%mm2 \n\t"
"pxor %%mm6, %%mm0 \n\t"
"pxor %%mm6, %%mm2 \n\t"
"movq %%mm0, (%%eax, %1, 2) \n\t"
"movq %%mm2, (%0, %1, 4) \n\t"
#endif
"leal (%0, %1), %%eax \n\t"
"pcmpeqb %%mm6, %%mm6 \n\t" // -1
// 0 1 2 3 4 5 6 7
// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
"movq (%%eax, %1, 2), %%mm1 \n\t" // l3
"movq (%0, %1, 4), %%mm0 \n\t" // l4
"pxor %%mm6, %%mm1 \n\t" // -l3-1
PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
// mm1=-l3-1, mm0=128-q
"movq (%%eax, %1, 4), %%mm2 \n\t" // l5
"movq (%%eax, %1), %%mm3 \n\t" // l2
"pxor %%mm6, %%mm2 \n\t" // -l5-1
"movq %%mm2, %%mm5 \n\t" // -l5-1
"movq b80, %%mm4 \n\t" // 128
"leal (%%eax, %1, 4), %%ebx \n\t"
PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
"movq (%%eax), %%mm2 \n\t" // l1
"pxor %%mm6, %%mm2 \n\t" // -l1-1
PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
PAVGB((%0), %%mm1) // (l0-l3+256)/2
"movq b80, %%mm3 \n\t" // 128
PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2
"movq (%%ebx, %1, 2), %%mm1 \n\t" // l7
"pxor %%mm6, %%mm1 \n\t" // -l7-1
PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
"movq b80, %%mm2 \n\t" // 128
PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
"movq b00, %%mm1 \n\t" // 0
"movq b00, %%mm5 \n\t" // 0
"psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
"psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
"movq b00, %%mm7 \n\t" // 0
"movq pQPb, %%mm2 \n\t" // QP
PAVGB(%%mm6, %%mm2) // 128 + QP/2
"psubb %%mm6, %%mm2 \n\t"
"movq %%mm4, %%mm1 \n\t"
"pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
"pxor %%mm1, %%mm4 \n\t"
"psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
"pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
"psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
"movq %%mm4, %%mm3 \n\t" // d
"psubusb b01, %%mm4 \n\t"
PAVGB(%%mm7, %%mm4) // d/32
PAVGB(%%mm7, %%mm4) // (d + 32)/64
"paddb %%mm3, %%mm4 \n\t" // 5d/64
"pand %%mm2, %%mm4 \n\t"
"movq b80, %%mm5 \n\t" // 128
"psubb %%mm0, %%mm5 \n\t" // q
"paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
"pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
"pxor %%mm7, %%mm5 \n\t"
PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
"pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
"pand %%mm7, %%mm4 \n\t"
"movq (%%eax, %1, 2), %%mm0 \n\t"
"movq (%0, %1, 4), %%mm2 \n\t"
"pxor %%mm1, %%mm0 \n\t"
"pxor %%mm1, %%mm2 \n\t"
"paddb %%mm4, %%mm0 \n\t"
"psubb %%mm4, %%mm2 \n\t"
"pxor %%mm1, %%mm0 \n\t"
"pxor %%mm1, %%mm2 \n\t"
"movq %%mm0, (%%eax, %1, 2) \n\t"
"movq %%mm2, (%0, %1, 4) \n\t"
:
: "r" (src), "r" (stride)
: "%eax", "%ebx"
);
/*
{
int x;
src-= stride;
for(x=0; x<BLOCK_SIZE; x++)
{
const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
if(ABS(middleEnergy)< 8*QP)
{
const int q=(src[l4] - src[l5])/2;
const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
d= MAX(d, 0);
d= (5*d + 32) >> 6;
d*= SIGN(-middleEnergy);
if(q>0)
{
d= d<0 ? 0 : d;
d= d>q ? q : d;
}
else
{
d= d>0 ? 0 : d;
d= d<q ? q : d;
}
src[l4]-= d;
src[l5]+= d;
}
src++;
}
src-=8;
for(x=0; x<8; x++)
{
int y;
for(y=4; y<6; y++)
{
int d= src[x+y*stride] - tmp[x+(y-4)*8];
int ad= ABS(d);
static int max=0;
static int sum=0;
static int num=0;
static int bias=0;
if(max<ad) max=ad;
sum+= ad>3 ? 1 : 0;
if(ad>3)
{
src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
}
if(y==4) bias+=d;
num++;
if(num%1000000 == 0)
{
printf(" %d %d %d %d\n", num, sum, max, bias);
}
}
}
}
*/
#elif defined (HAVE_MMX)
src+= stride*4;
asm volatile(
"pxor %%mm7, %%mm7 \n\t"
"leal (%0, %1), %%eax \n\t"
@ -3961,7 +4249,17 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
uint8_t *dstBlock= &(dst[y*dstStride]);
memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
}
}
/*
for(x=0; x<width; x+=32)
{
int i;
i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
+ dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
+ dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]
+ dstBlock[x +13*dstStride] + dstBlock[x +14*dstStride]
+ dstBlock[x +15*dstStride];
}
*/ }
#ifdef HAVE_3DNOW
asm volatile("femms");
#elif defined (HAVE_MMX)
@ -3977,4 +4275,31 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
(int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
, black, white);
#endif
#ifdef DEBUG_BRIGHTNESS
if(!isColor)
{
int max=1;
int i;
for(i=0; i<256; i++)
if(yHistogram[i] > max) max=yHistogram[i];
for(i=1; i<256; i++)
{
int x;
int start=yHistogram[i-1]/(max/256+1);
int end=yHistogram[i]/(max/256+1);
int inc= end > start ? 1 : -1;
for(x=start; x!=end+inc; x+=inc)
dst[ i*dstStride + x]+=128;
}
for(i=0; i<100; i+=2)
{
dst[ (white)*dstStride + i]+=128;
dst[ (black)*dstStride + i]+=128;
}
}
#endif
}