mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2025-01-20 14:20:51 +00:00
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
now Originally committed as revision 3435 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
ff158dc95d
commit
d6af6b0350
@ -1334,7 +1334,7 @@ static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
|
||||
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
|
||||
int i=0;
|
||||
uint8_t l, lt;
|
||||
|
||||
|
||||
asm volatile(
|
||||
"1: \n\t"
|
||||
"movq -1(%1, %0), %%mm0 \n\t" // LT
|
||||
@ -2046,7 +2046,7 @@ static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride)
|
||||
uint64_t temp[8];\
|
||||
uint8_t * const half= (uint8_t*)temp;\
|
||||
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
|
||||
OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
|
||||
OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
@ -2057,14 +2057,14 @@ static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride)
|
||||
uint64_t temp[8];\
|
||||
uint8_t * const half= (uint8_t*)temp;\
|
||||
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
|
||||
OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
|
||||
OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t temp[8];\
|
||||
uint8_t * const half= (uint8_t*)temp;\
|
||||
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
|
||||
OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
|
||||
OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
@ -2075,43 +2075,43 @@ static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride)
|
||||
uint64_t temp[8];\
|
||||
uint8_t * const half= (uint8_t*)temp;\
|
||||
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
|
||||
OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
|
||||
OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
|
||||
}\
|
||||
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t half[8 + 9];\
|
||||
uint8_t * const halfH= ((uint8_t*)half) + 64;\
|
||||
uint8_t * const halfHV= ((uint8_t*)half);\
|
||||
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
||||
put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
|
||||
put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
|
||||
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
||||
OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
|
||||
OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
|
||||
}\
|
||||
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t half[8 + 9];\
|
||||
uint8_t * const halfH= ((uint8_t*)half) + 64;\
|
||||
uint8_t * const halfHV= ((uint8_t*)half);\
|
||||
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
||||
put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
|
||||
put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
|
||||
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
||||
OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
|
||||
OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
|
||||
}\
|
||||
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t half[8 + 9];\
|
||||
uint8_t * const halfH= ((uint8_t*)half) + 64;\
|
||||
uint8_t * const halfHV= ((uint8_t*)half);\
|
||||
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
||||
put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
|
||||
put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
|
||||
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
||||
OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
|
||||
OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
|
||||
}\
|
||||
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t half[8 + 9];\
|
||||
uint8_t * const halfH= ((uint8_t*)half) + 64;\
|
||||
uint8_t * const halfHV= ((uint8_t*)half);\
|
||||
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
||||
put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
|
||||
put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
|
||||
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
||||
OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
|
||||
OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
|
||||
}\
|
||||
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t half[8 + 9];\
|
||||
@ -2119,7 +2119,7 @@ static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride)
|
||||
uint8_t * const halfHV= ((uint8_t*)half);\
|
||||
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
||||
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
||||
OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
|
||||
OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
|
||||
}\
|
||||
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t half[8 + 9];\
|
||||
@ -2127,20 +2127,20 @@ static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride)
|
||||
uint8_t * const halfHV= ((uint8_t*)half);\
|
||||
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
||||
put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
||||
OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
|
||||
OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
|
||||
}\
|
||||
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t half[8 + 9];\
|
||||
uint8_t * const halfH= ((uint8_t*)half);\
|
||||
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
||||
put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
|
||||
put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
|
||||
OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
|
||||
}\
|
||||
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t half[8 + 9];\
|
||||
uint8_t * const halfH= ((uint8_t*)half);\
|
||||
put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
||||
put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
|
||||
put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
|
||||
OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
|
||||
}\
|
||||
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
@ -2157,7 +2157,7 @@ static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride
|
||||
uint64_t temp[32];\
|
||||
uint8_t * const half= (uint8_t*)temp;\
|
||||
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
|
||||
OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
|
||||
OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
@ -2168,14 +2168,14 @@ static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride
|
||||
uint64_t temp[32];\
|
||||
uint8_t * const half= (uint8_t*)temp;\
|
||||
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
|
||||
OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
|
||||
OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t temp[32];\
|
||||
uint8_t * const half= (uint8_t*)temp;\
|
||||
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
|
||||
OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
|
||||
OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
@ -2186,43 +2186,43 @@ static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride
|
||||
uint64_t temp[32];\
|
||||
uint8_t * const half= (uint8_t*)temp;\
|
||||
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
|
||||
OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
|
||||
OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
|
||||
}\
|
||||
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t half[16*2 + 17*2];\
|
||||
uint8_t * const halfH= ((uint8_t*)half) + 256;\
|
||||
uint8_t * const halfHV= ((uint8_t*)half);\
|
||||
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
||||
put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
|
||||
put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
|
||||
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
|
||||
OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
|
||||
OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
|
||||
}\
|
||||
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t half[16*2 + 17*2];\
|
||||
uint8_t * const halfH= ((uint8_t*)half) + 256;\
|
||||
uint8_t * const halfHV= ((uint8_t*)half);\
|
||||
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
||||
put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
|
||||
put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
|
||||
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
|
||||
OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
|
||||
OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
|
||||
}\
|
||||
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t half[16*2 + 17*2];\
|
||||
uint8_t * const halfH= ((uint8_t*)half) + 256;\
|
||||
uint8_t * const halfHV= ((uint8_t*)half);\
|
||||
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
||||
put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
|
||||
put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
|
||||
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
|
||||
OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
|
||||
OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
|
||||
}\
|
||||
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t half[16*2 + 17*2];\
|
||||
uint8_t * const halfH= ((uint8_t*)half) + 256;\
|
||||
uint8_t * const halfHV= ((uint8_t*)half);\
|
||||
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
||||
put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
|
||||
put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
|
||||
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
|
||||
OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
|
||||
OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
|
||||
}\
|
||||
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t half[16*2 + 17*2];\
|
||||
@ -2230,7 +2230,7 @@ static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride
|
||||
uint8_t * const halfHV= ((uint8_t*)half);\
|
||||
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
||||
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
|
||||
OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
|
||||
OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
|
||||
}\
|
||||
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t half[16*2 + 17*2];\
|
||||
@ -2238,20 +2238,20 @@ static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride
|
||||
uint8_t * const halfHV= ((uint8_t*)half);\
|
||||
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
||||
put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
|
||||
OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
|
||||
OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
|
||||
}\
|
||||
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t half[17*2];\
|
||||
uint8_t * const halfH= ((uint8_t*)half);\
|
||||
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
||||
put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
|
||||
put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
|
||||
OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
|
||||
}\
|
||||
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t half[17*2];\
|
||||
uint8_t * const halfH= ((uint8_t*)half);\
|
||||
put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
||||
put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
|
||||
put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
|
||||
OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
|
||||
}\
|
||||
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
|
@ -53,9 +53,19 @@ static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_
|
||||
:"%eax", "memory");
|
||||
}
|
||||
|
||||
static __attribute__((unused)) void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
||||
static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
||||
{
|
||||
__asm __volatile(
|
||||
"testl $1, %0 \n\t"
|
||||
" jz 1f \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%2), %%mm1 \n\t"
|
||||
"addl %4, %1 \n\t"
|
||||
"addl $8, %2 \n\t"
|
||||
PAVGB" %%mm1, %%mm0 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"addl %5, %3 \n\t"
|
||||
"decl %0 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"addl %4, %1 \n\t"
|
||||
@ -80,9 +90,142 @@ static __attribute__((unused)) void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *s
|
||||
"addl $32, %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
|
||||
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"(src1Stride), "D"(dstStride)
|
||||
:"memory");
|
||||
//the following should be used, though better not with gcc ...
|
||||
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
:"r"(src1Stride), "r"(dstStride)
|
||||
:"memory");
|
||||
:"memory");*/
|
||||
}
|
||||
|
||||
static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
||||
{
|
||||
__asm __volatile(
|
||||
"pcmpeqb %%mm6, %%mm6 \n\t"
|
||||
"testl $1, %0 \n\t"
|
||||
" jz 1f \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%2), %%mm1 \n\t"
|
||||
"addl %4, %1 \n\t"
|
||||
"addl $8, %2 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
PAVGB" %%mm1, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"addl %5, %3 \n\t"
|
||||
"decl %0 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"addl %4, %1 \n\t"
|
||||
"movq (%1), %%mm1 \n\t"
|
||||
"addl %4, %1 \n\t"
|
||||
"movq (%2), %%mm2 \n\t"
|
||||
"movq 8(%2), %%mm3 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm2 \n\t"
|
||||
"pxor %%mm6, %%mm3 \n\t"
|
||||
PAVGB" %%mm2, %%mm0 \n\t"
|
||||
PAVGB" %%mm3, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"addl %5, %3 \n\t"
|
||||
"movq %%mm1, (%3) \n\t"
|
||||
"addl %5, %3 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"addl %4, %1 \n\t"
|
||||
"movq (%1), %%mm1 \n\t"
|
||||
"addl %4, %1 \n\t"
|
||||
"movq 16(%2), %%mm2 \n\t"
|
||||
"movq 24(%2), %%mm3 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm2 \n\t"
|
||||
"pxor %%mm6, %%mm3 \n\t"
|
||||
PAVGB" %%mm2, %%mm0 \n\t"
|
||||
PAVGB" %%mm3, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"addl %5, %3 \n\t"
|
||||
"movq %%mm1, (%3) \n\t"
|
||||
"addl %5, %3 \n\t"
|
||||
"addl $32, %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
|
||||
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"(src1Stride), "D"(dstStride)
|
||||
:"memory");
|
||||
//the following should be used, though better not with gcc ...
|
||||
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
:"r"(src1Stride), "r"(dstStride)
|
||||
:"memory");*/
|
||||
}
|
||||
|
||||
static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
||||
{
|
||||
__asm __volatile(
|
||||
"testl $1, %0 \n\t"
|
||||
" jz 1f \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%2), %%mm1 \n\t"
|
||||
"addl %4, %1 \n\t"
|
||||
"addl $8, %2 \n\t"
|
||||
PAVGB" %%mm1, %%mm0 \n\t"
|
||||
PAVGB" (%3), %%mm0 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"addl %5, %3 \n\t"
|
||||
"decl %0 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"addl %4, %1 \n\t"
|
||||
"movq (%1), %%mm1 \n\t"
|
||||
"addl %4, %1 \n\t"
|
||||
PAVGB" (%2), %%mm0 \n\t"
|
||||
PAVGB" 8(%2), %%mm1 \n\t"
|
||||
PAVGB" (%3), %%mm0 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"addl %5, %3 \n\t"
|
||||
PAVGB" (%3), %%mm1 \n\t"
|
||||
"movq %%mm1, (%3) \n\t"
|
||||
"addl %5, %3 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"addl %4, %1 \n\t"
|
||||
"movq (%1), %%mm1 \n\t"
|
||||
"addl %4, %1 \n\t"
|
||||
PAVGB" 16(%2), %%mm0 \n\t"
|
||||
PAVGB" 24(%2), %%mm1 \n\t"
|
||||
PAVGB" (%3), %%mm0 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"addl %5, %3 \n\t"
|
||||
PAVGB" (%3), %%mm1 \n\t"
|
||||
"movq %%mm1, (%3) \n\t"
|
||||
"addl %5, %3 \n\t"
|
||||
"addl $32, %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
|
||||
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"(src1Stride), "D"(dstStride)
|
||||
:"memory");
|
||||
//the following should be used, though better not with gcc ...
|
||||
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
:"r"(src1Stride), "r"(dstStride)
|
||||
:"memory");*/
|
||||
}
|
||||
|
||||
static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
@ -125,9 +268,21 @@ static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line
|
||||
:"%eax", "memory");
|
||||
}
|
||||
|
||||
static __attribute__((unused)) void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
||||
static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
||||
{
|
||||
__asm __volatile(
|
||||
"testl $1, %0 \n\t"
|
||||
" jz 1f \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm1 \n\t"
|
||||
PAVGB" (%2), %%mm0 \n\t"
|
||||
PAVGB" 8(%2), %%mm1 \n\t"
|
||||
"addl %4, %1 \n\t"
|
||||
"addl $16, %2 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"movq %%mm1, 8(%3) \n\t"
|
||||
"addl %5, %3 \n\t"
|
||||
"decl %0 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm1 \n\t"
|
||||
@ -148,9 +303,144 @@ static __attribute__((unused)) void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *
|
||||
"addl $32, %2 \n\t"
|
||||
"subl $2, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
|
||||
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"(src1Stride), "D"(dstStride)
|
||||
:"memory");
|
||||
//the following should be used, though better not with gcc ...
|
||||
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
:"r"(src1Stride), "r"(dstStride)
|
||||
:"memory");
|
||||
:"memory");*/
|
||||
}
|
||||
|
||||
static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
||||
{
|
||||
__asm __volatile(
|
||||
"testl $1, %0 \n\t"
|
||||
" jz 1f \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm1 \n\t"
|
||||
PAVGB" (%2), %%mm0 \n\t"
|
||||
PAVGB" 8(%2), %%mm1 \n\t"
|
||||
"addl %4, %1 \n\t"
|
||||
"addl $16, %2 \n\t"
|
||||
PAVGB" (%3), %%mm0 \n\t"
|
||||
PAVGB" 8(%3), %%mm1 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"movq %%mm1, 8(%3) \n\t"
|
||||
"addl %5, %3 \n\t"
|
||||
"decl %0 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm1 \n\t"
|
||||
"addl %4, %1 \n\t"
|
||||
PAVGB" (%2), %%mm0 \n\t"
|
||||
PAVGB" 8(%2), %%mm1 \n\t"
|
||||
PAVGB" (%3), %%mm0 \n\t"
|
||||
PAVGB" 8(%3), %%mm1 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"movq %%mm1, 8(%3) \n\t"
|
||||
"addl %5, %3 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm1 \n\t"
|
||||
"addl %4, %1 \n\t"
|
||||
PAVGB" 16(%2), %%mm0 \n\t"
|
||||
PAVGB" 24(%2), %%mm1 \n\t"
|
||||
PAVGB" (%3), %%mm0 \n\t"
|
||||
PAVGB" 8(%3), %%mm1 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"movq %%mm1, 8(%3) \n\t"
|
||||
"addl %5, %3 \n\t"
|
||||
"addl $32, %2 \n\t"
|
||||
"subl $2, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
|
||||
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"(src1Stride), "D"(dstStride)
|
||||
:"memory");
|
||||
//the following should be used, though better not with gcc ...
|
||||
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
:"r"(src1Stride), "r"(dstStride)
|
||||
:"memory");*/
|
||||
}
|
||||
|
||||
static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
||||
{
|
||||
__asm __volatile(
|
||||
"pcmpeqb %%mm6, %%mm6\n\t"
|
||||
"testl $1, %0 \n\t"
|
||||
" jz 1f \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm1 \n\t"
|
||||
"movq (%2), %%mm2 \n\t"
|
||||
"movq 8(%2), %%mm3 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm2 \n\t"
|
||||
"pxor %%mm6, %%mm3 \n\t"
|
||||
PAVGB" %%mm2, %%mm0 \n\t"
|
||||
PAVGB" %%mm3, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"addl %4, %1 \n\t"
|
||||
"addl $16, %2 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"movq %%mm1, 8(%3) \n\t"
|
||||
"addl %5, %3 \n\t"
|
||||
"decl %0 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm1 \n\t"
|
||||
"addl %4, %1 \n\t"
|
||||
"movq (%2), %%mm2 \n\t"
|
||||
"movq 8(%2), %%mm3 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm2 \n\t"
|
||||
"pxor %%mm6, %%mm3 \n\t"
|
||||
PAVGB" %%mm2, %%mm0 \n\t"
|
||||
PAVGB" %%mm3, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"movq %%mm1, 8(%3) \n\t"
|
||||
"addl %5, %3 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm1 \n\t"
|
||||
"addl %4, %1 \n\t"
|
||||
"movq 16(%2), %%mm2 \n\t"
|
||||
"movq 24(%2), %%mm3 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm2 \n\t"
|
||||
"pxor %%mm6, %%mm3 \n\t"
|
||||
PAVGB" %%mm2, %%mm0 \n\t"
|
||||
PAVGB" %%mm3, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"movq %%mm1, 8(%3) \n\t"
|
||||
"addl %5, %3 \n\t"
|
||||
"addl $32, %2 \n\t"
|
||||
"subl $2, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
|
||||
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"(src1Stride), "D"(dstStride)
|
||||
:"memory");
|
||||
//the following should be used, though better not with gcc ...
|
||||
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
:"r"(src1Stride), "r"(dstStride)
|
||||
:"memory");*/
|
||||
}
|
||||
|
||||
/* GL: this function does incorrect rounding if overflow */
|
||||
|
Loading…
Reference in New Issue
Block a user