mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2025-01-21 06:50:56 +00:00
15% faster put_h264_qpel16_hv_lowpass_mmx2
Originally committed as revision 4949 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
1a44a8b654
commit
0331f09237
@ -630,9 +630,9 @@ static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, i
|
||||
dst += 4-8*dstStride;\
|
||||
}\
|
||||
}\
|
||||
static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
||||
int h=8;\
|
||||
int w=4;\
|
||||
static inline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
|
||||
int h = size;\
|
||||
int w = (size+8)>>2;\
|
||||
src -= 2*srcStride+2;\
|
||||
while(w--){\
|
||||
asm volatile(\
|
||||
@ -652,23 +652,40 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp,
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm3 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"\
|
||||
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*4)\
|
||||
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*4)\
|
||||
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*4)\
|
||||
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*4)\
|
||||
QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*8*4)\
|
||||
QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*8*4)\
|
||||
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*8*4)\
|
||||
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*8*4)\
|
||||
\
|
||||
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
|
||||
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
|
||||
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
|
||||
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
|
||||
QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
|
||||
QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
|
||||
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
|
||||
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
|
||||
: "+a"(src)\
|
||||
: "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
|
||||
: "memory"\
|
||||
);\
|
||||
if(size==16){\
|
||||
asm volatile(\
|
||||
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
|
||||
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
|
||||
QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
|
||||
QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
|
||||
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
|
||||
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
|
||||
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
|
||||
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
|
||||
: "+a"(src)\
|
||||
: "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
|
||||
: "memory"\
|
||||
);\
|
||||
tmp += 4;\
|
||||
src += 4 - 13*srcStride;\
|
||||
}\
|
||||
tmp -= 4*4;\
|
||||
tmp += 4;\
|
||||
src += 4 - (size+5)*srcStride;\
|
||||
}\
|
||||
tmp -= size+8;\
|
||||
w = size>>4;\
|
||||
do{\
|
||||
h = size;\
|
||||
asm volatile(\
|
||||
"movq %4, %%mm6 \n\t"\
|
||||
"1: \n\t"\
|
||||
@ -702,7 +719,7 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp,
|
||||
"psraw $6, %%mm3 \n\t"\
|
||||
"packuswb %%mm3, %%mm0 \n\t"\
|
||||
OP(%%mm0, (%1),%%mm7, q)\
|
||||
"add $32, %0 \n\t"\
|
||||
"add $48, %0 \n\t"\
|
||||
"add %3, %1 \n\t"\
|
||||
"decl %2 \n\t"\
|
||||
" jnz 1b \n\t"\
|
||||
@ -710,6 +727,9 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp,
|
||||
: "S"((long)dstStride), "m"(ff_pw_32)\
|
||||
: "memory"\
|
||||
);\
|
||||
tmp += 8 - size*24;\
|
||||
dst += 8 - size*dstStride;\
|
||||
}while(w--);\
|
||||
}\
|
||||
static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
|
||||
OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
|
||||
@ -729,13 +749,12 @@ static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src,
|
||||
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
||||
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
||||
OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\
|
||||
OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp , src+8, dstStride, tmpStride, srcStride);\
|
||||
src += 8*srcStride;\
|
||||
dst += 8*dstStride;\
|
||||
OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\
|
||||
OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp , src+8, dstStride, tmpStride, srcStride);\
|
||||
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
|
||||
}\
|
||||
|
||||
#define H264_MC(OPNAME, SIZE, MMX) \
|
||||
@ -816,13 +835,13 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t temp[SIZE*(SIZE+8)/4];\
|
||||
uint64_t temp[SIZE*(SIZE<8?12:24)/4];\
|
||||
int16_t * const tmp= (int16_t*)temp;\
|
||||
OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, tmp, src, stride, SIZE, stride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
|
||||
uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\
|
||||
uint8_t * const halfH= (uint8_t*)temp;\
|
||||
uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
|
||||
int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
|
||||
@ -832,7 +851,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
|
||||
uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\
|
||||
uint8_t * const halfH= (uint8_t*)temp;\
|
||||
uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
|
||||
int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
|
||||
@ -842,7 +861,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
|
||||
uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\
|
||||
uint8_t * const halfV= (uint8_t*)temp;\
|
||||
uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
|
||||
int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
|
||||
@ -852,7 +871,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
|
||||
uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\
|
||||
uint8_t * const halfV= (uint8_t*)temp;\
|
||||
uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
|
||||
int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
|
||||
|
Loading…
Reference in New Issue
Block a user