mirror of https://git.ffmpeg.org/ffmpeg.git
New implementation of rgb32tobgr32
The previous implementation segfaulted with MMX enabled when fed an image smaller than the size of the units the MMX code processed. The new code: - is faster for MMX, MMX2 and plain C - processes small images correctly - is LGPL Originally committed as revision 23009 to svn://svn.mplayerhq.hu/mplayer/trunk/libswscale
This commit is contained in:
parent
4f99f93268
commit
b38d487466
|
@ -1364,49 +1364,66 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_
|
||||||
|
|
||||||
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
|
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
|
||||||
{
|
{
|
||||||
|
uint8_t *d = dst, *s = (uint8_t *) src;
|
||||||
|
const uint8_t *end = s + src_size;
|
||||||
#ifdef HAVE_MMX
|
#ifdef HAVE_MMX
|
||||||
/* TODO: unroll this loop */
|
__asm __volatile(
|
||||||
asm volatile (
|
" "PREFETCH" (%1) \n"
|
||||||
"xor %%"REG_a", %%"REG_a" \n\t"
|
" movq %3, %%mm7 \n"
|
||||||
ASMALIGN(4)
|
" pxor %4, %%mm7 \n"
|
||||||
"1: \n\t"
|
" movq %%mm7, %%mm6 \n"
|
||||||
PREFETCH" 32(%0, %%"REG_a") \n\t"
|
" pxor %5, %%mm7 \n"
|
||||||
"movq (%0, %%"REG_a"), %%mm0 \n\t"
|
" jmp 2f \n"
|
||||||
"movq %%mm0, %%mm1 \n\t"
|
ASMALIGN(4)
|
||||||
"movq %%mm0, %%mm2 \n\t"
|
"1: \n"
|
||||||
"pslld $16, %%mm0 \n\t"
|
" "PREFETCH" 32(%1) \n"
|
||||||
"psrld $16, %%mm1 \n\t"
|
" movq (%1), %%mm0 \n"
|
||||||
"pand "MANGLE(mask32r)", %%mm0 \n\t"
|
" movq 8(%1), %%mm1 \n"
|
||||||
"pand "MANGLE(mask32g)", %%mm2 \n\t"
|
# ifdef HAVE_MMX2
|
||||||
"pand "MANGLE(mask32b)", %%mm1 \n\t"
|
" pshufw $177, %%mm0, %%mm3 \n"
|
||||||
"por %%mm0, %%mm2 \n\t"
|
" pshufw $177, %%mm1, %%mm5 \n"
|
||||||
"por %%mm1, %%mm2 \n\t"
|
" pand %%mm7, %%mm0 \n"
|
||||||
MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t"
|
" pand %%mm6, %%mm3 \n"
|
||||||
"add $8, %%"REG_a" \n\t"
|
" pand %%mm7, %%mm1 \n"
|
||||||
"cmp %2, %%"REG_a" \n\t"
|
" pand %%mm6, %%mm5 \n"
|
||||||
" jb 1b \n\t"
|
" por %%mm3, %%mm0 \n"
|
||||||
:: "r" (src), "r"(dst), "r" (src_size-7)
|
" por %%mm5, %%mm1 \n"
|
||||||
: "%"REG_a
|
# else
|
||||||
);
|
" movq %%mm0, %%mm2 \n"
|
||||||
|
" movq %%mm1, %%mm4 \n"
|
||||||
__asm __volatile(SFENCE:::"memory");
|
" pand %%mm7, %%mm0 \n"
|
||||||
__asm __volatile(EMMS:::"memory");
|
" pand %%mm6, %%mm2 \n"
|
||||||
#else
|
" pand %%mm7, %%mm1 \n"
|
||||||
unsigned i;
|
" pand %%mm6, %%mm4 \n"
|
||||||
unsigned num_pixels = src_size >> 2;
|
" movq %%mm2, %%mm3 \n"
|
||||||
for(i=0; i<num_pixels; i++)
|
" movq %%mm4, %%mm5 \n"
|
||||||
{
|
" pslld $16, %%mm2 \n"
|
||||||
#ifdef WORDS_BIGENDIAN
|
" psrld $16, %%mm3 \n"
|
||||||
dst[4*i + 1] = src[4*i + 3];
|
" pslld $16, %%mm4 \n"
|
||||||
dst[4*i + 2] = src[4*i + 2];
|
" psrld $16, %%mm5 \n"
|
||||||
dst[4*i + 3] = src[4*i + 1];
|
" por %%mm2, %%mm0 \n"
|
||||||
#else
|
" por %%mm4, %%mm1 \n"
|
||||||
dst[4*i + 0] = src[4*i + 2];
|
" por %%mm3, %%mm0 \n"
|
||||||
dst[4*i + 1] = src[4*i + 1];
|
" por %%mm5, %%mm1 \n"
|
||||||
dst[4*i + 2] = src[4*i + 0];
|
# endif
|
||||||
|
" "MOVNTQ" %%mm0, (%0) \n"
|
||||||
|
" "MOVNTQ" %%mm1, 8(%0) \n"
|
||||||
|
" add $16, %0 \n"
|
||||||
|
" add $16, %1 \n"
|
||||||
|
"2: \n"
|
||||||
|
" cmp %1, %2 \n"
|
||||||
|
" ja 1b \n"
|
||||||
|
" "SFENCE" \n"
|
||||||
|
" "EMMS" \n"
|
||||||
|
: "+r"(d), "+r"(s)
|
||||||
|
: "r" (end-15), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
|
||||||
|
: "memory");
|
||||||
#endif
|
#endif
|
||||||
|
for (; s<end; s+=4, d+=4) {
|
||||||
|
int v = *(uint32_t *)s, g = v & 0xff00;
|
||||||
|
v &= 0xff00ff;
|
||||||
|
*(uint32_t *)d = (v>>16) + g + (v<<16);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
|
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
|
||||||
|
|
Loading…
Reference in New Issue