/* * software RGB to RGB converter * pluralize by software PAL8 to RGB converter * software YUV to YUV converter * software YUV to RGB converter * Written by Nick Kurshev. * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) * lot of big-endian byte order fixes by Alex Beregszaszi * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include #undef PREFETCH #undef MOVNTQ #undef EMMS #undef SFENCE #undef MMREG_SIZE #undef PAVGB #if COMPILE_TEMPLATE_SSE2 #define MMREG_SIZE 16 #else #define MMREG_SIZE 8 #endif #if COMPILE_TEMPLATE_AMD3DNOW #define PREFETCH "prefetch" #define PAVGB "pavgusb" #elif COMPILE_TEMPLATE_MMX2 #define PREFETCH "prefetchnta" #define PAVGB "pavgb" #else #define PREFETCH " # nop" #endif #if COMPILE_TEMPLATE_AMD3DNOW /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */ #define EMMS "femms" #else #define EMMS "emms" #endif #if COMPILE_TEMPLATE_MMX2 #define MOVNTQ "movntq" #define SFENCE "sfence" #else #define MOVNTQ "movq" #define SFENCE " # nop" #endif static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size) { uint8_t *dest = dst; const uint8_t *s = src; const uint8_t *end; const uint8_t *mm_end; end = s + src_size; __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); mm_end = end - 23; __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory"); while (s < mm_end) { __asm__ volatile( PREFETCH" 32%1 \n\t" "movd %1, %%mm0 \n\t" "punpckldq 3%1, %%mm0 \n\t" "movd 6%1, %%mm1 \n\t" "punpckldq 9%1, %%mm1 \n\t" "movd 12%1, %%mm2 \n\t" "punpckldq 15%1, %%mm2 \n\t" "movd 18%1, %%mm3 \n\t" "punpckldq 21%1, %%mm3 \n\t" "por %%mm7, %%mm0 \n\t" "por %%mm7, %%mm1 \n\t" "por %%mm7, %%mm2 \n\t" "por %%mm7, %%mm3 \n\t" MOVNTQ" %%mm0, %0 \n\t" MOVNTQ" %%mm1, 8%0 \n\t" MOVNTQ" %%mm2, 16%0 \n\t" MOVNTQ" %%mm3, 24%0" :"=m"(*dest) :"m"(*s) :"memory"); dest += 32; s += 24; } __asm__ volatile(SFENCE:::"memory"); __asm__ volatile(EMMS:::"memory"); while (s < end) { *dest++ = *s++; *dest++ = *s++; *dest++ = *s++; *dest++ = 255; } } #define STORE_BGR24_MMX \ "psrlq $8, %%mm2 \n\t" \ "psrlq $8, %%mm3 \n\t" \ "psrlq $8, %%mm6 \n\t" \ "psrlq $8, %%mm7 \n\t" \ "pand "MANGLE(mask24l)", %%mm0\n\t" \ "pand "MANGLE(mask24l)", %%mm1\n\t" \ "pand "MANGLE(mask24l)", %%mm4\n\t" \ "pand "MANGLE(mask24l)", %%mm5\n\t" \ "pand "MANGLE(mask24h)", %%mm2\n\t" \ "pand "MANGLE(mask24h)", %%mm3\n\t" \ "pand "MANGLE(mask24h)", %%mm6\n\t" \ "pand "MANGLE(mask24h)", %%mm7\n\t" \ "por %%mm2, %%mm0 \n\t" \ "por %%mm3, %%mm1 \n\t" \ "por %%mm6, %%mm4 \n\t" \ "por %%mm7, %%mm5 \n\t" \ \ "movq %%mm1, %%mm2 \n\t" \ "movq %%mm4, %%mm3 \n\t" \ "psllq $48, %%mm2 \n\t" \ "psllq $32, %%mm3 \n\t" \ "pand "MANGLE(mask24hh)", %%mm2\n\t" \ "pand "MANGLE(mask24hhh)", %%mm3\n\t" \ "por %%mm2, %%mm0 \n\t" \ "psrlq $16, %%mm1 \n\t" \ "psrlq $32, %%mm4 \n\t" \ "psllq $16, %%mm5 \n\t" \ "por %%mm3, %%mm1 \n\t" \ "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \ "por %%mm5, %%mm4 \n\t" \ \ MOVNTQ" %%mm0, %0 \n\t" \ MOVNTQ" %%mm1, 8%0 \n\t" \ MOVNTQ" %%mm4, 16%0" static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size) { uint8_t *dest = dst; const uint8_t *s = src; const uint8_t *end; const uint8_t *mm_end; end = s + src_size; __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); mm_end = end - 31; while (s < mm_end) { __asm__ volatile( PREFETCH" 32%1 \n\t" "movq %1, %%mm0 \n\t" "movq 8%1, %%mm1 \n\t" "movq 16%1, %%mm4 \n\t" "movq 24%1, %%mm5 \n\t" "movq %%mm0, %%mm2 \n\t" "movq %%mm1, %%mm3 \n\t" "movq %%mm4, %%mm6 \n\t" "movq %%mm5, %%mm7 \n\t" STORE_BGR24_MMX :"=m"(*dest) :"m"(*s) :"memory"); dest += 24; s += 32; } __asm__ volatile(SFENCE:::"memory"); __asm__ volatile(EMMS:::"memory"); while (s < end) { *dest++ = *s++; *dest++ = *s++; *dest++ = *s++; s++; } } /* original by Strepto/Astral ported to gcc & bugfixed: A'rpi MMX2, 3DNOW optimization by Nick Kurshev 32-bit C version, and and&add trick by Michael Niedermayer */ static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size) { register const uint8_t* s=src; register uint8_t* d=dst; register const uint8_t *end; const uint8_t *mm_end; end = s + src_size; __asm__ volatile(PREFETCH" %0"::"m"(*s)); __asm__ volatile("movq %0, %%mm4"::"m"(mask15s)); mm_end = end - 15; while (s>1)&0x7FE07FE0) | (x&0x001F001F); s+=4; d+=4; } if (s < end) { register uint16_t x= *((const uint16_t*)s); *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F); } } static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size) { const uint8_t *s = src; const uint8_t *end; const uint8_t *mm_end; uint16_t *d = (uint16_t *)dst; end = s + src_size; mm_end = end - 15; #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster) __asm__ volatile( "movq %3, %%mm5 \n\t" "movq %4, %%mm6 \n\t" "movq %5, %%mm7 \n\t" "jmp 2f \n\t" ".p2align 4 \n\t" "1: \n\t" PREFETCH" 32(%1) \n\t" "movd (%1), %%mm0 \n\t" "movd 4(%1), %%mm3 \n\t" "punpckldq 8(%1), %%mm0 \n\t" "punpckldq 12(%1), %%mm3 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm3, %%mm4 \n\t" "pand %%mm6, %%mm0 \n\t" "pand %%mm6, %%mm3 \n\t" "pmaddwd %%mm7, %%mm0 \n\t" "pmaddwd %%mm7, %%mm3 \n\t" "pand %%mm5, %%mm1 \n\t" "pand %%mm5, %%mm4 \n\t" "por %%mm1, %%mm0 \n\t" "por %%mm4, %%mm3 \n\t" "psrld $5, %%mm0 \n\t" "pslld $11, %%mm3 \n\t" "por %%mm3, %%mm0 \n\t" MOVNTQ" %%mm0, (%0) \n\t" "add $16, %1 \n\t" "add $8, %0 \n\t" "2: \n\t" "cmp %2, %1 \n\t" " jb 1b \n\t" : "+r" (d), "+r"(s) : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) ); #else __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); __asm__ volatile( "movq %0, %%mm7 \n\t" "movq %1, %%mm6 \n\t" ::"m"(red_16mask),"m"(green_16mask)); while (s < mm_end) { __asm__ volatile( PREFETCH" 32%1 \n\t" "movd %1, %%mm0 \n\t" "movd 4%1, %%mm3 \n\t" "punpckldq 8%1, %%mm0 \n\t" "punpckldq 12%1, %%mm3 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm0, %%mm2 \n\t" "movq %%mm3, %%mm4 \n\t" "movq %%mm3, %%mm5 \n\t" "psrlq $3, %%mm0 \n\t" "psrlq $3, %%mm3 \n\t" "pand %2, %%mm0 \n\t" "pand %2, %%mm3 \n\t" "psrlq $5, %%mm1 \n\t" "psrlq $5, %%mm4 \n\t" "pand %%mm6, %%mm1 \n\t" "pand %%mm6, %%mm4 \n\t" "psrlq $8, %%mm2 \n\t" "psrlq $8, %%mm5 \n\t" "pand %%mm7, %%mm2 \n\t" "pand %%mm7, %%mm5 \n\t" "por %%mm1, %%mm0 \n\t" "por %%mm4, %%mm3 \n\t" "por %%mm2, %%mm0 \n\t" "por %%mm5, %%mm3 \n\t" "psllq $16, %%mm3 \n\t" "por %%mm3, %%mm0 \n\t" MOVNTQ" %%mm0, %0 \n\t" :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); d += 4; s += 16; } #endif __asm__ volatile(SFENCE:::"memory"); __asm__ volatile(EMMS:::"memory"); while (s < end) { register int rgb = *(const uint32_t*)s; s += 4; *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); } } static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size) { const uint8_t *s = src; const uint8_t *end; const uint8_t *mm_end; uint16_t *d = (uint16_t *)dst; end = s + src_size; __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); __asm__ volatile( "movq %0, %%mm7 \n\t" "movq %1, %%mm6 \n\t" ::"m"(red_16mask),"m"(green_16mask)); mm_end = end - 15; while (s < mm_end) { __asm__ volatile( PREFETCH" 32%1 \n\t" "movd %1, %%mm0 \n\t" "movd 4%1, %%mm3 \n\t" "punpckldq 8%1, %%mm0 \n\t" "punpckldq 12%1, %%mm3 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm0, %%mm2 \n\t" "movq %%mm3, %%mm4 \n\t" "movq %%mm3, %%mm5 \n\t" "psllq $8, %%mm0 \n\t" "psllq $8, %%mm3 \n\t" "pand %%mm7, %%mm0 \n\t" "pand %%mm7, %%mm3 \n\t" "psrlq $5, %%mm1 \n\t" "psrlq $5, %%mm4 \n\t" "pand %%mm6, %%mm1 \n\t" "pand %%mm6, %%mm4 \n\t" "psrlq $19, %%mm2 \n\t" "psrlq $19, %%mm5 \n\t" "pand %2, %%mm2 \n\t" "pand %2, %%mm5 \n\t" "por %%mm1, %%mm0 \n\t" "por %%mm4, %%mm3 \n\t" "por %%mm2, %%mm0 \n\t" "por %%mm5, %%mm3 \n\t" "psllq $16, %%mm3 \n\t" "por %%mm3, %%mm0 \n\t" MOVNTQ" %%mm0, %0 \n\t" :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); d += 4; s += 16; } __asm__ volatile(SFENCE:::"memory"); __asm__ volatile(EMMS:::"memory"); while (s < end) { register int rgb = *(const uint32_t*)s; s += 4; *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19); } } static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size) { const uint8_t *s = src; const uint8_t *end; const uint8_t *mm_end; uint16_t *d = (uint16_t *)dst; end = s + src_size; mm_end = end - 15; #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster) __asm__ volatile( "movq %3, %%mm5 \n\t" "movq %4, %%mm6 \n\t" "movq %5, %%mm7 \n\t" "jmp 2f \n\t" ".p2align 4 \n\t" "1: \n\t" PREFETCH" 32(%1) \n\t" "movd (%1), %%mm0 \n\t" "movd 4(%1), %%mm3 \n\t" "punpckldq 8(%1), %%mm0 \n\t" "punpckldq 12(%1), %%mm3 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm3, %%mm4 \n\t" "pand %%mm6, %%mm0 \n\t" "pand %%mm6, %%mm3 \n\t" "pmaddwd %%mm7, %%mm0 \n\t" "pmaddwd %%mm7, %%mm3 \n\t" "pand %%mm5, %%mm1 \n\t" "pand %%mm5, %%mm4 \n\t" "por %%mm1, %%mm0 \n\t" "por %%mm4, %%mm3 \n\t" "psrld $6, %%mm0 \n\t" "pslld $10, %%mm3 \n\t" "por %%mm3, %%mm0 \n\t" MOVNTQ" %%mm0, (%0) \n\t" "add $16, %1 \n\t" "add $8, %0 \n\t" "2: \n\t" "cmp %2, %1 \n\t" " jb 1b \n\t" : "+r" (d), "+r"(s) : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) ); #else __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); __asm__ volatile( "movq %0, %%mm7 \n\t" "movq %1, %%mm6 \n\t" ::"m"(red_15mask),"m"(green_15mask)); while (s < mm_end) { __asm__ volatile( PREFETCH" 32%1 \n\t" "movd %1, %%mm0 \n\t" "movd 4%1, %%mm3 \n\t" "punpckldq 8%1, %%mm0 \n\t" "punpckldq 12%1, %%mm3 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm0, %%mm2 \n\t" "movq %%mm3, %%mm4 \n\t" "movq %%mm3, %%mm5 \n\t" "psrlq $3, %%mm0 \n\t" "psrlq $3, %%mm3 \n\t" "pand %2, %%mm0 \n\t" "pand %2, %%mm3 \n\t" "psrlq $6, %%mm1 \n\t" "psrlq $6, %%mm4 \n\t" "pand %%mm6, %%mm1 \n\t" "pand %%mm6, %%mm4 \n\t" "psrlq $9, %%mm2 \n\t" "psrlq $9, %%mm5 \n\t" "pand %%mm7, %%mm2 \n\t" "pand %%mm7, %%mm5 \n\t" "por %%mm1, %%mm0 \n\t" "por %%mm4, %%mm3 \n\t" "por %%mm2, %%mm0 \n\t" "por %%mm5, %%mm3 \n\t" "psllq $16, %%mm3 \n\t" "por %%mm3, %%mm0 \n\t" MOVNTQ" %%mm0, %0 \n\t" :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); d += 4; s += 16; } #endif __asm__ volatile(SFENCE:::"memory"); __asm__ volatile(EMMS:::"memory"); while (s < end) { register int rgb = *(const uint32_t*)s; s += 4; *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9); } } static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size) { const uint8_t *s = src; const uint8_t *end; const uint8_t *mm_end; uint16_t *d = (uint16_t *)dst; end = s + src_size; __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); __asm__ volatile( "movq %0, %%mm7 \n\t" "movq %1, %%mm6 \n\t" ::"m"(red_15mask),"m"(green_15mask)); mm_end = end - 15; while (s < mm_end) { __asm__ volatile( PREFETCH" 32%1 \n\t" "movd %1, %%mm0 \n\t" "movd 4%1, %%mm3 \n\t" "punpckldq 8%1, %%mm0 \n\t" "punpckldq 12%1, %%mm3 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm0, %%mm2 \n\t" "movq %%mm3, %%mm4 \n\t" "movq %%mm3, %%mm5 \n\t" "psllq $7, %%mm0 \n\t" "psllq $7, %%mm3 \n\t" "pand %%mm7, %%mm0 \n\t" "pand %%mm7, %%mm3 \n\t" "psrlq $6, %%mm1 \n\t" "psrlq $6, %%mm4 \n\t" "pand %%mm6, %%mm1 \n\t" "pand %%mm6, %%mm4 \n\t" "psrlq $19, %%mm2 \n\t" "psrlq $19, %%mm5 \n\t" "pand %2, %%mm2 \n\t" "pand %2, %%mm5 \n\t" "por %%mm1, %%mm0 \n\t" "por %%mm4, %%mm3 \n\t" "por %%mm2, %%mm0 \n\t" "por %%mm5, %%mm3 \n\t" "psllq $16, %%mm3 \n\t" "por %%mm3, %%mm0 \n\t" MOVNTQ" %%mm0, %0 \n\t" :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); d += 4; s += 16; } __asm__ volatile(SFENCE:::"memory"); __asm__ volatile(EMMS:::"memory"); while (s < end) { register int rgb = *(const uint32_t*)s; s += 4; *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19); } } static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size) { const uint8_t *s = src; const uint8_t *end; const uint8_t *mm_end; uint16_t *d = (uint16_t *)dst; end = s + src_size; __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); __asm__ volatile( "movq %0, %%mm7 \n\t" "movq %1, %%mm6 \n\t" ::"m"(red_16mask),"m"(green_16mask)); mm_end = end - 11; while (s < mm_end) { __asm__ volatile( PREFETCH" 32%1 \n\t" "movd %1, %%mm0 \n\t" "movd 3%1, %%mm3 \n\t" "punpckldq 6%1, %%mm0 \n\t" "punpckldq 9%1, %%mm3 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm0, %%mm2 \n\t" "movq %%mm3, %%mm4 \n\t" "movq %%mm3, %%mm5 \n\t" "psrlq $3, %%mm0 \n\t" "psrlq $3, %%mm3 \n\t" "pand %2, %%mm0 \n\t" "pand %2, %%mm3 \n\t" "psrlq $5, %%mm1 \n\t" "psrlq $5, %%mm4 \n\t" "pand %%mm6, %%mm1 \n\t" "pand %%mm6, %%mm4 \n\t" "psrlq $8, %%mm2 \n\t" "psrlq $8, %%mm5 \n\t" "pand %%mm7, %%mm2 \n\t" "pand %%mm7, %%mm5 \n\t" "por %%mm1, %%mm0 \n\t" "por %%mm4, %%mm3 \n\t" "por %%mm2, %%mm0 \n\t" "por %%mm5, %%mm3 \n\t" "psllq $16, %%mm3 \n\t" "por %%mm3, %%mm0 \n\t" MOVNTQ" %%mm0, %0 \n\t" :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); d += 4; s += 12; } __asm__ volatile(SFENCE:::"memory"); __asm__ volatile(EMMS:::"memory"); while (s < end) { const int b = *s++; const int g = *s++; const int r = *s++; *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); } } static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size) { const uint8_t *s = src; const uint8_t *end; const uint8_t *mm_end; uint16_t *d = (uint16_t *)dst; end = s + src_size; __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); __asm__ volatile( "movq %0, %%mm7 \n\t" "movq %1, %%mm6 \n\t" ::"m"(red_16mask),"m"(green_16mask)); mm_end = end - 15; while (s < mm_end) { __asm__ volatile( PREFETCH" 32%1 \n\t" "movd %1, %%mm0 \n\t" "movd 3%1, %%mm3 \n\t" "punpckldq 6%1, %%mm0 \n\t" "punpckldq 9%1, %%mm3 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm0, %%mm2 \n\t" "movq %%mm3, %%mm4 \n\t" "movq %%mm3, %%mm5 \n\t" "psllq $8, %%mm0 \n\t" "psllq $8, %%mm3 \n\t" "pand %%mm7, %%mm0 \n\t" "pand %%mm7, %%mm3 \n\t" "psrlq $5, %%mm1 \n\t" "psrlq $5, %%mm4 \n\t" "pand %%mm6, %%mm1 \n\t" "pand %%mm6, %%mm4 \n\t" "psrlq $19, %%mm2 \n\t" "psrlq $19, %%mm5 \n\t" "pand %2, %%mm2 \n\t" "pand %2, %%mm5 \n\t" "por %%mm1, %%mm0 \n\t" "por %%mm4, %%mm3 \n\t" "por %%mm2, %%mm0 \n\t" "por %%mm5, %%mm3 \n\t" "psllq $16, %%mm3 \n\t" "por %%mm3, %%mm0 \n\t" MOVNTQ" %%mm0, %0 \n\t" :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); d += 4; s += 12; } __asm__ volatile(SFENCE:::"memory"); __asm__ volatile(EMMS:::"memory"); while (s < end) { const int r = *s++; const int g = *s++; const int b = *s++; *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); } } static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size) { const uint8_t *s = src; const uint8_t *end; const uint8_t *mm_end; uint16_t *d = (uint16_t *)dst; end = s + src_size; __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); __asm__ volatile( "movq %0, %%mm7 \n\t" "movq %1, %%mm6 \n\t" ::"m"(red_15mask),"m"(green_15mask)); mm_end = end - 11; while (s < mm_end) { __asm__ volatile( PREFETCH" 32%1 \n\t" "movd %1, %%mm0 \n\t" "movd 3%1, %%mm3 \n\t" "punpckldq 6%1, %%mm0 \n\t" "punpckldq 9%1, %%mm3 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm0, %%mm2 \n\t" "movq %%mm3, %%mm4 \n\t" "movq %%mm3, %%mm5 \n\t" "psrlq $3, %%mm0 \n\t" "psrlq $3, %%mm3 \n\t" "pand %2, %%mm0 \n\t" "pand %2, %%mm3 \n\t" "psrlq $6, %%mm1 \n\t" "psrlq $6, %%mm4 \n\t" "pand %%mm6, %%mm1 \n\t" "pand %%mm6, %%mm4 \n\t" "psrlq $9, %%mm2 \n\t" "psrlq $9, %%mm5 \n\t" "pand %%mm7, %%mm2 \n\t" "pand %%mm7, %%mm5 \n\t" "por %%mm1, %%mm0 \n\t" "por %%mm4, %%mm3 \n\t" "por %%mm2, %%mm0 \n\t" "por %%mm5, %%mm3 \n\t" "psllq $16, %%mm3 \n\t" "por %%mm3, %%mm0 \n\t" MOVNTQ" %%mm0, %0 \n\t" :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); d += 4; s += 12; } __asm__ volatile(SFENCE:::"memory"); __asm__ volatile(EMMS:::"memory"); while (s < end) { const int b = *s++; const int g = *s++; const int r = *s++; *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); } } static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size) { const uint8_t *s = src; const uint8_t *end; const uint8_t *mm_end; uint16_t *d = (uint16_t *)dst; end = s + src_size; __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); __asm__ volatile( "movq %0, %%mm7 \n\t" "movq %1, %%mm6 \n\t" ::"m"(red_15mask),"m"(green_15mask)); mm_end = end - 15; while (s < mm_end) { __asm__ volatile( PREFETCH" 32%1 \n\t" "movd %1, %%mm0 \n\t" "movd 3%1, %%mm3 \n\t" "punpckldq 6%1, %%mm0 \n\t" "punpckldq 9%1, %%mm3 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm0, %%mm2 \n\t" "movq %%mm3, %%mm4 \n\t" "movq %%mm3, %%mm5 \n\t" "psllq $7, %%mm0 \n\t" "psllq $7, %%mm3 \n\t" "pand %%mm7, %%mm0 \n\t" "pand %%mm7, %%mm3 \n\t" "psrlq $6, %%mm1 \n\t" "psrlq $6, %%mm4 \n\t" "pand %%mm6, %%mm1 \n\t" "pand %%mm6, %%mm4 \n\t" "psrlq $19, %%mm2 \n\t" "psrlq $19, %%mm5 \n\t" "pand %2, %%mm2 \n\t" "pand %2, %%mm5 \n\t" "por %%mm1, %%mm0 \n\t" "por %%mm4, %%mm3 \n\t" "por %%mm2, %%mm0 \n\t" "por %%mm5, %%mm3 \n\t" "psllq $16, %%mm3 \n\t" "por %%mm3, %%mm0 \n\t" MOVNTQ" %%mm0, %0 \n\t" :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); d += 4; s += 12; } __asm__ volatile(SFENCE:::"memory"); __asm__ volatile(EMMS:::"memory"); while (s < end) { const int r = *s++; const int g = *s++; const int b = *s++; *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); } } /* I use less accurate approximation here by simply left-shifting the input value and filling the low order bits with zeroes. This method improves PNG compression but this scheme cannot reproduce white exactly, since it does not generate an all-ones maximum value; the net effect is to darken the image slightly. The better method should be "left bit replication": 4 3 2 1 0 --------- 1 1 0 1 1 7 6 5 4 3 2 1 0 ---------------- 1 1 0 1 1 1 1 0 |=======| |===| | leftmost bits repeated to fill open bits | original bits */ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size) { const uint16_t *end; const uint16_t *mm_end; uint8_t *d = dst; const uint16_t *s = (const uint16_t*)src; end = s + src_size/2; __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); mm_end = end - 7; while (s < mm_end) { __asm__ volatile( PREFETCH" 32%1 \n\t" "movq %1, %%mm0 \n\t" "movq %1, %%mm1 \n\t" "movq %1, %%mm2 \n\t" "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" "psllq $3, %%mm0 \n\t" "psrlq $2, %%mm1 \n\t" "psrlq $7, %%mm2 \n\t" "movq %%mm0, %%mm3 \n\t" "movq %%mm1, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" "punpcklwd %5, %%mm0 \n\t" "punpcklwd %5, %%mm1 \n\t" "punpcklwd %5, %%mm2 \n\t" "punpckhwd %5, %%mm3 \n\t" "punpckhwd %5, %%mm4 \n\t" "punpckhwd %5, %%mm5 \n\t" "psllq $8, %%mm1 \n\t" "psllq $16, %%mm2 \n\t" "por %%mm1, %%mm0 \n\t" "por %%mm2, %%mm0 \n\t" "psllq $8, %%mm4 \n\t" "psllq $16, %%mm5 \n\t" "por %%mm4, %%mm3 \n\t" "por %%mm5, %%mm3 \n\t" "movq %%mm0, %%mm6 \n\t" "movq %%mm3, %%mm7 \n\t" "movq 8%1, %%mm0 \n\t" "movq 8%1, %%mm1 \n\t" "movq 8%1, %%mm2 \n\t" "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" "psllq $3, %%mm0 \n\t" "psrlq $2, %%mm1 \n\t" "psrlq $7, %%mm2 \n\t" "movq %%mm0, %%mm3 \n\t" "movq %%mm1, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" "punpcklwd %5, %%mm0 \n\t" "punpcklwd %5, %%mm1 \n\t" "punpcklwd %5, %%mm2 \n\t" "punpckhwd %5, %%mm3 \n\t" "punpckhwd %5, %%mm4 \n\t" "punpckhwd %5, %%mm5 \n\t" "psllq $8, %%mm1 \n\t" "psllq $16, %%mm2 \n\t" "por %%mm1, %%mm0 \n\t" "por %%mm2, %%mm0 \n\t" "psllq $8, %%mm4 \n\t" "psllq $16, %%mm5 \n\t" "por %%mm4, %%mm3 \n\t" "por %%mm5, %%mm3 \n\t" :"=m"(*d) :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) :"memory"); /* borrowed 32 to 24 */ __asm__ volatile( "movq %%mm0, %%mm4 \n\t" "movq %%mm3, %%mm5 \n\t" "movq %%mm6, %%mm0 \n\t" "movq %%mm7, %%mm1 \n\t" "movq %%mm4, %%mm6 \n\t" "movq %%mm5, %%mm7 \n\t" "movq %%mm0, %%mm2 \n\t" "movq %%mm1, %%mm3 \n\t" STORE_BGR24_MMX :"=m"(*d) :"m"(*s) :"memory"); d += 24; s += 8; } __asm__ volatile(SFENCE:::"memory"); __asm__ volatile(EMMS:::"memory"); while (s < end) { register uint16_t bgr; bgr = *s++; *d++ = (bgr&0x1F)<<3; *d++ = (bgr&0x3E0)>>2; *d++ = (bgr&0x7C00)>>7; } } static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size) { const uint16_t *end; const uint16_t *mm_end; uint8_t *d = (uint8_t *)dst; const uint16_t *s = (const uint16_t *)src; end = s + src_size/2; __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); mm_end = end - 7; while (s < mm_end) { __asm__ volatile( PREFETCH" 32%1 \n\t" "movq %1, %%mm0 \n\t" "movq %1, %%mm1 \n\t" "movq %1, %%mm2 \n\t" "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" "psllq $3, %%mm0 \n\t" "psrlq $3, %%mm1 \n\t" "psrlq $8, %%mm2 \n\t" "movq %%mm0, %%mm3 \n\t" "movq %%mm1, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" "punpcklwd %5, %%mm0 \n\t" "punpcklwd %5, %%mm1 \n\t" "punpcklwd %5, %%mm2 \n\t" "punpckhwd %5, %%mm3 \n\t" "punpckhwd %5, %%mm4 \n\t" "punpckhwd %5, %%mm5 \n\t" "psllq $8, %%mm1 \n\t" "psllq $16, %%mm2 \n\t" "por %%mm1, %%mm0 \n\t" "por %%mm2, %%mm0 \n\t" "psllq $8, %%mm4 \n\t" "psllq $16, %%mm5 \n\t" "por %%mm4, %%mm3 \n\t" "por %%mm5, %%mm3 \n\t" "movq %%mm0, %%mm6 \n\t" "movq %%mm3, %%mm7 \n\t" "movq 8%1, %%mm0 \n\t" "movq 8%1, %%mm1 \n\t" "movq 8%1, %%mm2 \n\t" "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" "psllq $3, %%mm0 \n\t" "psrlq $3, %%mm1 \n\t" "psrlq $8, %%mm2 \n\t" "movq %%mm0, %%mm3 \n\t" "movq %%mm1, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" "punpcklwd %5, %%mm0 \n\t" "punpcklwd %5, %%mm1 \n\t" "punpcklwd %5, %%mm2 \n\t" "punpckhwd %5, %%mm3 \n\t" "punpckhwd %5, %%mm4 \n\t" "punpckhwd %5, %%mm5 \n\t" "psllq $8, %%mm1 \n\t" "psllq $16, %%mm2 \n\t" "por %%mm1, %%mm0 \n\t" "por %%mm2, %%mm0 \n\t" "psllq $8, %%mm4 \n\t" "psllq $16, %%mm5 \n\t" "por %%mm4, %%mm3 \n\t" "por %%mm5, %%mm3 \n\t" :"=m"(*d) :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) :"memory"); /* borrowed 32 to 24 */ __asm__ volatile( "movq %%mm0, %%mm4 \n\t" "movq %%mm3, %%mm5 \n\t" "movq %%mm6, %%mm0 \n\t" "movq %%mm7, %%mm1 \n\t" "movq %%mm4, %%mm6 \n\t" "movq %%mm5, %%mm7 \n\t" "movq %%mm0, %%mm2 \n\t" "movq %%mm1, %%mm3 \n\t" STORE_BGR24_MMX :"=m"(*d) :"m"(*s) :"memory"); d += 24; s += 8; } __asm__ volatile(SFENCE:::"memory"); __asm__ volatile(EMMS:::"memory"); while (s < end) { register uint16_t bgr; bgr = *s++; *d++ = (bgr&0x1F)<<3; *d++ = (bgr&0x7E0)>>3; *d++ = (bgr&0xF800)>>8; } } /* * mm0 = 00 B3 00 B2 00 B1 00 B0 * mm1 = 00 G3 00 G2 00 G1 00 G0 * mm2 = 00 R3 00 R2 00 R1 00 R0 * mm6 = FF FF FF FF FF FF FF FF * mm7 = 00 00 00 00 00 00 00 00 */ #define PACK_RGB32 \ "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \ "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \ "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \ "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \ "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \ "movq %%mm0, %%mm3 \n\t" \ "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \ "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \ MOVNTQ" %%mm0, %0 \n\t" \ MOVNTQ" %%mm3, 8%0 \n\t" \ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size) { const uint16_t *end; const uint16_t *mm_end; uint8_t *d = dst; const uint16_t *s = (const uint16_t *)src; end = s + src_size/2; __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); mm_end = end - 3; while (s < mm_end) { __asm__ volatile( PREFETCH" 32%1 \n\t" "movq %1, %%mm0 \n\t" "movq %1, %%mm1 \n\t" "movq %1, %%mm2 \n\t" "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" "psllq $3, %%mm0 \n\t" "psrlq $2, %%mm1 \n\t" "psrlq $7, %%mm2 \n\t" PACK_RGB32 :"=m"(*d) :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r) :"memory"); d += 16; s += 4; } __asm__ volatile(SFENCE:::"memory"); __asm__ volatile(EMMS:::"memory"); while (s < end) { register uint16_t bgr; bgr = *s++; *d++ = (bgr&0x1F)<<3; *d++ = (bgr&0x3E0)>>2; *d++ = (bgr&0x7C00)>>7; *d++ = 255; } } static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size) { const uint16_t *end; const uint16_t *mm_end; uint8_t *d = dst; const uint16_t *s = (const uint16_t*)src; end = s + src_size/2; __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); mm_end = end - 3; while (s < mm_end) { __asm__ volatile( PREFETCH" 32%1 \n\t" "movq %1, %%mm0 \n\t" "movq %1, %%mm1 \n\t" "movq %1, %%mm2 \n\t" "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" "psllq $3, %%mm0 \n\t" "psrlq $3, %%mm1 \n\t" "psrlq $8, %%mm2 \n\t" PACK_RGB32 :"=m"(*d) :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r) :"memory"); d += 16; s += 4; } __asm__ volatile(SFENCE:::"memory"); __asm__ volatile(EMMS:::"memory"); while (s < end) { register uint16_t bgr; bgr = *s++; *d++ = (bgr&0x1F)<<3; *d++ = (bgr&0x7E0)>>3; *d++ = (bgr&0xF800)>>8; *d++ = 255; } } static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, long src_size) { x86_reg idx = 15 - src_size; const uint8_t *s = src-idx; uint8_t *d = dst-idx; __asm__ volatile( "test %0, %0 \n\t" "jns 2f \n\t" PREFETCH" (%1, %0) \n\t" "movq %3, %%mm7 \n\t" "pxor %4, %%mm7 \n\t" "movq %%mm7, %%mm6 \n\t" "pxor %5, %%mm7 \n\t" ".p2align 4 \n\t" "1: \n\t" PREFETCH" 32(%1, %0) \n\t" "movq (%1, %0), %%mm0 \n\t" "movq 8(%1, %0), %%mm1 \n\t" # if COMPILE_TEMPLATE_MMX2 "pshufw $177, %%mm0, %%mm3 \n\t" "pshufw $177, %%mm1, %%mm5 \n\t" "pand %%mm7, %%mm0 \n\t" "pand %%mm6, %%mm3 \n\t" "pand %%mm7, %%mm1 \n\t" "pand %%mm6, %%mm5 \n\t" "por %%mm3, %%mm0 \n\t" "por %%mm5, %%mm1 \n\t" # else "movq %%mm0, %%mm2 \n\t" "movq %%mm1, %%mm4 \n\t" "pand %%mm7, %%mm0 \n\t" "pand %%mm6, %%mm2 \n\t" "pand %%mm7, %%mm1 \n\t" "pand %%mm6, %%mm4 \n\t" "movq %%mm2, %%mm3 \n\t" "movq %%mm4, %%mm5 \n\t" "pslld $16, %%mm2 \n\t" "psrld $16, %%mm3 \n\t" "pslld $16, %%mm4 \n\t" "psrld $16, %%mm5 \n\t" "por %%mm2, %%mm0 \n\t" "por %%mm4, %%mm1 \n\t" "por %%mm3, %%mm0 \n\t" "por %%mm5, %%mm1 \n\t" # endif MOVNTQ" %%mm0, (%2, %0) \n\t" MOVNTQ" %%mm1, 8(%2, %0) \n\t" "add $16, %0 \n\t" "js 1b \n\t" SFENCE" \n\t" EMMS" \n\t" "2: \n\t" : "+&r"(idx) : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one) : "memory"); for (; idx<15; idx+=4) { register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00; v &= 0xff00ff; *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16); } } static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size) { unsigned i; x86_reg mmx_size= 23 - src_size; __asm__ volatile ( "test %%"REG_a", %%"REG_a" \n\t" "jns 2f \n\t" "movq "MANGLE(mask24r)", %%mm5 \n\t" "movq "MANGLE(mask24g)", %%mm6 \n\t" "movq "MANGLE(mask24b)", %%mm7 \n\t" ".p2align 4 \n\t" "1: \n\t" PREFETCH" 32(%1, %%"REG_a") \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B "psllq $16, %%mm0 \n\t" // 00 BGR BGR "pand %%mm5, %%mm0 \n\t" "pand %%mm6, %%mm1 \n\t" "pand %%mm7, %%mm2 \n\t" "por %%mm0, %%mm1 \n\t" "por %%mm2, %%mm1 \n\t" "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR "pand %%mm7, %%mm0 \n\t" "pand %%mm5, %%mm1 \n\t" "pand %%mm6, %%mm2 \n\t" "por %%mm0, %%mm1 \n\t" "por %%mm2, %%mm1 \n\t" "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG "pand %%mm6, %%mm0 \n\t" "pand %%mm7, %%mm1 \n\t" "pand %%mm5, %%mm2 \n\t" "por %%mm0, %%mm1 \n\t" "por %%mm2, %%mm1 \n\t" MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t" "add $24, %%"REG_a" \n\t" " js 1b \n\t" "2: \n\t" : "+a" (mmx_size) : "r" (src-mmx_size), "r"(dst-mmx_size) ); __asm__ volatile(SFENCE:::"memory"); __asm__ volatile(EMMS:::"memory"); if (mmx_size==23) return; //finished, was multiple of 8 src+= src_size; dst+= src_size; src_size= 23-mmx_size; src-= src_size; dst-= src_size; for (i=0; i>1; for (y=0; y>1; for (y=0; y>1; for (y=0; y>2; dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; } dst[2*srcWidth-1]= src[srcWidth-1]; dst+= dstStride; for (y=1; y>2; dst[dstStride]= ( src[0] + 3*src[srcStride])>>2; #endif for (x=mmxSize-1; x>2; dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2; dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2; dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2; } dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2; dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2; dst+=dstStride*2; src+=srcStride; } // last line #if 1 dst[0]= src[0]; for (x=0; x>2; dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; } dst[2*srcWidth-1]= src[srcWidth-1]; #else for (x=0; x>1; for (y=0; y>1; for (y=0; y>RGB2YUV_SHIFT) + 16; unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128; unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128; udst[i] = U; vdst[i] = V; ydst[2*i] = Y; b = src[6*i+3]; g = src[6*i+4]; r = src[6*i+5]; Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; ydst[2*i+1] = Y; } ydst += lumStride; src += srcStride; for (i=0; i>RGB2YUV_SHIFT) + 16; ydst[2*i] = Y; b = src[6*i+3]; g = src[6*i+4]; r = src[6*i+5]; Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; ydst[2*i+1] = Y; } udst += chromStride; vdst += chromStride; ydst += lumStride; src += srcStride; } } static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, long width, long height, long src1Stride, long src2Stride, long dstStride) { long h; for (h=0; h < height; h++) { long w; #if COMPILE_TEMPLATE_SSE2 __asm__( "xor %%"REG_a", %%"REG_a" \n\t" "1: \n\t" PREFETCH" 64(%1, %%"REG_a") \n\t" PREFETCH" 64(%2, %%"REG_a") \n\t" "movdqa (%1, %%"REG_a"), %%xmm0 \n\t" "movdqa (%1, %%"REG_a"), %%xmm1 \n\t" "movdqa (%2, %%"REG_a"), %%xmm2 \n\t" "punpcklbw %%xmm2, %%xmm0 \n\t" "punpckhbw %%xmm2, %%xmm1 \n\t" "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t" "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t" "add $16, %%"REG_a" \n\t" "cmp %3, %%"REG_a" \n\t" " jb 1b \n\t" ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) : "memory", "%"REG_a"" ); #else __asm__( "xor %%"REG_a", %%"REG_a" \n\t" "1: \n\t" PREFETCH" 64(%1, %%"REG_a") \n\t" PREFETCH" 64(%2, %%"REG_a") \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t" "movq 8(%1, %%"REG_a"), %%mm2 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm2, %%mm3 \n\t" "movq (%2, %%"REG_a"), %%mm4 \n\t" "movq 8(%2, %%"REG_a"), %%mm5 \n\t" "punpcklbw %%mm4, %%mm0 \n\t" "punpckhbw %%mm4, %%mm1 \n\t" "punpcklbw %%mm5, %%mm2 \n\t" "punpckhbw %%mm5, %%mm3 \n\t" MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t" MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t" MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t" MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t" "add $16, %%"REG_a" \n\t" "cmp %3, %%"REG_a" \n\t" " jb 1b \n\t" ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) : "memory", "%"REG_a ); #endif for (w= (width&(~15)); w < width; w++) { dest[2*w+0] = src1[w]; dest[2*w+1] = src2[w]; } dest += dstStride; src1 += src1Stride; src2 += src2Stride; } __asm__( EMMS" \n\t" SFENCE" \n\t" ::: "memory" ); } static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst1, uint8_t *dst2, long width, long height, long srcStride1, long srcStride2, long dstStride1, long dstStride2) { x86_reg y; long x,w,h; w=width/2; h=height/2; __asm__ volatile( PREFETCH" %0 \n\t" PREFETCH" %1 \n\t" ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); for (y=0;y>1); uint8_t* d=dst1+dstStride1*y; x=0; for (;x>1); uint8_t* d=dst2+dstStride2*y; x=0; for (;x>2); const uint8_t* vp=src3+srcStride3*(y>>2); uint8_t* d=dst+dstStride*y; x=0; for (;x>1; dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; count++; } } static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) { dst0+= count; dst1+= count; src += 4*count; count= - count; if(count <= -8) { count += 7; __asm__ volatile( "pcmpeqw %%mm7, %%mm7 \n\t" "psrlw $8, %%mm7 \n\t" "1: \n\t" "movq -28(%1, %0, 4), %%mm0 \n\t" "movq -20(%1, %0, 4), %%mm1 \n\t" "movq -12(%1, %0, 4), %%mm2 \n\t" "movq -4(%1, %0, 4), %%mm3 \n\t" "psrlw $8, %%mm0 \n\t" "psrlw $8, %%mm1 \n\t" "psrlw $8, %%mm2 \n\t" "psrlw $8, %%mm3 \n\t" "packuswb %%mm1, %%mm0 \n\t" "packuswb %%mm3, %%mm2 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm2, %%mm3 \n\t" "psrlw $8, %%mm0 \n\t" "psrlw $8, %%mm2 \n\t" "pand %%mm7, %%mm1 \n\t" "pand %%mm7, %%mm3 \n\t" "packuswb %%mm2, %%mm0 \n\t" "packuswb %%mm3, %%mm1 \n\t" MOVNTQ" %%mm0,- 7(%3, %0) \n\t" MOVNTQ" %%mm1,- 7(%2, %0) \n\t" "add $8, %0 \n\t" " js 1b \n\t" : "+r"(count) : "r"(src), "r"(dst0), "r"(dst1) ); count -= 7; } src++; while(count<0) { dst0[count]= src[4*count+0]; dst1[count]= src[4*count+2]; count++; } } static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count) { dst0 += count; dst1 += count; src0 += 4*count; src1 += 4*count; count= - count; #ifdef PAVGB if(count <= -8) { count += 7; __asm__ volatile( "pcmpeqw %%mm7, %%mm7 \n\t" "psrlw $8, %%mm7 \n\t" "1: \n\t" "movq -28(%1, %0, 4), %%mm0 \n\t" "movq -20(%1, %0, 4), %%mm1 \n\t" "movq -12(%1, %0, 4), %%mm2 \n\t" "movq -4(%1, %0, 4), %%mm3 \n\t" PAVGB" -28(%2, %0, 4), %%mm0 \n\t" PAVGB" -20(%2, %0, 4), %%mm1 \n\t" PAVGB" -12(%2, %0, 4), %%mm2 \n\t" PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" "psrlw $8, %%mm0 \n\t" "psrlw $8, %%mm1 \n\t" "psrlw $8, %%mm2 \n\t" "psrlw $8, %%mm3 \n\t" "packuswb %%mm1, %%mm0 \n\t" "packuswb %%mm3, %%mm2 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm2, %%mm3 \n\t" "psrlw $8, %%mm0 \n\t" "psrlw $8, %%mm2 \n\t" "pand %%mm7, %%mm1 \n\t" "pand %%mm7, %%mm3 \n\t" "packuswb %%mm2, %%mm0 \n\t" "packuswb %%mm3, %%mm1 \n\t" MOVNTQ" %%mm0,- 7(%4, %0) \n\t" MOVNTQ" %%mm1,- 7(%3, %0) \n\t" "add $8, %0 \n\t" " js 1b \n\t" : "+r"(count) : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1) ); count -= 7; } #endif src0++; src1++; while(count<0) { dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1; dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; count++; } } static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, long width, long height, long lumStride, long chromStride, long srcStride) { long y; const long chromWidth= -((-width)>>1); for (y=0; y>1); for (y=0; y>1); for (y=0; y>1); for (y=0; y