Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.

git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@1246 b3059339-0415-0410-9bf9-f77b7e298cf2
This commit is contained in:
nick 2001-06-29 17:55:35 +00:00
parent bf8a76c063
commit 2ec6762923
19 changed files with 3162 additions and 2266 deletions

View File

@ -1,8 +1,10 @@
include config.mak
SRCS = sr1.c d_cpu.s decode_i586.s $(OPTIONAL_SRCS)
OBJS = sr1.o d_cpu.o decode_i586.o $(OPTIONAL_OBJS)
SRCS = sr1.c d_cpu.s decode_i586.s dct64_MMX.s decode_MMX.s tabinit_MMX.s\
dct36_3dnow.s dct64_3dnow.s dct36_k7.s dct64_k7.s
OBJS = sr1.o d_cpu.o decode_i586.o dct64_MMX.o decode_MMX.o tabinit_MMX.o\
dct36_3dnow.o dct64_3dnow.o dct36_k7.o dct64_k7.o
# OBJS = $(SRCS:.c,.s=.o)
CFLAGS = $(OPTFLAGS) $(EXTRA_INC)

View File

@ -9,9 +9,12 @@
unsigned int _CpuID;
unsigned int _i586;
unsigned int _3dnow;
unsigned int _isse;
unsigned int _has_mmx;
extern unsigned long CpuDetect( void );
extern unsigned long ipentium( void );
extern unsigned long isse( void );
extern unsigned long a3dnow( void );
#endif

View File

@ -9,6 +9,7 @@
.globl CpuDetect
.globl ipentium
.globl a3dnow
.globl isse
/ ---------------------------------------------------------------------------
/ in C: unsigned long CpuDetect( void );
@ -45,7 +46,9 @@ exit_cpudetect:
/ ---------------------------------------------------------------------------
/ in C: unsigled long ipentium( void );
/ return: 0 if the processor is not P5 or above else above 1.
/ return: 0 if this processor i386 or i486
/ 1 otherwise
/ 2 if this cpu supports mmx
/ ---------------------------------------------------------------------------
ipentium:
pushl %ebx
@ -63,10 +66,15 @@ ipentium:
jz no_cpuid
movl $1,%eax
cpuid
shrl $8,%eax
cmpl $5,%eax
jb no_cpuid
movl $1,%eax
movl %eax, %ecx
xorl %eax, %eax
shrl $8,%ecx
cmpl $5,%ecx
jb exit
incl %eax
test $0x00800000, %edx
jz exit
incl %eax
jmp exit
no_cpuid:
xorl %eax,%eax
@ -113,3 +121,33 @@ exit2:
popl %edx
popl %ebx
ret
/ ---------------------------------------------------------------------------
/ in C: unsigned long isse( void );
/ return: 0 if this processor does not support sse
/ 1 otherwise
/ 2 if this cpu supports sse2 extension
/ ---------------------------------------------------------------------------
isse:
pushl %ebx
pushl %edx
pushl %ecx
call ipentium
testl %eax,%eax
jz exit3
movl $1,%eax
cpuid
xorl %eax, %eax
testl $0x02000000,%edx
jz exit3
incl %eax
testl $0x04000000,%edx
jz exit3
incl %eax
exit3:
popl %ecx
popl %edx
popl %ebx
ret

View File

@ -193,7 +193,7 @@ static void dct36(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf)
sum1 = (tmp2b - tmp1b) * tfcos36[(v)]; \
MACRO0(v); }
register const real *c = nCOS9;
register const real *c = COS9;
register real *out2 = o2;
register real *w = wintab;
register real *out1 = o1;

File diff suppressed because it is too large Load Diff

1028
mp3lib/dct64_MMX.s Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -105,6 +105,15 @@ static int synth_1to1_r(real *bandPtr,int channel,unsigned char *out,int *pnt)
}
#endif
synth_func_t synth_func;
int synth_1to1_MMX( real *bandPtr,int channel,short * samples)
{
static short buffs[2][2][0x110];
static int bo = 1;
synth_1to1_MMX_s(bandPtr, channel, samples, (short *) buffs, &bo);
return 0;
}
static int synth_1to1(real *bandPtr,int channel,unsigned char *out,int *pnt)
{
@ -117,40 +126,13 @@ static int synth_1to1(real *bandPtr,int channel,unsigned char *out,int *pnt)
int clip = 0;
int bo1;
#ifdef HAVE_SSE_MP3
//if ( _3dnow )
if ( synth_func )
{
int ret;
ret=synth_1to1_sse( bandPtr,channel,out+*pnt );
ret=(*synth_func)( bandPtr,channel,samples);
*pnt+=128;
return ret;
}
#endif
#ifdef HAVE_3DNOWEX
if ( _3dnow > 1 )
{
int ret;
ret=synth_1to1_3dnowex( bandPtr,channel,out+*pnt );
*pnt+=128;
return ret;
}
#endif
#ifdef HAVE_3DNOW
if ( _3dnow )
{
int ret;
ret=synth_1to1_3dnow( bandPtr,channel,out+*pnt );
*pnt+=128;
return ret;
}
#endif
if ( _i586 )
{
int ret;
ret=synth_1to1_pent( bandPtr,channel,out+*pnt );
*pnt+=128;
return ret;
}
if(!channel) { /* channel=0 */
bo--;

View File

@ -1,265 +0,0 @@
/ synth_1to1_3dnow works the same way as the c version of
/ synth_1to1. this assembler code based 'decode-i586.s'
/ (by Stefan Bieschewski <stb@acm.org>), two types of changes
/ have been made:
/ - use {MMX,3DNow!} instruction for reduce cpu
/ - remove unused(?) local symbols
/
/ useful sources of information on optimizing 3DNow! code include:
/ AMD 3DNow! Technology Manual (Publication #21928)
/ English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf
/ (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
/ English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf
/
/ This code was tested only AMD-K6-2 processor Linux systems,
/ please tell me:
/ - whether this code works on other 3DNow! capable processors
/ (ex.IDT-C6-2) or not
/ - whether this code works on other OSes or not
/
/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
/ <kim@comtec.co.jp> - after 1.Apr.1998
/ Enhancments for q-word operation by Michael Hipp
.bss
.comm buffs,4352,4
.data
.align 4
bo:
.long 1
.text
.globl synth_1to1_3dnow
synth_1to1_3dnow:
subl $12,%esp
pushl %ebp
pushl %edi
pushl %esi
pushl %ebx
movl 32(%esp),%eax
movl 40(%esp),%esi
movl $0,%edi
movl bo,%ebp
cmpl %edi,36(%esp)
jne .L48
decl %ebp
andl $15,%ebp
movl %ebp,bo
movl $buffs,%ecx
jmp .L49
.L48:
addl $2,%esi
movl $buffs+2176,%ecx
.L49:
testl $1,%ebp
je .L50
movl %ecx,%ebx
movl %ebp,16(%esp)
pushl %eax
movl 20(%esp),%edx
leal (%ebx,%edx,4),%eax
pushl %eax
movl 24(%esp),%eax
incl %eax
andl $15,%eax
leal 1088(,%eax,4),%eax
addl %ebx,%eax
jmp .L74
.L50:
leal 1088(%ecx),%ebx
leal 1(%ebp),%edx
movl %edx,16(%esp)
pushl %eax
leal 1092(%ecx,%ebp,4),%eax
pushl %eax
leal (%ecx,%ebp,4),%eax
.L74:
pushl %eax
call dct64_3dnow
addl $12,%esp
movl 16(%esp),%edx
leal 0(,%edx,4),%edx
movl $decwin+64,%eax
movl %eax,%ecx
subl %edx,%ecx
movl $16,%ebp
.L55:
movq (%ecx),%mm4
movq (%ebx),%mm3
movq 8(%ecx),%mm0
movq 8(%ebx),%mm1
pfmul %mm3,%mm4
movq 16(%ecx),%mm2
pfmul %mm1,%mm0
movq 16(%ebx),%mm3
pfadd %mm0,%mm4
movq 24(%ecx),%mm0
pfmul %mm2,%mm3
movq 24(%ebx),%mm1
pfadd %mm3,%mm4
movq 32(%ecx),%mm2
pfmul %mm1,%mm0
movq 32(%ebx),%mm3
pfadd %mm0,%mm4
movq 40(%ecx),%mm0
pfmul %mm2,%mm3
movq 40(%ebx),%mm1
pfadd %mm3,%mm4
movq 48(%ecx),%mm2
pfmul %mm1,%mm0
movq 48(%ebx),%mm3
pfadd %mm0,%mm4
movq 56(%ecx),%mm0
pfmul %mm2,%mm3
movq 56(%ebx),%mm1
pfadd %mm3,%mm4
pfmul %mm1,%mm0
pfadd %mm0,%mm4
movq %mm4,%mm0
psrlq $32,%mm0
pfsub %mm0,%mm4
pf2id %mm4,%mm4
movd %mm4,%eax
sar $16,%eax
movw %ax,(%esi)
addl $64,%ebx
subl $-128,%ecx
addl $4,%esi
decl %ebp
jnz .L55
/ --- end of loop 1 ---
movd (%ecx),%mm2
movd (%ebx),%mm1
pfmul %mm1,%mm2
movd 8(%ecx),%mm0
movd 8(%ebx),%mm1
pfmul %mm0,%mm1
pfadd %mm1,%mm2
movd 16(%ecx),%mm0
movd 16(%ebx),%mm1
pfmul %mm0,%mm1
pfadd %mm1,%mm2
movd 24(%ecx),%mm0
movd 24(%ebx),%mm1
pfmul %mm0,%mm1
pfadd %mm1,%mm2
movd 32(%ecx),%mm0
movd 32(%ebx),%mm1
pfmul %mm0,%mm1
pfadd %mm1,%mm2
movd 40(%ecx),%mm0
movd 40(%ebx),%mm1
pfmul %mm0,%mm1
pfadd %mm1,%mm2
movd 48(%ecx),%mm0
movd 48(%ebx),%mm1
pfmul %mm0,%mm1
pfadd %mm1,%mm2
movd 56(%ecx),%mm0
movd 56(%ebx),%mm1
pfmul %mm0,%mm1
pfadd %mm1,%mm2
pf2id %mm2,%mm2
movd %mm2,%eax
sar $16,%eax
movw %ax,(%esi)
addl $-64,%ebx
addl $4,%esi
addl $256,%ecx
movl $15,%ebp
.L68:
psubd %mm0,%mm0
movq (%ebx),%mm1
movq (%ecx),%mm2
pfmul %mm1,%mm2
pfsub %mm2,%mm0
movq 8(%ebx),%mm3
movq 8(%ecx),%mm4
pfmul %mm3,%mm4
pfsub %mm4,%mm0
movq 16(%ebx),%mm1
movq 16(%ecx),%mm2
pfmul %mm1,%mm2
pfsub %mm2,%mm0
movq 24(%ebx),%mm3
movq 24(%ecx),%mm4
pfmul %mm3,%mm4
pfsub %mm4,%mm0
movq 32(%ebx),%mm1
movq 32(%ecx),%mm2
pfmul %mm1,%mm2
pfsub %mm2,%mm0
movq 40(%ebx),%mm3
movq 40(%ecx),%mm4
pfmul %mm3,%mm4
pfsub %mm4,%mm0
movq 48(%ebx),%mm1
movq 48(%ecx),%mm2
pfmul %mm1,%mm2
pfsub %mm2,%mm0
movq 56(%ebx),%mm3
movq 56(%ecx),%mm4
pfmul %mm3,%mm4
pfsub %mm4,%mm0
pfacc %mm0,%mm0
pf2id %mm0,%mm0
movd %mm0,%eax
sar $16,%eax
movw %ax,(%esi)
addl $-64,%ebx
subl $-128,%ecx
addl $4,%esi
decl %ebp
jnz .L68
/ --- end of loop 2
femms
movl %edi,%eax
popl %ebx
popl %esi
popl %edi
popl %ebp
addl $12,%esp
ret

117
mp3lib/decode_MMX.s Normal file
View File

@ -0,0 +1,117 @@
# this code comes under GPL
# This code was taken from http://www.mpg123.org
# See ChangeLog of mpg123-0.59s-pre.1 for detail
# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
#
# TODO: Partial loops unrolling and removing MOVW insn.
#
.text
.globl synth_1to1_MMX_s
synth_1to1_MMX_s:
pushl %ebp
pushl %edi
pushl %esi
pushl %ebx
movl 24(%esp),%ecx
movl 28(%esp),%edi
movl $15,%ebx
movl 36(%esp),%edx
leal (%edi,%ecx,2),%edi
decl %ecx
movl 32(%esp),%esi
movl (%edx),%eax
jecxz .L1
decl %eax
andl %ebx,%eax
leal 1088(%esi),%esi
movl %eax,(%edx)
.L1:
leal (%esi,%eax,2),%edx
movl %eax,%ebp
incl %eax
pushl 20(%esp)
andl %ebx,%eax
leal 544(%esi,%eax,2),%ecx
incl %ebx
testl $1, %eax
jnz .L2
xchgl %edx,%ecx
incl %ebp
leal 544(%esi),%esi
.L2:
emms
pushl %edx
pushl %ecx
call *dct64_MMX_func
addl $12,%esp
leal 1(%ebx), %ecx
subl %ebp,%ebx
leal decwins(%ebx,%ebx,1), %edx
.L3:
movq (%edx),%mm0
pmaddwd (%esi),%mm0
movq 8(%edx),%mm1
pmaddwd 8(%esi),%mm1
movq 16(%edx),%mm2
pmaddwd 16(%esi),%mm2
movq 24(%edx),%mm3
pmaddwd 24(%esi),%mm3
paddd %mm1,%mm0
paddd %mm2,%mm0
paddd %mm3,%mm0
movq %mm0,%mm1
psrlq $32,%mm1
paddd %mm1,%mm0
psrad $13,%mm0
packssdw %mm0,%mm0
movd %mm0,%eax
movw %ax, (%edi)
leal 32(%esi),%esi
leal 64(%edx),%edx
leal 4(%edi),%edi
decl %ecx
jnz .L3
subl $64,%esi
movl $15,%ecx
.L4:
movq (%edx),%mm0
pmaddwd (%esi),%mm0
movq 8(%edx),%mm1
pmaddwd 8(%esi),%mm1
movq 16(%edx),%mm2
pmaddwd 16(%esi),%mm2
movq 24(%edx),%mm3
pmaddwd 24(%esi),%mm3
paddd %mm1,%mm0
paddd %mm2,%mm0
paddd %mm3,%mm0
movq %mm0,%mm1
psrlq $32,%mm1
paddd %mm0,%mm1
psrad $13,%mm1
packssdw %mm1,%mm1
psubd %mm0,%mm0
psubsw %mm1,%mm0
movd %mm0,%eax
movw %ax,(%edi)
subl $32,%esi
addl $64,%edx
leal 4(%edi),%edi
decl %ecx
jnz .L4
emms
popl %ebx
popl %esi
popl %edi
popl %ebp
ret

View File

@ -1,364 +0,0 @@
///
/// Replacement of synth_1to1() with AMD's 3DNowEx(DSP)! SIMD operations support
///
/// This code based 'decode_3dnow.s' by Syuuhei Kashiyama
/// <squash@mb.kcom.ne.jp>,only some types of changes have been made:
///
/// - Added new opcode PFNACC
/// - decreased number of opcodes (as it was suggested by k7 manual)
/// (using memory reference as operand of instructions)
/// - added PREFETCHW opcode. It has different semantic on k7 than on k6-2
/// and saves 15-25 cpu clocks for athlon.
/// - partial unrolling loops for removing slower MOVW insns.
/// (Note: probably same operation should be done for decode_3dnow.s)
/// - change function name for support 3DNowEx! automatic detect
/// - added loops alignment
///
/// note: because K7 processors are an aggresive out-of-order three-way
/// superscalar ones instruction order is not significand for them.
///
/// Benchmark: measured by mplayer on Duron-700:
/// 3dNow! optimized code - 1.4% of cpu usage
/// k7 optimized code (without partial loop unrolling) - 1.3% of cpu usage
/// k7 optimized code - 1.1% of cpu usage
/// Note: K6-2 users have an chance with partial loops unrolling
///
/// Modified by Nick Kurshev <nickols_k@mail.ru>
///
/ synth_1to1_3dnow works the same way as the c version of
/ synth_1to1. this assembler code based 'decode-i586.s'
/ (by Stefan Bieschewski <stb@acm.org>), two types of changes
/ have been made:
/ - use {MMX,3DNow!} instruction for reduce cpu
/ - remove unused(?) local symbols
/
/ useful sources of information on optimizing 3DNow! code include:
/ AMD 3DNow! Technology Manual (Publication #21928)
/ English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf
/ (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
/ English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf
/
/ This code was tested only AMD-K6-2 processor Linux systems,
/ please tell me:
/ - whether this code works on other 3DNow! capable processors
/ (ex.IDT-C6-2) or not
/ - whether this code works on other OSes or not
/
/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
/ <kim@comtec.co.jp> - after 1.Apr.1998
/ Enhancments for q-word operation by Michael Hipp
.bss
.comm buffs,4352,4
.data
.align 8
null_one: .long 0x0000ffff, 0x0000ffff
one_null: .long 0xffff0000, 0xffff0000
bo: .long 1
.text
/* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */
.globl synth_1to1_3dnowex
synth_1to1_3dnowex:
subl $12,%esp
pushl %ebp
pushl %edi
pushl %esi
pushl %ebx
movl 32(%esp),%eax
movl 40(%esp),%esi
movl $0,%edi
movl bo,%ebp
cmpl %edi,36(%esp)
jne .L48
decl %ebp
andl $15,%ebp
movl %ebp,bo
movl $buffs,%ecx
jmp .L49
.L48:
addl $2,%esi
movl $buffs+2176,%ecx
.L49:
testl $1,%ebp
je .L50
movl %ecx,%ebx
movl %ebp,16(%esp)
pushl %eax
movl 20(%esp),%edx
leal (%ebx,%edx,4),%eax
pushl %eax
movl 24(%esp),%eax
incl %eax
andl $15,%eax
leal 1088(,%eax,4),%eax
addl %ebx,%eax
jmp .L74
.L50:
leal 1088(%ecx),%ebx
leal 1(%ebp),%edx
movl %edx,16(%esp)
pushl %eax
leal 1092(%ecx,%ebp,4),%eax
pushl %eax
leal (%ecx,%ebp,4),%eax
.L74:
pushl %eax
call dct64_3dnowex
movl 16(%esp),%edx
leal 0(,%edx,4),%edx
movl $decwin+64,%eax
movl %eax,%ecx
subl %edx,%ecx
movl $8,%ebp
prefetchw (%esi)
.align 16
.L55:
movq (%ecx),%mm0
pfmul (%ebx),%mm0
movq 128(%ecx),%mm4
pfmul 64(%ebx),%mm4
movq 8(%ecx),%mm1
pfmul 8(%ebx),%mm1
pfadd %mm1,%mm0
movq 136(%ecx),%mm5
pfmul 72(%ebx),%mm5
pfadd %mm5,%mm4
movq 16(%ebx),%mm2
pfmul 16(%ecx),%mm2
pfadd %mm2,%mm0
movq 80(%ebx),%mm6
pfmul 144(%ecx),%mm6
pfadd %mm6,%mm4
movq 24(%ecx),%mm3
pfmul 24(%ebx),%mm3
pfadd %mm3,%mm0
movq 152(%ecx),%mm7
pfmul 88(%ebx),%mm7
pfadd %mm7,%mm4
movq 32(%ebx),%mm1
pfmul 32(%ecx),%mm1
pfadd %mm1,%mm0
movq 96(%ebx),%mm5
pfmul 160(%ecx),%mm5
pfadd %mm5,%mm4
movq 40(%ecx),%mm2
pfmul 40(%ebx),%mm2
pfadd %mm2,%mm0
movq 168(%ecx),%mm6
pfmul 104(%ebx),%mm6
pfadd %mm6,%mm4
movq 48(%ebx),%mm3
pfmul 48(%ecx),%mm3
pfadd %mm3,%mm0
movq 112(%ebx),%mm7
pfmul 176(%ecx),%mm7
pfadd %mm7,%mm4
movq 56(%ecx),%mm1
pfmul 56(%ebx),%mm1
pfadd %mm1,%mm0
movq 184(%ecx),%mm5
pfmul 120(%ebx),%mm5
pfadd %mm5,%mm4
pfnacc %mm4, %mm0
movq (%esi), %mm1
pf2id %mm0, %mm0
pand one_null, %mm1
psrld $16,%mm0
pand null_one, %mm0
por %mm0, %mm1
movq %mm1,(%esi)
addl $128,%ebx
addl $256,%ecx
addl $8,%esi
decl %ebp
jnz .L55
/ --- end of loop 1 ---
prefetchw (%esi) /* prefetching for writing this block and next loop */
movd (%ecx),%mm0
pfmul (%ebx),%mm0
movd 8(%ebx),%mm1
pfmul 8(%ecx),%mm1
pfadd %mm1,%mm0
movd 16(%ebx),%mm2
pfmul 16(%ecx),%mm2
pfadd %mm2,%mm0
movd 24(%ebx),%mm3
pfmul 24(%ecx),%mm3
pfadd %mm3,%mm0
movd 32(%ebx),%mm4
pfmul 32(%ecx),%mm4
pfadd %mm4,%mm0
movd 40(%ebx),%mm5
pfmul 40(%ecx),%mm5
pfadd %mm5,%mm0
movd 48(%ebx),%mm6
pfmul 48(%ecx),%mm6
pfadd %mm6,%mm0
movd 56(%ebx),%mm7
pfmul 56(%ecx),%mm7
pfadd %mm7,%mm0
pf2id %mm0,%mm0
movd %mm0,%eax
sar $16,%eax
movw %ax,(%esi)
subl $64,%ebx
addl $4,%esi
addl $256,%ecx
movl $7,%ebp
.align 16
.L68:
pxor %mm0, %mm0
pxor %mm4, %mm4
movq (%ecx),%mm1
pfmul (%ebx),%mm1
pfsub %mm1,%mm0
movq 128(%ecx),%mm5
pfmul -64(%ebx),%mm5
pfsub %mm5,%mm4
movq 8(%ecx),%mm2
pfmul 8(%ebx),%mm2
pfsub %mm2,%mm0
movq 136(%ecx),%mm6
pfmul -56(%ebx),%mm6
pfsub %mm6,%mm4
movq 16(%ecx),%mm3
pfmul 16(%ebx),%mm3
pfsub %mm3,%mm0
movq 144(%ecx),%mm7
pfmul -48(%ebx),%mm7
pfsub %mm7,%mm4
movq 24(%ecx),%mm1
pfmul 24(%ebx),%mm1
pfsub %mm1,%mm0
movq 152(%ecx),%mm5
pfmul -40(%ebx),%mm5
pfsub %mm5,%mm4
movq 32(%ecx),%mm2
pfmul 32(%ebx),%mm2
pfsub %mm2,%mm0
movq 160(%ecx),%mm6
pfmul -32(%ebx),%mm6
pfsub %mm6,%mm4
movq 40(%ecx),%mm3
pfmul 40(%ebx),%mm3
pfsub %mm3,%mm0
movq 168(%ecx),%mm7
pfmul -24(%ebx),%mm7
pfsub %mm7,%mm4
movq 48(%ecx),%mm1
pfmul 48(%ebx),%mm1
pfsub %mm1,%mm0
movq 176(%ecx),%mm5
pfmul -16(%ebx),%mm5
pfsub %mm5,%mm4
movq 56(%ecx),%mm2
pfmul 56(%ebx),%mm2
pfsub %mm2,%mm0
movq 184(%ecx),%mm6
pfmul -8(%ebx),%mm6
pfsub %mm6,%mm4
pfacc %mm4,%mm0
movq (%esi), %mm1
pf2id %mm0, %mm0
pand one_null, %mm1
psrld $16,%mm0
pand null_one, %mm0
por %mm0, %mm1
movq %mm1,(%esi)
subl $128,%ebx
addl $256,%ecx
addl $8,%esi
decl %ebp
jnz .L68
/ --- end of loop 2
pxor %mm0, %mm0
movq (%ecx),%mm1
pfmul (%ebx),%mm1
pfsub %mm1,%mm0
movq 8(%ecx),%mm2
pfmul 8(%ebx),%mm2
pfsub %mm2,%mm0
movq 16(%ecx),%mm3
pfmul 16(%ebx),%mm3
pfsub %mm3,%mm0
movq 24(%ecx),%mm4
pfmul 24(%ebx),%mm4
pfsub %mm4,%mm0
movq 32(%ecx),%mm5
pfmul 32(%ebx),%mm5
pfsub %mm5,%mm0
movq 40(%ecx),%mm6
pfmul 40(%ebx),%mm6
pfsub %mm6,%mm0
movq 48(%ecx),%mm7
pfmul 48(%ebx),%mm7
pfsub %mm7,%mm0
movq 56(%ecx),%mm1
pfmul 56(%ebx),%mm1
pfsub %mm1,%mm0
pfacc %mm0,%mm0
pf2id %mm0,%mm0
movd %mm0,%eax
sar $16,%eax
movw %ax,(%esi)
femms
movl %edi,%eax
popl %ebx
popl %esi
popl %edi
popl %ebp
addl $12,%esp
ret

View File

@ -1,201 +0,0 @@
///
/// Replacement of synth_1to1() with Intel's SSE SIMD operations support
///
/// This code based 'decode_k7.s' by Nick Kurshev
/// <squash@mb.kcom.ne.jp>,only some types of changes have been made:
///
/// - SSE optimization
/// - change function name for support SSE automatic detect
///
/// Modified by Nick Kurshev <nickols_k@mail.ru>
///
/ synth_1to1_3dnow works the same way as the c version of
/ synth_1to1. this assembler code based 'decode-i586.s'
/ (by Stefan Bieschewski <stb@acm.org>), two types of changes
/ have been made:
/ - use {MMX,3DNow!} instruction for reduce cpu
/ - remove unused(?) local symbols
/
/ useful sources of information on optimizing 3DNow! code include:
/ AMD 3DNow! Technology Manual (Publication #21928)
/ English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf
/ (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
/ English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf
/
/ This code was tested only AMD-K6-2 processor Linux systems,
/ please tell me:
/ - whether this code works on other 3DNow! capable processors
/ (ex.IDT-C6-2) or not
/ - whether this code works on other OSes or not
/
/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
/ <kim@comtec.co.jp> - after 1.Apr.1998
/ Enhancments for q-word operation by Michael Hipp
.bss
.comm buffs,4352,4
.data
.align 4
bo:
.long 1
.text
/* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */
.globl synth_1to1_sse
synth_1to1_sse:
subl $12,%esp
pushl %ebp
pushl %edi
pushl %esi
pushl %ebx
movl 32(%esp),%eax
movl 40(%esp),%esi
movl $0,%edi
movl bo,%ebp
cmpl %edi,36(%esp)
jne .L48
decl %ebp
andl $15,%ebp
movl %ebp,bo
movl $buffs,%ecx
jmp .L49
.L48:
addl $2,%esi
movl $buffs+2176,%ecx
.L49:
testl $1,%ebp
je .L50
movl %ecx,%ebx
movl %ebp,16(%esp)
pushl %eax
movl 20(%esp),%edx
leal (%ebx,%edx,4),%eax
pushl %eax
movl 24(%esp),%eax
incl %eax
andl $15,%eax
leal 1088(,%eax,4),%eax
addl %ebx,%eax
jmp .L74
.L50:
leal 1088(%ecx),%ebx
leal 1(%ebp),%edx
movl %edx,16(%esp)
pushl %eax
leal 1092(%ecx,%ebp,4),%eax
pushl %eax
leal (%ecx,%ebp,4),%eax
.L74:
pushl %eax
call dct64
addl $12, %esp
movl 16(%esp),%edx
leal 0(,%edx,4),%edx
movl $decwin+64,%eax
movl %eax,%ecx
subl %edx,%ecx
movl $16,%ebp
.L55:
movups (%ecx), %xmm0
mulps (%ebx), %xmm0
movups 16(%ecx), %xmm1
mulps 16(%ebx), %xmm1
addps %xmm1, %xmm0
movups 32(%ecx), %xmm1
mulps 32(%ebx), %xmm1
addps %xmm1, %xmm0
movups 48(%ecx), %xmm1
mulps 48(%ebx), %xmm1
addps %xmm1, %xmm0
/* pfnacc -> PFNACC mmreg1, mmreg2 performs the following operations: */
/* temp = mmreg2 */
/* mmreg1[31:0] = mmreg1[31:0] - mmreg1[63:32] */
/* mmreg1[63:32]= temp [31:0] - temp[63:32] */
/* save difference of mmreg1's low-word and high-word into mmreg1's low-word */
/* save difference of mmreg2's low-word and high-word into mmreg1's high-word */
movhlps %xmm0, %xmm1
addps %xmm1, %xmm0
movaps %xmm0, %xmm1
shufps $0x55, %xmm1, %xmm1 /* fake of pfnacc. 1|1|1|1 */
subss %xmm1, %xmm0
cvtss2si %xmm0, %eax
/ sar $16,%eax
movw %ax,(%esi)
addl $64,%ebx
subl $-128,%ecx
addl $4,%esi
decl %ebp
jnz .L55
/ --- end of loop 1 ---
movups (%ecx), %xmm0
mulps (%ebx), %xmm0
movups 16(%ecx), %xmm1
mulps 16(%ebx), %xmm1
addps %xmm1, %xmm0
movups 32(%ecx), %xmm1
mulps 32(%ebx), %xmm1
addps %xmm1, %xmm0
movups 48(%ecx), %xmm1
mulps 48(%ebx), %xmm1
addps %xmm1, %xmm0
movhlps %xmm0, %xmm1
addss %xmm1, %xmm0
cvtss2si %xmm0, %eax
/ sar $16,%eax
movw %ax,(%esi)
addl $-64,%ebx
addl $4,%esi
addl $256,%ecx
movl $15,%ebp
.L68:
xorps %xmm0, %xmm0
movups (%ecx), %xmm1
mulps (%ebx), %xmm1
subps %xmm1, %xmm0
movups 16(%ecx), %xmm1
mulps 16(%ebx), %xmm1
subps %xmm1, %xmm0
movups 32(%ecx), %xmm1
mulps 32(%ebx), %xmm1
subps %xmm1, %xmm0
movups 48(%ecx), %xmm1
mulps 48(%ebx), %xmm1
subps %xmm1, %xmm0
movhlps %xmm0, %xmm1
subps %xmm1, %xmm0
movaps %xmm0, %xmm1
shufps $0x55, %xmm1, %xmm1 /* fake of pfacc 1|1|1|1 */
addss %xmm1, %xmm0
cvtss2si %xmm0, %eax
/ sar $16,%eax
movw %ax,(%esi)
addl $-64,%ebx
subl $-128,%ecx
addl $4,%esi
decl %ebp
jnz .L68
/ --- end of loop 2
movl %edi,%eax
popl %ebx
popl %esi
popl %edi
popl %ebp
addl $12,%esp
ret

View File

@ -50,8 +50,16 @@ static void init_layer2(void)
{
double m=mulmul[k];
table = muls[k];
if(_has_mmx)
{
for(j=3,i=0;i<63;i++,j--)
*table++ = 16384 * m * pow(2.0,(double) j / 3.0);
}
else
for(j=3,i=0;i<63;i++,j--)
{
*table++ = m * pow(2.0,(double) j / 3.0);
}
*table++ = 0.0;
}
}

View File

@ -22,9 +22,9 @@ static real win1[4][36];
#define GP2MAX (256+118+4)
static real gainpow2[GP2MAX];
static real nCOS9[9];
real COS9[9];
static real COS6_1,COS6_2;
static real tfcos36[9];
real tfcos36[9];
static real tfcos12[3];
#ifdef NEW_DCT9
static real cos9[3],cos18[3];
@ -111,8 +111,12 @@ void init_layer3(int down_sample_sblimit)
int i,j,k,l;
for(i=-256;i<118+4;i++)
gainpow2[i+256] = pow((double)2.0,-0.25 * (double) (i+210) );
{
if(_has_mmx)
gainpow2[i+256] = 16384.0 * pow((double)2.0,-0.25 * (double) (i+210) );
else
gainpow2[i+256] = pow((double)2.0,-0.25 * (double) (i+210) );
}
for(i=0;i<8207;i++)
ispow[i] = pow((double)i,(double)4.0/3.0);
@ -139,7 +143,7 @@ void init_layer3(int down_sample_sblimit)
}
for(i=0;i<9;i++)
nCOS9[i] = cos( M_PI / 18.0 * (double) i);
COS9[i] = cos( M_PI / 18.0 * (double) i);
for(i=0;i<9;i++)
tfcos36[i] = 0.5 / cos ( M_PI * (double) (i*2+1) / 36.0 );
@ -1533,6 +1537,9 @@ static void III_antialias(real xr[SBLIMIT][SSLIMIT],struct gr_info_s *gr_info)
/*
* III_hybrid
*/
dct36_func_t dct36_func;
static void III_hybrid(real fsIn[SBLIMIT][SSLIMIT],real tsOut[SSLIMIT][SBLIMIT],
int ch,struct gr_info_s *gr_info)
{
@ -1553,8 +1560,8 @@ static void III_hybrid(real fsIn[SBLIMIT][SSLIMIT],real tsOut[SSLIMIT][SBLIMIT],
if(gr_info->mixed_block_flag) {
sb = 2;
dct36(fsIn[0],rawout1,rawout2,win[0],tspnt);
dct36(fsIn[1],rawout1+18,rawout2+18,win1[0],tspnt+1);
(*dct36_func)(fsIn[0],rawout1,rawout2,win[0],tspnt);
(*dct36_func)(fsIn[1],rawout1+18,rawout2+18,win1[0],tspnt+1);
rawout1 += 36; rawout2 += 36; tspnt += 2;
}
@ -1567,8 +1574,8 @@ static void III_hybrid(real fsIn[SBLIMIT][SSLIMIT],real tsOut[SSLIMIT][SBLIMIT],
}
else {
for (; sb<gr_info->maxb; sb+=2,tspnt+=2,rawout1+=36,rawout2+=36) {
dct36(fsIn[sb],rawout1,rawout2,win[bt],tspnt);
dct36(fsIn[sb+1],rawout1+18,rawout2+18,win1[bt],tspnt+1);
(*dct36_func)(fsIn[sb],rawout1,rawout2,win[bt],tspnt);
(*dct36_func)(fsIn[sb+1],rawout1+18,rawout2+18,win1[bt],tspnt+1);
}
}

View File

@ -104,33 +104,22 @@ struct III_sideinfo
};
static long freqs[9];
#ifdef HAVE_3DNOW
real decwin[2*(512+32)];
#else
real decwin[512+32];
#endif
real *pnts[];
extern real decwin[(512+32)];
extern real *pnts[];
static int do_layer2(struct frame *fr,int single);
static int do_layer3(struct frame *fr,int single);
static int synth_1to1(real *bandPtr,int channel,unsigned char *out,int *pnt);
extern int synth_1to1_pent( real *,int,unsigned char * );
extern int synth_1to1_pent( real *,int,short * );
extern void make_decode_tables_MMX(long scaleval);
extern int synth_1to1_MMX( real *,int,short * );
extern int synth_1to1_MMX_s(real *, int, short *, short *, int *);
extern void dct64(real *a,real *b,real *c);
#ifdef HAVE_3DNOW
extern void dct64_3dnow( real *,real *, real * );
extern void dct36_3dnow(real *,real *,real *,real *,real *);
extern int synth_1to1_3dnow( real *,int,unsigned char * );
#endif
#ifdef HAVE_3DNOWEX
extern void dct64_3dnowex( real *,real *, real * );
extern void dct36_3dnowex(real *,real *,real *,real *,real *);
extern int synth_1to1_3dnowex( real *,int,unsigned char * );
#endif
#ifdef HAVE_SSE_MP3
// extern void dct64_3dnow( real *,real *, real * );
// extern void dct36_3dnow(real *,real *,real *,real *,real *);
extern int synth_1to1_sse( real *,int,unsigned char * );
#endif
extern void dct36_3dnow(real *,real *,real *,real *,real *);
extern void dct36_3dnowex(real *,real *,real *,real *,real *);
extern void dct36_sse(real *,real *,real *,real *,real *);
typedef int (*synth_func_t)( real *,int,short * );
typedef void (*dct36_func_t)(real *,real *,real *,real *,real *);

View File

@ -343,6 +343,12 @@ retry1:
static int tables_done_flag=0;
/* It's hidden from gcc in assembler */
extern void dct64_MMX( void );
extern void dct64_MMX_3dnow( void );
extern void dct64_MMX_3dnowex( void );
void (*dct64_MMX_func)( void );
// Init decoder tables. Call first, once!
#ifdef USE_FAKE_MONO
void MP3_Init(int fakemono){
@ -351,20 +357,41 @@ void MP3_Init(){
#endif
_CpuID=CpuDetect();
_i586=ipentium();
#ifdef HAVE_3DNOW
#ifndef HAVE_MMX
_i586 &= 1;
#endif
_3dnow=a3dnow();
#ifndef HAVE_3DNOW
_3dnow = 0;
#endif
#ifndef HAVE_3DNOWEX
_3dnow &= 1;
#endif
_isse=isse();
#ifndef HAVE_SSE
_isse = 0;
#endif
#ifndef HAVE_SSE2
_isse &= 1;
#endif
_has_mmx=_i586>1||_3dnow||_isse;
printf( "mp3lib: Processor ID: %x\n",_CpuID );
printf( "mp3lib: i586 processor %sdetected.\n",(_i586?"":"not ") );
#ifdef HAVE_3DNOW
printf( "mp3lib: AMD 3dnow! extension %sdetected.\n",(_3dnow?"":"not ") );
#endif
#ifdef HAVE_3DNOWEX
printf( "mp3lib: AMD 3dnow-dsp! extension %sdetected.\n",(_3dnow>1?"":"not ") );
#endif
if(_i586&&!_3dnow&&!_isse)
printf( "mp3lib: Using Pentium%s optimized decore.\n",(_i586>1?"-MMX":""));
else
if(_isse)
/*
Note: It's ok, Since K8 will have SSE2 support and will much faster
of P4 ;)
*/
printf( "mp3lib: Using SSE%s! optimized decore.\n",(_isse>1?"2":""));
else
if(_3dnow)
printf( "mp3lib: Using AMD 3dnow%s! optimized decore.\n",(_3dnow>1?"-dsp(k7)":""));
make_decode_tables(outscale);
/* Use it for any MMX cpu */
if(_has_mmx) make_decode_tables_MMX(outscale);
else make_decode_tables(outscale);
#ifdef USE_FAKE_MONO
if (fakemono == 1)
fr.synth=synth_1to1_l;
@ -381,6 +408,42 @@ void MP3_Init(){
init_layer2();
init_layer3(fr.down_sample_sblimit);
tables_done_flag=1;
dct36_func=dct36;
if(_isse)
{
synth_func=synth_1to1_MMX;
dct64_MMX_func=dct64_MMX;
}
else
if ( _3dnow > 1 )
{
synth_func=synth_1to1_MMX;
dct36_func=dct36_3dnowex;
dct64_MMX_func=dct64_MMX_3dnowex;
}
else
if ( _3dnow )
{
synth_func=synth_1to1_MMX;
dct36_func=dct36_3dnow;
dct64_MMX_func=dct64_MMX_3dnow;
}
else
if ( _i586 > 1)
{
synth_func=synth_1to1_MMX;
dct64_MMX_func=dct64_MMX;
}
else
if ( _i586 )
{
synth_func=synth_1to1_pent;
}
else
{
synth_func = NULL;
}
}
#if 0

View File

@ -1,20 +1,7 @@
real decwin[(512+32)], cos64[32], cos32[16], cos16[8], cos8[4], cos4[2];
real *pnts[]={ cos64,cos32,cos16,cos8,cos4 };
#ifdef HAVE_3DNOW
real decwin[2*(512+32)] __attribute__((aligned(8)));
real cos64[32] __attribute__((aligned(8)));
real cos32[16] __attribute__((aligned(8)));
real cos16[8] __attribute__((aligned(8)));
real cos8[4] __attribute__((aligned(8)));
real cos4[2] __attribute__((aligned(8)));
real *pnts[]={ cos64,cos32,cos16,cos8,cos4 };
#else
real decwin[512+32];
real cos64[16],cos32[8],cos16[4],cos8[2],cos4[1];
real *pnts[] = { cos64,cos32,cos16,cos8,cos4 };
#endif
long intwinbase[] = {
static long intwinbase[] = {
0, -1, -1, -1, -1, -1, -1, -2, -2, -2,
-2, -3, -3, -4, -4, -5, -5, -6, -7, -7,
-8, -9, -10, -11, -13, -14, -16, -17, -19, -21,
@ -42,7 +29,7 @@ long intwinbase[] = {
64019, 65290, 66494, 67629, 68692, 69679, 70590, 71420, 72169, 72835,
73415, 73908, 74313, 74630, 74856, 74992, 75038 };
void make_decode_tables(long scaleval)
void make_decode_tables(long scaleval)
{
int i,j,k,kr,divv;
real *table,*costab;
@ -53,17 +40,13 @@ long intwinbase[] = {
kr=0x10>>i; divv=0x40>>i;
costab = pnts[i];
for(k=0;k<kr;k++) costab[k] = 1.0 / (2.0 * cos(M_PI * ((double) k * 2.0 + 1.0) / (double) divv));
#ifdef HAVE_3DNOW
if ( _3dnow ) for(k=0;k<kr;k++) costab[k+kr]=-costab[k];
#endif
}
table = decwin;
scaleval = -scaleval;
for(i=0,j=0;i<256;i++,j++,table+=32)
{
if(table < decwin+512+16)
if(table < decwin+512+16)
table[16] = table[0] = (double) intwinbase[j] / 65536.0 * (double) scaleval;
if(i % 32 == 31)
table -= 1023;
@ -80,14 +63,6 @@ long intwinbase[] = {
if(i % 64 == 63)
scaleval = - scaleval;
}
#ifdef HAVE_3DNOW
if ( _3dnow )
for(i=0;i<512+32;i++)
{
decwin[512+31-i]*=65536.0; // allows faster clipping in 3dnow code
decwin[512+32+i]=decwin[512+31-i];
}
#endif
}

161
mp3lib/tabinit_MMX.s Normal file
View File

@ -0,0 +1,161 @@
# This code was taken from http://www.mpg123.org
# See ChangeLog of mpg123-0.59s-pre.1 for detail
# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
.bss
.align 8
.comm decwin,2176,32
.align 8
.comm decwins,2176,32
.data
.align 8
intwinbase_MMX:
.value 0, -1, -1, -1, -1, -1, -1, -2
.value -2, -2, -2, -3, -3, -4, -4, -5
.value -5, -6, -7, -7, -8, -9, -10, -11
.value -13, -14, -16, -17, -19, -21, -24, -26
.value -29, -31, -35, -38, -41, -45, -49, -53
.value -58, -63, -68, -73, -79, -85, -91, -97
.value -104, -111, -117, -125, -132, -139, -147, -154
.value -161, -169, -176, -183, -190, -196, -202, -208
.value -213, -218, -222, -225, -227, -228, -228, -227
.value -224, -221, -215, -208, -200, -189, -177, -163
.value -146, -127, -106, -83, -57, -29, 2, 36
.value 72, 111, 153, 197, 244, 294, 347, 401
.value 459, 519, 581, 645, 711, 779, 848, 919
.value 991, 1064, 1137, 1210, 1283, 1356, 1428, 1498
.value 1567, 1634, 1698, 1759, 1817, 1870, 1919, 1962
.value 2001, 2032, 2057, 2075, 2085, 2087, 2080, 2063
.value 2037, 2000, 1952, 1893, 1822, 1739, 1644, 1535
.value 1414, 1280, 1131, 970, 794, 605, 402, 185
.value -45, -288, -545, -814, -1095, -1388, -1692, -2006
.value -2330, -2663, -3004, -3351, -3705, -4063, -4425, -4788
.value -5153, -5517, -5879, -6237, -6589, -6935, -7271, -7597
.value -7910, -8209, -8491, -8755, -8998, -9219, -9416, -9585
.value -9727, -9838, -9916, -9959, -9966, -9935, -9863, -9750
.value -9592, -9389, -9139, -8840, -8492, -8092, -7640, -7134
.value -6574, -5959, -5288, -4561, -3776, -2935, -2037, -1082
.value -70, 998, 2122, 3300, 4533, 5818, 7154, 8540
.value 9975, 11455, 12980, 14548, 16155, 17799, 19478, 21189
.value 22929, 24694, 26482, 28289, 30112, 31947,-26209,-24360
.value -22511,-20664,-18824,-16994,-15179,-13383,-11610, -9863
.value -8147, -6466, -4822, -3222, -1667, -162, 1289, 2684
.value 4019, 5290, 6494, 7629, 8692, 9679, 10590, 11420
.value 12169, 12835, 13415, 13908, 14313, 14630, 14856, 14992
.value 15038
intwindiv:
.long 0x47800000 # 65536.0
.text
.align 32
.globl make_decode_tables_MMX
make_decode_tables_MMX:
pushl %edi
pushl %esi
pushl %ebx
xorl %ecx,%ecx
xorl %ebx,%ebx
movl $32,%esi
movl $intwinbase_MMX,%edi
negl 16(%esp) # scaleval
pushl $2 # intwinbase step
.L00:
cmpl $528,%ecx
jnc .L02
movswl (%edi),%eax
cmpl $intwinbase_MMX+444,%edi
jc .L01
addl $60000,%eax
.L01:
pushl %eax
fildl (%esp)
fdivs intwindiv
fimull 24(%esp)
popl %eax
fsts decwin(,%ecx,4)
fstps decwin+64(,%ecx,4)
.L02:
leal -1(%esi),%edx
and %ebx,%edx
cmp $31,%edx
jnz .L03
addl $-1023,%ecx
test %esi,%ebx
jz .L03
negl 20(%esp)
.L03:
addl %esi,%ecx
addl (%esp),%edi
incl %ebx
cmpl $intwinbase_MMX,%edi
jz .L04
cmp $256,%ebx
jnz .L00
negl (%esp)
jmp .L00
.L04:
popl %eax
xorl %ecx,%ecx
xorl %ebx,%ebx
pushl $2
.L05:
cmpl $528,%ecx
jnc .L11
movswl (%edi),%eax
cmpl $intwinbase_MMX+444,%edi
jc .L06
addl $60000,%eax
.L06:
cltd
imull 20(%esp)
shrdl $17,%edx,%eax
cmpl $32767,%eax
movl $1055,%edx
jle .L07
movl $32767,%eax
jmp .L08
.L07:
cmpl $-32767,%eax
jge .L08
movl $-32767,%eax
.L08:
cmpl $512,%ecx
jnc .L09
subl %ecx,%edx
movw %ax,decwins(,%edx,2)
movw %ax,decwins-32(,%edx,2)
.L09:
testl $1,%ecx
jnz .L10
negl %eax
.L10:
movw %ax,decwins(,%ecx,2)
movw %ax,decwins+32(,%ecx,2)
.L11:
leal -1(%esi),%edx
and %ebx,%edx
cmp $31,%edx
jnz .L12
addl $-1023,%ecx
test %esi,%ebx
jz .L12
negl 20(%esp)
.L12:
addl %esi,%ecx
addl (%esp),%edi
incl %ebx
cmpl $intwinbase_MMX,%edi
jz .L13
cmp $256,%ebx
jnz .L05
negl (%esp)
jmp .L05
.L13:
popl %eax
popl %ebx
popl %esi
popl %edi
ret

View File

@ -1,5 +1,5 @@
// gcc test.c -I.. -L. -lMP3 -lm -o test2 -O4
//gcc test2.c -O2 -I.. -L. ../libvo/aclib.c -lMP3 -lm -o test2
#include <stdio.h>
#include <stdlib.h>