From d7a1862c912ee2df45d409192092947c4819823b Mon Sep 17 00:00:00 2001 From: atmos4 Date: Fri, 27 Jul 2001 17:25:19 +0000 Subject: [PATCH] First development version of dct64, mixed with 3dnow/k7 and fpu code. Phases 1 to 3 seem to be ok already, report if you get strange sound with this version (klicks or distorted sound, that doesn't happen with mmx-only version), I've tested with approx. 20 mp3 files which all sounded ok, speed improvement with this version is still very minimal cause more cpu intensive phases 4 and 5 aren't working so I use fpu code for them. git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@1395 b3059339-0415-0410-9bf9-f77b7e298cf2 --- mp3lib/dct64_sse.s | 2217 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2217 insertions(+) create mode 100644 mp3lib/dct64_sse.s diff --git a/mp3lib/dct64_sse.s b/mp3lib/dct64_sse.s new file mode 100644 index 0000000000..922e1c881a --- /dev/null +++ b/mp3lib/dct64_sse.s @@ -0,0 +1,2217 @@ +# This code is a translation of dct64_k7.s from MPlayer. +# Coded by Felix Buenemann +# +# TODO: - fix phases 4 and 5 (sse) +# - optimize scalar FPU code? (interleave with sse code) +# + +//.data +// .align 8 +//x_plus_minus_3dnow: .long 0x00000000, 0x80000000 +//plus_1f: .float 1.0 + +.text + + .align 16 + + .global dct64_MMX_sse + +dct64_MMX_sse: + pushl %ebx + pushl %esi + pushl %edi + subl $256,%esp + movl 280(%esp),%eax + + leal 128(%esp),%edx + movl 272(%esp),%esi + movl 276(%esp),%edi + movl $costab_mmx,%ebx + orl %ecx,%ecx + movl %esp,%ecx + +/* Phase 1 (complete, worx) */ + +// [1] Process Block A1 (16 Bytes) +/ movq (%eax), %mm0 +/ movq 8(%eax), %mm4 + movups (%eax), %xmm0 + +// Copy A1 to another register A2 +/ movq %mm0, %mm3 +/ movq %mm4, %mm7 + movaps %xmm0, %xmm2 + +// Process Block B1 (last 16 bytes) +/ movq 120(%eax), %mm1 +/ movq 112(%eax), %mm5 + movups 112(%eax), %xmm1 + +/* The PSWAPD instruction swaps or reverses the upper and lower + * doublewords of the source operand. PSWAPD mmreg1, mmreg2 + * performs the following operations: + * temp = mmreg2 + * mmreg1[63:32] = temp[31:0 ] + * mmreg1[31:0 ] = temp[63:32] + */ +/ pswapd %mm1, %mm1 +/ pswapd %mm5, %mm5 +// shufps here exchanges a,b,c,d to b,a,d,c in xmm1 (desc ia32-ref p.752) +//// shufps $177, %xmm1, %xmm1 + shufps $27, %xmm1, %xmm1 + +// Add B1 to A1 +/ pfadd %mm1, %mm0 +/ pfadd %mm5, %mm4 + addps %xmm1, %xmm0 + +// Save Block A1 +/ movq %mm0, (%edx) +/ movq %mm4, 8(%edx) + movups %xmm0, (%edx) + +// Sub B1 from A2 +/ pfsub %mm1, %mm3 +/ pfsub %mm5, %mm7 + subps %xmm1, %xmm2 + +// Mul mem with A2 +/ pfmul (%ebx), %mm3 +/ pfmul 8(%ebx), %mm7 + movups (%ebx), %xmm7 + mulps %xmm7, %xmm2 + +// Shuffle A2 +/ pswapd %mm3, %mm3 +/ pswapd %mm7, %mm7 +// I do a,b,c,d -> d,c,b,a to suit order when writing to mem (saves one shufps) + shufps $27, %xmm2, %xmm2 + +// Save A2 to mem (end) +/ movq %mm3, 120(%edx) +/ movq %mm7, 112(%edx) + movups %xmm2, 112(%edx) + +// [2] Process next data block +/ movq 16(%eax), %mm0 +/ movq 24(%eax), %mm4 + movups 16(%eax), %xmm0 + +/ movq %mm0, %mm3 +/ movq %mm4, %mm7 + movaps %xmm0, %xmm2 + +/ movq 104(%eax), %mm1 +/ movq 96(%eax), %mm5 + movups 96(%eax), %xmm1 + +/ pswapd %mm1, %mm1 +/ pswapd %mm5, %mm5 +//// shufps $177, %xmm1, %xmm1 + shufps $27, %xmm1, %xmm1 + +/ pfadd %mm1, %mm0 +/ pfadd %mm5, %mm4 + addps %xmm1, %xmm0 + +/ movq %mm0, 16(%edx) +/ movq %mm4, 24(%edx) + movups %xmm0, 16(%edx) + +/ pfsub %mm1, %mm3 +/ pfsub %mm5, %mm7 + subps %xmm1, %xmm2 + +/ pfmul 16(%ebx), %mm3 +/ pfmul 24(%ebx), %mm7 + movups 16(%ebx), %xmm7 + mulps %xmm7, %xmm2 + +/ pswapd %mm3, %mm3 +/ pswapd %mm7, %mm7 + shufps $27, %xmm2, %xmm2 + +/ movq %mm3, 104(%edx) +/ movq %mm7, 96(%edx) + movups %xmm2, 96(%edx) + +// [3] +/ movq 32(%eax), %mm0 +/ movq 40(%eax), %mm4 + movups 32(%eax), %xmm0 + +/ movq %mm0, %mm3 +/ movq %mm4, %mm7 + movaps %xmm0, %xmm2 + +/ movq 88(%eax), %mm1 +/ movq 80(%eax), %mm5 + movups 80(%eax), %xmm1 + +/ pswapd %mm1, %mm1 +/ pswapd %mm5, %mm5 +//// shufps $177, %xmm1, %xmm1 + shufps $27, %xmm1, %xmm1 + +/ pfadd %mm1, %mm0 +/ pfadd %mm5, %mm4 + addps %xmm1, %xmm0 + +/ movq %mm0, 32(%edx) +/ movq %mm4, 40(%edx) + movups %xmm0, 32(%edx) + +/ pfsub %mm1, %mm3 +/ pfsub %mm5, %mm7 + subps %xmm1, %xmm2 + +/ pfmul 32(%ebx), %mm3 +/ pfmul 40(%ebx), %mm7 + movups 32(%ebx), %xmm7 + mulps %xmm7, %xmm2 + +/ pswapd %mm3, %mm3 +/ pswapd %mm7, %mm7 + shufps $27, %xmm2, %xmm2 + +/ movq %mm3, 88(%edx) +/ movq %mm7, 80(%edx) + movups %xmm2, 80(%edx) + +// [4] +/ movq 48(%eax), %mm0 +/ movq 56(%eax), %mm4 + movups 48(%eax), %xmm0 + +/ movq %mm0, %mm3 +/ movq %mm4, %mm7 + movaps %xmm0, %xmm2 + +/ movq 72(%eax), %mm1 +/ movq 64(%eax), %mm5 + movups 64(%eax), %xmm1 + +/ pswapd %mm1, %mm1 +/ pswapd %mm5, %mm5 +//// shufps $177, %xmm1, %xmm1 + shufps $27, %xmm1, %xmm1 + +/ pfadd %mm1, %mm0 +/ pfadd %mm5, %mm4 + addps %xmm1, %xmm0 + +/ movq %mm0, 48(%edx) +/ movq %mm4, 56(%edx) + movups %xmm0, 48(%edx) + +/ pfsub %mm1, %mm3 +/ pfsub %mm5, %mm7 + subps %xmm1, %xmm2 + +/ pfmul 48(%ebx), %mm3 +/ pfmul 56(%ebx), %mm7 + movups 48(%ebx), %xmm7 + mulps %xmm7, %xmm2 + +/ pswapd %mm3, %mm3 +/ pswapd %mm7, %mm7 + shufps $27, %xmm2, %xmm2 + +/ movq %mm3, 72(%edx) +/ movq %mm7, 64(%edx) + movups %xmm2, 64(%edx) + + +// phase 1 fpu code +/* Phase 1*/ +/* + flds (%eax) + leal 128(%esp),%edx + fadds 124(%eax) + movl 272(%esp),%esi + fstps (%edx) + movl 276(%esp),%edi + + flds 4(%eax) + movl $costab_mmx,%ebx + fadds 120(%eax) + orl %ecx,%ecx + fstps 4(%edx) + + flds (%eax) + movl %esp,%ecx + fsubs 124(%eax) + fmuls (%ebx) + fstps 124(%edx) + + flds 4(%eax) + fsubs 120(%eax) + fmuls 4(%ebx) + fstps 120(%edx) + + flds 8(%eax) + fadds 116(%eax) + fstps 8(%edx) + + flds 12(%eax) + fadds 112(%eax) + fstps 12(%edx) + + flds 8(%eax) + fsubs 116(%eax) + fmuls 8(%ebx) + fstps 116(%edx) + + flds 12(%eax) + fsubs 112(%eax) + fmuls 12(%ebx) + fstps 112(%edx) + + flds 16(%eax) + fadds 108(%eax) + fstps 16(%edx) + + flds 20(%eax) + fadds 104(%eax) + fstps 20(%edx) + + flds 16(%eax) + fsubs 108(%eax) + fmuls 16(%ebx) + fstps 108(%edx) + + flds 20(%eax) + fsubs 104(%eax) + fmuls 20(%ebx) + fstps 104(%edx) + + flds 24(%eax) + fadds 100(%eax) + fstps 24(%edx) + + flds 28(%eax) + fadds 96(%eax) + fstps 28(%edx) + + flds 24(%eax) + fsubs 100(%eax) + fmuls 24(%ebx) + fstps 100(%edx) + + flds 28(%eax) + fsubs 96(%eax) + fmuls 28(%ebx) + fstps 96(%edx) + + flds 32(%eax) + fadds 92(%eax) + fstps 32(%edx) + + flds 36(%eax) + fadds 88(%eax) + fstps 36(%edx) + + flds 32(%eax) + fsubs 92(%eax) + fmuls 32(%ebx) + fstps 92(%edx) + + flds 36(%eax) + fsubs 88(%eax) + fmuls 36(%ebx) + fstps 88(%edx) + + flds 40(%eax) + fadds 84(%eax) + fstps 40(%edx) + + flds 44(%eax) + fadds 80(%eax) + fstps 44(%edx) + + flds 40(%eax) + fsubs 84(%eax) + fmuls 40(%ebx) + fstps 84(%edx) + + flds 44(%eax) + fsubs 80(%eax) + fmuls 44(%ebx) + fstps 80(%edx) + + flds 48(%eax) + fadds 76(%eax) + fstps 48(%edx) + + flds 52(%eax) + fadds 72(%eax) + fstps 52(%edx) + + flds 48(%eax) + fsubs 76(%eax) + fmuls 48(%ebx) + fstps 76(%edx) + + flds 52(%eax) + fsubs 72(%eax) + fmuls 52(%ebx) + fstps 72(%edx) + + flds 56(%eax) + fadds 68(%eax) + fstps 56(%edx) + + flds 60(%eax) + fadds 64(%eax) + fstps 60(%edx) + + flds 56(%eax) + fsubs 68(%eax) + fmuls 56(%ebx) + fstps 68(%edx) + + flds 60(%eax) + fsubs 64(%eax) + fmuls 60(%ebx) + fstps 64(%edx) +*/ +// end phase 1 fpu code + +/* Phase 2 (completed, worx) */ + +/ movq (%edx), %mm0 +/ movq 8(%edx), %mm4 + movups (%edx), %xmm0 + +/ movq %mm0, %mm3 +/ movq %mm4, %mm7 + movaps %xmm0, %xmm2 + +/ movq 56(%edx), %mm1 +/ movq 48(%edx), %mm5 + movups 48(%edx), %xmm1 + +/ pswapd %mm1, %mm1 +/ pswapd %mm5, %mm5 +//// shufps $177, %xmm1, %xmm1 + shufps $27, %xmm1, %xmm1 + +/ pfadd %mm1, %mm0 +/ pfadd %mm5, %mm4 + addps %xmm1, %xmm0 + +/ movq %mm0, (%ecx) +/ movq %mm4, 8(%ecx) + movups %xmm0, (%ecx) + +/ pfsub %mm1, %mm3 +/ pfsub %mm5, %mm7 + subps %xmm1, %xmm2 + +/ pfmul 64(%ebx), %mm3 +/ pfmul 72(%ebx), %mm7 + movups 64(%ebx), %xmm7 + mulps %xmm7, %xmm2 + +/ pswapd %mm3, %mm3 +/ pswapd %mm7, %mm7 + shufps $27, %xmm2, %xmm2 + +/ movq %mm3, 56(%ecx) +/ movq %mm7, 48(%ecx) + movups %xmm2, 48(%ecx) + +/ movq 16(%edx), %mm0 +/ movq 24(%edx), %mm4 + movups 16(%edx), %xmm0 + +/ movq %mm0, %mm3 +/ movq %mm4, %mm7 + movaps %xmm0, %xmm2 + +/ movq 40(%edx), %mm1 +/ movq 32(%edx), %mm5 + movups 32(%edx), %xmm1 + +/ pswapd %mm1, %mm1 +/ pswapd %mm5, %mm5 +//// shufps $177, %xmm1, %xmm1 + shufps $27, %xmm1, %xmm1 + +/ pfadd %mm1, %mm0 +/ pfadd %mm5, %mm4 + addps %xmm1, %xmm0 + +/ movq %mm0, 16(%ecx) +/ movq %mm4, 24(%ecx) + movups %xmm0, 16(%ecx) + +/ pfsub %mm1, %mm3 +/ pfsub %mm5, %mm7 + subps %xmm1, %xmm2 + +/ pfmul 80(%ebx), %mm3 +/ pfmul 88(%ebx), %mm7 + movups 80(%ebx), %xmm7 + mulps %xmm7, %xmm2 + +/ pswapd %mm3, %mm3 +/ pswapd %mm7, %mm7 + shufps $27, %xmm2, %xmm2 + +/ movq %mm3, 40(%ecx) +/ movq %mm7, 32(%ecx) + movups %xmm2, 32(%ecx) + + +// phase 2 fpu +/* Phase 2*/ +/* + flds (%edx) + fadds 60(%edx) + fstps (%ecx) + + flds 4(%edx) + fadds 56(%edx) + fstps 4(%ecx) + + flds (%edx) + fsubs 60(%edx) + fmuls 64(%ebx) + fstps 60(%ecx) + + flds 4(%edx) + fsubs 56(%edx) + fmuls 68(%ebx) + fstps 56(%ecx) + + flds 8(%edx) + fadds 52(%edx) + fstps 8(%ecx) + + flds 12(%edx) + fadds 48(%edx) + fstps 12(%ecx) + + flds 8(%edx) + fsubs 52(%edx) + fmuls 72(%ebx) + fstps 52(%ecx) + + flds 12(%edx) + fsubs 48(%edx) + fmuls 76(%ebx) + fstps 48(%ecx) + + flds 16(%edx) + fadds 44(%edx) + fstps 16(%ecx) + + flds 20(%edx) + fadds 40(%edx) + fstps 20(%ecx) + + flds 16(%edx) + fsubs 44(%edx) + fmuls 80(%ebx) + fstps 44(%ecx) + + flds 20(%edx) + fsubs 40(%edx) + fmuls 84(%ebx) + fstps 40(%ecx) + + flds 24(%edx) + fadds 36(%edx) + fstps 24(%ecx) + + flds 28(%edx) + fadds 32(%edx) + fstps 28(%ecx) + + flds 24(%edx) + fsubs 36(%edx) + fmuls 88(%ebx) + fstps 36(%ecx) + + flds 28(%edx) + fsubs 32(%edx) + fmuls 92(%ebx) + fstps 32(%ecx) +*/ +// end phase 2 fpu + +/* Phase 3 (completed, working) */ + +/ movq 64(%edx), %mm0 +/ movq 72(%edx), %mm4 + movups 64(%edx), %xmm0 + +/ movq %mm0, %mm3 +/ movq %mm4, %mm7 + movaps %xmm0, %xmm2 + +/ movq 120(%edx), %mm1 +/ movq 112(%edx), %mm5 + movups 112(%edx), %xmm1 + +/ pswapd %mm1, %mm1 +/ pswapd %mm5, %mm5 +//// shufps $177, %xmm1, %xmm1 + shufps $27, %xmm1, %xmm1 + +/ pfadd %mm1, %mm0 +/ pfadd %mm5, %mm4 + addps %xmm1, %xmm0 + +/ movq %mm0, 64(%ecx) +/ movq %mm4, 72(%ecx) + movups %xmm0, 64(%ecx) + +/ pfsubr %mm1, %mm3 +/ pfsubr %mm5, %mm7 +// optimized (xmm1<->xmm2) + subps %xmm2, %xmm1 + +/ pfmul 64(%ebx), %mm3 +/ pfmul 72(%ebx), %mm7 + movups 64(%ebx), %xmm7 + mulps %xmm7, %xmm1 + +/ pswapd %mm3, %mm3 +/ pswapd %mm7, %mm7 + shufps $27, %xmm1, %xmm1 + +/ movq %mm3, 120(%ecx) +/ movq %mm7, 112(%ecx) + movups %xmm1, 112(%ecx) + + +/ movq 80(%edx), %mm0 +/ movq 88(%edx), %mm4 + movups 80(%edx), %xmm0 + +/ movq %mm0, %mm3 +/ movq %mm4, %mm7 + movaps %xmm0, %xmm2 + +/ movq 104(%edx), %mm1 +/ movq 96(%edx), %mm5 + movups 96(%edx), %xmm1 + +/ pswapd %mm1, %mm1 +/ pswapd %mm5, %mm5 +//// shufps $177, %xmm1, %xmm1 + shufps $27, %xmm1, %xmm1 + +/ pfadd %mm1, %mm0 +/ pfadd %mm5, %mm4 + addps %xmm1, %xmm0 + +/ movq %mm0, 80(%ecx) +/ movq %mm4, 88(%ecx) + movups %xmm0, 80(%ecx) + +/ pfsubr %mm1, %mm3 +/ pfsubr %mm5, %mm7 +// optimized (xmm1<->xmm2) + subps %xmm2, %xmm1 + +/ pfmul 80(%ebx), %mm3 +/ pfmul 88(%ebx), %mm7 + movups 80(%ebx), %xmm7 + mulps %xmm7, %xmm1 + +/ pswapd %mm3, %mm3 +/ pswapd %mm7, %mm7 + shufps $27, %xmm1, %xmm1 + +/ movq %mm3, 104(%ecx) +/ movq %mm7, 96(%ecx) + movups %xmm1, 96(%ecx) + + +// phase 3 fpu +/* Phase 3*/ +/* + flds 64(%edx) + fadds 124(%edx) + fstps 64(%ecx) + + flds 68(%edx) + fadds 120(%edx) + fstps 68(%ecx) + + flds 124(%edx) + fsubs 64(%edx) + fmuls 64(%ebx) + fstps 124(%ecx) + + flds 120(%edx) + fsubs 68(%edx) + fmuls 68(%ebx) + fstps 120(%ecx) + + flds 72(%edx) + fadds 116(%edx) + fstps 72(%ecx) + + flds 76(%edx) + fadds 112(%edx) + fstps 76(%ecx) + + flds 116(%edx) + fsubs 72(%edx) + fmuls 72(%ebx) + fstps 116(%ecx) + + flds 112(%edx) + fsubs 76(%edx) + fmuls 76(%ebx) + fstps 112(%ecx) + + flds 80(%edx) + fadds 108(%edx) + fstps 80(%ecx) + + flds 84(%edx) + fadds 104(%edx) + fstps 84(%ecx) + + flds 108(%edx) + fsubs 80(%edx) + fmuls 80(%ebx) + fstps 108(%ecx) + + flds 104(%edx) + fsubs 84(%edx) + fmuls 84(%ebx) + fstps 104(%ecx) + + flds 88(%edx) + fadds 100(%edx) + fstps 88(%ecx) + + flds 92(%edx) + fadds 96(%edx) + fstps 92(%ecx) + + flds 100(%edx) + fsubs 88(%edx) + fmuls 88(%ebx) + fstps 100(%ecx) + + flds 96(%edx) + fsubs 92(%edx) + fmuls 92(%ebx) + fstps 96(%ecx) +*/ +// end phase 3 fpu + + +/* Phase 4 (completed, buggy) */ +/* +/ movq 96(%ebx), %mm2 +/ movq 104(%ebx), %mm6 + movups 96(%ebx), %xmm4 + + +/ movq (%ecx), %mm0 +/ movq 8(%ecx), %mm4 + movups (%ecx), %xmm0 + +/ movq %mm0, %mm3 +/ movq %mm4, %mm7 + movaps %xmm0, %xmm2 + +/ movq 24(%ecx), %mm1 +/ movq 16(%ecx), %mm5 + movups 16(%ecx), %xmm1 + +/ pswapd %mm1, %mm1 +/ pswapd %mm5, %mm5 +//// shufps $177, %xmm1, %xmm1 + shufps $27, %xmm1, %xmm1 + +/ pfadd %mm1, %mm0 +/ pfadd %mm5, %mm4 + addps %xmm1, %xmm0 + +/ movq %mm0, (%edx) +/ movq %mm4, 8(%edx) + movups %xmm0, (%edx) + +/ pfsub %mm1, %mm3 +/ pfsub %mm5, %mm7 + subps %xmm1, %xmm2 + +/ pfmul %mm2, %mm3 +/ pfmul %mm6, %mm7 + mulps %xmm4, %xmm2 + +/ pswapd %mm3, %mm3 +/ pswapd %mm7, %mm7 + shufps $27, %xmm2, %xmm2 + +/ movq %mm3, 24(%edx) +/ movq %mm7, 16(%edx) + movups %xmm2, 16(%edx) + +/ movq 32(%ecx), %mm0 +/ movq 40(%ecx), %mm4 + movups 32(%ecx), %xmm0 + +/ movq %mm0, %mm3 +/ movq %mm4, %mm7 + movaps %xmm0, %xmm2 + +/ movq 56(%ecx), %mm1 +/ movq 48(%ecx), %mm5 + movups 48(%ecx), %xmm1 + +/ pswapd %mm1, %mm1 +/ pswapd %mm5, %mm5 +//// shufps $177, %xmm1, %xmm1 + shufps $27, %xmm1, %xmm1 + +/ pfadd %mm1, %mm0 +/ pfadd %mm5, %mm4 + addps %xmm1, %xmm0 + +/ movq %mm0, 32(%edx) +/ movq %mm4, 40(%edx) + movups %xmm0, 32(%edx) + +/ pfsubr %mm1, %mm3 +/ pfsubr %mm5, %mm7 +// Luckily we can swap this (xmm1<->xmm2) + subps %xmm2, %xmm1 + +/ pfmul %mm2, %mm3 +/ pfmul %mm6, %mm7 + mulps %xmm4, %xmm1 + +/ pswapd %mm3, %mm3 +/ pswapd %mm7, %mm7 + shufps $27, %xmm1, %xmm1 + +/ movq %mm3, 56(%edx) +/ movq %mm7, 48(%edx) + movups %xmm1, 48(%edx) + + +/ movq 64(%ecx), %mm0 +/ movq 72(%ecx), %mm4 + movups 64(%ecx), %xmm0 + +/ movq %mm0, %mm3 +/ movq %mm4, %mm7 + movaps %xmm0, %xmm2 + +/ movq 88(%ecx), %mm1 +/ movq 80(%ecx), %mm5 + movups 80(%ecx), %xmm1 + +/ pswapd %mm1, %mm1 +/ pswapd %mm5, %mm5 +//// shufps $177, %xmm1, %xmm1 + shufps $27, %xmm1, %xmm1 + +/ pfadd %mm1, %mm0 +/ pfadd %mm5, %mm4 + addps %xmm1, %xmm0 + +/ movq %mm0, 64(%edx) +/ movq %mm4, 72(%edx) + movups %xmm0, 64(%edx) + +/ pfsub %mm1, %mm3 +/ pfsub %mm5, %mm7 + subps %xmm1, %xmm2 + +/ pfmul %mm2, %mm3 +/ pfmul %mm6, %mm7 + mulps %xmm4, %xmm2 + +/ pswapd %mm3, %mm3 +/ pswapd %mm7, %mm7 + shufps $27, %xmm2, %xmm2 + +/ movq %mm3, 88(%edx) +/ movq %mm7, 80(%edx) + movups %xmm2, 80(%edx) + + +/ movq 96(%ecx), %mm0 +/ movq 104(%ecx), %mm4 + movups 96(%ecx), %xmm0 + +/ movq %mm0, %mm3 +/ movq %mm4, %mm7 + movaps %xmm0, %xmm2 + +/ movq 120(%ecx), %mm1 +/ movq 112(%ecx), %mm5 + movups 112(%ecx), %xmm1 + +/ pswapd %mm1, %mm1 +/ pswapd %mm5, %mm5 +//// shufps $177, %xmm1, %xmm1 + shufps $27, %xmm1, %xmm1 + +/ pfadd %mm1, %mm0 +/ pfadd %mm5, %mm4 + addps %xmm1, %xmm0 + +/ movq %mm0, 96(%edx) +/ movq %mm4, 104(%edx) + movups %xmm0, 96(%edx) + +/ pfsubr %mm1, %mm3 +/ pfsubr %mm5, %mm7 +// This is already optimized, so xmm2 must be swapped with xmm1 for rest of phase + subps %xmm2, %xmm1 + +/ pfmul %mm2, %mm3 +/ pfmul %mm6, %mm7 + mulps %xmm4, %xmm1 + +/ pswapd %mm3, %mm3 +/ pswapd %mm7, %mm7 + shufps $27, %xmm1, %xmm1 + +/ movq %mm3, 120(%edx) +/ movq %mm7, 112(%edx) + movups %xmm1, 112(%edx) +*/ + +// phase 4 fpu code +/* Phase 4*/ + + flds (%ecx) + fadds 28(%ecx) + fstps (%edx) + + flds (%ecx) + fsubs 28(%ecx) + fmuls 96(%ebx) + fstps 28(%edx) + + flds 4(%ecx) + fadds 24(%ecx) + fstps 4(%edx) + + flds 4(%ecx) + fsubs 24(%ecx) + fmuls 100(%ebx) + fstps 24(%edx) + + flds 8(%ecx) + fadds 20(%ecx) + fstps 8(%edx) + + flds 8(%ecx) + fsubs 20(%ecx) + fmuls 104(%ebx) + fstps 20(%edx) + + flds 12(%ecx) + fadds 16(%ecx) + fstps 12(%edx) + + flds 12(%ecx) + fsubs 16(%ecx) + fmuls 108(%ebx) + fstps 16(%edx) + + flds 32(%ecx) + fadds 60(%ecx) + fstps 32(%edx) + + flds 60(%ecx) + fsubs 32(%ecx) + fmuls 96(%ebx) + fstps 60(%edx) + + flds 36(%ecx) + fadds 56(%ecx) + fstps 36(%edx) + + flds 56(%ecx) + fsubs 36(%ecx) + fmuls 100(%ebx) + fstps 56(%edx) + + flds 40(%ecx) + fadds 52(%ecx) + fstps 40(%edx) + + flds 52(%ecx) + fsubs 40(%ecx) + fmuls 104(%ebx) + fstps 52(%edx) + + flds 44(%ecx) + fadds 48(%ecx) + fstps 44(%edx) + + flds 48(%ecx) + fsubs 44(%ecx) + fmuls 108(%ebx) + fstps 48(%edx) + + flds 64(%ecx) + fadds 92(%ecx) + fstps 64(%edx) + + flds 64(%ecx) + fsubs 92(%ecx) + fmuls 96(%ebx) + fstps 92(%edx) + + flds 68(%ecx) + fadds 88(%ecx) + fstps 68(%edx) + + flds 68(%ecx) + fsubs 88(%ecx) + fmuls 100(%ebx) + fstps 88(%edx) + + flds 72(%ecx) + fadds 84(%ecx) + fstps 72(%edx) + + flds 72(%ecx) + fsubs 84(%ecx) + fmuls 104(%ebx) + fstps 84(%edx) + + flds 76(%ecx) + fadds 80(%ecx) + fstps 76(%edx) + + flds 76(%ecx) + fsubs 80(%ecx) + fmuls 108(%ebx) + fstps 80(%edx) + + flds 96(%ecx) + fadds 124(%ecx) + fstps 96(%edx) + + flds 124(%ecx) + fsubs 96(%ecx) + fmuls 96(%ebx) + fstps 124(%edx) + + flds 100(%ecx) + fadds 120(%ecx) + fstps 100(%edx) + + flds 120(%ecx) + fsubs 100(%ecx) + fmuls 100(%ebx) + fstps 120(%edx) + + flds 104(%ecx) + fadds 116(%ecx) + fstps 104(%edx) + + flds 116(%ecx) + fsubs 104(%ecx) + fmuls 104(%ebx) + fstps 116(%edx) + + flds 108(%ecx) + fadds 112(%ecx) + fstps 108(%edx) + + flds 112(%ecx) + fsubs 108(%ecx) + fmuls 108(%ebx) + fstps 112(%edx) + + flds (%edx) + fadds 12(%edx) + fstps (%ecx) + + flds (%edx) + fsubs 12(%edx) + fmuls 112(%ebx) + fstps 12(%ecx) + + flds 4(%edx) + fadds 8(%edx) + fstps 4(%ecx) + + flds 4(%edx) + fsubs 8(%edx) + fmuls 116(%ebx) + fstps 8(%ecx) + + flds 16(%edx) + fadds 28(%edx) + fstps 16(%ecx) + + flds 28(%edx) + fsubs 16(%edx) + fmuls 112(%ebx) + fstps 28(%ecx) + + flds 20(%edx) + fadds 24(%edx) + fstps 20(%ecx) + + flds 24(%edx) + fsubs 20(%edx) + fmuls 116(%ebx) + fstps 24(%ecx) + + flds 32(%edx) + fadds 44(%edx) + fstps 32(%ecx) + + flds 32(%edx) + fsubs 44(%edx) + fmuls 112(%ebx) + fstps 44(%ecx) + + flds 36(%edx) + fadds 40(%edx) + fstps 36(%ecx) + + flds 36(%edx) + fsubs 40(%edx) + fmuls 116(%ebx) + fstps 40(%ecx) + + flds 48(%edx) + fadds 60(%edx) + fstps 48(%ecx) + + flds 60(%edx) + fsubs 48(%edx) + fmuls 112(%ebx) + fstps 60(%ecx) + + flds 52(%edx) + fadds 56(%edx) + fstps 52(%ecx) + + flds 56(%edx) + fsubs 52(%edx) + fmuls 116(%ebx) + fstps 56(%ecx) + + flds 64(%edx) + fadds 76(%edx) + fstps 64(%ecx) + + flds 64(%edx) + fsubs 76(%edx) + fmuls 112(%ebx) + fstps 76(%ecx) + + flds 68(%edx) + fadds 72(%edx) + fstps 68(%ecx) + + flds 68(%edx) + fsubs 72(%edx) + fmuls 116(%ebx) + fstps 72(%ecx) + + flds 80(%edx) + fadds 92(%edx) + fstps 80(%ecx) + + flds 92(%edx) + fsubs 80(%edx) + fmuls 112(%ebx) + fstps 92(%ecx) + + flds 84(%edx) + fadds 88(%edx) + fstps 84(%ecx) + + flds 88(%edx) + fsubs 84(%edx) + fmuls 116(%ebx) + fstps 88(%ecx) + + flds 96(%edx) + fadds 108(%edx) + fstps 96(%ecx) + + flds 96(%edx) + fsubs 108(%edx) + fmuls 112(%ebx) + fstps 108(%ecx) + + flds 100(%edx) + fadds 104(%edx) + fstps 100(%ecx) + + flds 100(%edx) + fsubs 104(%edx) + fmuls 116(%ebx) + fstps 104(%ecx) + + flds 112(%edx) + fadds 124(%edx) + fstps 112(%ecx) + + flds 124(%edx) + fsubs 112(%edx) + fmuls 112(%ebx) + fstps 124(%ecx) + + flds 116(%edx) + fadds 120(%edx) + fstps 116(%ecx) + + flds 120(%edx) + fsubs 116(%edx) + fmuls 116(%ebx) + fstps 120(%ecx) + +// end of phase 4 fpu + +// below stuff needs to be finished I use FPU code for first +/* Phase 5 (completed, crashing) */ +/* +/ movq 112(%ebx), %mm2 + // move 8 byte data to (low)high quadword - check this! atmos + movlps 112(%ebx), %xmm4 + // maybe I need movhlps too to get data into correct quadword + movlhps %xmm4, %xmm4 + +/ movq (%edx), %mm0 +/ movq 16(%edx), %mm4 + movups (%edx), %xmm0 + +/ movq %mm0, %mm3 +/ movq %mm4, %mm7 + movaps %xmm0, %xmm2 + +// hmm? this is strange +/ movq 8(%edx), %mm1 +/ movq 24(%edx), %mm5 + movlps 8(%edx), %xmm1 + movhps 24(%edx), %xmm1 + +/ pswapd %mm1, %mm1 +/ pswapd %mm5, %mm5 + pshufd $177, %xmm1, %xmm1 + +/ pfadd %mm1, %mm0 +/ pfadd %mm5, %mm4 + addps %xmm1, %xmm0 + +/ movq %mm0, (%ecx) +/ movq %mm4, 16(%ecx) + movlps %xmm0, (%ecx) + movhps %xmm0, 16(%ecx) + +/ pfsub %mm1, %mm3 +/ pfsubr %mm5, %mm7 +// I need to emulate pfsubr here + movaps %xmm1, %xmm3 + subps %xmm2, %xmm3 + subps %xmm1, %xmm2 +// now move correct quadword from reverse substration in xmm3 to correct +// quadword in xmm2 and leave other quadword with non-reversed substration untouched +/// shufpd $2, %xmm3, %xmm2 +// (or $1?) (see ia32-ref p.749) +// optimize + movq %xmm2, %xmm3 + movaps %xmm3, %xmm2 + +/ pfmul %mm2, %mm3 +/ pfmul %mm2, %mm7 + mulps %xmm4, %xmm2 + +/ pswapd %mm3, %mm3 +/ pswapd %mm7, %mm7 + shufps $177, %xmm2, %xmm2 + +/ movq %mm3, 8(%ecx) +/ movq %mm7, 24(%ecx) + movlps %xmm2, 8(%ecx) + movhps %xmm2, 24(%ecx) + +/ movq 32(%edx), %mm0 +/ movq 48(%edx), %mm4 + movlps 32(%edx), %xmm0 + movhps 48(%edx), %xmm0 + +/ movq %mm0, %mm3 +/ movq %mm4, %mm7 + movaps %xmm0, %xmm2 + +/ movq 40(%edx), %mm1 +/ movq 56(%edx), %mm5 + movlps 40(%edx), %xmm1 + movhps 56(%edx), %xmm1 + +/ pswapd %mm1, %mm1 +/ pswapd %mm5, %mm5 + shufps $177, %xmm1, %xmm1 + +/ pfadd %mm1, %mm0 +/ pfadd %mm5, %mm4 + addps %xmm1, %xmm0 + +/ movq %mm0, 32(%ecx) +/ movq %mm4, 48(%ecx) + movlps %xmm0, 32(%ecx) + movhps %xmm0, 48(%ecx) + +/ pfsub %mm1, %mm3 +/ pfsubr %mm5, %mm7 + movaps %xmm1, %xmm3 + subps %xmm2, %xmm3 + subps %xmm1, %xmm2 +/// shufpd $2, %xmm3, %xmm2 +// (or $1?) +// optimize + movq %xmm2, %xmm3 + movaps %xmm3, %xmm2 + +/ pfmul %mm2, %mm3 +/ pfmul %mm2, %mm7 + mulps %xmm4, %xmm2 + +/ pswapd %mm3, %mm3 +/ pswapd %mm7, %mm7 + shufps $177, %xmm2, %xmm2 + +/ movq %mm3, 40(%ecx) +/ movq %mm7, 56(%ecx) + movlps %xmm2, 40(%ecx) + movhps %xmm2, 56(%ecx) + + +/ movq 64(%edx), %mm0 +/ movq 80(%edx), %mm4 + movlps 64(%edx), %xmm0 + movhps 80(%edx), %xmm0 + +/ movq %mm0, %mm3 +/ movq %mm4, %mm7 + movaps %xmm0, %xmm2 + +/ movq 72(%edx), %mm1 +/ movq 88(%edx), %mm5 + movlps 72(%edx), %xmm1 + movhps 88(%edx), %xmm1 + +/ pswapd %mm1, %mm1 +/ pswapd %mm5, %mm5 + shufps $177, %xmm1, %xmm1 + +/ pfadd %mm1, %mm0 +/ pfadd %mm5, %mm4 + addps %xmm1, %xmm0 + +/ movq %mm0, 64(%ecx) +/ movq %mm4, 80(%ecx) + movlps %xmm0, 64(%ecx) + movhps %xmm0, 80(%ecx) + +/ pfsub %mm1, %mm3 +/ pfsubr %mm5, %mm7 + movaps %xmm1, %xmm3 + subps %xmm2, %xmm3 + subps %xmm1, %xmm2 +/// shufpd $2, %xmm3, %xmm2 +// (or $1?) +// optimize + movq %xmm2, %xmm3 + movaps %xmm3, %xmm2 + +/ pfmul %mm2, %mm3 +/ pfmul %mm2, %mm7 + mulps %xmm4, %xmm2 + +/ pswapd %mm3, %mm3 +/ pswapd %mm7, %mm7 + shufps $177, %xmm2, %xmm2 + +/ movq %mm3, 72(%ecx) +/ movq %mm7, 88(%ecx) + movlps %xmm2, 72(%ecx) + movhps %xmm2, 88(%ecx) + +/ movq 96(%edx), %mm0 +/ movq 112(%edx), %mm4 + movups 96(%edx), %xmm0 + +/ movq %mm0, %mm3 +/ movq %mm4, %mm7 + movaps %xmm0, %xmm2 + +/ movq 104(%edx), %mm1 +/ movq 120(%edx), %mm5 + movlps 104(%edx), %xmm1 + movhps 120(%edx), %xmm1 + +/ pswapd %mm1, %mm1 +/ pswapd %mm5, %mm5 + shufps $177, %xmm1, %xmm1 + +/ pfadd %mm1, %mm0 +/ pfadd %mm5, %mm4 + addps %xmm1, %xmm0 + +/ movq %mm0, 96(%ecx) +/ movq %mm4, 112(%ecx) + movups %xmm0, 96(%ecx) + +/ pfsub %mm1, %mm3 +/ pfsubr %mm5, %mm7 + movaps %xmm1, %xmm3 + subps %xmm2, %xmm3 + subps %xmm1, %xmm2 +/// shufpd $2, %xmm3, %xmm2 +// (or $1?) +// optimize + movq %xmm2, %xmm3 + movaps %xmm3, %xmm2 + +/ pfmul %mm2, %mm3 +/ pfmul %mm2, %mm7 + mulps %xmm4, %xmm2 + +/ pswapd %mm3, %mm3 +/ pswapd %mm7, %mm7 + shufps $177, %xmm2, %xmm2 + +/ movq %mm3, 104(%ecx) +/ movq %mm7, 120(%ecx) + movlps %xmm2, 104(%ecx) + movhps %xmm2, 120(%ecx) +*/ + + +/* Phase 6. This is the end of easy road. */ +/* Code below is coded in scalar mode. Should be optimized */ +// +// movd plus_1f, %mm6 +// punpckldq 120(%ebx), %mm6 /* mm6 = 1.0 | 120(%ebx)*/ +// movq x_plus_minus_3dnow, %mm7 /* mm7 = +1 | -1 */ +/* + movq 32(%ecx), %mm0 + movq 64(%ecx), %mm2 + movq %mm0, %mm1 + movq %mm2, %mm3 + pxor %mm7, %mm1 + pxor %mm7, %mm3 + pfacc %mm1, %mm0 + pfacc %mm3, %mm2 + pfmul %mm6, %mm0 + pfmul %mm6, %mm2 + movq %mm0, 32(%edx) + movq %mm2, 64(%edx) + + movd 44(%ecx), %mm0 + movd 40(%ecx), %mm2 + movd 120(%ebx), %mm3 + punpckldq 76(%ecx), %mm0 + punpckldq 72(%ecx), %mm2 + punpckldq %mm3, %mm3 + movq %mm0, %mm4 + movq %mm2, %mm5 + pfsub %mm2, %mm0 + pfmul %mm3, %mm0 + movq %mm0, %mm1 + pfadd %mm5, %mm0 + pfadd %mm4, %mm0 + movq %mm0, %mm2 + punpckldq %mm1, %mm0 + punpckhdq %mm1, %mm2 + movq %mm0, 40(%edx) + movq %mm2, 72(%edx) + + movd 48(%ecx), %mm3 + movd 60(%ecx), %mm2 + pfsub 52(%ecx), %mm3 + pfsub 56(%ecx), %mm2 + pfmul 120(%ebx), %mm3 + pfmul 120(%ebx), %mm2 + movq %mm2, %mm1 + + pfadd 56(%ecx), %mm1 + pfadd 60(%ecx), %mm1 + movq %mm1, %mm0 + + pfadd 48(%ecx), %mm0 + pfadd 52(%ecx), %mm0 + pfadd %mm3, %mm1 + punpckldq %mm2, %mm1 + pfadd %mm3, %mm2 + punpckldq %mm2, %mm0 + movq %mm1, 56(%edx) + movq %mm0, 48(%edx) +*/ +/*---*/ +/* + movd 92(%ecx), %mm1 + pfsub 88(%ecx), %mm1 + pfmul 120(%ebx), %mm1 + movd %mm1, 92(%edx) + pfadd 92(%ecx), %mm1 + pfadd 88(%ecx), %mm1 + movq %mm1, %mm0 + + pfadd 80(%ecx), %mm0 + pfadd 84(%ecx), %mm0 + movd %mm0, 80(%edx) + + movd 80(%ecx), %mm0 + pfsub 84(%ecx), %mm0 + pfmul 120(%ebx), %mm0 + pfadd %mm0, %mm1 + pfadd 92(%edx), %mm0 + punpckldq %mm1, %mm0 + movq %mm0, 84(%edx) + + movq 96(%ecx), %mm0 + movq %mm0, %mm1 + pxor %mm7, %mm1 + pfacc %mm1, %mm0 + pfmul %mm6, %mm0 + movq %mm0, 96(%edx) + + movd 108(%ecx), %mm0 + pfsub 104(%ecx), %mm0 + pfmul 120(%ebx), %mm0 + movd %mm0, 108(%edx) + pfadd 104(%ecx), %mm0 + pfadd 108(%ecx), %mm0 + movd %mm0, 104(%edx) + + movd 124(%ecx), %mm1 + pfsub 120(%ecx), %mm1 + pfmul 120(%ebx), %mm1 + movd %mm1, 124(%edx) + pfadd 120(%ecx), %mm1 + pfadd 124(%ecx), %mm1 + movq %mm1, %mm0 + + pfadd 112(%ecx), %mm0 + pfadd 116(%ecx), %mm0 + movd %mm0, 112(%edx) + + movd 112(%ecx), %mm0 + pfsub 116(%ecx), %mm0 + pfmul 120(%ebx), %mm0 + pfadd %mm0,%mm1 + pfadd 124(%edx), %mm0 + punpckldq %mm1, %mm0 + movq %mm0, 116(%edx) + + jnz .L01 +*/ + + +/* Phase 7*/ +/* Code below is coded in scalar mode. Should be optimized */ +/* + movd (%ecx), %mm0 + pfadd 4(%ecx), %mm0 + movd %mm0, 1024(%esi) + + movd (%ecx), %mm0 + pfsub 4(%ecx), %mm0 + pfmul 120(%ebx), %mm0 + movd %mm0, (%esi) + movd %mm0, (%edi) + + movd 12(%ecx), %mm0 + pfsub 8(%ecx), %mm0 + pfmul 120(%ebx), %mm0 + movd %mm0, 512(%edi) + pfadd 12(%ecx), %mm0 + pfadd 8(%ecx), %mm0 + movd %mm0, 512(%esi) + + movd 16(%ecx), %mm0 + pfsub 20(%ecx), %mm0 + pfmul 120(%ebx), %mm0 + movq %mm0, %mm3 + + movd 28(%ecx), %mm0 + pfsub 24(%ecx), %mm0 + pfmul 120(%ebx), %mm0 + movd %mm0, 768(%edi) + movq %mm0, %mm2 + + pfadd 24(%ecx), %mm0 + pfadd 28(%ecx), %mm0 + movq %mm0, %mm1 + + pfadd 16(%ecx), %mm0 + pfadd 20(%ecx), %mm0 + movd %mm0, 768(%esi) + pfadd %mm3, %mm1 + movd %mm1, 256(%esi) + pfadd %mm3, %mm2 + movd %mm2, 256(%edi) +*/ + + +/* Phase 8*/ +/* + movq 32(%edx), %mm0 + movq 48(%edx), %mm1 + pfadd 48(%edx), %mm0 + pfadd 40(%edx), %mm1 + movd %mm0, 896(%esi) + movd %mm1, 640(%esi) + psrlq $32, %mm0 + psrlq $32, %mm1 + movd %mm0, 128(%edi) + movd %mm1, 384(%edi) + + movd 40(%edx), %mm0 + pfadd 56(%edx), %mm0 + movd %mm0, 384(%esi) + + movd 56(%edx), %mm0 + pfadd 36(%edx), %mm0 + movd %mm0, 128(%esi) + + movd 60(%edx), %mm0 + movd %mm0, 896(%edi) + pfadd 44(%edx), %mm0 + movd %mm0, 640(%edi) + + movq 96(%edx), %mm0 + movq 112(%edx), %mm2 + movq 104(%edx), %mm4 + pfadd 112(%edx), %mm0 + pfadd 104(%edx), %mm2 + pfadd 120(%edx), %mm4 + movq %mm0, %mm1 + movq %mm2, %mm3 + movq %mm4, %mm5 + pfadd 64(%edx), %mm0 + pfadd 80(%edx), %mm2 + pfadd 72(%edx), %mm4 + movd %mm0, 960(%esi) + movd %mm2, 704(%esi) + movd %mm4, 448(%esi) + psrlq $32, %mm0 + psrlq $32, %mm2 + psrlq $32, %mm4 + movd %mm0, 64(%edi) + movd %mm2, 320(%edi) + movd %mm4, 576(%edi) + pfadd 80(%edx), %mm1 + pfadd 72(%edx), %mm3 + pfadd 88(%edx), %mm5 + movd %mm1, 832(%esi) + movd %mm3, 576(%esi) + movd %mm5, 320(%esi) + psrlq $32, %mm1 + psrlq $32, %mm3 + psrlq $32, %mm5 + movd %mm1, 192(%edi) + movd %mm3, 448(%edi) + movd %mm5, 704(%edi) + + movd 120(%edx), %mm0 + pfadd 100(%edx), %mm0 + movq %mm0, %mm1 + pfadd 88(%edx), %mm0 + movd %mm0, 192(%esi) + pfadd 68(%edx), %mm1 + movd %mm1, 64(%esi) + + movd 124(%edx), %mm0 + movd %mm0, 960(%edi) + pfadd 92(%edx), %mm0 + movd %mm0, 832(%edi) + + jmp .L_bye +.L01: +*/ + + +/* Phase 9*/ +/* + movq (%ecx), %mm0 + movq %mm0, %mm1 + pxor %mm7, %mm1 + pfacc %mm1, %mm0 + pfmul %mm6, %mm0 + pf2id %mm0, %mm0 + movd %mm0, %eax + movw %ax, 512(%esi) + psrlq $32, %mm0 + movd %mm0, %eax + movw %ax, (%esi) + + movd 12(%ecx), %mm0 + pfsub 8(%ecx), %mm0 + pfmul 120(%ebx), %mm0 + pf2id %mm0, %mm7 + movd %mm7, %eax + movw %ax, 256(%edi) + pfadd 12(%ecx), %mm0 + pfadd 8(%ecx), %mm0 + pf2id %mm0, %mm0 + movd %mm0, %eax + movw %ax, 256(%esi) + + movd 16(%ecx), %mm3 + pfsub 20(%ecx), %mm3 + pfmul 120(%ebx), %mm3 + movq %mm3, %mm2 + + movd 28(%ecx), %mm2 + pfsub 24(%ecx), %mm2 + pfmul 120(%ebx), %mm2 + movq %mm2, %mm1 + + pf2id %mm2, %mm7 + movd %mm7, %eax + movw %ax, 384(%edi) + + pfadd 24(%ecx), %mm1 + pfadd 28(%ecx), %mm1 + movq %mm1, %mm0 + + pfadd 16(%ecx), %mm0 + pfadd 20(%ecx), %mm0 + pf2id %mm0, %mm0 + movd %mm0, %eax + movw %ax, 384(%esi) + pfadd %mm3, %mm1 + pf2id %mm1, %mm1 + movd %mm1, %eax + movw %ax, 128(%esi) + pfadd %mm3, %mm2 + pf2id %mm2, %mm2 + movd %mm2, %eax + movw %ax, 128(%edi) +*/ + + +/* Phase 10*/ +/* + movq 32(%edx), %mm0 + movq 48(%edx), %mm1 + pfadd 48(%edx), %mm0 + pfadd 40(%edx), %mm1 + pf2id %mm0, %mm0 + pf2id %mm1, %mm1 + movd %mm0, %eax + movd %mm1, %ecx + movw %ax, 448(%esi) + movw %cx, 320(%esi) + psrlq $32, %mm0 + psrlq $32, %mm1 + movd %mm0, %eax + movd %mm1, %ecx + movw %ax, 64(%edi) + movw %cx, 192(%edi) + + movd 40(%edx), %mm3 + movd 56(%edx), %mm4 + movd 60(%edx), %mm0 + movd 44(%edx), %mm2 + movd 120(%edx), %mm5 + punpckldq %mm4, %mm3 + punpckldq 124(%edx), %mm0 + pfadd 100(%edx), %mm5 + punpckldq 36(%edx), %mm4 + punpckldq 92(%edx), %mm2 + movq %mm5, %mm6 + pfadd %mm4, %mm3 + pf2id %mm0, %mm1 + pf2id %mm3, %mm3 + pfadd 88(%edx), %mm5 + movd %mm1, %eax + movd %mm3, %ecx + movw %ax, 448(%edi) + movw %cx, 192(%esi) + pf2id %mm5, %mm5 + psrlq $32, %mm1 + psrlq $32, %mm3 + movd %mm5, %ebx + movd %mm1, %eax + movd %mm3, %ecx + movw %bx, 96(%esi) + movw %ax, 480(%edi) + movw %cx, 64(%esi) + pfadd %mm2, %mm0 + pf2id %mm0, %mm0 + movd %mm0, %eax + pfadd 68(%edx), %mm6 + movw %ax, 320(%edi) + psrlq $32, %mm0 + pf2id %mm6, %mm6 + movd %mm0, %eax + movd %mm6, %ebx + movw %ax, 416(%edi) + movw %bx, 32(%esi) + + movq 96(%edx), %mm0 + movq 112(%edx), %mm2 + movq 104(%edx), %mm4 + pfadd %mm2, %mm0 + pfadd %mm4, %mm2 + pfadd 120(%edx), %mm4 + movq %mm0, %mm1 + movq %mm2, %mm3 + movq %mm4, %mm5 + pfadd 64(%edx), %mm0 + pfadd 80(%edx), %mm2 + pfadd 72(%edx), %mm4 + pf2id %mm0, %mm0 + pf2id %mm2, %mm2 + pf2id %mm4, %mm4 + movd %mm0, %eax + movd %mm2, %ecx + movd %mm4, %ebx + movw %ax, 480(%esi) + movw %cx, 352(%esi) + movw %bx, 224(%esi) + psrlq $32, %mm0 + psrlq $32, %mm2 + psrlq $32, %mm4 + movd %mm0, %eax + movd %mm2, %ecx + movd %mm4, %ebx + movw %ax, 32(%edi) + movw %cx, 160(%edi) + movw %bx, 288(%edi) + pfadd 80(%edx), %mm1 + pfadd 72(%edx), %mm3 + pfadd 88(%edx), %mm5 + pf2id %mm1, %mm1 + pf2id %mm3, %mm3 + pf2id %mm5, %mm5 + movd %mm1, %eax + movd %mm3, %ecx + movd %mm5, %ebx + movw %ax, 416(%esi) + movw %cx, 288(%esi) + movw %bx, 160(%esi) + psrlq $32, %mm1 + psrlq $32, %mm3 + psrlq $32, %mm5 + movd %mm1, %eax + movd %mm3, %ecx + movd %mm5, %ebx + movw %ax, 96(%edi) + movw %cx, 224(%edi) + movw %bx, 352(%edi) + + movsw + +.L_bye: + addl $256,%esp +/ femms + emms + popl %edi + popl %esi + popl %ebx + ret $12 +*/ + +// here comes old fashioned FPU code for the tough parts + +/* Phase 5*/ + + flds 32(%ecx) + fadds 36(%ecx) + fstps 32(%edx) + + flds 32(%ecx) + fsubs 36(%ecx) + fmuls 120(%ebx) + fstps 36(%edx) + + flds 44(%ecx) + fsubs 40(%ecx) + fmuls 120(%ebx) + fsts 44(%edx) + fadds 40(%ecx) + fadds 44(%ecx) + fstps 40(%edx) + + flds 48(%ecx) + fsubs 52(%ecx) + fmuls 120(%ebx) + + flds 60(%ecx) + fsubs 56(%ecx) + fmuls 120(%ebx) + fld %st(0) + fadds 56(%ecx) + fadds 60(%ecx) + fld %st(0) + fadds 48(%ecx) + fadds 52(%ecx) + fstps 48(%edx) + fadd %st(2) + fstps 56(%edx) + fsts 60(%edx) + faddp %st(1) + fstps 52(%edx) + + flds 64(%ecx) + fadds 68(%ecx) + fstps 64(%edx) + + flds 64(%ecx) + fsubs 68(%ecx) + fmuls 120(%ebx) + fstps 68(%edx) + + flds 76(%ecx) + fsubs 72(%ecx) + fmuls 120(%ebx) + fsts 76(%edx) + fadds 72(%ecx) + fadds 76(%ecx) + fstps 72(%edx) + + flds 92(%ecx) + fsubs 88(%ecx) + fmuls 120(%ebx) + fsts 92(%edx) + fadds 92(%ecx) + fadds 88(%ecx) + fld %st(0) + fadds 80(%ecx) + fadds 84(%ecx) + fstps 80(%edx) + + flds 80(%ecx) + fsubs 84(%ecx) + fmuls 120(%ebx) + fadd %st(0), %st(1) + fadds 92(%edx) + fstps 84(%edx) + fstps 88(%edx) + + flds 96(%ecx) + fadds 100(%ecx) + fstps 96(%edx) + + flds 96(%ecx) + fsubs 100(%ecx) + fmuls 120(%ebx) + fstps 100(%edx) + + flds 108(%ecx) + fsubs 104(%ecx) + fmuls 120(%ebx) + fsts 108(%edx) + fadds 104(%ecx) + fadds 108(%ecx) + fstps 104(%edx) + + flds 124(%ecx) + fsubs 120(%ecx) + fmuls 120(%ebx) + fsts 124(%edx) + fadds 120(%ecx) + fadds 124(%ecx) + fld %st(0) + fadds 112(%ecx) + fadds 116(%ecx) + fstps 112(%edx) + + flds 112(%ecx) + fsubs 116(%ecx) + fmuls 120(%ebx) + fadd %st(0),%st(1) + fadds 124(%edx) + fstps 116(%edx) + fstps 120(%edx) + jnz .L01 + + +/* Phase 6*/ + + flds (%ecx) + fadds 4(%ecx) + fstps 1024(%esi) + + flds (%ecx) + fsubs 4(%ecx) + fmuls 120(%ebx) + fsts (%esi) + fstps (%edi) + + flds 12(%ecx) + fsubs 8(%ecx) + fmuls 120(%ebx) + fsts 512(%edi) + fadds 12(%ecx) + fadds 8(%ecx) + fstps 512(%esi) + + flds 16(%ecx) + fsubs 20(%ecx) + fmuls 120(%ebx) + + flds 28(%ecx) + fsubs 24(%ecx) + fmuls 120(%ebx) + fsts 768(%edi) + fld %st(0) + fadds 24(%ecx) + fadds 28(%ecx) + fld %st(0) + fadds 16(%ecx) + fadds 20(%ecx) + fstps 768(%esi) + fadd %st(2) + fstps 256(%esi) + faddp %st(1) + fstps 256(%edi) + +/* Phase 7*/ + + flds 32(%edx) + fadds 48(%edx) + fstps 896(%esi) + + flds 48(%edx) + fadds 40(%edx) + fstps 640(%esi) + + flds 40(%edx) + fadds 56(%edx) + fstps 384(%esi) + + flds 56(%edx) + fadds 36(%edx) + fstps 128(%esi) + + flds 36(%edx) + fadds 52(%edx) + fstps 128(%edi) + + flds 52(%edx) + fadds 44(%edx) + fstps 384(%edi) + + flds 60(%edx) + fsts 896(%edi) + fadds 44(%edx) + fstps 640(%edi) + + flds 96(%edx) + fadds 112(%edx) + fld %st(0) + fadds 64(%edx) + fstps 960(%esi) + fadds 80(%edx) + fstps 832(%esi) + + flds 112(%edx) + fadds 104(%edx) + fld %st(0) + fadds 80(%edx) + fstps 704(%esi) + fadds 72(%edx) + fstps 576(%esi) + + flds 104(%edx) + fadds 120(%edx) + fld %st(0) + fadds 72(%edx) + fstps 448(%esi) + fadds 88(%edx) + fstps 320(%esi) + + flds 120(%edx) + fadds 100(%edx) + fld %st(0) + fadds 88(%edx) + fstps 192(%esi) + fadds 68(%edx) + fstps 64(%esi) + + flds 100(%edx) + fadds 116(%edx) + fld %st(0) + fadds 68(%edx) + fstps 64(%edi) + fadds 84(%edx) + fstps 192(%edi) + + flds 116(%edx) + fadds 108(%edx) + fld %st(0) + fadds 84(%edx) + fstps 320(%edi) + fadds 76(%edx) + fstps 448(%edi) + + flds 108(%edx) + fadds 124(%edx) + fld %st(0) + fadds 76(%edx) + fstps 576(%edi) + fadds 92(%edx) + fstps 704(%edi) + + flds 124(%edx) + fsts 960(%edi) + fadds 92(%edx) + fstps 832(%edi) + addl $256,%esp + popl %edi + popl %esi + popl %ebx + ret +.L01: +/* Phase 8*/ + + flds (%ecx) + fadds 4(%ecx) + fistp 512(%esi) + + flds (%ecx) + fsubs 4(%ecx) + fmuls 120(%ebx) + + fistp (%esi) + + + flds 12(%ecx) + fsubs 8(%ecx) + fmuls 120(%ebx) + fist 256(%edi) + fadds 12(%ecx) + fadds 8(%ecx) + fistp 256(%esi) + + flds 16(%ecx) + fsubs 20(%ecx) + fmuls 120(%ebx) + + flds 28(%ecx) + fsubs 24(%ecx) + fmuls 120(%ebx) + fist 384(%edi) + fld %st(0) + fadds 24(%ecx) + fadds 28(%ecx) + fld %st(0) + fadds 16(%ecx) + fadds 20(%ecx) + fistp 384(%esi) + fadd %st(2) + fistp 128(%esi) + faddp %st(1) + fistp 128(%edi) + +/* Phase 9*/ + + flds 32(%edx) + fadds 48(%edx) + fistp 448(%esi) + + flds 48(%edx) + fadds 40(%edx) + fistp 320(%esi) + + flds 40(%edx) + fadds 56(%edx) + fistp 192(%esi) + + flds 56(%edx) + fadds 36(%edx) + fistp 64(%esi) + + flds 36(%edx) + fadds 52(%edx) + fistp 64(%edi) + + flds 52(%edx) + fadds 44(%edx) + fistp 192(%edi) + + flds 60(%edx) + fist 448(%edi) + fadds 44(%edx) + fistp 320(%edi) + + flds 96(%edx) + fadds 112(%edx) + fld %st(0) + fadds 64(%edx) + fistp 480(%esi) + fadds 80(%edx) + fistp 416(%esi) + + flds 112(%edx) + fadds 104(%edx) + fld %st(0) + fadds 80(%edx) + fistp 352(%esi) + fadds 72(%edx) + fistp 288(%esi) + + flds 104(%edx) + fadds 120(%edx) + fld %st(0) + fadds 72(%edx) + fistp 224(%esi) + fadds 88(%edx) + fistp 160(%esi) + + flds 120(%edx) + fadds 100(%edx) + fld %st(0) + fadds 88(%edx) + fistp 96(%esi) + fadds 68(%edx) + fistp 32(%esi) + + flds 100(%edx) + fadds 116(%edx) + fld %st(0) + fadds 68(%edx) + fistp 32(%edi) + fadds 84(%edx) + fistp 96(%edi) + + flds 116(%edx) + fadds 108(%edx) + fld %st(0) + fadds 84(%edx) + fistp 160(%edi) + fadds 76(%edx) + fistp 224(%edi) + + flds 108(%edx) + fadds 124(%edx) + fld %st(0) + fadds 76(%edx) + fistp 288(%edi) + fadds 92(%edx) + fistp 352(%edi) + + flds 124(%edx) + fist 480(%edi) + fadds 92(%edx) + fistp 416(%edi) + movsw + addl $256,%esp + popl %edi + popl %esi + popl %ebx + ret $12 + +// end of FPU stuff