avcodec/mips: Fix segfault in imdct36_mips_float.

'li.s' is a synthesized instruction, it does not work properly
when compiled with clang on mips, and A segfault occurred.

Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
Shiyou Yin 2020-07-29 18:11:01 +08:00 committed by Michael Niedermayer
parent 1563b4b4c6
commit bd4f37f2eb
4 changed files with 264 additions and 267 deletions

View File

@ -293,16 +293,17 @@ static void ps_decorrelate_mips(float (*out)[2], float (*delay)[2],
float phi_fract0 = phi_fract[0];
float phi_fract1 = phi_fract[1];
float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
float f1, f2, f3;
float *p_delay_end = (p_delay + (len << 1));
/* merged 2 loops */
f1 = 0.65143905753106;
f2 = 0.56471812200776;
f3 = 0.48954165955695;
__asm__ volatile(
".set push \n\t"
".set noreorder \n\t"
"li.s %[ag0], 0.65143905753106 \n\t"
"li.s %[ag1], 0.56471812200776 \n\t"
"li.s %[ag2], 0.48954165955695 \n\t"
"mul.s %[ag0], %[ag0], %[g_decay_slope] \n\t"
"mul.s %[ag1], %[ag1], %[g_decay_slope] \n\t"
"mul.s %[ag2], %[ag2], %[g_decay_slope] \n\t"
@ -378,10 +379,10 @@ static void ps_decorrelate_mips(float (*out)[2], float (*delay)[2],
[temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
[temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
[temp9]"=&f"(temp9), [p_delay]"+r"(p_delay), [p_ap_delay]"+r"(p_ap_delay),
[p_Q_fract]"+r"(p_Q_fract), [p_t_gain]"+r"(p_t_gain), [p_out]"+r"(p_out),
[ag0]"=&f"(ag0), [ag1]"=&f"(ag1), [ag2]"=&f"(ag2)
[p_Q_fract]"+r"(p_Q_fract), [p_t_gain]"+r"(p_t_gain), [p_out]"+r"(p_out)
: [phi_fract0]"f"(phi_fract0), [phi_fract1]"f"(phi_fract1),
[p_delay_end]"r"(p_delay_end), [g_decay_slope]"f"(g_decay_slope)
[p_delay_end]"r"(p_delay_end), [g_decay_slope]"f"(g_decay_slope),
[ag0]"f"(f1), [ag1]"f"(f2), [ag2]"f"(f3)
: "memory"
);
}

View File

@ -135,11 +135,11 @@ static void psy_hp_filter_mips(const float *firbuf, float *hpfsmpl, const float
float coeff3 = psy_fir_coeffs[7];
float coeff4 = psy_fir_coeffs[9];
float f1 = 32768.0;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"li.s $f12, 32768 \n\t"
"1: \n\t"
"lwc1 $f0, 40(%[fb]) \n\t"
"lwc1 $f1, 4(%[fb]) \n\t"
@ -203,14 +203,14 @@ static void psy_hp_filter_mips(const float *firbuf, float *hpfsmpl, const float
"madd.s %[sum2], %[sum2], $f9, %[coeff4] \n\t"
"madd.s %[sum4], %[sum4], $f6, %[coeff4] \n\t"
"madd.s %[sum3], %[sum3], $f3, %[coeff4] \n\t"
"mul.s %[sum1], %[sum1], $f12 \n\t"
"mul.s %[sum2], %[sum2], $f12 \n\t"
"mul.s %[sum1], %[sum1], %[f1] \n\t"
"mul.s %[sum2], %[sum2], %[f1] \n\t"
"madd.s %[sum4], %[sum4], $f11, %[coeff4] \n\t"
"madd.s %[sum3], %[sum3], $f8, %[coeff4] \n\t"
"swc1 %[sum1], 0(%[hp]) \n\t"
"swc1 %[sum2], 4(%[hp]) \n\t"
"mul.s %[sum4], %[sum4], $f12 \n\t"
"mul.s %[sum3], %[sum3], $f12 \n\t"
"mul.s %[sum4], %[sum4], %[f1] \n\t"
"mul.s %[sum3], %[sum3], %[f1] \n\t"
"swc1 %[sum4], 12(%[hp]) \n\t"
"swc1 %[sum3], 8(%[hp]) \n\t"
"bne %[fb], %[fb_end], 1b \n\t"
@ -223,9 +223,9 @@ static void psy_hp_filter_mips(const float *firbuf, float *hpfsmpl, const float
[fb]"+r"(fb), [hp]"+r"(hp)
: [coeff0]"f"(coeff0), [coeff1]"f"(coeff1),
[coeff2]"f"(coeff2), [coeff3]"f"(coeff3),
[coeff4]"f"(coeff4), [fb_end]"r"(fb_end)
[coeff4]"f"(coeff4), [fb_end]"r"(fb_end), [f1]"f"(f1)
: "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6",
"$f7", "$f8", "$f9", "$f10", "$f11", "$f12",
"$f7", "$f8", "$f9", "$f10", "$f11",
"memory"
);
}

View File

@ -71,6 +71,7 @@ static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z)
float temp, temp1, temp3, temp4;
FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4;
FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i;
float f1 = 0.7071067812;
num_transforms = (21845 >> (17 - s->nbits)) | 1;
@ -148,7 +149,6 @@ static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z)
"swc1 %[pom2], 4(%[tmpz]) \n\t" // tmpz[0].im = tmpz[0].im + tmp6;
"lwc1 %[pom1], 16(%[tmpz]) \n\t"
"lwc1 %[pom3], 20(%[tmpz]) \n\t"
"li.s %[pom], 0.7071067812 \n\t" // float pom = 0.7071067812f;
"add.s %[temp1],%[tmp1], %[tmp2] \n\t"
"sub.s %[temp], %[pom1], %[tmp8] \n\t"
"add.s %[pom2], %[pom3], %[tmp7] \n\t"
@ -159,10 +159,10 @@ static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z)
"add.s %[pom1], %[pom1], %[tmp8] \n\t"
"sub.s %[pom3], %[pom3], %[tmp7] \n\t"
"add.s %[tmp3], %[tmp3], %[tmp4] \n\t"
"mul.s %[tmp5], %[pom], %[temp1] \n\t" // tmp5 = pom * (tmp1 + tmp2);
"mul.s %[tmp7], %[pom], %[temp3] \n\t" // tmp7 = pom * (tmp3 - tmp4);
"mul.s %[tmp6], %[pom], %[temp4] \n\t" // tmp6 = pom * (tmp2 - tmp1);
"mul.s %[tmp8], %[pom], %[tmp3] \n\t" // tmp8 = pom * (tmp3 + tmp4);
"mul.s %[tmp5], %[f1], %[temp1] \n\t" // tmp5 = pom * (tmp1 + tmp2);
"mul.s %[tmp7], %[f1], %[temp3] \n\t" // tmp7 = pom * (tmp3 - tmp4);
"mul.s %[tmp6], %[f1], %[temp4] \n\t" // tmp6 = pom * (tmp2 - tmp1);
"mul.s %[tmp8], %[f1], %[tmp3] \n\t" // tmp8 = pom * (tmp3 + tmp4);
"swc1 %[pom1], 16(%[tmpz]) \n\t" // tmpz[2].re = tmpz[2].re + tmp8;
"swc1 %[pom3], 20(%[tmpz]) \n\t" // tmpz[2].im = tmpz[2].im - tmp7;
"add.s %[tmp1], %[tmp5], %[tmp7] \n\t" // tmp1 = tmp5 + tmp7;
@ -193,7 +193,7 @@ static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z)
[tmp3]"=&f"(tmp3), [tmp2]"=&f"(tmp2), [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp7]"=&f"(tmp7),
[tmp6]"=&f"(tmp6), [tmp8]"=&f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1),
[temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
: [tmpz]"r"(tmpz)
: [tmpz]"r"(tmpz), [f1]"f"(f1)
: "memory"
);
}

View File

@ -287,9 +287,16 @@ static void ff_dct32_mips_float(float *out, const float *tab)
val8 , val9 , val10, val11, val12, val13, val14, val15,
val16, val17, val18, val19, val20, val21, val22, val23,
val24, val25, val26, val27, val28, val29, val30, val31;
float fTmp1, fTmp2, fTmp3, fTmp4, fTmp5, fTmp6, fTmp7, fTmp8,
fTmp9, fTmp10, fTmp11;
float fTmp1, fTmp2, fTmp3, fTmp4, fTmp5, fTmp6, fTmp8, fTmp9;
float f1, f2, f3, f4, f5, f6, f7;
f1 = 0.50241928618815570551;
f2 = 0.50060299823519630134;
f3 = 10.19000812354805681150;
f4 = 5.10114861868916385802;
f5 = 0.67480834145500574602;
f6 = 0.74453627100229844977;
f7 = 0.50979557910415916894;
/**
* instructions are scheduled to minimize pipeline stall.
*/
@ -298,149 +305,142 @@ static void ff_dct32_mips_float(float *out, const float *tab)
"lwc1 %[fTmp2], 31*4(%[tab]) \n\t"
"lwc1 %[fTmp3], 15*4(%[tab]) \n\t"
"lwc1 %[fTmp4], 16*4(%[tab]) \n\t"
"li.s %[fTmp7], 0.50241928618815570551 \n\t"
"add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
"sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
"add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
"sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
"li.s %[fTmp10], 0.50060299823519630134 \n\t"
"li.s %[fTmp11], 10.19000812354805681150 \n\t"
"mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
"mul.s %[fTmp8], %[fTmp8], %[f2] \n\t"
"add.s %[val0], %[fTmp5], %[fTmp6] \n\t"
"sub.s %[val15], %[fTmp5], %[fTmp6] \n\t"
"lwc1 %[fTmp1], 7*4(%[tab]) \n\t"
"lwc1 %[fTmp2], 24*4(%[tab]) \n\t"
"madd.s %[val16], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
"nmsub.s %[val31], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
"mul.s %[val15], %[val15], %[fTmp7] \n\t"
"madd.s %[val16], %[fTmp8], %[fTmp9], %[f3] \n\t"
"nmsub.s %[val31], %[fTmp8], %[fTmp9], %[f3] \n\t"
"mul.s %[val15], %[val15], %[f1] \n\t"
"lwc1 %[fTmp3], 8*4(%[tab]) \n\t"
"lwc1 %[fTmp4], 23*4(%[tab]) \n\t"
"add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val31], %[val31], %[fTmp7] \n\t"
"mul.s %[val31], %[val31], %[f1] \n\t"
"sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
"add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
"sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
"li.s %[fTmp7], 5.10114861868916385802 \n\t"
"li.s %[fTmp10], 0.67480834145500574602 \n\t"
"li.s %[fTmp11], 0.74453627100229844977 \n\t"
"add.s %[val7], %[fTmp5], %[fTmp6] \n\t"
"sub.s %[val8], %[fTmp5], %[fTmp6] \n\t"
"mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
"li.s %[fTmp1], 0.50979557910415916894 \n\t"
"mul.s %[fTmp8], %[fTmp8], %[f5] \n\t"
"sub.s %[fTmp2], %[val0], %[val7] \n\t"
"mul.s %[val8], %[val8], %[fTmp7] \n\t"
"madd.s %[val23], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
"nmsub.s %[val24], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
"mul.s %[val8], %[val8], %[f4] \n\t"
"madd.s %[val23], %[fTmp8], %[fTmp9], %[f6] \n\t"
"nmsub.s %[val24], %[fTmp8], %[fTmp9], %[f6] \n\t"
"add.s %[val0], %[val0], %[val7] \n\t"
"mul.s %[val7], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val7], %[f7], %[fTmp2] \n\t"
"sub.s %[fTmp2], %[val15], %[val8] \n\t"
"add.s %[val8], %[val15], %[val8] \n\t"
"mul.s %[val24], %[val24], %[fTmp7] \n\t"
"mul.s %[val24], %[val24], %[f4] \n\t"
"sub.s %[fTmp3], %[val16], %[val23] \n\t"
"add.s %[val16], %[val16], %[val23] \n\t"
"mul.s %[val15], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val15], %[f7], %[fTmp2] \n\t"
"sub.s %[fTmp4], %[val31], %[val24] \n\t"
"mul.s %[val23], %[fTmp1], %[fTmp3] \n\t"
"mul.s %[val23], %[f7], %[fTmp3] \n\t"
"add.s %[val24], %[val31], %[val24] \n\t"
"mul.s %[val31], %[fTmp1], %[fTmp4] \n\t"
"mul.s %[val31], %[f7], %[fTmp4] \n\t"
: [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
[fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
[fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
[fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
[val0] "=f" (val0), [val7] "=f" (val7),
[val8] "=f" (val8), [val15] "=f" (val15),
[val16] "=f" (val16), [val23] "=f" (val23),
[val24] "=f" (val24), [val31] "=f" (val31)
: [tab] "r" (tab)
[fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
[val0] "=&f" (val0), [val7] "=&f" (val7),
[val8] "=&f" (val8), [val15] "=&f" (val15),
[val16] "=&f" (val16), [val23] "=&f" (val23),
[val24] "=&f" (val24), [val31] "=&f" (val31)
: [tab] "r" (tab), [f1]"f"(f1), [f2]"f"(f2), [f3]"f"(f3),
[f4]"f"(f4), [f5]"f"(f5), [f6]"f"(f6), [f7]"f"(f7)
: "memory"
);
f1 = 0.64682178335999012954;
f2 = 0.53104259108978417447;
f3 = 1.48416461631416627724;
f4 = 0.78815462345125022473;
f5 = 0.55310389603444452782;
f6 = 1.16943993343288495515;
f7 = 2.56291544774150617881;
__asm__ volatile (
"lwc1 %[fTmp1], 3*4(%[tab]) \n\t"
"lwc1 %[fTmp2], 28*4(%[tab]) \n\t"
"lwc1 %[fTmp3], 12*4(%[tab]) \n\t"
"lwc1 %[fTmp4], 19*4(%[tab]) \n\t"
"li.s %[fTmp7], 0.64682178335999012954 \n\t"
"add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
"sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
"add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
"sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
"li.s %[fTmp10], 0.53104259108978417447 \n\t"
"li.s %[fTmp11], 1.48416461631416627724 \n\t"
"mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
"mul.s %[fTmp8], %[fTmp8], %[f2] \n\t"
"add.s %[val3], %[fTmp5], %[fTmp6] \n\t"
"sub.s %[val12], %[fTmp5], %[fTmp6] \n\t"
"lwc1 %[fTmp1], 4*4(%[tab]) \n\t"
"lwc1 %[fTmp2], 27*4(%[tab]) \n\t"
"madd.s %[val19], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
"nmsub.s %[val28], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
"mul.s %[val12], %[val12], %[fTmp7] \n\t"
"madd.s %[val19], %[fTmp8], %[fTmp9], %[f3] \n\t"
"nmsub.s %[val28], %[fTmp8], %[fTmp9], %[f3] \n\t"
"mul.s %[val12], %[val12], %[f1] \n\t"
"lwc1 %[fTmp3], 11*4(%[tab]) \n\t"
"lwc1 %[fTmp4], 20*4(%[tab]) \n\t"
"add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val28], %[val28], %[fTmp7] \n\t"
"mul.s %[val28], %[val28], %[f1] \n\t"
"sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
"li.s %[fTmp7], 0.78815462345125022473 \n\t"
"add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
"sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
"li.s %[fTmp10], 0.55310389603444452782 \n\t"
"li.s %[fTmp11], 1.16943993343288495515 \n\t"
"mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
"mul.s %[fTmp8], %[fTmp8], %[f5] \n\t"
"add.s %[val4], %[fTmp5], %[fTmp6] \n\t"
"sub.s %[val11], %[fTmp5], %[fTmp6] \n\t"
"li.s %[fTmp1], 2.56291544774150617881 \n\t"
"madd.s %[val20], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
"nmsub.s %[val27], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
"mul.s %[val11], %[val11], %[fTmp7] \n\t"
"madd.s %[val20], %[fTmp8], %[fTmp9], %[f6] \n\t"
"nmsub.s %[val27], %[fTmp8], %[fTmp9], %[f6] \n\t"
"mul.s %[val11], %[val11], %[f4] \n\t"
"sub.s %[fTmp2], %[val3], %[val4] \n\t"
"add.s %[val3], %[val3], %[val4] \n\t"
"sub.s %[fTmp4], %[val19], %[val20] \n\t"
"mul.s %[val27], %[val27], %[fTmp7] \n\t"
"mul.s %[val27], %[val27], %[f4] \n\t"
"sub.s %[fTmp3], %[val12], %[val11] \n\t"
"mul.s %[val4], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val4], %[f7], %[fTmp2] \n\t"
"add.s %[val11], %[val12], %[val11] \n\t"
"add.s %[val19], %[val19], %[val20] \n\t"
"mul.s %[val20], %[fTmp1], %[fTmp4] \n\t"
"mul.s %[val12], %[fTmp1], %[fTmp3] \n\t"
"mul.s %[val20], %[f7], %[fTmp4] \n\t"
"mul.s %[val12], %[f7], %[fTmp3] \n\t"
"sub.s %[fTmp2], %[val28], %[val27] \n\t"
"add.s %[val27], %[val28], %[val27] \n\t"
"mul.s %[val28], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val28], %[f7], %[fTmp2] \n\t"
: [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
[fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
[fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
[fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
[val3] "=f" (val3), [val4] "=f" (val4),
[val11] "=f" (val11), [val12] "=f" (val12),
[val19] "=f" (val19), [val20] "=f" (val20),
[val27] "=f" (val27), [val28] "=f" (val28)
: [tab] "r" (tab)
[fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
[val3] "=&f" (val3), [val4] "=&f" (val4),
[val11] "=&f" (val11), [val12] "=&f" (val12),
[val19] "=&f" (val19), [val20] "=&f" (val20),
[val27] "=&f" (val27), [val28] "=&f" (val28)
: [tab] "r" (tab), [f1]"f"(f1), [f2]"f"(f2), [f3]"f"(f3),
[f4]"f"(f4), [f5]"f"(f5), [f6]"f"(f6), [f7]"f"(f7)
: "memory"
);
f1 = 0.54119610014619698439;
__asm__ volatile (
"li.s %[fTmp1], 0.54119610014619698439 \n\t"
"sub.s %[fTmp2], %[val0], %[val3] \n\t"
"add.s %[val0], %[val0], %[val3] \n\t"
"sub.s %[fTmp3], %[val7], %[val4] \n\t"
"add.s %[val4], %[val7], %[val4] \n\t"
"sub.s %[fTmp4], %[val8], %[val11] \n\t"
"mul.s %[val3], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val3], %[f1], %[fTmp2] \n\t"
"add.s %[val8], %[val8], %[val11] \n\t"
"mul.s %[val7], %[fTmp1], %[fTmp3] \n\t"
"mul.s %[val7], %[f1], %[fTmp3] \n\t"
"sub.s %[fTmp2], %[val15], %[val12] \n\t"
"mul.s %[val11], %[fTmp1], %[fTmp4] \n\t"
"mul.s %[val11], %[f1], %[fTmp4] \n\t"
"add.s %[val12], %[val15], %[val12] \n\t"
"mul.s %[val15], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val15], %[f1], %[fTmp2] \n\t"
: [val0] "+f" (val0), [val3] "+f" (val3),
[val4] "+f" (val4), [val7] "+f" (val7),
[val8] "+f" (val8), [val11] "+f" (val11),
[val12] "+f" (val12), [val15] "+f" (val15),
[fTmp1] "=f" (fTmp1), [fTmp2] "=&f" (fTmp2),
: [val0] "+&f" (val0), [val3] "+&f" (val3),
[val4] "+&f" (val4), [val7] "+&f" (val7),
[val8] "+&f" (val8), [val11] "+&f" (val11),
[val12] "+&f" (val12), [val15] "+&f" (val15),
[fTmp2] "=&f" (fTmp2),
[fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4)
:
: [f1] "f" (f1)
);
__asm__ volatile (
@ -449,169 +449,169 @@ static void ff_dct32_mips_float(float *out, const float *tab)
"sub.s %[fTmp3], %[val23], %[val20] \n\t"
"add.s %[val20], %[val23], %[val20] \n\t"
"sub.s %[fTmp4], %[val24], %[val27] \n\t"
"mul.s %[val19], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val19], %[f1], %[fTmp2] \n\t"
"add.s %[val24], %[val24], %[val27] \n\t"
"mul.s %[val23], %[fTmp1], %[fTmp3] \n\t"
"mul.s %[val23], %[f1], %[fTmp3] \n\t"
"sub.s %[fTmp2], %[val31], %[val28] \n\t"
"mul.s %[val27], %[fTmp1], %[fTmp4] \n\t"
"mul.s %[val27], %[f1], %[fTmp4] \n\t"
"add.s %[val28], %[val31], %[val28] \n\t"
"mul.s %[val31], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val31], %[f1], %[fTmp2] \n\t"
: [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
[val16] "+f" (val16), [val19] "+f" (val19), [val20] "+f" (val20),
[val23] "+f" (val23), [val24] "+f" (val24), [val27] "+f" (val27),
[val28] "+f" (val28), [val31] "+f" (val31)
: [fTmp1] "f" (fTmp1)
[val16] "+&f" (val16), [val19] "+&f" (val19), [val20] "+&f" (val20),
[val23] "+&f" (val23), [val24] "+&f" (val24), [val27] "+&f" (val27),
[val28] "+&f" (val28), [val31] "+&f" (val31)
: [f1] "f" (f1)
);
f1 = 0.52249861493968888062;
f2 = 0.50547095989754365998;
f3 = 3.40760841846871878570;
f4 = 1.72244709823833392782;
f5 = 0.62250412303566481615;
f6 = 0.83934964541552703873;
f7 = 0.60134488693504528054;
__asm__ volatile (
"lwc1 %[fTmp1], 1*4(%[tab]) \n\t"
"lwc1 %[fTmp2], 30*4(%[tab]) \n\t"
"lwc1 %[fTmp3], 14*4(%[tab]) \n\t"
"lwc1 %[fTmp4], 17*4(%[tab]) \n\t"
"li.s %[fTmp7], 0.52249861493968888062 \n\t"
"add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
"sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
"add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
"sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
"li.s %[fTmp10], 0.50547095989754365998 \n\t"
"li.s %[fTmp11], 3.40760841846871878570 \n\t"
"mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
"mul.s %[fTmp8], %[fTmp8], %[f2] \n\t"
"add.s %[val1], %[fTmp5], %[fTmp6] \n\t"
"sub.s %[val14], %[fTmp5], %[fTmp6] \n\t"
"lwc1 %[fTmp1], 6*4(%[tab]) \n\t"
"lwc1 %[fTmp2], 25*4(%[tab]) \n\t"
"madd.s %[val17], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
"nmsub.s %[val30], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
"mul.s %[val14], %[val14], %[fTmp7] \n\t"
"madd.s %[val17], %[fTmp8], %[fTmp9], %[f3] \n\t"
"nmsub.s %[val30], %[fTmp8], %[fTmp9], %[f3] \n\t"
"mul.s %[val14], %[val14], %[f1] \n\t"
"lwc1 %[fTmp3], 9*4(%[tab]) \n\t"
"lwc1 %[fTmp4], 22*4(%[tab]) \n\t"
"add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val30], %[val30], %[fTmp7] \n\t"
"mul.s %[val30], %[val30], %[f1] \n\t"
"sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
"add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
"sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
"li.s %[fTmp7], 1.72244709823833392782 \n\t"
"li.s %[fTmp10], 0.62250412303566481615 \n\t"
"li.s %[fTmp11], 0.83934964541552703873 \n\t"
"add.s %[val6], %[fTmp5], %[fTmp6] \n\t"
"sub.s %[val9], %[fTmp5], %[fTmp6] \n\t"
"mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
"li.s %[fTmp1], 0.60134488693504528054 \n\t"
"mul.s %[fTmp8], %[fTmp8], %[f5] \n\t"
"sub.s %[fTmp2], %[val1], %[val6] \n\t"
"add.s %[val1], %[val1], %[val6] \n\t"
"mul.s %[val9], %[val9], %[fTmp7] \n\t"
"madd.s %[val22], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
"nmsub.s %[val25], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
"mul.s %[val6], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val9], %[val9], %[f4] \n\t"
"madd.s %[val22], %[fTmp8], %[fTmp9], %[f6] \n\t"
"nmsub.s %[val25], %[fTmp8], %[fTmp9], %[f6] \n\t"
"mul.s %[val6], %[f7], %[fTmp2] \n\t"
"sub.s %[fTmp2], %[val14], %[val9] \n\t"
"add.s %[val9], %[val14], %[val9] \n\t"
"mul.s %[val25], %[val25], %[fTmp7] \n\t"
"mul.s %[val25], %[val25], %[f4] \n\t"
"sub.s %[fTmp3], %[val17], %[val22] \n\t"
"add.s %[val17], %[val17], %[val22] \n\t"
"mul.s %[val14], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val14], %[f7], %[fTmp2] \n\t"
"sub.s %[fTmp2], %[val30], %[val25] \n\t"
"mul.s %[val22], %[fTmp1], %[fTmp3] \n\t"
"mul.s %[val22], %[f7], %[fTmp3] \n\t"
"add.s %[val25], %[val30], %[val25] \n\t"
"mul.s %[val30], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val30], %[f7], %[fTmp2] \n\t"
: [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
[fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
[fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
[fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
[val1] "=f" (val1), [val6] "=f" (val6),
[val9] "=f" (val9), [val14] "=f" (val14),
[val17] "=f" (val17), [val22] "=f" (val22),
[val25] "=f" (val25), [val30] "=f" (val30)
: [tab] "r" (tab)
[fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
[val1] "=&f" (val1), [val6] "=&f" (val6),
[val9] "=&f" (val9), [val14] "=&f" (val14),
[val17] "=&f" (val17), [val22] "=&f" (val22),
[val25] "=&f" (val25), [val30] "=&f" (val30)
: [tab] "r" (tab), [f1]"f"(f1), [f2]"f"(f2), [f3]"f"(f3),
[f4]"f"(f4), [f5]"f"(f5), [f6]"f"(f6), [f7]"f"(f7)
: "memory"
);
f1 = 0.56694403481635770368;
f2 = 0.51544730992262454697;
f3 = 2.05778100995341155085;
f4 = 1.06067768599034747134;
f5 = 0.58293496820613387367;
f6 = 0.97256823786196069369;
f7 = 0.89997622313641570463;
__asm__ volatile (
"lwc1 %[fTmp1], 2*4(%[tab]) \n\t"
"lwc1 %[fTmp2], 29*4(%[tab]) \n\t"
"lwc1 %[fTmp3], 13*4(%[tab]) \n\t"
"lwc1 %[fTmp4], 18*4(%[tab]) \n\t"
"li.s %[fTmp7], 0.56694403481635770368 \n\t"
"add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
"sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
"add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
"sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
"li.s %[fTmp10], 0.51544730992262454697 \n\t"
"li.s %[fTmp11], 2.05778100995341155085 \n\t"
"mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
"mul.s %[fTmp8], %[fTmp8], %[f2] \n\t"
"add.s %[val2], %[fTmp5], %[fTmp6] \n\t"
"sub.s %[val13], %[fTmp5], %[fTmp6] \n\t"
"lwc1 %[fTmp1], 5*4(%[tab]) \n\t"
"lwc1 %[fTmp2], 26*4(%[tab]) \n\t"
"madd.s %[val18], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
"nmsub.s %[val29], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
"mul.s %[val13], %[val13], %[fTmp7] \n\t"
"madd.s %[val18], %[fTmp8], %[fTmp9], %[f3] \n\t"
"nmsub.s %[val29], %[fTmp8], %[fTmp9], %[f3] \n\t"
"mul.s %[val13], %[val13], %[f1] \n\t"
"lwc1 %[fTmp3], 10*4(%[tab]) \n\t"
"lwc1 %[fTmp4], 21*4(%[tab]) \n\t"
"mul.s %[val29], %[val29], %[fTmp7] \n\t"
"mul.s %[val29], %[val29], %[f1] \n\t"
"add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
"sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
"add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
"sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
"li.s %[fTmp7], 1.06067768599034747134 \n\t"
"li.s %[fTmp10], 0.58293496820613387367 \n\t"
"li.s %[fTmp11], 0.97256823786196069369 \n\t"
"add.s %[val5], %[fTmp5], %[fTmp6] \n\t"
"sub.s %[val10], %[fTmp5], %[fTmp6] \n\t"
"mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
"li.s %[fTmp1], 0.89997622313641570463 \n\t"
"mul.s %[fTmp8], %[fTmp8], %[f5] \n\t"
"sub.s %[fTmp2], %[val2], %[val5] \n\t"
"mul.s %[val10], %[val10], %[fTmp7] \n\t"
"madd.s %[val21], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
"nmsub.s %[val26], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
"mul.s %[val10], %[val10], %[f4] \n\t"
"madd.s %[val21], %[fTmp8], %[fTmp9], %[f6] \n\t"
"nmsub.s %[val26], %[fTmp8], %[fTmp9], %[f6] \n\t"
"add.s %[val2], %[val2], %[val5] \n\t"
"mul.s %[val5], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val5], %[f7], %[fTmp2] \n\t"
"sub.s %[fTmp3], %[val13], %[val10] \n\t"
"add.s %[val10], %[val13], %[val10] \n\t"
"mul.s %[val26], %[val26], %[fTmp7] \n\t"
"mul.s %[val26], %[val26], %[f4] \n\t"
"sub.s %[fTmp4], %[val18], %[val21] \n\t"
"add.s %[val18], %[val18], %[val21] \n\t"
"mul.s %[val13], %[fTmp1], %[fTmp3] \n\t"
"mul.s %[val13], %[f7], %[fTmp3] \n\t"
"sub.s %[fTmp2], %[val29], %[val26] \n\t"
"add.s %[val26], %[val29], %[val26] \n\t"
"mul.s %[val21], %[fTmp1], %[fTmp4] \n\t"
"mul.s %[val29], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val21], %[f7], %[fTmp4] \n\t"
"mul.s %[val29], %[f7], %[fTmp2] \n\t"
: [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
[fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
[fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
[fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
[val2] "=f" (val2), [val5] "=f" (val5),
[val10] "=f" (val10), [val13] "=f" (val13),
[val18] "=f" (val18), [val21] "=f" (val21),
[val26] "=f" (val26), [val29] "=f" (val29)
: [tab] "r" (tab)
[fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
[val2] "=&f" (val2), [val5] "=&f" (val5),
[val10] "=&f" (val10), [val13] "=&f" (val13),
[val18] "=&f" (val18), [val21] "=&f" (val21),
[val26] "=&f" (val26), [val29] "=&f" (val29)
: [tab] "r" (tab), [f1]"f"(f1), [f2]"f"(f2), [f3]"f"(f3),
[f4]"f"(f4), [f5]"f"(f5), [f6]"f"(f6), [f7]"f"(f7)
: "memory"
);
f1 = 1.30656296487637652785;
__asm__ volatile (
"li.s %[fTmp1], 1.30656296487637652785 \n\t"
"sub.s %[fTmp2], %[val1], %[val2] \n\t"
"add.s %[val1], %[val1], %[val2] \n\t"
"sub.s %[fTmp3], %[val6], %[val5] \n\t"
"add.s %[val5], %[val6], %[val5] \n\t"
"sub.s %[fTmp4], %[val9], %[val10] \n\t"
"mul.s %[val2], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val2], %[f1], %[fTmp2] \n\t"
"add.s %[val9], %[val9], %[val10] \n\t"
"mul.s %[val6], %[fTmp1], %[fTmp3] \n\t"
"mul.s %[val6], %[f1], %[fTmp3] \n\t"
"sub.s %[fTmp2], %[val14], %[val13] \n\t"
"mul.s %[val10], %[fTmp1], %[fTmp4] \n\t"
"mul.s %[val10], %[f1], %[fTmp4] \n\t"
"add.s %[val13], %[val14], %[val13] \n\t"
"mul.s %[val14], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val14], %[f1], %[fTmp2] \n\t"
: [fTmp1] "=f" (fTmp1), [fTmp2] "=&f" (fTmp2),
: [fTmp2] "=&f" (fTmp2),
[fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
[val1] "+f" (val1), [val2] "+f" (val2),
[val5] "+f" (val5), [val6] "+f" (val6),
[val9] "+f" (val9), [val10] "+f" (val10),
[val13] "+f" (val13), [val14] "+f" (val14)
:
[val1] "+&f" (val1), [val2] "+&f" (val2),
[val5] "+&f" (val5), [val6] "+&f" (val6),
[val9] "+&f" (val9), [val10] "+&f" (val10),
[val13] "+&f" (val13), [val14] "+&f" (val14)
: [f1]"f"(f1)
);
__asm__ volatile (
@ -620,39 +620,39 @@ static void ff_dct32_mips_float(float *out, const float *tab)
"sub.s %[fTmp3], %[val22], %[val21] \n\t"
"add.s %[val21], %[val22], %[val21] \n\t"
"sub.s %[fTmp4], %[val25], %[val26] \n\t"
"mul.s %[val18], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val18], %[f1], %[fTmp2] \n\t"
"add.s %[val25], %[val25], %[val26] \n\t"
"mul.s %[val22], %[fTmp1], %[fTmp3] \n\t"
"mul.s %[val22], %[f1], %[fTmp3] \n\t"
"sub.s %[fTmp2], %[val30], %[val29] \n\t"
"mul.s %[val26], %[fTmp1], %[fTmp4] \n\t"
"mul.s %[val26], %[f1], %[fTmp4] \n\t"
"add.s %[val29], %[val30], %[val29] \n\t"
"mul.s %[val30], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val30], %[f1], %[fTmp2] \n\t"
: [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
[val17] "+f" (val17), [val18] "+f" (val18), [val21] "+f" (val21),
[val22] "+f" (val22), [val25] "+f" (val25), [val26] "+f" (val26),
[val29] "+f" (val29), [val30] "+f" (val30)
: [fTmp1] "f" (fTmp1)
[val17] "+&f" (val17), [val18] "+&f" (val18), [val21] "+&f" (val21),
[val22] "+&f" (val22), [val25] "+&f" (val25), [val26] "+&f" (val26),
[val29] "+&f" (val29), [val30] "+&f" (val30)
: [f1] "f" (f1)
);
f1 = 0.70710678118654752439;
__asm__ volatile (
"li.s %[fTmp1], 0.70710678118654752439 \n\t"
"sub.s %[fTmp2], %[val0], %[val1] \n\t"
"add.s %[val0], %[val0], %[val1] \n\t"
"sub.s %[fTmp3], %[val3], %[val2] \n\t"
"add.s %[val2], %[val3], %[val2] \n\t"
"sub.s %[fTmp4], %[val4], %[val5] \n\t"
"mul.s %[val1], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val1], %[f1], %[fTmp2] \n\t"
"swc1 %[val0], 0(%[out]) \n\t"
"mul.s %[val3], %[fTmp3], %[fTmp1] \n\t"
"mul.s %[val3], %[fTmp3], %[f1] \n\t"
"add.s %[val4], %[val4], %[val5] \n\t"
"mul.s %[val5], %[fTmp1], %[fTmp4] \n\t"
"mul.s %[val5], %[f1], %[fTmp4] \n\t"
"swc1 %[val1], 16*4(%[out]) \n\t"
"sub.s %[fTmp2], %[val7], %[val6] \n\t"
"add.s %[val2], %[val2], %[val3] \n\t"
"swc1 %[val3], 24*4(%[out]) \n\t"
"add.s %[val6], %[val7], %[val6] \n\t"
"mul.s %[val7], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val7], %[f1], %[fTmp2] \n\t"
"swc1 %[val2], 8*4(%[out]) \n\t"
"add.s %[val6], %[val6], %[val7] \n\t"
"swc1 %[val7], 28*4(%[out]) \n\t"
@ -663,13 +663,13 @@ static void ff_dct32_mips_float(float *out, const float *tab)
"swc1 %[val5], 20*4(%[out]) \n\t"
"swc1 %[val6], 12*4(%[out]) \n\t"
: [fTmp1] "=f" (fTmp1), [fTmp2] "=&f" (fTmp2),
: [fTmp2] "=&f" (fTmp2),
[fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
[val0] "+f" (val0), [val1] "+f" (val1),
[val2] "+f" (val2), [val3] "+f" (val3),
[val4] "+f" (val4), [val5] "+f" (val5),
[val6] "+f" (val6), [val7] "+f" (val7)
: [out] "r" (out)
[val0] "+&f" (val0), [val1] "+&f" (val1),
[val2] "+&f" (val2), [val3] "+&f" (val3),
[val4] "+&f" (val4), [val5] "+&f" (val5),
[val6] "+&f" (val6), [val7] "+&f" (val7)
: [out] "r" (out), [f1]"f"(f1)
);
__asm__ volatile (
@ -678,14 +678,14 @@ static void ff_dct32_mips_float(float *out, const float *tab)
"sub.s %[fTmp3], %[val11], %[val10] \n\t"
"add.s %[val10], %[val11], %[val10] \n\t"
"sub.s %[fTmp4], %[val12], %[val13] \n\t"
"mul.s %[val9], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val9], %[f1], %[fTmp2] \n\t"
"add.s %[val12], %[val12], %[val13] \n\t"
"mul.s %[val11], %[fTmp1], %[fTmp3] \n\t"
"mul.s %[val11], %[f1], %[fTmp3] \n\t"
"sub.s %[fTmp2], %[val15], %[val14] \n\t"
"mul.s %[val13], %[fTmp1], %[fTmp4] \n\t"
"mul.s %[val13], %[f1], %[fTmp4] \n\t"
"add.s %[val14], %[val15], %[val14] \n\t"
"add.s %[val10], %[val10], %[val11] \n\t"
"mul.s %[val15], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val15], %[f1], %[fTmp2] \n\t"
"add.s %[val14], %[val14], %[val15] \n\t"
"add.s %[val12], %[val12], %[val14] \n\t"
"add.s %[val14], %[val14], %[val13] \n\t"
@ -707,10 +707,10 @@ static void ff_dct32_mips_float(float *out, const float *tab)
"swc1 %[val15], 30*4(%[out]) \n\t"
: [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
[val8] "+f" (val8), [val9] "+f" (val9), [val10] "+f" (val10),
[val11] "+f" (val11), [val12] "+f" (val12), [val13] "+f" (val13),
[val14] "+f" (val14), [val15] "+f" (val15)
: [fTmp1] "f" (fTmp1), [out] "r" (out)
[val8] "+&f" (val8), [val9] "+&f" (val9), [val10] "+&f" (val10),
[val11] "+&f" (val11), [val12] "+&f" (val12), [val13] "+&f" (val13),
[val14] "+&f" (val14), [val15] "+&f" (val15)
: [f1] "f" (f1), [out] "r" (out)
);
__asm__ volatile (
@ -719,24 +719,24 @@ static void ff_dct32_mips_float(float *out, const float *tab)
"sub.s %[fTmp3], %[val19], %[val18] \n\t"
"add.s %[val18], %[val19], %[val18] \n\t"
"sub.s %[fTmp4], %[val20], %[val21] \n\t"
"mul.s %[val17], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val17], %[f1], %[fTmp2] \n\t"
"add.s %[val20], %[val20], %[val21] \n\t"
"mul.s %[val19], %[fTmp1], %[fTmp3] \n\t"
"mul.s %[val19], %[f1], %[fTmp3] \n\t"
"sub.s %[fTmp2], %[val23], %[val22] \n\t"
"mul.s %[val21], %[fTmp1], %[fTmp4] \n\t"
"mul.s %[val21], %[f1], %[fTmp4] \n\t"
"add.s %[val22], %[val23], %[val22] \n\t"
"add.s %[val18], %[val18], %[val19] \n\t"
"mul.s %[val23], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val23], %[f1], %[fTmp2] \n\t"
"add.s %[val22], %[val22], %[val23] \n\t"
"add.s %[val20], %[val20], %[val22] \n\t"
"add.s %[val22], %[val22], %[val21] \n\t"
"add.s %[val21], %[val21], %[val23] \n\t"
: [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
[val16] "+f" (val16), [val17] "+f" (val17), [val18] "+f" (val18),
[val19] "+f" (val19), [val20] "+f" (val20), [val21] "+f" (val21),
[val22] "+f" (val22), [val23] "+f" (val23)
: [fTmp1] "f" (fTmp1)
[val16] "+&f" (val16), [val17] "+&f" (val17), [val18] "+&f" (val18),
[val19] "+&f" (val19), [val20] "+&f" (val20), [val21] "+&f" (val21),
[val22] "+&f" (val22), [val23] "+&f" (val23)
: [f1] "f" (f1)
);
__asm__ volatile (
@ -745,14 +745,14 @@ static void ff_dct32_mips_float(float *out, const float *tab)
"sub.s %[fTmp3], %[val27], %[val26] \n\t"
"add.s %[val26], %[val27], %[val26] \n\t"
"sub.s %[fTmp4], %[val28], %[val29] \n\t"
"mul.s %[val25], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val25], %[f1], %[fTmp2] \n\t"
"add.s %[val28], %[val28], %[val29] \n\t"
"mul.s %[val27], %[fTmp1], %[fTmp3] \n\t"
"mul.s %[val27], %[f1], %[fTmp3] \n\t"
"sub.s %[fTmp2], %[val31], %[val30] \n\t"
"mul.s %[val29], %[fTmp1], %[fTmp4] \n\t"
"mul.s %[val29], %[f1], %[fTmp4] \n\t"
"add.s %[val30], %[val31], %[val30] \n\t"
"add.s %[val26], %[val26], %[val27] \n\t"
"mul.s %[val31], %[fTmp1], %[fTmp2] \n\t"
"mul.s %[val31], %[f1], %[fTmp2] \n\t"
"add.s %[val30], %[val30], %[val31] \n\t"
"add.s %[val28], %[val28], %[val30] \n\t"
"add.s %[val30], %[val30], %[val29] \n\t"
@ -766,10 +766,10 @@ static void ff_dct32_mips_float(float *out, const float *tab)
"add.s %[val27], %[val27], %[val31] \n\t"
: [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
[val24] "+f" (val24), [val25] "+f" (val25), [val26] "+f" (val26),
[val27] "+f" (val27), [val28] "+f" (val28), [val29] "+f" (val29),
[val30] "+f" (val30), [val31] "+f" (val31)
: [fTmp1] "f" (fTmp1)
[val24] "+&f" (val24), [val25] "+&f" (val25), [val26] "+&f" (val26),
[val27] "+&f" (val27), [val28] "+&f" (val28), [val29] "+&f" (val29),
[val30] "+&f" (val30), [val31] "+&f" (val31)
: [f1] "f" (f1)
);
out[ 1] = val16 + val24;
@ -797,7 +797,7 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
/* temporary variables */
float in1, in2, in3, in4, in5, in6;
float out1, out2, out3, out4, out5;
float c1, c2, c3, c4, c5, c6, c7, c8, c9;
float f1, f2, f3, f4, f5, f6, f7, f8, f9;
/**
* all loops are unrolled totally, and instructions are scheduled to
@ -881,33 +881,36 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
);
/* loop 3 */
f1 = 0.5;
f2 = 0.93969262078590838405;
f3 = -0.76604444311897803520;
f4 = -0.17364817766693034885;
f5 = -0.86602540378443864676;
f6 = 0.98480775301220805936;
f7 = -0.34202014332566873304;
f8 = 0.86602540378443864676;
f9 = -0.64278760968653932632;
__asm__ volatile (
"li.s %[c1], 0.5 \t\n"
"lwc1 %[in1], 8*4(%[in]) \t\n"
"lwc1 %[in2], 16*4(%[in]) \t\n"
"lwc1 %[in3], 4*4(%[in]) \t\n"
"lwc1 %[in4], 0(%[in]) \t\n"
"lwc1 %[in5], 12*4(%[in]) \t\n"
"li.s %[c2], 0.93969262078590838405 \t\n"
"add.s %[t2], %[in1], %[in2] \t\n"
"add.s %[t0], %[in1], %[in3] \t\n"
"li.s %[c3], -0.76604444311897803520 \t\n"
"madd.s %[t3], %[in4], %[in5], %[c1] \t\n"
"madd.s %[t3], %[in4], %[in5], %[f1] \t\n"
"sub.s %[t1], %[in4], %[in5] \t\n"
"sub.s %[t2], %[t2], %[in3] \t\n"
"mul.s %[t0], %[t0], %[c2] \t\n"
"li.s %[c4], -0.17364817766693034885 \t\n"
"li.s %[c5], -0.86602540378443864676 \t\n"
"li.s %[c6], 0.98480775301220805936 \t\n"
"nmsub.s %[out1], %[t1], %[t2], %[c1] \t\n"
"mul.s %[t0], %[t0], %[f2] \t\n"
"nmsub.s %[out1], %[t1], %[t2], %[f1] \t\n"
"add.s %[out2], %[t1], %[t2] \t\n"
"add.s %[t2], %[in2], %[in3] \t\n"
"sub.s %[t1], %[in1], %[in2] \t\n"
"sub.s %[out3], %[t3], %[t0] \t\n"
"swc1 %[out1], 6*4(%[tmp]) \t\n"
"swc1 %[out2], 16*4(%[tmp]) \t\n"
"mul.s %[t2], %[t2], %[c3] \t\n"
"mul.s %[t1], %[t1], %[c4] \t\n"
"mul.s %[t2], %[t2], %[f3] \t\n"
"mul.s %[t1], %[t1], %[f4] \t\n"
"add.s %[out1], %[t3], %[t0] \t\n"
"lwc1 %[in1], 10*4(%[in]) \t\n"
"lwc1 %[in2], 14*4(%[in]) \t\n"
@ -923,19 +926,16 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
"add.s %[t2], %[in1], %[in3] \t\n"
"sub.s %[t3], %[in1], %[in2] \t\n"
"swc1 %[out2], 14*4(%[tmp]) \t\n"
"li.s %[c7], -0.34202014332566873304 \t\n"
"sub.s %[out1], %[out1], %[in3] \t\n"
"mul.s %[t2], %[t2], %[c6] \t\n"
"mul.s %[t3], %[t3], %[c7] \t\n"
"li.s %[c8], 0.86602540378443864676 \t\n"
"mul.s %[t0], %[in4], %[c8] \t\n"
"mul.s %[out1], %[out1], %[c5] \t\n"
"mul.s %[t2], %[t2], %[f6] \t\n"
"mul.s %[t3], %[t3], %[f7] \t\n"
"mul.s %[t0], %[in4], %[f8] \t\n"
"mul.s %[out1], %[out1], %[f5] \t\n"
"add.s %[t1], %[in2], %[in3] \t\n"
"li.s %[c9], -0.64278760968653932632 \t\n"
"add.s %[out2], %[t2], %[t3] \t\n"
"lwc1 %[in1], 9*4(%[in]) \t\n"
"swc1 %[out1], 4*4(%[tmp]) \t\n"
"mul.s %[t1], %[t1], %[c9] \t\n"
"mul.s %[t1], %[t1], %[f9] \t\n"
"lwc1 %[in2], 17*4(%[in]) \t\n"
"add.s %[out2], %[out2], %[t0] \t\n"
"lwc1 %[in3], 5*4(%[in]) \t\n"
@ -948,21 +948,21 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
"sub.s %[out3], %[out3], %[t0] \t\n"
"sub.s %[out1], %[out1], %[t0] \t\n"
"add.s %[t0], %[in1], %[in3] \t\n"
"madd.s %[t3], %[in4], %[in5], %[c1] \t\n"
"madd.s %[t3], %[in4], %[in5], %[f1] \t\n"
"sub.s %[t2], %[t2], %[in3] \t\n"
"swc1 %[out3], 12*4(%[tmp]) \t\n"
"swc1 %[out1], 8*4(%[tmp]) \t\n"
"sub.s %[t1], %[in4], %[in5] \t\n"
"mul.s %[t0], %[t0], %[c2] \t\n"
"nmsub.s %[out1], %[t1], %[t2], %[c1] \t\n"
"mul.s %[t0], %[t0], %[f2] \t\n"
"nmsub.s %[out1], %[t1], %[t2], %[f1] \t\n"
"add.s %[out2], %[t1], %[t2] \t\n"
"add.s %[t2], %[in2], %[in3] \t\n"
"sub.s %[t1], %[in1], %[in2] \t\n"
"sub.s %[out3], %[t3], %[t0] \t\n"
"swc1 %[out1], 7*4(%[tmp]) \t\n"
"swc1 %[out2], 17*4(%[tmp]) \t\n"
"mul.s %[t2], %[t2], %[c3] \t\n"
"mul.s %[t1], %[t1], %[c4] \t\n"
"mul.s %[t2], %[t2], %[f3] \t\n"
"mul.s %[t1], %[t1], %[f4] \t\n"
"add.s %[out1], %[t3], %[t0] \t\n"
"lwc1 %[in1], 11*4(%[in]) \t\n"
"lwc1 %[in2], 15*4(%[in]) \t\n"
@ -978,14 +978,14 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
"add.s %[t2], %[in1], %[in3] \t\n"
"sub.s %[t3], %[in1], %[in2] \t\n"
"swc1 %[out2], 15*4(%[tmp]) \t\n"
"mul.s %[t0], %[in4], %[c8] \t\n"
"mul.s %[t0], %[in4], %[f8] \t\n"
"sub.s %[out3], %[out3], %[in3] \t\n"
"mul.s %[t2], %[t2], %[c6] \t\n"
"mul.s %[t3], %[t3], %[c7] \t\n"
"mul.s %[t2], %[t2], %[f6] \t\n"
"mul.s %[t3], %[t3], %[f7] \t\n"
"add.s %[t1], %[in2], %[in3] \t\n"
"mul.s %[out3], %[out3], %[c5] \t\n"
"mul.s %[out3], %[out3], %[f5] \t\n"
"add.s %[out1], %[t2], %[t3] \t\n"
"mul.s %[t1], %[t1], %[c9] \t\n"
"mul.s %[t1], %[t1], %[f9] \t\n"
"swc1 %[out3], 5*4(%[tmp]) \t\n"
"add.s %[out1], %[out1], %[t0] \t\n"
"add.s %[out2], %[t2], %[t1] \t\n"
@ -1000,26 +1000,29 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
[t2] "=&f" (t2), [t3] "=&f" (t3),
[in1] "=&f" (in1), [in2] "=&f" (in2),
[in3] "=&f" (in3), [in4] "=&f" (in4),
[in5] "=&f" (in5),
[out1] "=&f" (out1), [out2] "=&f" (out2),
[out3] "=&f" (out3),
[c1] "=&f" (c1), [c2] "=&f" (c2),
[c3] "=&f" (c3), [c4] "=&f" (c4),
[c5] "=&f" (c5), [c6] "=&f" (c6),
[c7] "=&f" (c7), [c8] "=&f" (c8),
[c9] "=&f" (c9)
: [in] "r" (in), [tmp] "r" (tmp)
[in5] "=&f" (in5), [out1] "=&f" (out1),
[out2] "=&f" (out2), [out3] "=&f" (out3)
: [in] "r" (in), [tmp] "r" (tmp), [f1]"f"(f1), [f2]"f"(f2),
[f3]"f"(f3), [f4]"f"(f4), [f5]"f"(f5), [f6]"f"(f6),
[f7]"f"(f7), [f8]"f"(f8), [f9]"f"(f9)
: "memory"
);
/* loop 4 */
f1 = 0.50190991877167369479;
f2 = 5.73685662283492756461;
f3 = 0.51763809020504152469;
f4 = 1.93185165257813657349;
f5 = 0.55168895948124587824;
f6 = 1.18310079157624925896;
f7 = 0.61038729438072803416;
f8 = 0.87172339781054900991;
f9 = 0.70710678118654752439;
__asm__ volatile (
"lwc1 %[in1], 2*4(%[tmp]) \t\n"
"lwc1 %[in2], 0(%[tmp]) \t\n"
"lwc1 %[in3], 3*4(%[tmp]) \t\n"
"lwc1 %[in4], 1*4(%[tmp]) \t\n"
"li.s %[c1], 0.50190991877167369479 \t\n"
"li.s %[c2], 5.73685662283492756461 \t\n"
"add.s %[s0], %[in1], %[in2] \t\n"
"sub.s %[s2], %[in1], %[in2] \t\n"
"add.s %[s1], %[in3], %[in4] \t\n"
@ -1027,15 +1030,13 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
"lwc1 %[in1], 9*4(%[win]) \t\n"
"lwc1 %[in2], 4*9*4(%[buf]) \t\n"
"lwc1 %[in3], 8*4(%[win]) \t\n"
"mul.s %[s1], %[s1], %[c1] \t\n"
"mul.s %[s3], %[s3], %[c2] \t\n"
"mul.s %[s1], %[s1], %[f1] \t\n"
"mul.s %[s3], %[s3], %[f2] \t\n"
"lwc1 %[in4], 4*8*4(%[buf]) \t\n"
"lwc1 %[in5], 29*4(%[win]) \t\n"
"lwc1 %[in6], 28*4(%[win]) \t\n"
"add.s %[t0], %[s0], %[s1] \t\n"
"sub.s %[t1], %[s0], %[s1] \t\n"
"li.s %[c1], 0.51763809020504152469 \t\n"
"li.s %[c2], 1.93185165257813657349 \t\n"
"mul.s %[out3], %[in5], %[t0] \t\n"
"madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
"madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
@ -1071,14 +1072,13 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
"lwc1 %[in1], 10*4(%[win]) \t\n"
"lwc1 %[in2], 4*10*4(%[buf]) \t\n"
"lwc1 %[in3], 7*4(%[win]) \t\n"
"mul.s %[s1], %[s1], %[c1] \t\n"
"mul.s %[s3], %[s3], %[c2] \t\n"
"mul.s %[s1], %[s1], %[f3] \t\n"
"mul.s %[s3], %[s3], %[f4] \t\n"
"add.s %[t0], %[s0], %[s1] \t\n"
"sub.s %[t1], %[s0], %[s1] \t\n"
"lwc1 %[in4], 4*7*4(%[buf]) \t\n"
"lwc1 %[in5], 30*4(%[win]) \t\n"
"lwc1 %[in6], 27*4(%[win]) \t\n"
"li.s %[c1], 0.55168895948124587824 \t\n"
"madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
"madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
"mul.s %[out3], %[t0], %[in5] \t\n"
@ -1105,7 +1105,6 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
"swc1 %[out2], 32*4(%[out]) \t\n"
"swc1 %[out3], 4*16*4(%[buf]) \t\n"
"swc1 %[out4], 4*1*4(%[buf]) \t\n"
"li.s %[c2], 1.18310079157624925896 \t\n"
"add.s %[s0], %[in1], %[in2] \t\n"
"sub.s %[s2], %[in1], %[in2] \t\n"
"lwc1 %[in3], 11*4(%[tmp]) \t\n"
@ -1115,8 +1114,8 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
"lwc1 %[in1], 11*4(%[win]) \t\n"
"lwc1 %[in2], 4*11*4(%[buf]) \t\n"
"lwc1 %[in3], 6*4(%[win]) \t\n"
"mul.s %[s1], %[s1], %[c1] \t\n"
"mul.s %[s3], %[s3], %[c2] \t\n"
"mul.s %[s1], %[s1], %[f5] \t\n"
"mul.s %[s3], %[s3], %[f6] \t\n"
"lwc1 %[in4], 4*6*4(%[buf]) \t\n"
"lwc1 %[in5], 31*4(%[win]) \t\n"
"lwc1 %[in6], 26*4(%[win]) \t\n"
@ -1152,15 +1151,13 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
"add.s %[s0], %[in1], %[in2] \t\n"
"sub.s %[s2], %[in1], %[in2] \t\n"
"lwc1 %[in4], 13*4(%[tmp]) \t\n"
"li.s %[c1], 0.61038729438072803416 \t\n"
"li.s %[c2], 0.87172339781054900991 \t\n"
"add.s %[s1], %[in3], %[in4] \t\n"
"sub.s %[s3], %[in3], %[in4] \t\n"
"lwc1 %[in1], 12*4(%[win]) \t\n"
"lwc1 %[in2], 4*12*4(%[buf]) \t\n"
"lwc1 %[in3], 5*4(%[win]) \t\n"
"mul.s %[s1], %[s1], %[c1] \t\n"
"mul.s %[s3], %[s3], %[c2] \t\n"
"mul.s %[s1], %[s1], %[f7] \t\n"
"mul.s %[s3], %[s3], %[f8] \t\n"
"lwc1 %[in4], 4*5*4(%[buf]) \t\n"
"lwc1 %[in5], 32*4(%[win]) \t\n"
"lwc1 %[in6], 25*4(%[win]) \t\n"
@ -1168,7 +1165,6 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
"sub.s %[t1], %[s0], %[s1] \t\n"
"lwc1 %[s0], 16*4(%[tmp]) \t\n"
"lwc1 %[s1], 17*4(%[tmp]) \t\n"
"li.s %[c1], 0.70710678118654752439 \t\n"
"mul.s %[out3], %[t0], %[in5] \t\n"
"madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
"madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
@ -1186,7 +1182,7 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
"lwc1 %[in5], 34*4(%[win]) \t\n"
"lwc1 %[in6], 23*4(%[win]) \t\n"
"madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
"mul.s %[s1], %[s1], %[c1] \t\n"
"mul.s %[s1], %[s1], %[f9] \t\n"
"madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
"mul.s %[out3], %[in5], %[t0] \t\n"
"mul.s %[out4], %[in6], %[t0] \t\n"
@ -1211,18 +1207,18 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
"swc1 %[out3], 4*13*4(%[buf]) \t\n"
"swc1 %[out4], 4*4*4(%[buf]) \t\n"
: [c1] "=&f" (c1), [c2] "=&f" (c2),
[in1] "=&f" (in1), [in2] "=&f" (in2),
: [in1] "=&f" (in1), [in2] "=&f" (in2),
[in3] "=&f" (in3), [in4] "=&f" (in4),
[in5] "=&f" (in5), [in6] "=&f" (in6),
[out1] "=&f" (out1), [out2] "=&f" (out2),
[out3] "=&f" (out3), [out4] "=&f" (out4),
[t0] "=&f" (t0), [t1] "=&f" (t1),
[t2] "=&f" (t2), [t3] "=&f" (t3),
[s0] "=&f" (s0), [s1] "=&f" (s1),
[s2] "=&f" (s2), [s3] "=&f" (s3)
: [tmp] "r" (tmp), [win] "r" (win),
[buf] "r" (buf), [out] "r" (out)
[buf] "r" (buf), [out] "r" (out),
[f1]"f"(f1), [f2]"f"(f2), [f3]"f"(f3), [f4]"f"(f4),
[f5]"f"(f5), [f6]"f"(f6), [f7]"f"(f7), [f8]"f"(f8), [f9]"f"(f9)
: "memory"
);
}