From bd4f37f2ebf5870083c2343cbed6846aee63e6a5 Mon Sep 17 00:00:00 2001 From: Shiyou Yin Date: Wed, 29 Jul 2020 18:11:01 +0800 Subject: [PATCH] avcodec/mips: Fix segfault in imdct36_mips_float. 'li.s' is a synthesized instruction, it does not work properly when compiled with clang on mips, and A segfault occurred. Signed-off-by: Michael Niedermayer --- libavcodec/mips/aacpsdsp_mips.c | 13 +- libavcodec/mips/aacpsy_mips.h | 14 +- libavcodec/mips/fft_mips.c | 12 +- libavcodec/mips/mpegaudiodsp_mips_float.c | 492 +++++++++++----------- 4 files changed, 264 insertions(+), 267 deletions(-) diff --git a/libavcodec/mips/aacpsdsp_mips.c b/libavcodec/mips/aacpsdsp_mips.c index ef47e31a9e..f63541330d 100644 --- a/libavcodec/mips/aacpsdsp_mips.c +++ b/libavcodec/mips/aacpsdsp_mips.c @@ -293,16 +293,17 @@ static void ps_decorrelate_mips(float (*out)[2], float (*delay)[2], float phi_fract0 = phi_fract[0]; float phi_fract1 = phi_fract[1]; float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; + float f1, f2, f3; float *p_delay_end = (p_delay + (len << 1)); /* merged 2 loops */ + f1 = 0.65143905753106; + f2 = 0.56471812200776; + f3 = 0.48954165955695; __asm__ volatile( ".set push \n\t" ".set noreorder \n\t" - "li.s %[ag0], 0.65143905753106 \n\t" - "li.s %[ag1], 0.56471812200776 \n\t" - "li.s %[ag2], 0.48954165955695 \n\t" "mul.s %[ag0], %[ag0], %[g_decay_slope] \n\t" "mul.s %[ag1], %[ag1], %[g_decay_slope] \n\t" "mul.s %[ag2], %[ag2], %[g_decay_slope] \n\t" @@ -378,10 +379,10 @@ static void ps_decorrelate_mips(float (*out)[2], float (*delay)[2], [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5), [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8), [temp9]"=&f"(temp9), [p_delay]"+r"(p_delay), [p_ap_delay]"+r"(p_ap_delay), - [p_Q_fract]"+r"(p_Q_fract), [p_t_gain]"+r"(p_t_gain), [p_out]"+r"(p_out), - [ag0]"=&f"(ag0), [ag1]"=&f"(ag1), [ag2]"=&f"(ag2) + [p_Q_fract]"+r"(p_Q_fract), [p_t_gain]"+r"(p_t_gain), [p_out]"+r"(p_out) : [phi_fract0]"f"(phi_fract0), [phi_fract1]"f"(phi_fract1), - [p_delay_end]"r"(p_delay_end), [g_decay_slope]"f"(g_decay_slope) + [p_delay_end]"r"(p_delay_end), [g_decay_slope]"f"(g_decay_slope), + [ag0]"f"(f1), [ag1]"f"(f2), [ag2]"f"(f3) : "memory" ); } diff --git a/libavcodec/mips/aacpsy_mips.h b/libavcodec/mips/aacpsy_mips.h index a1fe5ccea9..7d27d32f18 100644 --- a/libavcodec/mips/aacpsy_mips.h +++ b/libavcodec/mips/aacpsy_mips.h @@ -135,11 +135,11 @@ static void psy_hp_filter_mips(const float *firbuf, float *hpfsmpl, const float float coeff3 = psy_fir_coeffs[7]; float coeff4 = psy_fir_coeffs[9]; + float f1 = 32768.0; __asm__ volatile ( ".set push \n\t" ".set noreorder \n\t" - "li.s $f12, 32768 \n\t" "1: \n\t" "lwc1 $f0, 40(%[fb]) \n\t" "lwc1 $f1, 4(%[fb]) \n\t" @@ -203,14 +203,14 @@ static void psy_hp_filter_mips(const float *firbuf, float *hpfsmpl, const float "madd.s %[sum2], %[sum2], $f9, %[coeff4] \n\t" "madd.s %[sum4], %[sum4], $f6, %[coeff4] \n\t" "madd.s %[sum3], %[sum3], $f3, %[coeff4] \n\t" - "mul.s %[sum1], %[sum1], $f12 \n\t" - "mul.s %[sum2], %[sum2], $f12 \n\t" + "mul.s %[sum1], %[sum1], %[f1] \n\t" + "mul.s %[sum2], %[sum2], %[f1] \n\t" "madd.s %[sum4], %[sum4], $f11, %[coeff4] \n\t" "madd.s %[sum3], %[sum3], $f8, %[coeff4] \n\t" "swc1 %[sum1], 0(%[hp]) \n\t" "swc1 %[sum2], 4(%[hp]) \n\t" - "mul.s %[sum4], %[sum4], $f12 \n\t" - "mul.s %[sum3], %[sum3], $f12 \n\t" + "mul.s %[sum4], %[sum4], %[f1] \n\t" + "mul.s %[sum3], %[sum3], %[f1] \n\t" "swc1 %[sum4], 12(%[hp]) \n\t" "swc1 %[sum3], 8(%[hp]) \n\t" "bne %[fb], %[fb_end], 1b \n\t" @@ -223,9 +223,9 @@ static void psy_hp_filter_mips(const float *firbuf, float *hpfsmpl, const float [fb]"+r"(fb), [hp]"+r"(hp) : [coeff0]"f"(coeff0), [coeff1]"f"(coeff1), [coeff2]"f"(coeff2), [coeff3]"f"(coeff3), - [coeff4]"f"(coeff4), [fb_end]"r"(fb_end) + [coeff4]"f"(coeff4), [fb_end]"r"(fb_end), [f1]"f"(f1) : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", - "$f7", "$f8", "$f9", "$f10", "$f11", "$f12", + "$f7", "$f8", "$f9", "$f10", "$f11", "memory" ); } diff --git a/libavcodec/mips/fft_mips.c b/libavcodec/mips/fft_mips.c index 03dcbad4d8..69abdc8a08 100644 --- a/libavcodec/mips/fft_mips.c +++ b/libavcodec/mips/fft_mips.c @@ -71,6 +71,7 @@ static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z) float temp, temp1, temp3, temp4; FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4; FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i; + float f1 = 0.7071067812; num_transforms = (21845 >> (17 - s->nbits)) | 1; @@ -148,7 +149,6 @@ static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z) "swc1 %[pom2], 4(%[tmpz]) \n\t" // tmpz[0].im = tmpz[0].im + tmp6; "lwc1 %[pom1], 16(%[tmpz]) \n\t" "lwc1 %[pom3], 20(%[tmpz]) \n\t" - "li.s %[pom], 0.7071067812 \n\t" // float pom = 0.7071067812f; "add.s %[temp1],%[tmp1], %[tmp2] \n\t" "sub.s %[temp], %[pom1], %[tmp8] \n\t" "add.s %[pom2], %[pom3], %[tmp7] \n\t" @@ -159,10 +159,10 @@ static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z) "add.s %[pom1], %[pom1], %[tmp8] \n\t" "sub.s %[pom3], %[pom3], %[tmp7] \n\t" "add.s %[tmp3], %[tmp3], %[tmp4] \n\t" - "mul.s %[tmp5], %[pom], %[temp1] \n\t" // tmp5 = pom * (tmp1 + tmp2); - "mul.s %[tmp7], %[pom], %[temp3] \n\t" // tmp7 = pom * (tmp3 - tmp4); - "mul.s %[tmp6], %[pom], %[temp4] \n\t" // tmp6 = pom * (tmp2 - tmp1); - "mul.s %[tmp8], %[pom], %[tmp3] \n\t" // tmp8 = pom * (tmp3 + tmp4); + "mul.s %[tmp5], %[f1], %[temp1] \n\t" // tmp5 = pom * (tmp1 + tmp2); + "mul.s %[tmp7], %[f1], %[temp3] \n\t" // tmp7 = pom * (tmp3 - tmp4); + "mul.s %[tmp6], %[f1], %[temp4] \n\t" // tmp6 = pom * (tmp2 - tmp1); + "mul.s %[tmp8], %[f1], %[tmp3] \n\t" // tmp8 = pom * (tmp3 + tmp4); "swc1 %[pom1], 16(%[tmpz]) \n\t" // tmpz[2].re = tmpz[2].re + tmp8; "swc1 %[pom3], 20(%[tmpz]) \n\t" // tmpz[2].im = tmpz[2].im - tmp7; "add.s %[tmp1], %[tmp5], %[tmp7] \n\t" // tmp1 = tmp5 + tmp7; @@ -193,7 +193,7 @@ static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z) [tmp3]"=&f"(tmp3), [tmp2]"=&f"(tmp2), [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp7]"=&f"(tmp7), [tmp6]"=&f"(tmp6), [tmp8]"=&f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1), [temp3]"=&f"(temp3), [temp4]"=&f"(temp4) - : [tmpz]"r"(tmpz) + : [tmpz]"r"(tmpz), [f1]"f"(f1) : "memory" ); } diff --git a/libavcodec/mips/mpegaudiodsp_mips_float.c b/libavcodec/mips/mpegaudiodsp_mips_float.c index 481b69c10e..ae130c752e 100644 --- a/libavcodec/mips/mpegaudiodsp_mips_float.c +++ b/libavcodec/mips/mpegaudiodsp_mips_float.c @@ -287,9 +287,16 @@ static void ff_dct32_mips_float(float *out, const float *tab) val8 , val9 , val10, val11, val12, val13, val14, val15, val16, val17, val18, val19, val20, val21, val22, val23, val24, val25, val26, val27, val28, val29, val30, val31; - float fTmp1, fTmp2, fTmp3, fTmp4, fTmp5, fTmp6, fTmp7, fTmp8, - fTmp9, fTmp10, fTmp11; + float fTmp1, fTmp2, fTmp3, fTmp4, fTmp5, fTmp6, fTmp8, fTmp9; + float f1, f2, f3, f4, f5, f6, f7; + f1 = 0.50241928618815570551; + f2 = 0.50060299823519630134; + f3 = 10.19000812354805681150; + f4 = 5.10114861868916385802; + f5 = 0.67480834145500574602; + f6 = 0.74453627100229844977; + f7 = 0.50979557910415916894; /** * instructions are scheduled to minimize pipeline stall. */ @@ -298,149 +305,142 @@ static void ff_dct32_mips_float(float *out, const float *tab) "lwc1 %[fTmp2], 31*4(%[tab]) \n\t" "lwc1 %[fTmp3], 15*4(%[tab]) \n\t" "lwc1 %[fTmp4], 16*4(%[tab]) \n\t" - "li.s %[fTmp7], 0.50241928618815570551 \n\t" "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" - "li.s %[fTmp10], 0.50060299823519630134 \n\t" - "li.s %[fTmp11], 10.19000812354805681150 \n\t" - "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t" + "mul.s %[fTmp8], %[fTmp8], %[f2] \n\t" "add.s %[val0], %[fTmp5], %[fTmp6] \n\t" "sub.s %[val15], %[fTmp5], %[fTmp6] \n\t" "lwc1 %[fTmp1], 7*4(%[tab]) \n\t" "lwc1 %[fTmp2], 24*4(%[tab]) \n\t" - "madd.s %[val16], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" - "nmsub.s %[val31], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" - "mul.s %[val15], %[val15], %[fTmp7] \n\t" + "madd.s %[val16], %[fTmp8], %[fTmp9], %[f3] \n\t" + "nmsub.s %[val31], %[fTmp8], %[fTmp9], %[f3] \n\t" + "mul.s %[val15], %[val15], %[f1] \n\t" "lwc1 %[fTmp3], 8*4(%[tab]) \n\t" "lwc1 %[fTmp4], 23*4(%[tab]) \n\t" "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" - "mul.s %[val31], %[val31], %[fTmp7] \n\t" + "mul.s %[val31], %[val31], %[f1] \n\t" "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" - "li.s %[fTmp7], 5.10114861868916385802 \n\t" - "li.s %[fTmp10], 0.67480834145500574602 \n\t" - "li.s %[fTmp11], 0.74453627100229844977 \n\t" "add.s %[val7], %[fTmp5], %[fTmp6] \n\t" "sub.s %[val8], %[fTmp5], %[fTmp6] \n\t" - "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t" - "li.s %[fTmp1], 0.50979557910415916894 \n\t" + "mul.s %[fTmp8], %[fTmp8], %[f5] \n\t" "sub.s %[fTmp2], %[val0], %[val7] \n\t" - "mul.s %[val8], %[val8], %[fTmp7] \n\t" - "madd.s %[val23], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" - "nmsub.s %[val24], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" + "mul.s %[val8], %[val8], %[f4] \n\t" + "madd.s %[val23], %[fTmp8], %[fTmp9], %[f6] \n\t" + "nmsub.s %[val24], %[fTmp8], %[fTmp9], %[f6] \n\t" "add.s %[val0], %[val0], %[val7] \n\t" - "mul.s %[val7], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val7], %[f7], %[fTmp2] \n\t" "sub.s %[fTmp2], %[val15], %[val8] \n\t" "add.s %[val8], %[val15], %[val8] \n\t" - "mul.s %[val24], %[val24], %[fTmp7] \n\t" + "mul.s %[val24], %[val24], %[f4] \n\t" "sub.s %[fTmp3], %[val16], %[val23] \n\t" "add.s %[val16], %[val16], %[val23] \n\t" - "mul.s %[val15], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val15], %[f7], %[fTmp2] \n\t" "sub.s %[fTmp4], %[val31], %[val24] \n\t" - "mul.s %[val23], %[fTmp1], %[fTmp3] \n\t" + "mul.s %[val23], %[f7], %[fTmp3] \n\t" "add.s %[val24], %[val31], %[val24] \n\t" - "mul.s %[val31], %[fTmp1], %[fTmp4] \n\t" + "mul.s %[val31], %[f7], %[fTmp4] \n\t" : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6), - [fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9), - [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11), - [val0] "=f" (val0), [val7] "=f" (val7), - [val8] "=f" (val8), [val15] "=f" (val15), - [val16] "=f" (val16), [val23] "=f" (val23), - [val24] "=f" (val24), [val31] "=f" (val31) - : [tab] "r" (tab) + [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9), + [val0] "=&f" (val0), [val7] "=&f" (val7), + [val8] "=&f" (val8), [val15] "=&f" (val15), + [val16] "=&f" (val16), [val23] "=&f" (val23), + [val24] "=&f" (val24), [val31] "=&f" (val31) + : [tab] "r" (tab), [f1]"f"(f1), [f2]"f"(f2), [f3]"f"(f3), + [f4]"f"(f4), [f5]"f"(f5), [f6]"f"(f6), [f7]"f"(f7) : "memory" ); + f1 = 0.64682178335999012954; + f2 = 0.53104259108978417447; + f3 = 1.48416461631416627724; + f4 = 0.78815462345125022473; + f5 = 0.55310389603444452782; + f6 = 1.16943993343288495515; + f7 = 2.56291544774150617881; __asm__ volatile ( "lwc1 %[fTmp1], 3*4(%[tab]) \n\t" "lwc1 %[fTmp2], 28*4(%[tab]) \n\t" "lwc1 %[fTmp3], 12*4(%[tab]) \n\t" "lwc1 %[fTmp4], 19*4(%[tab]) \n\t" - "li.s %[fTmp7], 0.64682178335999012954 \n\t" "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" - "li.s %[fTmp10], 0.53104259108978417447 \n\t" - "li.s %[fTmp11], 1.48416461631416627724 \n\t" - "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t" + "mul.s %[fTmp8], %[fTmp8], %[f2] \n\t" "add.s %[val3], %[fTmp5], %[fTmp6] \n\t" "sub.s %[val12], %[fTmp5], %[fTmp6] \n\t" "lwc1 %[fTmp1], 4*4(%[tab]) \n\t" "lwc1 %[fTmp2], 27*4(%[tab]) \n\t" - "madd.s %[val19], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" - "nmsub.s %[val28], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" - "mul.s %[val12], %[val12], %[fTmp7] \n\t" + "madd.s %[val19], %[fTmp8], %[fTmp9], %[f3] \n\t" + "nmsub.s %[val28], %[fTmp8], %[fTmp9], %[f3] \n\t" + "mul.s %[val12], %[val12], %[f1] \n\t" "lwc1 %[fTmp3], 11*4(%[tab]) \n\t" "lwc1 %[fTmp4], 20*4(%[tab]) \n\t" "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" - "mul.s %[val28], %[val28], %[fTmp7] \n\t" + "mul.s %[val28], %[val28], %[f1] \n\t" "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" - "li.s %[fTmp7], 0.78815462345125022473 \n\t" "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" - "li.s %[fTmp10], 0.55310389603444452782 \n\t" - "li.s %[fTmp11], 1.16943993343288495515 \n\t" - "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t" + "mul.s %[fTmp8], %[fTmp8], %[f5] \n\t" "add.s %[val4], %[fTmp5], %[fTmp6] \n\t" "sub.s %[val11], %[fTmp5], %[fTmp6] \n\t" - "li.s %[fTmp1], 2.56291544774150617881 \n\t" - "madd.s %[val20], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" - "nmsub.s %[val27], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" - "mul.s %[val11], %[val11], %[fTmp7] \n\t" + "madd.s %[val20], %[fTmp8], %[fTmp9], %[f6] \n\t" + "nmsub.s %[val27], %[fTmp8], %[fTmp9], %[f6] \n\t" + "mul.s %[val11], %[val11], %[f4] \n\t" "sub.s %[fTmp2], %[val3], %[val4] \n\t" "add.s %[val3], %[val3], %[val4] \n\t" "sub.s %[fTmp4], %[val19], %[val20] \n\t" - "mul.s %[val27], %[val27], %[fTmp7] \n\t" + "mul.s %[val27], %[val27], %[f4] \n\t" "sub.s %[fTmp3], %[val12], %[val11] \n\t" - "mul.s %[val4], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val4], %[f7], %[fTmp2] \n\t" "add.s %[val11], %[val12], %[val11] \n\t" "add.s %[val19], %[val19], %[val20] \n\t" - "mul.s %[val20], %[fTmp1], %[fTmp4] \n\t" - "mul.s %[val12], %[fTmp1], %[fTmp3] \n\t" + "mul.s %[val20], %[f7], %[fTmp4] \n\t" + "mul.s %[val12], %[f7], %[fTmp3] \n\t" "sub.s %[fTmp2], %[val28], %[val27] \n\t" "add.s %[val27], %[val28], %[val27] \n\t" - "mul.s %[val28], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val28], %[f7], %[fTmp2] \n\t" : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6), - [fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9), - [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11), - [val3] "=f" (val3), [val4] "=f" (val4), - [val11] "=f" (val11), [val12] "=f" (val12), - [val19] "=f" (val19), [val20] "=f" (val20), - [val27] "=f" (val27), [val28] "=f" (val28) - : [tab] "r" (tab) + [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9), + [val3] "=&f" (val3), [val4] "=&f" (val4), + [val11] "=&f" (val11), [val12] "=&f" (val12), + [val19] "=&f" (val19), [val20] "=&f" (val20), + [val27] "=&f" (val27), [val28] "=&f" (val28) + : [tab] "r" (tab), [f1]"f"(f1), [f2]"f"(f2), [f3]"f"(f3), + [f4]"f"(f4), [f5]"f"(f5), [f6]"f"(f6), [f7]"f"(f7) : "memory" ); + f1 = 0.54119610014619698439; __asm__ volatile ( - "li.s %[fTmp1], 0.54119610014619698439 \n\t" "sub.s %[fTmp2], %[val0], %[val3] \n\t" "add.s %[val0], %[val0], %[val3] \n\t" "sub.s %[fTmp3], %[val7], %[val4] \n\t" "add.s %[val4], %[val7], %[val4] \n\t" "sub.s %[fTmp4], %[val8], %[val11] \n\t" - "mul.s %[val3], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val3], %[f1], %[fTmp2] \n\t" "add.s %[val8], %[val8], %[val11] \n\t" - "mul.s %[val7], %[fTmp1], %[fTmp3] \n\t" + "mul.s %[val7], %[f1], %[fTmp3] \n\t" "sub.s %[fTmp2], %[val15], %[val12] \n\t" - "mul.s %[val11], %[fTmp1], %[fTmp4] \n\t" + "mul.s %[val11], %[f1], %[fTmp4] \n\t" "add.s %[val12], %[val15], %[val12] \n\t" - "mul.s %[val15], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val15], %[f1], %[fTmp2] \n\t" - : [val0] "+f" (val0), [val3] "+f" (val3), - [val4] "+f" (val4), [val7] "+f" (val7), - [val8] "+f" (val8), [val11] "+f" (val11), - [val12] "+f" (val12), [val15] "+f" (val15), - [fTmp1] "=f" (fTmp1), [fTmp2] "=&f" (fTmp2), + : [val0] "+&f" (val0), [val3] "+&f" (val3), + [val4] "+&f" (val4), [val7] "+&f" (val7), + [val8] "+&f" (val8), [val11] "+&f" (val11), + [val12] "+&f" (val12), [val15] "+&f" (val15), + [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4) - : + : [f1] "f" (f1) ); __asm__ volatile ( @@ -449,169 +449,169 @@ static void ff_dct32_mips_float(float *out, const float *tab) "sub.s %[fTmp3], %[val23], %[val20] \n\t" "add.s %[val20], %[val23], %[val20] \n\t" "sub.s %[fTmp4], %[val24], %[val27] \n\t" - "mul.s %[val19], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val19], %[f1], %[fTmp2] \n\t" "add.s %[val24], %[val24], %[val27] \n\t" - "mul.s %[val23], %[fTmp1], %[fTmp3] \n\t" + "mul.s %[val23], %[f1], %[fTmp3] \n\t" "sub.s %[fTmp2], %[val31], %[val28] \n\t" - "mul.s %[val27], %[fTmp1], %[fTmp4] \n\t" + "mul.s %[val27], %[f1], %[fTmp4] \n\t" "add.s %[val28], %[val31], %[val28] \n\t" - "mul.s %[val31], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val31], %[f1], %[fTmp2] \n\t" : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), - [val16] "+f" (val16), [val19] "+f" (val19), [val20] "+f" (val20), - [val23] "+f" (val23), [val24] "+f" (val24), [val27] "+f" (val27), - [val28] "+f" (val28), [val31] "+f" (val31) - : [fTmp1] "f" (fTmp1) + [val16] "+&f" (val16), [val19] "+&f" (val19), [val20] "+&f" (val20), + [val23] "+&f" (val23), [val24] "+&f" (val24), [val27] "+&f" (val27), + [val28] "+&f" (val28), [val31] "+&f" (val31) + : [f1] "f" (f1) ); + f1 = 0.52249861493968888062; + f2 = 0.50547095989754365998; + f3 = 3.40760841846871878570; + f4 = 1.72244709823833392782; + f5 = 0.62250412303566481615; + f6 = 0.83934964541552703873; + f7 = 0.60134488693504528054; __asm__ volatile ( "lwc1 %[fTmp1], 1*4(%[tab]) \n\t" "lwc1 %[fTmp2], 30*4(%[tab]) \n\t" "lwc1 %[fTmp3], 14*4(%[tab]) \n\t" "lwc1 %[fTmp4], 17*4(%[tab]) \n\t" - "li.s %[fTmp7], 0.52249861493968888062 \n\t" "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" - "li.s %[fTmp10], 0.50547095989754365998 \n\t" - "li.s %[fTmp11], 3.40760841846871878570 \n\t" - "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t" + "mul.s %[fTmp8], %[fTmp8], %[f2] \n\t" "add.s %[val1], %[fTmp5], %[fTmp6] \n\t" "sub.s %[val14], %[fTmp5], %[fTmp6] \n\t" "lwc1 %[fTmp1], 6*4(%[tab]) \n\t" "lwc1 %[fTmp2], 25*4(%[tab]) \n\t" - "madd.s %[val17], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" - "nmsub.s %[val30], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" - "mul.s %[val14], %[val14], %[fTmp7] \n\t" + "madd.s %[val17], %[fTmp8], %[fTmp9], %[f3] \n\t" + "nmsub.s %[val30], %[fTmp8], %[fTmp9], %[f3] \n\t" + "mul.s %[val14], %[val14], %[f1] \n\t" "lwc1 %[fTmp3], 9*4(%[tab]) \n\t" "lwc1 %[fTmp4], 22*4(%[tab]) \n\t" "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" - "mul.s %[val30], %[val30], %[fTmp7] \n\t" + "mul.s %[val30], %[val30], %[f1] \n\t" "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" - "li.s %[fTmp7], 1.72244709823833392782 \n\t" - "li.s %[fTmp10], 0.62250412303566481615 \n\t" - "li.s %[fTmp11], 0.83934964541552703873 \n\t" "add.s %[val6], %[fTmp5], %[fTmp6] \n\t" "sub.s %[val9], %[fTmp5], %[fTmp6] \n\t" - "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t" - "li.s %[fTmp1], 0.60134488693504528054 \n\t" + "mul.s %[fTmp8], %[fTmp8], %[f5] \n\t" "sub.s %[fTmp2], %[val1], %[val6] \n\t" "add.s %[val1], %[val1], %[val6] \n\t" - "mul.s %[val9], %[val9], %[fTmp7] \n\t" - "madd.s %[val22], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" - "nmsub.s %[val25], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" - "mul.s %[val6], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val9], %[val9], %[f4] \n\t" + "madd.s %[val22], %[fTmp8], %[fTmp9], %[f6] \n\t" + "nmsub.s %[val25], %[fTmp8], %[fTmp9], %[f6] \n\t" + "mul.s %[val6], %[f7], %[fTmp2] \n\t" "sub.s %[fTmp2], %[val14], %[val9] \n\t" "add.s %[val9], %[val14], %[val9] \n\t" - "mul.s %[val25], %[val25], %[fTmp7] \n\t" + "mul.s %[val25], %[val25], %[f4] \n\t" "sub.s %[fTmp3], %[val17], %[val22] \n\t" "add.s %[val17], %[val17], %[val22] \n\t" - "mul.s %[val14], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val14], %[f7], %[fTmp2] \n\t" "sub.s %[fTmp2], %[val30], %[val25] \n\t" - "mul.s %[val22], %[fTmp1], %[fTmp3] \n\t" + "mul.s %[val22], %[f7], %[fTmp3] \n\t" "add.s %[val25], %[val30], %[val25] \n\t" - "mul.s %[val30], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val30], %[f7], %[fTmp2] \n\t" : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6), - [fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9), - [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11), - [val1] "=f" (val1), [val6] "=f" (val6), - [val9] "=f" (val9), [val14] "=f" (val14), - [val17] "=f" (val17), [val22] "=f" (val22), - [val25] "=f" (val25), [val30] "=f" (val30) - : [tab] "r" (tab) + [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9), + [val1] "=&f" (val1), [val6] "=&f" (val6), + [val9] "=&f" (val9), [val14] "=&f" (val14), + [val17] "=&f" (val17), [val22] "=&f" (val22), + [val25] "=&f" (val25), [val30] "=&f" (val30) + : [tab] "r" (tab), [f1]"f"(f1), [f2]"f"(f2), [f3]"f"(f3), + [f4]"f"(f4), [f5]"f"(f5), [f6]"f"(f6), [f7]"f"(f7) : "memory" ); + f1 = 0.56694403481635770368; + f2 = 0.51544730992262454697; + f3 = 2.05778100995341155085; + f4 = 1.06067768599034747134; + f5 = 0.58293496820613387367; + f6 = 0.97256823786196069369; + f7 = 0.89997622313641570463; __asm__ volatile ( "lwc1 %[fTmp1], 2*4(%[tab]) \n\t" "lwc1 %[fTmp2], 29*4(%[tab]) \n\t" "lwc1 %[fTmp3], 13*4(%[tab]) \n\t" "lwc1 %[fTmp4], 18*4(%[tab]) \n\t" - "li.s %[fTmp7], 0.56694403481635770368 \n\t" "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" - "li.s %[fTmp10], 0.51544730992262454697 \n\t" - "li.s %[fTmp11], 2.05778100995341155085 \n\t" - "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t" + "mul.s %[fTmp8], %[fTmp8], %[f2] \n\t" "add.s %[val2], %[fTmp5], %[fTmp6] \n\t" "sub.s %[val13], %[fTmp5], %[fTmp6] \n\t" "lwc1 %[fTmp1], 5*4(%[tab]) \n\t" "lwc1 %[fTmp2], 26*4(%[tab]) \n\t" - "madd.s %[val18], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" - "nmsub.s %[val29], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" - "mul.s %[val13], %[val13], %[fTmp7] \n\t" + "madd.s %[val18], %[fTmp8], %[fTmp9], %[f3] \n\t" + "nmsub.s %[val29], %[fTmp8], %[fTmp9], %[f3] \n\t" + "mul.s %[val13], %[val13], %[f1] \n\t" "lwc1 %[fTmp3], 10*4(%[tab]) \n\t" "lwc1 %[fTmp4], 21*4(%[tab]) \n\t" - "mul.s %[val29], %[val29], %[fTmp7] \n\t" + "mul.s %[val29], %[val29], %[f1] \n\t" "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" - "li.s %[fTmp7], 1.06067768599034747134 \n\t" - "li.s %[fTmp10], 0.58293496820613387367 \n\t" - "li.s %[fTmp11], 0.97256823786196069369 \n\t" "add.s %[val5], %[fTmp5], %[fTmp6] \n\t" "sub.s %[val10], %[fTmp5], %[fTmp6] \n\t" - "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t" - "li.s %[fTmp1], 0.89997622313641570463 \n\t" + "mul.s %[fTmp8], %[fTmp8], %[f5] \n\t" "sub.s %[fTmp2], %[val2], %[val5] \n\t" - "mul.s %[val10], %[val10], %[fTmp7] \n\t" - "madd.s %[val21], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" - "nmsub.s %[val26], %[fTmp8], %[fTmp9], %[fTmp11] \n\t" + "mul.s %[val10], %[val10], %[f4] \n\t" + "madd.s %[val21], %[fTmp8], %[fTmp9], %[f6] \n\t" + "nmsub.s %[val26], %[fTmp8], %[fTmp9], %[f6] \n\t" "add.s %[val2], %[val2], %[val5] \n\t" - "mul.s %[val5], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val5], %[f7], %[fTmp2] \n\t" "sub.s %[fTmp3], %[val13], %[val10] \n\t" "add.s %[val10], %[val13], %[val10] \n\t" - "mul.s %[val26], %[val26], %[fTmp7] \n\t" + "mul.s %[val26], %[val26], %[f4] \n\t" "sub.s %[fTmp4], %[val18], %[val21] \n\t" "add.s %[val18], %[val18], %[val21] \n\t" - "mul.s %[val13], %[fTmp1], %[fTmp3] \n\t" + "mul.s %[val13], %[f7], %[fTmp3] \n\t" "sub.s %[fTmp2], %[val29], %[val26] \n\t" "add.s %[val26], %[val29], %[val26] \n\t" - "mul.s %[val21], %[fTmp1], %[fTmp4] \n\t" - "mul.s %[val29], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val21], %[f7], %[fTmp4] \n\t" + "mul.s %[val29], %[f7], %[fTmp2] \n\t" : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6), - [fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9), - [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11), - [val2] "=f" (val2), [val5] "=f" (val5), - [val10] "=f" (val10), [val13] "=f" (val13), - [val18] "=f" (val18), [val21] "=f" (val21), - [val26] "=f" (val26), [val29] "=f" (val29) - : [tab] "r" (tab) + [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9), + [val2] "=&f" (val2), [val5] "=&f" (val5), + [val10] "=&f" (val10), [val13] "=&f" (val13), + [val18] "=&f" (val18), [val21] "=&f" (val21), + [val26] "=&f" (val26), [val29] "=&f" (val29) + : [tab] "r" (tab), [f1]"f"(f1), [f2]"f"(f2), [f3]"f"(f3), + [f4]"f"(f4), [f5]"f"(f5), [f6]"f"(f6), [f7]"f"(f7) : "memory" ); + f1 = 1.30656296487637652785; __asm__ volatile ( - "li.s %[fTmp1], 1.30656296487637652785 \n\t" "sub.s %[fTmp2], %[val1], %[val2] \n\t" "add.s %[val1], %[val1], %[val2] \n\t" "sub.s %[fTmp3], %[val6], %[val5] \n\t" "add.s %[val5], %[val6], %[val5] \n\t" "sub.s %[fTmp4], %[val9], %[val10] \n\t" - "mul.s %[val2], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val2], %[f1], %[fTmp2] \n\t" "add.s %[val9], %[val9], %[val10] \n\t" - "mul.s %[val6], %[fTmp1], %[fTmp3] \n\t" + "mul.s %[val6], %[f1], %[fTmp3] \n\t" "sub.s %[fTmp2], %[val14], %[val13] \n\t" - "mul.s %[val10], %[fTmp1], %[fTmp4] \n\t" + "mul.s %[val10], %[f1], %[fTmp4] \n\t" "add.s %[val13], %[val14], %[val13] \n\t" - "mul.s %[val14], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val14], %[f1], %[fTmp2] \n\t" - : [fTmp1] "=f" (fTmp1), [fTmp2] "=&f" (fTmp2), + : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), - [val1] "+f" (val1), [val2] "+f" (val2), - [val5] "+f" (val5), [val6] "+f" (val6), - [val9] "+f" (val9), [val10] "+f" (val10), - [val13] "+f" (val13), [val14] "+f" (val14) - : + [val1] "+&f" (val1), [val2] "+&f" (val2), + [val5] "+&f" (val5), [val6] "+&f" (val6), + [val9] "+&f" (val9), [val10] "+&f" (val10), + [val13] "+&f" (val13), [val14] "+&f" (val14) + : [f1]"f"(f1) ); __asm__ volatile ( @@ -620,39 +620,39 @@ static void ff_dct32_mips_float(float *out, const float *tab) "sub.s %[fTmp3], %[val22], %[val21] \n\t" "add.s %[val21], %[val22], %[val21] \n\t" "sub.s %[fTmp4], %[val25], %[val26] \n\t" - "mul.s %[val18], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val18], %[f1], %[fTmp2] \n\t" "add.s %[val25], %[val25], %[val26] \n\t" - "mul.s %[val22], %[fTmp1], %[fTmp3] \n\t" + "mul.s %[val22], %[f1], %[fTmp3] \n\t" "sub.s %[fTmp2], %[val30], %[val29] \n\t" - "mul.s %[val26], %[fTmp1], %[fTmp4] \n\t" + "mul.s %[val26], %[f1], %[fTmp4] \n\t" "add.s %[val29], %[val30], %[val29] \n\t" - "mul.s %[val30], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val30], %[f1], %[fTmp2] \n\t" : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), - [val17] "+f" (val17), [val18] "+f" (val18), [val21] "+f" (val21), - [val22] "+f" (val22), [val25] "+f" (val25), [val26] "+f" (val26), - [val29] "+f" (val29), [val30] "+f" (val30) - : [fTmp1] "f" (fTmp1) + [val17] "+&f" (val17), [val18] "+&f" (val18), [val21] "+&f" (val21), + [val22] "+&f" (val22), [val25] "+&f" (val25), [val26] "+&f" (val26), + [val29] "+&f" (val29), [val30] "+&f" (val30) + : [f1] "f" (f1) ); + f1 = 0.70710678118654752439; __asm__ volatile ( - "li.s %[fTmp1], 0.70710678118654752439 \n\t" "sub.s %[fTmp2], %[val0], %[val1] \n\t" "add.s %[val0], %[val0], %[val1] \n\t" "sub.s %[fTmp3], %[val3], %[val2] \n\t" "add.s %[val2], %[val3], %[val2] \n\t" "sub.s %[fTmp4], %[val4], %[val5] \n\t" - "mul.s %[val1], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val1], %[f1], %[fTmp2] \n\t" "swc1 %[val0], 0(%[out]) \n\t" - "mul.s %[val3], %[fTmp3], %[fTmp1] \n\t" + "mul.s %[val3], %[fTmp3], %[f1] \n\t" "add.s %[val4], %[val4], %[val5] \n\t" - "mul.s %[val5], %[fTmp1], %[fTmp4] \n\t" + "mul.s %[val5], %[f1], %[fTmp4] \n\t" "swc1 %[val1], 16*4(%[out]) \n\t" "sub.s %[fTmp2], %[val7], %[val6] \n\t" "add.s %[val2], %[val2], %[val3] \n\t" "swc1 %[val3], 24*4(%[out]) \n\t" "add.s %[val6], %[val7], %[val6] \n\t" - "mul.s %[val7], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val7], %[f1], %[fTmp2] \n\t" "swc1 %[val2], 8*4(%[out]) \n\t" "add.s %[val6], %[val6], %[val7] \n\t" "swc1 %[val7], 28*4(%[out]) \n\t" @@ -663,13 +663,13 @@ static void ff_dct32_mips_float(float *out, const float *tab) "swc1 %[val5], 20*4(%[out]) \n\t" "swc1 %[val6], 12*4(%[out]) \n\t" - : [fTmp1] "=f" (fTmp1), [fTmp2] "=&f" (fTmp2), + : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), - [val0] "+f" (val0), [val1] "+f" (val1), - [val2] "+f" (val2), [val3] "+f" (val3), - [val4] "+f" (val4), [val5] "+f" (val5), - [val6] "+f" (val6), [val7] "+f" (val7) - : [out] "r" (out) + [val0] "+&f" (val0), [val1] "+&f" (val1), + [val2] "+&f" (val2), [val3] "+&f" (val3), + [val4] "+&f" (val4), [val5] "+&f" (val5), + [val6] "+&f" (val6), [val7] "+&f" (val7) + : [out] "r" (out), [f1]"f"(f1) ); __asm__ volatile ( @@ -678,14 +678,14 @@ static void ff_dct32_mips_float(float *out, const float *tab) "sub.s %[fTmp3], %[val11], %[val10] \n\t" "add.s %[val10], %[val11], %[val10] \n\t" "sub.s %[fTmp4], %[val12], %[val13] \n\t" - "mul.s %[val9], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val9], %[f1], %[fTmp2] \n\t" "add.s %[val12], %[val12], %[val13] \n\t" - "mul.s %[val11], %[fTmp1], %[fTmp3] \n\t" + "mul.s %[val11], %[f1], %[fTmp3] \n\t" "sub.s %[fTmp2], %[val15], %[val14] \n\t" - "mul.s %[val13], %[fTmp1], %[fTmp4] \n\t" + "mul.s %[val13], %[f1], %[fTmp4] \n\t" "add.s %[val14], %[val15], %[val14] \n\t" "add.s %[val10], %[val10], %[val11] \n\t" - "mul.s %[val15], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val15], %[f1], %[fTmp2] \n\t" "add.s %[val14], %[val14], %[val15] \n\t" "add.s %[val12], %[val12], %[val14] \n\t" "add.s %[val14], %[val14], %[val13] \n\t" @@ -707,10 +707,10 @@ static void ff_dct32_mips_float(float *out, const float *tab) "swc1 %[val15], 30*4(%[out]) \n\t" : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), - [val8] "+f" (val8), [val9] "+f" (val9), [val10] "+f" (val10), - [val11] "+f" (val11), [val12] "+f" (val12), [val13] "+f" (val13), - [val14] "+f" (val14), [val15] "+f" (val15) - : [fTmp1] "f" (fTmp1), [out] "r" (out) + [val8] "+&f" (val8), [val9] "+&f" (val9), [val10] "+&f" (val10), + [val11] "+&f" (val11), [val12] "+&f" (val12), [val13] "+&f" (val13), + [val14] "+&f" (val14), [val15] "+&f" (val15) + : [f1] "f" (f1), [out] "r" (out) ); __asm__ volatile ( @@ -719,24 +719,24 @@ static void ff_dct32_mips_float(float *out, const float *tab) "sub.s %[fTmp3], %[val19], %[val18] \n\t" "add.s %[val18], %[val19], %[val18] \n\t" "sub.s %[fTmp4], %[val20], %[val21] \n\t" - "mul.s %[val17], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val17], %[f1], %[fTmp2] \n\t" "add.s %[val20], %[val20], %[val21] \n\t" - "mul.s %[val19], %[fTmp1], %[fTmp3] \n\t" + "mul.s %[val19], %[f1], %[fTmp3] \n\t" "sub.s %[fTmp2], %[val23], %[val22] \n\t" - "mul.s %[val21], %[fTmp1], %[fTmp4] \n\t" + "mul.s %[val21], %[f1], %[fTmp4] \n\t" "add.s %[val22], %[val23], %[val22] \n\t" "add.s %[val18], %[val18], %[val19] \n\t" - "mul.s %[val23], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val23], %[f1], %[fTmp2] \n\t" "add.s %[val22], %[val22], %[val23] \n\t" "add.s %[val20], %[val20], %[val22] \n\t" "add.s %[val22], %[val22], %[val21] \n\t" "add.s %[val21], %[val21], %[val23] \n\t" : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), - [val16] "+f" (val16), [val17] "+f" (val17), [val18] "+f" (val18), - [val19] "+f" (val19), [val20] "+f" (val20), [val21] "+f" (val21), - [val22] "+f" (val22), [val23] "+f" (val23) - : [fTmp1] "f" (fTmp1) + [val16] "+&f" (val16), [val17] "+&f" (val17), [val18] "+&f" (val18), + [val19] "+&f" (val19), [val20] "+&f" (val20), [val21] "+&f" (val21), + [val22] "+&f" (val22), [val23] "+&f" (val23) + : [f1] "f" (f1) ); __asm__ volatile ( @@ -745,14 +745,14 @@ static void ff_dct32_mips_float(float *out, const float *tab) "sub.s %[fTmp3], %[val27], %[val26] \n\t" "add.s %[val26], %[val27], %[val26] \n\t" "sub.s %[fTmp4], %[val28], %[val29] \n\t" - "mul.s %[val25], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val25], %[f1], %[fTmp2] \n\t" "add.s %[val28], %[val28], %[val29] \n\t" - "mul.s %[val27], %[fTmp1], %[fTmp3] \n\t" + "mul.s %[val27], %[f1], %[fTmp3] \n\t" "sub.s %[fTmp2], %[val31], %[val30] \n\t" - "mul.s %[val29], %[fTmp1], %[fTmp4] \n\t" + "mul.s %[val29], %[f1], %[fTmp4] \n\t" "add.s %[val30], %[val31], %[val30] \n\t" "add.s %[val26], %[val26], %[val27] \n\t" - "mul.s %[val31], %[fTmp1], %[fTmp2] \n\t" + "mul.s %[val31], %[f1], %[fTmp2] \n\t" "add.s %[val30], %[val30], %[val31] \n\t" "add.s %[val28], %[val28], %[val30] \n\t" "add.s %[val30], %[val30], %[val29] \n\t" @@ -766,10 +766,10 @@ static void ff_dct32_mips_float(float *out, const float *tab) "add.s %[val27], %[val27], %[val31] \n\t" : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), - [val24] "+f" (val24), [val25] "+f" (val25), [val26] "+f" (val26), - [val27] "+f" (val27), [val28] "+f" (val28), [val29] "+f" (val29), - [val30] "+f" (val30), [val31] "+f" (val31) - : [fTmp1] "f" (fTmp1) + [val24] "+&f" (val24), [val25] "+&f" (val25), [val26] "+&f" (val26), + [val27] "+&f" (val27), [val28] "+&f" (val28), [val29] "+&f" (val29), + [val30] "+&f" (val30), [val31] "+&f" (val31) + : [f1] "f" (f1) ); out[ 1] = val16 + val24; @@ -797,7 +797,7 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win) /* temporary variables */ float in1, in2, in3, in4, in5, in6; float out1, out2, out3, out4, out5; - float c1, c2, c3, c4, c5, c6, c7, c8, c9; + float f1, f2, f3, f4, f5, f6, f7, f8, f9; /** * all loops are unrolled totally, and instructions are scheduled to @@ -881,33 +881,36 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win) ); /* loop 3 */ + f1 = 0.5; + f2 = 0.93969262078590838405; + f3 = -0.76604444311897803520; + f4 = -0.17364817766693034885; + f5 = -0.86602540378443864676; + f6 = 0.98480775301220805936; + f7 = -0.34202014332566873304; + f8 = 0.86602540378443864676; + f9 = -0.64278760968653932632; __asm__ volatile ( - "li.s %[c1], 0.5 \t\n" "lwc1 %[in1], 8*4(%[in]) \t\n" "lwc1 %[in2], 16*4(%[in]) \t\n" "lwc1 %[in3], 4*4(%[in]) \t\n" "lwc1 %[in4], 0(%[in]) \t\n" "lwc1 %[in5], 12*4(%[in]) \t\n" - "li.s %[c2], 0.93969262078590838405 \t\n" "add.s %[t2], %[in1], %[in2] \t\n" "add.s %[t0], %[in1], %[in3] \t\n" - "li.s %[c3], -0.76604444311897803520 \t\n" - "madd.s %[t3], %[in4], %[in5], %[c1] \t\n" + "madd.s %[t3], %[in4], %[in5], %[f1] \t\n" "sub.s %[t1], %[in4], %[in5] \t\n" "sub.s %[t2], %[t2], %[in3] \t\n" - "mul.s %[t0], %[t0], %[c2] \t\n" - "li.s %[c4], -0.17364817766693034885 \t\n" - "li.s %[c5], -0.86602540378443864676 \t\n" - "li.s %[c6], 0.98480775301220805936 \t\n" - "nmsub.s %[out1], %[t1], %[t2], %[c1] \t\n" + "mul.s %[t0], %[t0], %[f2] \t\n" + "nmsub.s %[out1], %[t1], %[t2], %[f1] \t\n" "add.s %[out2], %[t1], %[t2] \t\n" "add.s %[t2], %[in2], %[in3] \t\n" "sub.s %[t1], %[in1], %[in2] \t\n" "sub.s %[out3], %[t3], %[t0] \t\n" "swc1 %[out1], 6*4(%[tmp]) \t\n" "swc1 %[out2], 16*4(%[tmp]) \t\n" - "mul.s %[t2], %[t2], %[c3] \t\n" - "mul.s %[t1], %[t1], %[c4] \t\n" + "mul.s %[t2], %[t2], %[f3] \t\n" + "mul.s %[t1], %[t1], %[f4] \t\n" "add.s %[out1], %[t3], %[t0] \t\n" "lwc1 %[in1], 10*4(%[in]) \t\n" "lwc1 %[in2], 14*4(%[in]) \t\n" @@ -923,19 +926,16 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win) "add.s %[t2], %[in1], %[in3] \t\n" "sub.s %[t3], %[in1], %[in2] \t\n" "swc1 %[out2], 14*4(%[tmp]) \t\n" - "li.s %[c7], -0.34202014332566873304 \t\n" "sub.s %[out1], %[out1], %[in3] \t\n" - "mul.s %[t2], %[t2], %[c6] \t\n" - "mul.s %[t3], %[t3], %[c7] \t\n" - "li.s %[c8], 0.86602540378443864676 \t\n" - "mul.s %[t0], %[in4], %[c8] \t\n" - "mul.s %[out1], %[out1], %[c5] \t\n" + "mul.s %[t2], %[t2], %[f6] \t\n" + "mul.s %[t3], %[t3], %[f7] \t\n" + "mul.s %[t0], %[in4], %[f8] \t\n" + "mul.s %[out1], %[out1], %[f5] \t\n" "add.s %[t1], %[in2], %[in3] \t\n" - "li.s %[c9], -0.64278760968653932632 \t\n" "add.s %[out2], %[t2], %[t3] \t\n" "lwc1 %[in1], 9*4(%[in]) \t\n" "swc1 %[out1], 4*4(%[tmp]) \t\n" - "mul.s %[t1], %[t1], %[c9] \t\n" + "mul.s %[t1], %[t1], %[f9] \t\n" "lwc1 %[in2], 17*4(%[in]) \t\n" "add.s %[out2], %[out2], %[t0] \t\n" "lwc1 %[in3], 5*4(%[in]) \t\n" @@ -948,21 +948,21 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win) "sub.s %[out3], %[out3], %[t0] \t\n" "sub.s %[out1], %[out1], %[t0] \t\n" "add.s %[t0], %[in1], %[in3] \t\n" - "madd.s %[t3], %[in4], %[in5], %[c1] \t\n" + "madd.s %[t3], %[in4], %[in5], %[f1] \t\n" "sub.s %[t2], %[t2], %[in3] \t\n" "swc1 %[out3], 12*4(%[tmp]) \t\n" "swc1 %[out1], 8*4(%[tmp]) \t\n" "sub.s %[t1], %[in4], %[in5] \t\n" - "mul.s %[t0], %[t0], %[c2] \t\n" - "nmsub.s %[out1], %[t1], %[t2], %[c1] \t\n" + "mul.s %[t0], %[t0], %[f2] \t\n" + "nmsub.s %[out1], %[t1], %[t2], %[f1] \t\n" "add.s %[out2], %[t1], %[t2] \t\n" "add.s %[t2], %[in2], %[in3] \t\n" "sub.s %[t1], %[in1], %[in2] \t\n" "sub.s %[out3], %[t3], %[t0] \t\n" "swc1 %[out1], 7*4(%[tmp]) \t\n" "swc1 %[out2], 17*4(%[tmp]) \t\n" - "mul.s %[t2], %[t2], %[c3] \t\n" - "mul.s %[t1], %[t1], %[c4] \t\n" + "mul.s %[t2], %[t2], %[f3] \t\n" + "mul.s %[t1], %[t1], %[f4] \t\n" "add.s %[out1], %[t3], %[t0] \t\n" "lwc1 %[in1], 11*4(%[in]) \t\n" "lwc1 %[in2], 15*4(%[in]) \t\n" @@ -978,14 +978,14 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win) "add.s %[t2], %[in1], %[in3] \t\n" "sub.s %[t3], %[in1], %[in2] \t\n" "swc1 %[out2], 15*4(%[tmp]) \t\n" - "mul.s %[t0], %[in4], %[c8] \t\n" + "mul.s %[t0], %[in4], %[f8] \t\n" "sub.s %[out3], %[out3], %[in3] \t\n" - "mul.s %[t2], %[t2], %[c6] \t\n" - "mul.s %[t3], %[t3], %[c7] \t\n" + "mul.s %[t2], %[t2], %[f6] \t\n" + "mul.s %[t3], %[t3], %[f7] \t\n" "add.s %[t1], %[in2], %[in3] \t\n" - "mul.s %[out3], %[out3], %[c5] \t\n" + "mul.s %[out3], %[out3], %[f5] \t\n" "add.s %[out1], %[t2], %[t3] \t\n" - "mul.s %[t1], %[t1], %[c9] \t\n" + "mul.s %[t1], %[t1], %[f9] \t\n" "swc1 %[out3], 5*4(%[tmp]) \t\n" "add.s %[out1], %[out1], %[t0] \t\n" "add.s %[out2], %[t2], %[t1] \t\n" @@ -1000,26 +1000,29 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win) [t2] "=&f" (t2), [t3] "=&f" (t3), [in1] "=&f" (in1), [in2] "=&f" (in2), [in3] "=&f" (in3), [in4] "=&f" (in4), - [in5] "=&f" (in5), - [out1] "=&f" (out1), [out2] "=&f" (out2), - [out3] "=&f" (out3), - [c1] "=&f" (c1), [c2] "=&f" (c2), - [c3] "=&f" (c3), [c4] "=&f" (c4), - [c5] "=&f" (c5), [c6] "=&f" (c6), - [c7] "=&f" (c7), [c8] "=&f" (c8), - [c9] "=&f" (c9) - : [in] "r" (in), [tmp] "r" (tmp) + [in5] "=&f" (in5), [out1] "=&f" (out1), + [out2] "=&f" (out2), [out3] "=&f" (out3) + : [in] "r" (in), [tmp] "r" (tmp), [f1]"f"(f1), [f2]"f"(f2), + [f3]"f"(f3), [f4]"f"(f4), [f5]"f"(f5), [f6]"f"(f6), + [f7]"f"(f7), [f8]"f"(f8), [f9]"f"(f9) : "memory" ); /* loop 4 */ + f1 = 0.50190991877167369479; + f2 = 5.73685662283492756461; + f3 = 0.51763809020504152469; + f4 = 1.93185165257813657349; + f5 = 0.55168895948124587824; + f6 = 1.18310079157624925896; + f7 = 0.61038729438072803416; + f8 = 0.87172339781054900991; + f9 = 0.70710678118654752439; __asm__ volatile ( "lwc1 %[in1], 2*4(%[tmp]) \t\n" "lwc1 %[in2], 0(%[tmp]) \t\n" "lwc1 %[in3], 3*4(%[tmp]) \t\n" "lwc1 %[in4], 1*4(%[tmp]) \t\n" - "li.s %[c1], 0.50190991877167369479 \t\n" - "li.s %[c2], 5.73685662283492756461 \t\n" "add.s %[s0], %[in1], %[in2] \t\n" "sub.s %[s2], %[in1], %[in2] \t\n" "add.s %[s1], %[in3], %[in4] \t\n" @@ -1027,15 +1030,13 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win) "lwc1 %[in1], 9*4(%[win]) \t\n" "lwc1 %[in2], 4*9*4(%[buf]) \t\n" "lwc1 %[in3], 8*4(%[win]) \t\n" - "mul.s %[s1], %[s1], %[c1] \t\n" - "mul.s %[s3], %[s3], %[c2] \t\n" + "mul.s %[s1], %[s1], %[f1] \t\n" + "mul.s %[s3], %[s3], %[f2] \t\n" "lwc1 %[in4], 4*8*4(%[buf]) \t\n" "lwc1 %[in5], 29*4(%[win]) \t\n" "lwc1 %[in6], 28*4(%[win]) \t\n" "add.s %[t0], %[s0], %[s1] \t\n" "sub.s %[t1], %[s0], %[s1] \t\n" - "li.s %[c1], 0.51763809020504152469 \t\n" - "li.s %[c2], 1.93185165257813657349 \t\n" "mul.s %[out3], %[in5], %[t0] \t\n" "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" @@ -1071,14 +1072,13 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win) "lwc1 %[in1], 10*4(%[win]) \t\n" "lwc1 %[in2], 4*10*4(%[buf]) \t\n" "lwc1 %[in3], 7*4(%[win]) \t\n" - "mul.s %[s1], %[s1], %[c1] \t\n" - "mul.s %[s3], %[s3], %[c2] \t\n" + "mul.s %[s1], %[s1], %[f3] \t\n" + "mul.s %[s3], %[s3], %[f4] \t\n" "add.s %[t0], %[s0], %[s1] \t\n" "sub.s %[t1], %[s0], %[s1] \t\n" "lwc1 %[in4], 4*7*4(%[buf]) \t\n" "lwc1 %[in5], 30*4(%[win]) \t\n" "lwc1 %[in6], 27*4(%[win]) \t\n" - "li.s %[c1], 0.55168895948124587824 \t\n" "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" "mul.s %[out3], %[t0], %[in5] \t\n" @@ -1105,7 +1105,6 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win) "swc1 %[out2], 32*4(%[out]) \t\n" "swc1 %[out3], 4*16*4(%[buf]) \t\n" "swc1 %[out4], 4*1*4(%[buf]) \t\n" - "li.s %[c2], 1.18310079157624925896 \t\n" "add.s %[s0], %[in1], %[in2] \t\n" "sub.s %[s2], %[in1], %[in2] \t\n" "lwc1 %[in3], 11*4(%[tmp]) \t\n" @@ -1115,8 +1114,8 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win) "lwc1 %[in1], 11*4(%[win]) \t\n" "lwc1 %[in2], 4*11*4(%[buf]) \t\n" "lwc1 %[in3], 6*4(%[win]) \t\n" - "mul.s %[s1], %[s1], %[c1] \t\n" - "mul.s %[s3], %[s3], %[c2] \t\n" + "mul.s %[s1], %[s1], %[f5] \t\n" + "mul.s %[s3], %[s3], %[f6] \t\n" "lwc1 %[in4], 4*6*4(%[buf]) \t\n" "lwc1 %[in5], 31*4(%[win]) \t\n" "lwc1 %[in6], 26*4(%[win]) \t\n" @@ -1152,15 +1151,13 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win) "add.s %[s0], %[in1], %[in2] \t\n" "sub.s %[s2], %[in1], %[in2] \t\n" "lwc1 %[in4], 13*4(%[tmp]) \t\n" - "li.s %[c1], 0.61038729438072803416 \t\n" - "li.s %[c2], 0.87172339781054900991 \t\n" "add.s %[s1], %[in3], %[in4] \t\n" "sub.s %[s3], %[in3], %[in4] \t\n" "lwc1 %[in1], 12*4(%[win]) \t\n" "lwc1 %[in2], 4*12*4(%[buf]) \t\n" "lwc1 %[in3], 5*4(%[win]) \t\n" - "mul.s %[s1], %[s1], %[c1] \t\n" - "mul.s %[s3], %[s3], %[c2] \t\n" + "mul.s %[s1], %[s1], %[f7] \t\n" + "mul.s %[s3], %[s3], %[f8] \t\n" "lwc1 %[in4], 4*5*4(%[buf]) \t\n" "lwc1 %[in5], 32*4(%[win]) \t\n" "lwc1 %[in6], 25*4(%[win]) \t\n" @@ -1168,7 +1165,6 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win) "sub.s %[t1], %[s0], %[s1] \t\n" "lwc1 %[s0], 16*4(%[tmp]) \t\n" "lwc1 %[s1], 17*4(%[tmp]) \t\n" - "li.s %[c1], 0.70710678118654752439 \t\n" "mul.s %[out3], %[t0], %[in5] \t\n" "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" @@ -1186,7 +1182,7 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win) "lwc1 %[in5], 34*4(%[win]) \t\n" "lwc1 %[in6], 23*4(%[win]) \t\n" "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" - "mul.s %[s1], %[s1], %[c1] \t\n" + "mul.s %[s1], %[s1], %[f9] \t\n" "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" "mul.s %[out3], %[in5], %[t0] \t\n" "mul.s %[out4], %[in6], %[t0] \t\n" @@ -1211,18 +1207,18 @@ static void imdct36_mips_float(float *out, float *buf, float *in, float *win) "swc1 %[out3], 4*13*4(%[buf]) \t\n" "swc1 %[out4], 4*4*4(%[buf]) \t\n" - : [c1] "=&f" (c1), [c2] "=&f" (c2), - [in1] "=&f" (in1), [in2] "=&f" (in2), + : [in1] "=&f" (in1), [in2] "=&f" (in2), [in3] "=&f" (in3), [in4] "=&f" (in4), [in5] "=&f" (in5), [in6] "=&f" (in6), [out1] "=&f" (out1), [out2] "=&f" (out2), [out3] "=&f" (out3), [out4] "=&f" (out4), [t0] "=&f" (t0), [t1] "=&f" (t1), - [t2] "=&f" (t2), [t3] "=&f" (t3), [s0] "=&f" (s0), [s1] "=&f" (s1), [s2] "=&f" (s2), [s3] "=&f" (s3) : [tmp] "r" (tmp), [win] "r" (win), - [buf] "r" (buf), [out] "r" (out) + [buf] "r" (buf), [out] "r" (out), + [f1]"f"(f1), [f2]"f"(f2), [f3]"f"(f3), [f4]"f"(f4), + [f5]"f"(f5), [f6]"f"(f6), [f7]"f"(f7), [f8]"f"(f8), [f9]"f"(f9) : "memory" ); }