sbrdsp.asm: convert all instructions to float/SSE ones.

Since the values are floats, using the float operations makes sense, improves performance on some CPUs and makes the code SSE compatible instead of needing SSE2. Based on suggestion by Jason. Signed-off-by: Reimar Döffinger <Reimar.Doeffinger@gmx.de> Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
2024-12-24 16:22:37 +00:00 · 2012-03-07 21:35:13 +01:00 · 2012-03-07 21:35:13 +01:00 · 6eda85e15b
commit 6eda85e15b
parent 3416d0805e
1 changed files with 8 additions and 8 deletions
--- a/libavcodec/x86/sbrdsp.asm
+++ b/libavcodec/x86/sbrdsp.asm
@ -82,14 +82,14 @@ cglobal sbr_hf_g_filt, 5, 6, 5
    lea         r0, [r0 + r3*8]
    neg         r3
 .loop4:
-    movq        m0, [r2 + 4*r3 + 0]
-    movq        m1, [r2 + 4*r3 + 8]
-    movq        m2, [r1 + 0*STEP]
-    movq        m3, [r1 + 2*STEP]
+    movlps      m0, [r2 + 4*r3 + 0]
+    movlps      m1, [r2 + 4*r3 + 8]
+    movlps      m2, [r1 + 0*STEP]
+    movlps      m3, [r1 + 2*STEP]
    movhps      m2, [r1 + 1*STEP]
    movhps      m3, [r1 + 3*STEP]
-    punpckldq   m0, m0
-    punpckldq   m1, m1
+    unpcklps    m0, m0
+    unpcklps    m1, m1
    mulps       m0, m2
    mulps       m1, m3
    movu        [r0 + 8*r3 +  0], m0
@ -101,8 +101,8 @@ cglobal sbr_hf_g_filt, 5, 6, 5
    jz          .end
 .loop1: ; element 0 and 1 can be computed at the same time
    movss       m0, [r2]
-    movq        m2, [r1]
-    punpckldq   m0, m0
+    movlps      m2, [r1]
+    unpcklps    m0, m0
    mulps       m2, m0
    movlps    [r0], m2
    add         r0, 8