ffmpeg/libavutil/x86/float_dsp_init.c

/*
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "config.h"

#include "libavutil/cpu.h"
#include "libavutil/float_dsp.h"
#include "cpu.h"
#include "asm.h"

extern void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1,
                               int len);
extern void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1,
                               int len);

extern void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul,
                                      int len);
extern void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
                                      int len);

extern void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul,
                                      int len);

extern void ff_vector_dmul_scalar_sse2(double *dst, const double *src,
                                       double mul, int len);
extern void ff_vector_dmul_scalar_avx(double *dst, const double *src,
                                      double mul, int len);

void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
                            const float *src2, int len);
void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
                            const float *src2, int len);

void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
                                const float *src1, int len);
void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
                                const float *src1, int len);

float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);

#if HAVE_6REGS && HAVE_INLINE_ASM
static void vector_fmul_window_3dnowext(float *dst, const float *src0,
                                        const float *src1, const float *win,
                                        int len)
{
    x86_reg i = -len * 4;
    x86_reg j =  len * 4 - 8;
    __asm__ volatile (
        "1:                             \n"
        "pswapd (%5, %1), %%mm1         \n"
        "movq   (%5, %0), %%mm0         \n"
        "pswapd (%4, %1), %%mm5         \n"
        "movq   (%3, %0), %%mm4         \n"
        "movq      %%mm0, %%mm2         \n"
        "movq      %%mm1, %%mm3         \n"
        "pfmul     %%mm4, %%mm2         \n" // src0[len + i] * win[len + i]
        "pfmul     %%mm5, %%mm3         \n" // src1[j]       * win[len + j]
        "pfmul     %%mm4, %%mm1         \n" // src0[len + i] * win[len + j]
        "pfmul     %%mm5, %%mm0         \n" // src1[j]       * win[len + i]
        "pfadd     %%mm3, %%mm2         \n"
        "pfsub     %%mm0, %%mm1         \n"
        "pswapd    %%mm2, %%mm2         \n"
        "movq      %%mm1, (%2, %0)      \n"
        "movq      %%mm2, (%2, %1)      \n"
        "sub          $8, %1            \n"
        "add          $8, %0            \n"
        "jl           1b                \n"
        "femms                          \n"
        : "+r"(i), "+r"(j)
        : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
    );
}

static void vector_fmul_window_sse(float *dst, const float *src0,
                                   const float *src1, const float *win, int len)
{
    x86_reg i = -len * 4;
    x86_reg j =  len * 4 - 16;
    __asm__ volatile (
        "1:                             \n"
        "movaps      (%5, %1), %%xmm1   \n"
        "movaps      (%5, %0), %%xmm0   \n"
        "movaps      (%4, %1), %%xmm5   \n"
        "movaps      (%3, %0), %%xmm4   \n"
        "shufps $0x1b, %%xmm1, %%xmm1   \n"
        "shufps $0x1b, %%xmm5, %%xmm5   \n"
        "movaps        %%xmm0, %%xmm2   \n"
        "movaps        %%xmm1, %%xmm3   \n"
        "mulps         %%xmm4, %%xmm2   \n" // src0[len + i] * win[len + i]
        "mulps         %%xmm5, %%xmm3   \n" // src1[j]       * win[len + j]
        "mulps         %%xmm4, %%xmm1   \n" // src0[len + i] * win[len + j]
        "mulps         %%xmm5, %%xmm0   \n" // src1[j]       * win[len + i]
        "addps         %%xmm3, %%xmm2   \n"
        "subps         %%xmm0, %%xmm1   \n"
        "shufps $0x1b, %%xmm2, %%xmm2   \n"
        "movaps        %%xmm1, (%2, %0) \n"
        "movaps        %%xmm2, (%2, %1) \n"
        "sub              $16, %1       \n"
        "add              $16, %0       \n"
        "jl                1b           \n"
        : "+r"(i), "+r"(j)
        : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
    );
}
#endif /* HAVE_6REGS && HAVE_INLINE_ASM */

void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
{
    int mm_flags = av_get_cpu_flags();

#if HAVE_6REGS && HAVE_INLINE_ASM
    if (INLINE_AMD3DNOWEXT(mm_flags)) {
        fdsp->vector_fmul_window  = vector_fmul_window_3dnowext;
    }
    if (INLINE_SSE(mm_flags)) {
        fdsp->vector_fmul_window = vector_fmul_window_sse;
    }
#endif
    if (EXTERNAL_SSE(mm_flags)) {
        fdsp->vector_fmul = ff_vector_fmul_sse;
        fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;
        fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;
        fdsp->vector_fmul_add    = ff_vector_fmul_add_sse;
        fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
        fdsp->scalarproduct_float = ff_scalarproduct_float_sse;
    }
    if (EXTERNAL_SSE2(mm_flags)) {
        fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
    }
    if (EXTERNAL_AVX(mm_flags)) {
        fdsp->vector_fmul = ff_vector_fmul_avx;
        fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
        fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;
        fdsp->vector_fmul_add    = ff_vector_fmul_add_avx;
        fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
    }
}
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 16:58:41 +00:00			`/*`
			`* This file is part of Libav.`
			`*`
			`* Libav is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* Libav is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with Libav; if not, write to the Free Software`
			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

			`#include "config.h"`

			`#include "libavutil/cpu.h"`
			`#include "libavutil/float_dsp.h"`
x86: Replace checks for CPU extensions and flags by convenience macros This separates code relying on inline from that relying on external assembly and fixes instances where the coalesced check was incorrect. 2012-08-29 17:01:05 +00:00			`#include "cpu.h"`
lavc: Move vector_fmul_window to AVFloatDSPContext Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-07 04:47:30 +00:00			`#include "asm.h"`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 16:58:41 +00:00
			`extern void ff_vector_fmul_sse(float dst, const float src0, const float *src1,`
			`int len);`
			`extern void ff_vector_fmul_avx(float dst, const float src0, const float *src1,`
			`int len);`

float_dsp: add x86-optimized functions for vector_fmac_scalar() 2012-06-09 03:20:59 +00:00			`extern void ff_vector_fmac_scalar_sse(float dst, const float src, float mul,`
			`int len);`
			`extern void ff_vector_fmac_scalar_avx(float dst, const float src, float mul,`
			`int len);`

x86: float_dsp: add SSE version of vector_fmul_scalar() 2012-09-22 22:41:25 +00:00			`extern void ff_vector_fmul_scalar_sse(float dst, const float src, float mul,`
			`int len);`

float_dsp: add vector_dmul_scalar() to multiply a vector of doubles Include x86-optimized versions for SSE2 and AVX. 2012-09-24 19:00:53 +00:00			`extern void ff_vector_dmul_scalar_sse2(double dst, const double src,`
			`double mul, int len);`
			`extern void ff_vector_dmul_scalar_avx(double dst, const double src,`
			`double mul, int len);`

floatdsp: move vector_fmul_add from dsputil to avfloatdsp. 2013-01-20 06:26:58 +00:00			`void ff_vector_fmul_add_sse(float dst, const float src0, const float *src1,`
			`const float *src2, int len);`
			`void ff_vector_fmul_add_avx(float dst, const float src0, const float *src1,`
			`const float *src2, int len);`

floatdsp: move vector_fmul_reverse from dsputil to avfloatdsp. Now, nellymoserenc and aacenc no longer depends on dsputil. Independent of this patch, wmaprodec also does not depend on dsputil, so I removed it from there also. 2013-01-20 21:20:30 +00:00			`void ff_vector_fmul_reverse_sse(float dst, const float src0,`
			`const float *src1, int len);`
			`void ff_vector_fmul_reverse_avx(float dst, const float src0,`
			`const float *src1, int len);`

floatdsp: move scalarproduct_float from dsputil to avfloatdsp. This makes the aac decoder and all voice codecs independent of dsputil. 2013-01-20 23:41:52 +00:00			`float ff_scalarproduct_float_sse(const float v1, const float v2, int order);`

float_dsp: Add #ifdef HAVE_INLINE_ASM around vector_fmul_window This fixes builds on 64bit MSVC. Signed-off-by: Martin Storsjö <martin@martin.st> 2013-01-17 16:58:25 +00:00			`#if HAVE_6REGS && HAVE_INLINE_ASM`
lavc: Move vector_fmul_window to AVFloatDSPContext Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-07 04:47:30 +00:00			`static void vector_fmul_window_3dnowext(float dst, const float src0,`
			`const float src1, const float win,`
			`int len)`
			`{`
			`x86_reg i = -len * 4;`
			`x86_reg j = len * 4 - 8;`
			`__asm__ volatile (`
			`"1: \n"`
			`"pswapd (%5, %1), %%mm1 \n"`
			`"movq (%5, %0), %%mm0 \n"`
			`"pswapd (%4, %1), %%mm5 \n"`
			`"movq (%3, %0), %%mm4 \n"`
			`"movq %%mm0, %%mm2 \n"`
			`"movq %%mm1, %%mm3 \n"`
			`"pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]`
			`"pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]`
			`"pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]`
			`"pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]`
			`"pfadd %%mm3, %%mm2 \n"`
			`"pfsub %%mm0, %%mm1 \n"`
			`"pswapd %%mm2, %%mm2 \n"`
			`"movq %%mm1, (%2, %0) \n"`
			`"movq %%mm2, (%2, %1) \n"`
			`"sub $8, %1 \n"`
			`"add $8, %0 \n"`
			`"jl 1b \n"`
			`"femms \n"`
			`: "+r"(i), "+r"(j)`
			`: "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)`
			`);`
			`}`

			`static void vector_fmul_window_sse(float dst, const float src0,`
			`const float src1, const float win, int len)`
			`{`
			`x86_reg i = -len * 4;`
			`x86_reg j = len * 4 - 16;`
			`__asm__ volatile (`
			`"1: \n"`
			`"movaps (%5, %1), %%xmm1 \n"`
			`"movaps (%5, %0), %%xmm0 \n"`
			`"movaps (%4, %1), %%xmm5 \n"`
			`"movaps (%3, %0), %%xmm4 \n"`
			`"shufps $0x1b, %%xmm1, %%xmm1 \n"`
			`"shufps $0x1b, %%xmm5, %%xmm5 \n"`
			`"movaps %%xmm0, %%xmm2 \n"`
			`"movaps %%xmm1, %%xmm3 \n"`
			`"mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]`
			`"mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]`
			`"mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]`
			`"mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]`
			`"addps %%xmm3, %%xmm2 \n"`
			`"subps %%xmm0, %%xmm1 \n"`
			`"shufps $0x1b, %%xmm2, %%xmm2 \n"`
			`"movaps %%xmm1, (%2, %0) \n"`
			`"movaps %%xmm2, (%2, %1) \n"`
			`"sub $16, %1 \n"`
			`"add $16, %0 \n"`
			`"jl 1b \n"`
			`: "+r"(i), "+r"(j)`
			`: "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)`
			`);`
			`}`
float_dsp: Add #ifdef HAVE_INLINE_ASM around vector_fmul_window This fixes builds on 64bit MSVC. Signed-off-by: Martin Storsjö <martin@martin.st> 2013-01-17 16:58:25 +00:00			`#endif /* HAVE_6REGS && HAVE_INLINE_ASM */`
lavc: Move vector_fmul_window to AVFloatDSPContext Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-07 04:47:30 +00:00
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 16:58:41 +00:00			`void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)`
			`{`
			`int mm_flags = av_get_cpu_flags();`

float_dsp: Add #ifdef HAVE_INLINE_ASM around vector_fmul_window This fixes builds on 64bit MSVC. Signed-off-by: Martin Storsjö <martin@martin.st> 2013-01-17 16:58:25 +00:00			`#if HAVE_6REGS && HAVE_INLINE_ASM`
lavc: Move vector_fmul_window to AVFloatDSPContext Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-07 04:47:30 +00:00			`if (INLINE_AMD3DNOWEXT(mm_flags)) {`
			`fdsp->vector_fmul_window = vector_fmul_window_3dnowext;`
			`}`
			`if (INLINE_SSE(mm_flags)) {`
			`fdsp->vector_fmul_window = vector_fmul_window_sse;`
			`}`
			`#endif`
x86: Replace checks for CPU extensions and flags by convenience macros This separates code relying on inline from that relying on external assembly and fixes instances where the coalesced check was incorrect. 2012-08-29 17:01:05 +00:00			`if (EXTERNAL_SSE(mm_flags)) {`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 16:58:41 +00:00			`fdsp->vector_fmul = ff_vector_fmul_sse;`
float_dsp: add x86-optimized functions for vector_fmac_scalar() 2012-06-09 03:20:59 +00:00			`fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;`
x86: float_dsp: add SSE version of vector_fmul_scalar() 2012-09-22 22:41:25 +00:00			`fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;`
floatdsp: move vector_fmul_add from dsputil to avfloatdsp. 2013-01-20 06:26:58 +00:00			`fdsp->vector_fmul_add = ff_vector_fmul_add_sse;`
floatdsp: move vector_fmul_reverse from dsputil to avfloatdsp. Now, nellymoserenc and aacenc no longer depends on dsputil. Independent of this patch, wmaprodec also does not depend on dsputil, so I removed it from there also. 2013-01-20 21:20:30 +00:00			`fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse;`
floatdsp: move scalarproduct_float from dsputil to avfloatdsp. This makes the aac decoder and all voice codecs independent of dsputil. 2013-01-20 23:41:52 +00:00			`fdsp->scalarproduct_float = ff_scalarproduct_float_sse;`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 16:58:41 +00:00			`}`
float_dsp: add vector_dmul_scalar() to multiply a vector of doubles Include x86-optimized versions for SSE2 and AVX. 2012-09-24 19:00:53 +00:00			`if (EXTERNAL_SSE2(mm_flags)) {`
			`fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;`
			`}`
x86: Replace checks for CPU extensions and flags by convenience macros This separates code relying on inline from that relying on external assembly and fixes instances where the coalesced check was incorrect. 2012-08-29 17:01:05 +00:00			`if (EXTERNAL_AVX(mm_flags)) {`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 16:58:41 +00:00			`fdsp->vector_fmul = ff_vector_fmul_avx;`
float_dsp: add x86-optimized functions for vector_fmac_scalar() 2012-06-09 03:20:59 +00:00			`fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;`
float_dsp: add vector_dmul_scalar() to multiply a vector of doubles Include x86-optimized versions for SSE2 and AVX. 2012-09-24 19:00:53 +00:00			`fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;`
floatdsp: move vector_fmul_add from dsputil to avfloatdsp. 2013-01-20 06:26:58 +00:00			`fdsp->vector_fmul_add = ff_vector_fmul_add_avx;`
floatdsp: move vector_fmul_reverse from dsputil to avfloatdsp. Now, nellymoserenc and aacenc no longer depends on dsputil. Independent of this patch, wmaprodec also does not depend on dsputil, so I removed it from there also. 2013-01-20 21:20:30 +00:00			`fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 2012-05-21 16:58:41 +00:00			`}`
			`}`