;****************************************************************************** ;* SIMD-optimized JPEG2000 DSP functions ;* Copyright (c) 2014 Nicolas Bertrand ;* Copyright (c) 2015 James Almer ;* ;* This file is part of FFmpeg. ;* ;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* ;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public ;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "libavutil/x86/x86util.asm" SECTION_RODATA 32 pf_ict0: times 8 dd 1.402 pf_ict1: times 8 dd 0.34413 pf_ict2: times 8 dd 0.71414 pf_ict3: times 8 dd 1.772 SECTION .text ;*********************************************************************** ; ff_ict_float_(float *src0, float *src1, float *src2, int csize) ;*********************************************************************** %macro ICT_FLOAT 1 cglobal ict_float, 4, 4, %1, src0, src1, src2, csize shl csized, 2 add src0q, csizeq add src1q, csizeq add src2q, csizeq neg csizeq movaps m6, [pf_ict0] movaps m7, [pf_ict1] %define ICT0 m6 %define ICT1 m7 %if ARCH_X86_64 movaps m8, [pf_ict2] %define ICT2 m8 %if cpuflag(avx) movaps m3, [pf_ict3] %define ICT3 m3 %else movaps m9, [pf_ict3] %define ICT3 m9 %endif %else ; ARCH_X86_32 %define ICT2 [pf_ict2] %if cpuflag(avx) movaps m3, [pf_ict3] %define ICT3 m3 %else %define ICT3 [pf_ict3] %endif %endif ; ARCH align 16 .loop: movaps m0, [src0q+csizeq] movaps m1, [src1q+csizeq] movaps m2, [src2q+csizeq] %if cpuflag(fma4) || cpuflag(fma3) %if cpuflag(fma4) fnmaddps m5, m1, ICT1, m0 fmaddps m4, m2, ICT0, m0 %else ; fma3 movaps m5, m1 movaps m4, m2 fnmaddps m5, m5, ICT1, m0 fmaddps m4, m4, ICT0, m0 %endif fmaddps m0, m1, ICT3, m0 fnmaddps m5, m2, ICT2, m5 %else ; non FMA %if cpuflag(avx) mulps m5, m1, ICT1 mulps m4, m2, ICT0 mulps m1, m1, ICT3 mulps m2, m2, ICT2 subps m5, m0, m5 %else ; sse movaps m3, m1 movaps m4, m2 movaps m5, m0 mulps m3, ICT1 mulps m4, ICT0 mulps m1, ICT3 mulps m2, ICT2 subps m5, m3 %endif addps m4, m4, m0 addps m0, m0, m1 subps m5, m5, m2 %endif movaps [src0q+csizeq], m4 movaps [src2q+csizeq], m0 movaps [src1q+csizeq], m5 add csizeq, mmsize jl .loop RET %endmacro INIT_XMM sse ICT_FLOAT 10 INIT_YMM avx ICT_FLOAT 9 %if HAVE_FMA4_EXTERNAL INIT_XMM fma4 ICT_FLOAT 9 %endif INIT_YMM fma3 ICT_FLOAT 9 ;*************************************************************************** ; ff_rct_int_(int32_t *src0, int32_t *src1, int32_t *src2, int csize) ;*************************************************************************** %macro RCT_INT 0 cglobal rct_int, 4, 4, 4, src0, src1, src2, csize shl csized, 2 add src0q, csizeq add src1q, csizeq add src2q, csizeq neg csizeq align 16 .loop: mova m1, [src1q+csizeq] mova m2, [src2q+csizeq] mova m0, [src0q+csizeq] paddd m3, m1, m2 psrad m3, 2 psubd m0, m3 paddd m1, m0 paddd m2, m0 mova [src1q+csizeq], m0 mova [src2q+csizeq], m1 mova [src0q+csizeq], m2 add csizeq, mmsize jl .loop RET %endmacro INIT_XMM sse2 RCT_INT %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 RCT_INT %endif