;****************************************************************************** ;* ALAC DSP SIMD optimizations ;* ;* Copyright (C) 2015 James Almer ;* ;* This file is part of FFmpeg. ;* ;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* ;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public ;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "libavutil/x86/x86util.asm" SECTION .text INIT_XMM sse4 %if ARCH_X86_64 cglobal alac_decorrelate_stereo, 2, 5, 8, buf0, len, shift, weight, buf1 %else cglobal alac_decorrelate_stereo, 2, 3, 8, buf0, len, shift, weight %define buf1q r2q %endif movd m6, shiftm movd m7, weightm SPLATD m7 shl lend, 2 mov buf1q, [buf0q + gprsize] mov buf0q, [buf0q] add buf1q, lenq add buf0q, lenq neg lenq align 16 .loop: mova m0, [buf0q + lenq] mova m1, [buf0q + lenq + mmsize] mova m2, [buf1q + lenq] mova m3, [buf1q + lenq + mmsize] pmulld m4, m2, m7 pmulld m5, m3, m7 psrad m4, m6 psrad m5, m6 psubd m0, m4 psubd m1, m5 paddd m2, m0 paddd m3, m1 mova [buf1q + lenq], m0 mova [buf1q + lenq + mmsize], m1 mova [buf0q + lenq], m2 mova [buf0q + lenq + mmsize], m3 add lenq, mmsize*2 jl .loop RET INIT_XMM sse2 cglobal alac_append_extra_bits_stereo, 2, 5, 5, buf0, exbuf0, buf1, exbuf1, len movifnidn lend, lenm movd m4, r2m ; exbits shl lend, 2 mov buf1q, [buf0q + gprsize] mov buf0q, [buf0q] mov exbuf1q, [exbuf0q + gprsize] mov exbuf0q, [exbuf0q] add buf1q, lenq add buf0q, lenq add exbuf1q, lenq add exbuf0q, lenq neg lenq align 16 .loop: mova m0, [buf0q + lenq] mova m1, [buf0q + lenq + mmsize] pslld m0, m4 pslld m1, m4 mova m2, [buf1q + lenq] mova m3, [buf1q + lenq + mmsize] pslld m2, m4 pslld m3, m4 por m0, [exbuf0q + lenq] por m1, [exbuf0q + lenq + mmsize] por m2, [exbuf1q + lenq] por m3, [exbuf1q + lenq + mmsize] mova [buf0q + lenq ], m0 mova [buf0q + lenq + mmsize], m1 mova [buf1q + lenq ], m2 mova [buf1q + lenq + mmsize], m3 add lenq, mmsize*2 jl .loop RET %if ARCH_X86_64 cglobal alac_append_extra_bits_mono, 2, 5, 3, buf, exbuf, exbits, ch, len %else cglobal alac_append_extra_bits_mono, 2, 3, 3, buf, exbuf, len %define exbitsm r2m %endif movifnidn lend, r4m movd m2, exbitsm shl lend, 2 mov bufq, [bufq] mov exbufq, [exbufq] add bufq, lenq add exbufq, lenq neg lenq align 16 .loop: mova m0, [bufq + lenq] mova m1, [bufq + lenq + mmsize] pslld m0, m2 pslld m1, m2 por m0, [exbufq + lenq] por m1, [exbufq + lenq + mmsize] mova [bufq + lenq], m0 mova [bufq + lenq + mmsize], m1 add lenq, mmsize*2 jl .loop RET