ffmpeg/libavcodec/x86/lpc.asm
James Almer 0922c6b01b x86/lpc: use fused negative multiply-add instructions where useful
Signed-off-by: James Almer <jamrial@gmail.com>
2022-09-22 18:17:26 -03:00

264 lines
5.0 KiB
NASM

;******************************************************************************
;* Copyright (c) Lynne
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
one_tab: times 4 dq 1.0
seq_tab_avx2: dq 3.0, 2.0, 1.0, 0.0
sub_tab: dq -1.0, -2.0, -3.0, -4.0
add_tab_avx2: times 4 dq 4.0
dec_tab_avx2: times 4 dq -4.0
add_tab_sse2: times 2 dq 2.0
dec_tab_sse2: times 2 dq -2.0
dec_tab_scalar: times 2 dq -1.0
seq_tab_sse2: dq 1.0, 0.0
SECTION .text
%macro APPLY_WELCH_FN 0
cglobal lpc_apply_welch_window, 3, 5, 8, data, len, out, off1, off2
cmp lenq, 0
je .end_e
cmp lenq, 2
je .two
cmp lenq, 1
je .one
movapd m6, [one_tab]
movd xm1, lend
cvtdq2pd xm1, xm1 ; len
%if cpuflag(avx2)
vbroadcastsd m1, xm1
%else
shufpd m1, m1, 00b
%endif
addpd m0, m6, m6 ; 2.0
subpd m1, m6 ; len - 1
divpd m0, m1 ; 2.0 / (len - 1)
mov off1q, lenq
and off1q, 1
je .even
movapd m5, m0
addpd m0, [sub_tab]
lea off2q, [lenq*4 - mmsize/2]
sub lenq, mmsize/4 ; avoid overwriting
xor off1q, off1q
cmp lenq, mmsize/4
jl .scalar_o
%if cpuflag(avx2)
movapd m7, [dec_tab_avx2]
%else
movapd m7, [dec_tab_sse2]
%endif
.loop_o:
movapd m1, m6
%if cpuflag(avx2)
fnmaddpd m1, m0, m0, m1
vpermpd m2, m1, q0123
%else
mulpd m2, m0, m0
subpd m1, m2
shufpd m2, m1, m1, 01b
%endif
cvtdq2pd m3, [dataq + off1q]
cvtdq2pd m4, [dataq + off2q]
mulpd m1, m3
mulpd m2, m4
movupd [outq + off1q*2], m1
movupd [outq + off2q*2], m2
addpd m0, m7
add off1q, mmsize/2
sub off2q, mmsize/2
sub lenq, mmsize/4
jg .loop_o
add lend, (mmsize/4 - 1)
cmp lend, 0
je .end_o
sub lenq, (mmsize/4 - 1)
.scalar_o:
movapd xm7, [dec_tab_scalar]
; Set offsets
add off2q, (mmsize/4) + 4*cpuflag(avx2)
add lenq, mmsize/4 - 2
.loop_o_scalar:
movapd xm1, xm6
%if cpuflag(avx2)
fnmaddpd xm1, xm0, xm0, xm1
%else
mulpd xm2, xm0, xm0
subpd xm1, xm2
%endif
cvtdq2pd xm3, [dataq + off1q]
cvtdq2pd xm4, [dataq + off2q]
mulpd xm3, xm1
mulpd xm4, xm1
movlpd [outq + off1q*2], xm3
movlpd [outq + off2q*2], xm4
addpd xm0, xm7
add off1q, 4
sub off2q, 4
sub lenq, 2
jg .loop_o_scalar
.end_o:
xorpd xm3, xm3
movlpd [outq + off1q*2], xm3
RET
.even:
%if cpuflag(avx2)
addpd m0, [seq_tab_avx2]
%else
addpd m0, [seq_tab_sse2]
%endif
mov off1d, lend
shr off1d, 1
movd xm1, off1d
cvtdq2pd xm1, xm1 ; len/2
%if cpuflag(avx2)
vbroadcastsd m1, xm1
%else
shufpd m1, m1, 00b
%endif
subpd m0, m1
%if cpuflag(avx2)
movapd m7, [add_tab_avx2]
%else
movapd m7, [add_tab_sse2]
%endif
lea off2q, [lenq*2]
lea off1q, [lenq*2 - mmsize/2]
sub lenq, mmsize/4
cmp lenq, mmsize/4
jl .scalar_e
.loop_e:
movapd m1, m6
%if cpuflag(avx2)
fnmaddpd m1, m0, m0, m1
%else
mulpd m2, m0, m0
subpd m1, m2
%endif
%if cpuflag(avx2)
vpermpd m2, m1, q0123
%else
shufpd m2, m1, m1, 01b
%endif
cvtdq2pd m3, [dataq + off1q]
cvtdq2pd m4, [dataq + off2q]
mulpd m1, m3
mulpd m2, m4
movupd [outq + off1q*2], m1
movupd [outq + off2q*2], m2
addpd m0, m7
add off2q, mmsize/2
sub off1q, mmsize/2
sub lenq, mmsize/4
jge .loop_e
.scalar_e:
subpd xm0, xm7
movapd xm7, [dec_tab_scalar]
subpd xm0, xm7
add off1q, (mmsize/2)
sub off2q, (mmsize/2) - 8*cpuflag(avx2)
add lenq, 6 + 4*cpuflag(avx2)
addpd xm0, [sub_tab]
.loop_e_scalar:
movapd xm1, xm6
%if cpuflag(avx2)
fnmaddpd xm1, xm0, xm0, xm1
%else
mulpd xm2, xm0, xm0
subpd xm1, xm2
%endif
cvtdq2pd xm3, [dataq + off1q]
cvtdq2pd xm4, [dataq + off2q]
mulpd xm3, xm1
shufpd xm1, xm1, 00b
mulpd xm4, xm1
movlpd [outq + off1q*2], xm3
movhpd [outq + off2q*2 + 8], xm4
subpd xm0, xm7
add off2q, 4
sub off1q, 4
sub lenq, 2
jg .loop_e_scalar
RET
.two:
xorpd xm0, xm0
movhpd [outq + 8], xm0
.one:
xorpd xm0, xm0
movhpd [outq], xm0
.end_e:
RET
%endmacro
INIT_XMM sse2
APPLY_WELCH_FN
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
APPLY_WELCH_FN
%endif