mirror of https://git.ffmpeg.org/ffmpeg.git
262 lines
4.9 KiB
NASM
262 lines
4.9 KiB
NASM
;******************************************************************************
|
|
;* Copyright (c) Lynne
|
|
;*
|
|
;* This file is part of FFmpeg.
|
|
;*
|
|
;* FFmpeg is free software; you can redistribute it and/or
|
|
;* modify it under the terms of the GNU Lesser General Public
|
|
;* License as published by the Free Software Foundation; either
|
|
;* version 2.1 of the License, or (at your option) any later version.
|
|
;*
|
|
;* FFmpeg is distributed in the hope that it will be useful,
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
;* Lesser General Public License for more details.
|
|
;*
|
|
;* You should have received a copy of the GNU Lesser General Public
|
|
;* License along with FFmpeg; if not, write to the Free Software
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
;******************************************************************************
|
|
|
|
%include "libavutil/x86/x86util.asm"
|
|
|
|
SECTION_RODATA 32
|
|
|
|
one_tab: times 4 dq 1.0
|
|
seq_tab_avx2: dq 3.0, 2.0, 1.0, 0.0
|
|
sub_tab: dq -1.0, -2.0, -3.0, -4.0
|
|
add_tab_avx2: times 4 dq 4.0
|
|
dec_tab_avx2: times 4 dq -4.0
|
|
add_tab_sse2: times 2 dq 2.0
|
|
dec_tab_sse2: times 2 dq -2.0
|
|
dec_tab_scalar: times 2 dq -1.0
|
|
seq_tab_sse2: dq 1.0, 0.0
|
|
|
|
SECTION .text
|
|
|
|
%macro APPLY_WELCH_FN 0
|
|
cglobal lpc_apply_welch_window, 3, 5, 8, data, len, out, off1, off2
|
|
cmp lenq, 0
|
|
je .end_e
|
|
cmp lenq, 2
|
|
je .two
|
|
cmp lenq, 1
|
|
je .one
|
|
|
|
movapd m6, [one_tab]
|
|
|
|
movd xm1, lend
|
|
cvtdq2pd xm1, xm1 ; len
|
|
%if cpuflag(avx2)
|
|
vbroadcastsd m1, xm1
|
|
%else
|
|
shufpd m1, m1, 00b
|
|
%endif
|
|
|
|
addpd m0, m6, m6 ; 2.0
|
|
subpd m1, m6 ; len - 1
|
|
divpd m0, m1 ; 2.0 / (len - 1)
|
|
|
|
mov off1q, lenq
|
|
and off1q, 1
|
|
je .even
|
|
|
|
movapd m5, m0
|
|
addpd m0, [sub_tab]
|
|
|
|
lea off2q, [lenq*4 - mmsize/2]
|
|
sub lenq, mmsize/4 ; avoid overwriting
|
|
xor off1q, off1q
|
|
|
|
cmp lenq, mmsize/4
|
|
jl .scalar_o
|
|
|
|
%if cpuflag(avx2)
|
|
movapd m7, [dec_tab_avx2]
|
|
%else
|
|
movapd m7, [dec_tab_sse2]
|
|
%endif
|
|
|
|
.loop_o:
|
|
movapd m1, m6
|
|
%if cpuflag(avx2)
|
|
fnmaddpd m1, m0, m0, m1
|
|
vpermpd m2, m1, q0123
|
|
%else
|
|
mulpd m2, m0, m0
|
|
subpd m1, m2
|
|
shufpd m2, m1, m1, 01b
|
|
%endif
|
|
|
|
cvtdq2pd m3, [dataq + off1q]
|
|
cvtdq2pd m4, [dataq + off2q]
|
|
|
|
mulpd m1, m3
|
|
mulpd m2, m4
|
|
|
|
movupd [outq + off1q*2], m1
|
|
movupd [outq + off2q*2], m2
|
|
|
|
addpd m0, m7
|
|
add off1q, mmsize/2
|
|
sub off2q, mmsize/2
|
|
sub lenq, mmsize/4
|
|
jg .loop_o
|
|
|
|
add lend, (mmsize/4 - 1)
|
|
cmp lend, 0
|
|
je .end_o
|
|
sub lenq, (mmsize/4 - 1)
|
|
|
|
.scalar_o:
|
|
movapd xm7, [dec_tab_scalar]
|
|
|
|
; Set offsets
|
|
add off2q, (mmsize/4) + 4*cpuflag(avx2)
|
|
add lenq, mmsize/4 - 2
|
|
|
|
.loop_o_scalar:
|
|
movapd xm1, xm6
|
|
%if cpuflag(avx2)
|
|
fnmaddpd xm1, xm0, xm0, xm1
|
|
%else
|
|
mulpd xm2, xm0, xm0
|
|
subpd xm1, xm2
|
|
%endif
|
|
|
|
cvtdq2pd xm3, [dataq + off1q]
|
|
cvtdq2pd xm4, [dataq + off2q]
|
|
|
|
mulpd xm3, xm1
|
|
mulpd xm4, xm1
|
|
|
|
movlpd [outq + off1q*2], xm3
|
|
movlpd [outq + off2q*2], xm4
|
|
|
|
addpd xm0, xm7
|
|
|
|
add off1q, 4
|
|
sub off2q, 4
|
|
|
|
sub lenq, 2
|
|
jg .loop_o_scalar
|
|
|
|
.end_o:
|
|
xorpd xm3, xm3
|
|
movlpd [outq + off1q*2], xm3
|
|
RET
|
|
|
|
.even:
|
|
%if cpuflag(avx2)
|
|
addpd m0, [seq_tab_avx2]
|
|
%else
|
|
addpd m0, [seq_tab_sse2]
|
|
%endif
|
|
|
|
mov off1d, lend
|
|
shr off1d, 1
|
|
movd xm1, off1d
|
|
cvtdq2pd xm1, xm1 ; len/2
|
|
%if cpuflag(avx2)
|
|
vbroadcastsd m1, xm1
|
|
%else
|
|
shufpd m1, m1, 00b
|
|
%endif
|
|
subpd m0, m1
|
|
|
|
%if cpuflag(avx2)
|
|
movapd m7, [add_tab_avx2]
|
|
%else
|
|
movapd m7, [add_tab_sse2]
|
|
%endif
|
|
|
|
lea off2q, [lenq*2]
|
|
lea off1q, [lenq*2 - mmsize/2]
|
|
sub lenq, mmsize/4
|
|
|
|
cmp lenq, mmsize/4
|
|
jl .scalar_e
|
|
|
|
.loop_e:
|
|
movapd m1, m6
|
|
%if cpuflag(avx2)
|
|
fnmaddpd m1, m0, m0, m1
|
|
%else
|
|
mulpd m2, m0, m0
|
|
subpd m1, m2
|
|
%endif
|
|
%if cpuflag(avx2)
|
|
vpermpd m2, m1, q0123
|
|
%else
|
|
shufpd m2, m1, m1, 01b
|
|
%endif
|
|
|
|
cvtdq2pd m3, [dataq + off1q]
|
|
cvtdq2pd m4, [dataq + off2q]
|
|
|
|
mulpd m1, m3
|
|
mulpd m2, m4
|
|
|
|
movupd [outq + off1q*2], m1
|
|
movupd [outq + off2q*2], m2
|
|
|
|
addpd m0, m7
|
|
add off2q, mmsize/2
|
|
sub off1q, mmsize/2
|
|
sub lenq, mmsize/4
|
|
jge .loop_e
|
|
|
|
.scalar_e:
|
|
subpd xm0, xm7
|
|
movapd xm7, [dec_tab_scalar]
|
|
subpd xm0, xm7
|
|
|
|
add off1q, (mmsize/2)
|
|
sub off2q, (mmsize/2) - 8*cpuflag(avx2)
|
|
add lenq, 6 + 4*cpuflag(avx2)
|
|
|
|
addpd xm0, [sub_tab]
|
|
|
|
.loop_e_scalar:
|
|
movapd xm1, xm6
|
|
%if cpuflag(avx2)
|
|
fnmaddpd xm1, xm0, xm0, xm1
|
|
%else
|
|
mulpd xm2, xm0, xm0
|
|
subpd xm1, xm2
|
|
%endif
|
|
|
|
cvtdq2pd xm3, [dataq + off1q]
|
|
cvtdq2pd xm4, [dataq + off2q]
|
|
|
|
mulpd xm3, xm1
|
|
shufpd xm1, xm1, 00b
|
|
mulpd xm4, xm1
|
|
|
|
movlpd [outq + off1q*2], xm3
|
|
movhpd [outq + off2q*2 + 8], xm4
|
|
|
|
subpd xm0, xm7
|
|
|
|
add off2q, 4
|
|
sub off1q, 4
|
|
sub lenq, 2
|
|
jg .loop_e_scalar
|
|
RET
|
|
|
|
.two:
|
|
xorpd xm0, xm0
|
|
movhpd [outq + 8], xm0
|
|
.one:
|
|
xorpd xm0, xm0
|
|
movhpd [outq], xm0
|
|
.end_e:
|
|
RET
|
|
%endmacro
|
|
|
|
INIT_XMM sse2
|
|
APPLY_WELCH_FN
|
|
|
|
INIT_YMM avx2
|
|
APPLY_WELCH_FN
|