mirror of https://git.ffmpeg.org/ffmpeg.git
663 lines
17 KiB
NASM
663 lines
17 KiB
NASM
;*****************************************************************************
|
|
;* x86-optimized functions for lut3d filter
|
|
;*
|
|
;* Copyright (c) 2021 Mark Reid <mindmark@gmail.com>
|
|
;*
|
|
;* This file is part of FFmpeg.
|
|
;*
|
|
;* FFmpeg is free software; you can redistribute it and/or
|
|
;* modify it under the terms of the GNU Lesser General Public
|
|
;* License as published by the Free Software Foundation; either
|
|
;* version 2.1 of the License, or (at your option) any later version.
|
|
;*
|
|
;* FFmpeg is distributed in the hope that it will be useful,
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
;* Lesser General Public License for more details.
|
|
;*
|
|
;* You should have received a copy of the GNU Lesser General Public
|
|
;* License along with FFmpeg; if not, write to the Free Software
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
;******************************************************************************
|
|
|
|
%include "libavutil/x86/x86util.asm"
|
|
|
|
SECTION_RODATA
|
|
pd_1f: times 8 dd 1.0
|
|
pd_3f: times 8 dd 3.0
|
|
pd_65535f: times 8 dd 65535.0
|
|
pd_65535_invf: times 8 dd 0x37800080 ;1.0/65535.0
|
|
|
|
pb_shuffle16: db 0, 1, 0x80, 0x80, \
|
|
2, 3, 0x80, 0x80, \
|
|
4, 5, 0x80, 0x80, \
|
|
6, 7, 0x80, 0x80
|
|
|
|
pb_lo_pack_shuffle16: db 0, 1, 4, 5, \
|
|
8, 9, 12, 13, \
|
|
0x80, 0x80, 0x80, 0x80, \
|
|
0x80, 0x80, 0x80, 0x80
|
|
|
|
pb_hi_pack_shuffle16: db 0x80, 0x80, 0x80, 0x80, \
|
|
0x80, 0x80, 0x80, 0x80, \
|
|
0, 1, 4, 5, \
|
|
8, 9, 12, 13
|
|
|
|
SECTION .text
|
|
|
|
struc Lut3DPreLut
|
|
.size: resd 1
|
|
.min: resd 3
|
|
.max: resd 3
|
|
.scale: resd 3
|
|
.lut: resq 3
|
|
endstruc
|
|
|
|
struc LUT3DContext
|
|
.class: resq 1
|
|
.lut: resq 1
|
|
.lutsize: resd 1
|
|
.lutsize2: resd 1
|
|
.scale: resd 3
|
|
endstruc
|
|
|
|
%define AV_NUM_DATA_POINTERS 8
|
|
|
|
struc AVFrame
|
|
.data: resq AV_NUM_DATA_POINTERS
|
|
.linesize: resd AV_NUM_DATA_POINTERS
|
|
.extended_data: resq 1
|
|
.width: resd 1
|
|
.height: resd 1
|
|
endstruc
|
|
|
|
%define rm rsp
|
|
%define gm rsp+mmsize
|
|
%define bm rsp+(mmsize*2)
|
|
|
|
%define lut3dsizem [rsp+mmsize*3]
|
|
%define lut3dsize2m [rsp+mmsize*4]
|
|
%define lut3dmaxm [rsp+mmsize*5]
|
|
%define prelutmaxm [rsp+mmsize*6]
|
|
|
|
%define scalerm [rsp+mmsize*7]
|
|
%define scalegm [rsp+mmsize*8]
|
|
%define scalebm [rsp+mmsize*9]
|
|
|
|
%define prelutminrm [rsp+mmsize*10]
|
|
%define prelutmingm [rsp+mmsize*11]
|
|
%define prelutminbm [rsp+mmsize*12]
|
|
|
|
%define prelutscalerm [rsp+mmsize*13]
|
|
%define prelutscalegm [rsp+mmsize*14]
|
|
%define prelutscalebm [rsp+mmsize*15]
|
|
|
|
; data pointers
|
|
%define srcrm [rsp+mmsize*16 + 0]
|
|
%define srcgm [rsp+mmsize*16 + 8]
|
|
%define srcbm [rsp+mmsize*16 + 16]
|
|
%define srcam [rsp+mmsize*16 + 24]
|
|
|
|
%define dstrm [rsp+mmsize*16 + 32]
|
|
%define dstgm [rsp+mmsize*16 + 40]
|
|
%define dstbm [rsp+mmsize*16 + 48]
|
|
%define dstam [rsp+mmsize*16 + 56]
|
|
|
|
; 1 - prev
|
|
; 2 - next
|
|
; 3 - offset
|
|
%macro FETCH_PRELUT_PN 3
|
|
mov tmp2d, [rm + %3]
|
|
mov tmp3d, [gm + %3]
|
|
movss xm%1, [tmpq + tmp2q*4]
|
|
movss xm%2, [tmpq + tmp3q*4]
|
|
movss [rm + %3], xm%1
|
|
movss [gm + %3], xm%2
|
|
%endmacro
|
|
|
|
; 1 - p
|
|
; 2 - n
|
|
; 3 - p indices
|
|
; 4 - n indices
|
|
%macro GATHER_PRELUT 4
|
|
%if cpuflag(avx2)
|
|
vpcmpeqb m7, m7
|
|
vgatherdps m%1, [tmpq + m%3*4], m7 ; p
|
|
vpcmpeqb m9, m9
|
|
vgatherdps m%2, [tmpq + m%4*4], m9 ; n
|
|
%else
|
|
mova [rm], m%3
|
|
mova [gm], m%4
|
|
FETCH_PRELUT_PN %1, %2, 0
|
|
FETCH_PRELUT_PN %1, %2, 4
|
|
FETCH_PRELUT_PN %1, %2, 8
|
|
FETCH_PRELUT_PN %1, %2, 12
|
|
%if mmsize > 16
|
|
FETCH_PRELUT_PN %1, %2, 16
|
|
FETCH_PRELUT_PN %1, %2, 20
|
|
FETCH_PRELUT_PN %1, %2, 24
|
|
FETCH_PRELUT_PN %1, %2, 28
|
|
%endif
|
|
movu m%1, [rm]
|
|
movu m%2, [gm]
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro FLOORPS 2
|
|
%if mmsize > 16
|
|
vroundps %1, %2, 0x01
|
|
%else
|
|
cvttps2dq %1, %2
|
|
cvtdq2ps %1, %1
|
|
%endif
|
|
%endmacro
|
|
|
|
; %1 = %2 * %3 + %1
|
|
%macro MADD3 3
|
|
%if cpuflag(avx2)
|
|
vfmadd231ps %1, %2, %3
|
|
%else
|
|
mulps %2, %2, %3
|
|
addps %1, %1, %2
|
|
%endif
|
|
%endmacro
|
|
|
|
; 1 - dst
|
|
; 2 - index
|
|
; 3 - min
|
|
; 4 - scale
|
|
; assumes lut max m13, m14 1.0f, zero m15
|
|
%macro APPLY_PRELUT 4
|
|
; scale
|
|
subps m5, m%1, %3 ; v - min
|
|
mulps m5, m5, %4 ; v * scale
|
|
; clamp
|
|
maxps m5, m5, m15 ; max zero, Max first, NAN set to zero
|
|
minps m5, m5, m13 ; min lut max
|
|
|
|
FLOORPS m3, m5 ; prev index
|
|
subps m5, m5, m3 ; d
|
|
addps m4, m3, m14 ; p+1 = n index
|
|
minps m4, m4, m13 ; clamp n idex
|
|
|
|
mov tmpq, [prelutq + Lut3DPreLut.lut + %2*8]
|
|
cvttps2dq m6, m3
|
|
cvttps2dq m10, m4
|
|
GATHER_PRELUT %1, 4, 6, 10
|
|
|
|
; lerp
|
|
subps m8, m4, m%1
|
|
MADD3 m%1, m8, m5
|
|
|
|
%endmacro
|
|
|
|
; 1 - dst
|
|
; 2 - scale
|
|
; assumes lut max m13, zero m15
|
|
%macro APPLY_SCALE 2
|
|
mulps m%1, m%1, %2
|
|
maxps m%1, m%1, m15 ; Max first, NAN set to zero
|
|
minps m%1, m%1, m13
|
|
%endmacro
|
|
|
|
%macro BLEND 4
|
|
%if mmsize > 16
|
|
vblendvps %1, %2, %3, %4
|
|
%else
|
|
%ifidni %1,%2
|
|
%error operand 1 must not equal operand 2
|
|
%endif
|
|
%ifidni %1,%3
|
|
%error operand 1 must not equal operand 3
|
|
%endif
|
|
mova %1, %2
|
|
xorps %1, %3
|
|
andps %1, %4
|
|
xorps %1, %2
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro ADD3 4
|
|
addps %1, %2, %3
|
|
addps %1, %1, %4
|
|
%endmacro
|
|
|
|
%macro FETCH_LUT3D_RGB 4
|
|
mov tmp2d, [rm + %4]
|
|
movss xm%1, [tmpq + tmp2q*4 + 0]
|
|
movss xm%2, [tmpq + tmp2q*4 + 4]
|
|
movss xm%3, [tmpq + tmp2q*4 + 8]
|
|
movss [rm + %4], xm%1
|
|
movss [gm + %4], xm%2
|
|
movss [bm + %4], xm%3
|
|
%endmacro
|
|
|
|
; 1 - dstr
|
|
; 2 - dstg
|
|
; 3 - dstb
|
|
; 4 - indices
|
|
%macro GATHER_LUT3D_INDICES 4
|
|
%if cpuflag(avx2)
|
|
vpcmpeqb m3, m3
|
|
vgatherdps m%1, [tmpq + m%4*4 + 0], m3
|
|
vpcmpeqb m14, m14
|
|
vgatherdps m%2, [tmpq + m%4*4 + 4], m14
|
|
vpcmpeqb m15, m15
|
|
vgatherdps m%3, [tmpq + m%4*4 + 8], m15
|
|
%else
|
|
movu [rm], m%4
|
|
FETCH_LUT3D_RGB %1, %2, %3, 0
|
|
FETCH_LUT3D_RGB %1, %2, %3, 4
|
|
FETCH_LUT3D_RGB %1, %2, %3, 8
|
|
FETCH_LUT3D_RGB %1, %2, %3, 12
|
|
%if mmsize > 16
|
|
FETCH_LUT3D_RGB %1, %2, %3, 16
|
|
FETCH_LUT3D_RGB %1, %2, %3, 20
|
|
FETCH_LUT3D_RGB %1, %2, %3, 24
|
|
FETCH_LUT3D_RGB %1, %2, %3, 28
|
|
%endif
|
|
movu m%1, [rm]
|
|
movu m%2, [gm]
|
|
movu m%3, [bm]
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro interp_tetrahedral 0
|
|
%define d_r m0
|
|
%define d_g m1
|
|
%define d_b m2
|
|
|
|
%define prev_r m3
|
|
%define prev_g m4
|
|
%define prev_b m5
|
|
|
|
%define next_r m6
|
|
%define next_g m7
|
|
%define next_b m8
|
|
|
|
%define x0 m4
|
|
%define x1 m5
|
|
%define x2 m6
|
|
|
|
; setup prev index
|
|
FLOORPS prev_r, m0
|
|
FLOORPS prev_g, m1
|
|
FLOORPS prev_b, m2
|
|
|
|
; setup deltas
|
|
subps d_r, m0, prev_r
|
|
subps d_g, m1, prev_g
|
|
subps d_b, m2, prev_b
|
|
|
|
; setup next index
|
|
addps next_r, prev_r, m14 ; +1
|
|
minps next_r, next_r, m13 ; clamp lutmax
|
|
|
|
addps next_g, prev_g, m14 ; +1
|
|
minps next_g, next_g, m13 ; clamp lutmax
|
|
|
|
addps next_b, prev_b, m14 ; +1
|
|
minps next_b, next_b, m13 ; clamp lutmax
|
|
|
|
; prescale indices
|
|
mulps prev_r, prev_r, lut3dsize2m
|
|
mulps next_r, next_r, lut3dsize2m
|
|
|
|
mulps prev_g, prev_g, lut3dsizem
|
|
mulps next_g, next_g, lut3dsizem
|
|
|
|
mulps prev_b, prev_b, [pd_3f]
|
|
mulps next_b, next_b, [pd_3f]
|
|
|
|
; cxxxa m10
|
|
; 1 is the delta that is the largest
|
|
; r> == c100 == (r>g && r>b)
|
|
; g> == c010 == (g>r && g>b)
|
|
; b> == c001 == (b>r && b>g)
|
|
; if delta > other 2 use next else prev
|
|
|
|
; cxxxb m11;
|
|
; 0 is the delta that is the smallest
|
|
; r< == c011 == (r<=g && r<=b)
|
|
; g< == c101 == (g<=r && g<=b)
|
|
; b< == c110 == (b<=r && b<=g)
|
|
; if delta <= other 2 use prev else next
|
|
|
|
cmpps m13, d_r, d_g, 0x1E ; r>g
|
|
cmpps m14, d_g, d_b, 0x1E ; g>b
|
|
cmpps m15, d_b, d_r, 0x1E ; b>r
|
|
|
|
; r> !b>r && r>g
|
|
andnps m9, m15, m13
|
|
BLEND m10, prev_r, next_r, m9
|
|
|
|
; r< !r>g && b>r
|
|
andnps m9, m13, m15
|
|
BLEND m11, next_r, prev_r, m9
|
|
|
|
; g> !r>g && g>b
|
|
andnps m9, m13, m14
|
|
BLEND m12, prev_g, next_g, m9
|
|
addps m10, m10, m12
|
|
|
|
; g< !g>b && r>g
|
|
andnps m9, m14, m13
|
|
BLEND m12, next_g, prev_g, m9
|
|
addps m11, m11, m12
|
|
|
|
; b> !g>b && b>r
|
|
andnps m9, m14, m15
|
|
BLEND m12, prev_b, next_b, m9
|
|
addps m10, m10, m12
|
|
|
|
; b< !b>r && g>b
|
|
andnps m9, m15, m14
|
|
BLEND m12, next_b, prev_b, m9
|
|
addps m11, m11, m12
|
|
|
|
; c000 m12;
|
|
ADD3 m12, prev_r, prev_g, prev_b
|
|
|
|
; c111 m13;
|
|
ADD3 m13, next_r, next_g, next_b
|
|
|
|
; sort delta r,g,b x0 >= x1 >= x2
|
|
minps m7, d_r, d_g
|
|
maxps m8, d_r, d_g
|
|
|
|
minps x2, m7, d_b
|
|
maxps m7, m7, d_b
|
|
|
|
maxps x0, m8, d_b
|
|
minps x1, m8, m7
|
|
|
|
; convert indices to integer
|
|
cvttps2dq m12, m12
|
|
cvttps2dq m10, m10
|
|
cvttps2dq m11, m11
|
|
cvttps2dq m13, m13
|
|
|
|
; now the gathering festival
|
|
mov tmpq, [ctxq + LUT3DContext.lut]
|
|
|
|
GATHER_LUT3D_INDICES 0, 1, 2, 12
|
|
movu m14, [pd_1f]
|
|
subps m14, m14, x0; 1 - x0
|
|
|
|
mulps m0, m0, m14
|
|
mulps m1, m1, m14
|
|
mulps m2, m2, m14
|
|
|
|
GATHER_LUT3D_INDICES 7, 8, 9, 10
|
|
subps m14, x0, x1; x0 - x1
|
|
MADD3 m0, m7, m14
|
|
MADD3 m1, m8, m14
|
|
MADD3 m2, m9, m14
|
|
|
|
GATHER_LUT3D_INDICES 7, 8, 9, 11
|
|
subps m14, x1, x2; x1 - x2
|
|
MADD3 m0, m7, m14
|
|
MADD3 m1, m8, m14
|
|
MADD3 m2, m9, m14
|
|
|
|
GATHER_LUT3D_INDICES 7, 8, 9, 13
|
|
MADD3 m0, m7, x2
|
|
MADD3 m1, m8, x2
|
|
MADD3 m2, m9, x2
|
|
|
|
%endmacro
|
|
|
|
%macro INIT_DATA_PTR 3
|
|
mov ptrq, [%2 + AVFrame.data + %3 * 8]
|
|
mov tmpd, [%2 + AVFrame.linesize + %3 * 4]
|
|
imul tmpd, slice_startd
|
|
add ptrq, tmpq
|
|
mov %1, ptrq
|
|
%endmacro
|
|
|
|
%macro INC_DATA_PTR 3
|
|
mov tmpd, [%2 + AVFrame.linesize + %3 * 4]
|
|
mov ptrq, %1
|
|
add ptrq, tmpq
|
|
mov %1, ptrq
|
|
%endmacro
|
|
|
|
%macro LOAD16 2
|
|
mov ptrq, %2
|
|
%if mmsize > 16
|
|
movu xm%1, [ptrq + xq*2]
|
|
%else
|
|
movsd xm%1, [ptrq + xq*2]
|
|
%endif
|
|
%if cpuflag(avx2)
|
|
vpmovzxwd m%1, xm%1
|
|
%else
|
|
%if mmsize > 16
|
|
pshufd xm4, xm%1, (1 << 6 | 0 << 4 | 3 << 2 | 2 << 0)
|
|
pshufb xm%1, xm6 ; pb_shuffle16
|
|
pshufb xm4, xm6 ; pb_shuffle16
|
|
vinsertf128 m%1, m%1, xm4, 1
|
|
%else
|
|
pshufd xm%1, xm%1, (3 << 6 | 1 << 4 | 3 << 2 | 0 << 0)
|
|
pshuflw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
|
|
pshufhw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
|
|
%endif
|
|
%endif
|
|
cvtdq2ps m%1, m%1
|
|
mulps m%1, m%1, m7 ; pd_65535_invf
|
|
%endmacro
|
|
|
|
%macro STORE16 2
|
|
mulps m%2, m%2, m5 ; [pd_65535f]
|
|
minps m%2, m%2, m5 ; [pd_65535f]
|
|
maxps m%2, m%2, m15 ; zero
|
|
cvttps2dq m%2, m%2
|
|
%if mmsize > 16
|
|
vextractf128 xm4, m%2, 1
|
|
pshufb xm%2, xm6 ; [pb_lo_pack_shuffle16]
|
|
pshufb xm4, xm7 ; [pb_hi_pack_shuffle16]
|
|
por xm%2, xm4
|
|
%else
|
|
pshuflw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
|
|
pshufhw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
|
|
pshufd xm%2, xm%2, (3 << 6 | 3 << 4 | 2 << 2 | 0 << 0)
|
|
%endif
|
|
mov ptrq, %1
|
|
%if mmsize > 16
|
|
movu [ptrq + xq*2], xm%2
|
|
%else
|
|
movsd [ptrq + xq*2], xm%2
|
|
%endif
|
|
%endmacro
|
|
|
|
; 1 - interp method
|
|
; 2 - format_name
|
|
; 3 - depth
|
|
; 4 - is float format
|
|
%macro DEFINE_INTERP_FUNC 4
|
|
cglobal interp_%1_%2, 7, 13, 16, mmsize*16+(8*8), ctx, prelut, src_image, dst_image, slice_start, slice_end, has_alpha, width, x, ptr, tmp, tmp2, tmp3
|
|
; store lut max and lutsize
|
|
mov tmpd, dword [ctxq + LUT3DContext.lutsize]
|
|
cvtsi2ss xm0, tmpd
|
|
mulss xm0, xm0, [pd_3f]
|
|
VBROADCASTSS m0, xm0
|
|
mova lut3dsizem, m0
|
|
sub tmpd, 1
|
|
cvtsi2ss xm0, tmpd
|
|
VBROADCASTSS m0, xm0
|
|
mova lut3dmaxm, m0
|
|
|
|
; scale_r
|
|
mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 0*4]
|
|
VBROADCASTSS m1, xm1
|
|
mova scalerm, m1
|
|
|
|
; scale_g
|
|
mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 1*4]
|
|
VBROADCASTSS m1, xm1
|
|
mova scalegm, m1
|
|
|
|
; scale_b
|
|
mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 2*4]
|
|
VBROADCASTSS m1, xm1
|
|
mova scalebm, m1
|
|
|
|
; store lutsize2
|
|
cvtsi2ss xm0, dword [ctxq + LUT3DContext.lutsize2]
|
|
mulss xm0, xm0, [pd_3f]
|
|
VBROADCASTSS m0, xm0
|
|
mova lut3dsize2m, m0
|
|
|
|
; init prelut values
|
|
cmp prelutq, 0
|
|
je %%skip_init_prelut
|
|
mov tmpd, dword [prelutq + Lut3DPreLut.size]
|
|
sub tmpd, 1
|
|
cvtsi2ss xm0, tmpd
|
|
VBROADCASTSS m0, xm0
|
|
mova prelutmaxm, m0
|
|
VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 0*4]
|
|
mova prelutminrm, m0
|
|
VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 1*4]
|
|
mova prelutmingm, m0
|
|
VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 2*4]
|
|
mova prelutminbm, m0
|
|
VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 0*4]
|
|
mova prelutscalerm, m0
|
|
VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 1*4]
|
|
mova prelutscalegm, m0
|
|
VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 2*4]
|
|
mova prelutscalebm, m0
|
|
%%skip_init_prelut:
|
|
|
|
mov widthd, [src_imageq + AVFrame.width]
|
|
|
|
; gbra pixel order
|
|
INIT_DATA_PTR srcrm, src_imageq, 2
|
|
INIT_DATA_PTR srcgm, src_imageq, 0
|
|
INIT_DATA_PTR srcbm, src_imageq, 1
|
|
INIT_DATA_PTR srcam, src_imageq, 3
|
|
|
|
INIT_DATA_PTR dstrm, dst_imageq, 2
|
|
INIT_DATA_PTR dstgm, dst_imageq, 0
|
|
INIT_DATA_PTR dstbm, dst_imageq, 1
|
|
INIT_DATA_PTR dstam, dst_imageq, 3
|
|
|
|
%%loop_y:
|
|
xor xq, xq
|
|
%%loop_x:
|
|
movu m14, [pd_1f]
|
|
xorps m15, m15, m15
|
|
%if %4 ; float
|
|
mov ptrq, srcrm
|
|
movu m0, [ptrq + xq*4]
|
|
mov ptrq, srcgm
|
|
movu m1, [ptrq + xq*4]
|
|
mov ptrq, srcbm
|
|
movu m2, [ptrq + xq*4]
|
|
%else
|
|
; constants for LOAD16
|
|
movu m7, [pd_65535_invf]
|
|
%if notcpuflag(avx2) && mmsize >= 32
|
|
movu xm6, [pb_shuffle16]
|
|
%endif
|
|
LOAD16 0, srcrm
|
|
LOAD16 1, srcgm
|
|
LOAD16 2, srcbm
|
|
%endif
|
|
|
|
cmp prelutq, 0
|
|
je %%skip_prelut
|
|
mova m13, prelutmaxm
|
|
APPLY_PRELUT 0, 0, prelutminrm, prelutscalerm
|
|
APPLY_PRELUT 1, 1, prelutmingm, prelutscalegm
|
|
APPLY_PRELUT 2, 2, prelutminbm, prelutscalebm
|
|
%%skip_prelut:
|
|
|
|
mova m13, lut3dmaxm
|
|
APPLY_SCALE 0, scalerm
|
|
APPLY_SCALE 1, scalegm
|
|
APPLY_SCALE 2, scalebm
|
|
|
|
interp_%1
|
|
|
|
%if %4 ; float
|
|
mov ptrq, dstrm
|
|
movu [ptrq + xq*4], m0
|
|
mov ptrq, dstgm
|
|
movu [ptrq + xq*4], m1
|
|
mov ptrq, dstbm
|
|
movu [ptrq + xq*4], m2
|
|
cmp has_alphad, 0
|
|
je %%skip_alphaf
|
|
mov ptrq, srcam
|
|
movu m0, [ptrq + xq*4]
|
|
mov ptrq, dstam
|
|
movu [ptrq + xq*4], m0
|
|
%%skip_alphaf:
|
|
%else
|
|
; constants for STORE16
|
|
movu m5, [pd_65535f]
|
|
%if mmsize > 16
|
|
movu xm6, [pb_lo_pack_shuffle16]
|
|
movu xm7, [pb_hi_pack_shuffle16]
|
|
%endif
|
|
|
|
xorps m15, m15, m15
|
|
STORE16 dstrm, 0
|
|
STORE16 dstgm, 1
|
|
STORE16 dstbm, 2
|
|
|
|
cmp has_alphad, 0
|
|
je %%skip_alpha
|
|
%if mmsize > 16
|
|
mov ptrq, srcam
|
|
movu xm0, [ptrq + xq*2]
|
|
mov ptrq, dstam
|
|
movu [ptrq + xq*2], xm0
|
|
%else
|
|
mov ptrq, srcam
|
|
movsd xm0, [ptrq + xq*2]
|
|
mov ptrq, dstam
|
|
movsd [ptrq + xq*2], xm0
|
|
%endif
|
|
|
|
%%skip_alpha:
|
|
%endif
|
|
|
|
add xq, mmsize/4
|
|
cmp xd, widthd
|
|
jl %%loop_x
|
|
|
|
INC_DATA_PTR srcrm, src_imageq, 2
|
|
INC_DATA_PTR srcgm, src_imageq, 0
|
|
INC_DATA_PTR srcbm, src_imageq, 1
|
|
INC_DATA_PTR srcam, src_imageq, 3
|
|
|
|
INC_DATA_PTR dstrm, dst_imageq, 2
|
|
INC_DATA_PTR dstgm, dst_imageq, 0
|
|
INC_DATA_PTR dstbm, dst_imageq, 1
|
|
INC_DATA_PTR dstam, dst_imageq, 3
|
|
|
|
inc slice_startd
|
|
cmp slice_startd, slice_endd
|
|
jl %%loop_y
|
|
|
|
RET
|
|
%endmacro
|
|
%if ARCH_X86_64
|
|
%if HAVE_AVX2_EXTERNAL
|
|
INIT_YMM avx2
|
|
DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
|
|
DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
|
|
%endif
|
|
%if HAVE_AVX_EXTERNAL
|
|
INIT_YMM avx
|
|
DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
|
|
DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
|
|
%endif
|
|
INIT_XMM sse2
|
|
DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
|
|
DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
|
|
%endif
|