mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2024-12-30 19:32:13 +00:00
338f8fd232
x64 always has MMX, MMXEXT, SSE and SSE2 and this means that some functions for MMX, MMXEXT and 3dnow are always overridden by other functions (unless one e.g. explicitly disables SSE2) for x64. So given that the only systems that benefit from these functions are truely ancient 32bit x86s they are removed. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
853 lines
20 KiB
NASM
853 lines
20 KiB
NASM
;*******************************************************************************
|
|
;* SIMD-optimized IDCT functions for HEVC decoding
|
|
;* Copyright (c) 2014 Pierre-Edouard LEPERE
|
|
;* Copyright (c) 2014 James Almer
|
|
;* Copyright (c) 2016 Alexandra Hájková
|
|
;*
|
|
;* This file is part of FFmpeg.
|
|
;*
|
|
;* FFmpeg is free software; you can redistribute it and/or
|
|
;* modify it under the terms of the GNU Lesser General Public
|
|
;* License as published by the Free Software Foundation; either
|
|
;* version 2.1 of the License, or (at your option) any later version.
|
|
;*
|
|
;* FFmpeg is distributed in the hope that it will be useful,
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
;* Lesser General Public License for more details.
|
|
;*
|
|
;* You should have received a copy of the GNU Lesser General Public
|
|
;* License along with FFmpeg; if not, write to the Free Software
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
;******************************************************************************
|
|
|
|
%include "libavutil/x86/x86util.asm"
|
|
|
|
SECTION_RODATA
|
|
|
|
pd_64: times 4 dd 64
|
|
pd_2048: times 4 dd 2048
|
|
pd_512: times 4 dd 512
|
|
|
|
; 4x4 transform coeffs
|
|
cextern pw_64
|
|
pw_64_m64: times 4 dw 64, -64
|
|
pw_83_36: times 4 dw 83, 36
|
|
pw_36_m83: times 4 dw 36, -83
|
|
|
|
; 8x8 transform coeffs
|
|
pw_89_75: times 4 dw 89, 75
|
|
pw_50_18: times 4 dw 50, 18
|
|
|
|
pw_75_m18: times 4 dw 75, -18
|
|
pw_m89_m50: times 4 dw -89, -50
|
|
|
|
pw_50_m89: times 4 dw 50, -89
|
|
pw_18_75: times 4 dw 18, 75
|
|
|
|
pw_18_m50: times 4 dw 18, -50
|
|
pw_75_m89: times 4 dw 75, -89
|
|
|
|
; 16x16 transformation coeffs
|
|
trans_coeffs16: times 4 dw 90, 87
|
|
times 4 dw 80, 70
|
|
times 4 dw 57, 43
|
|
times 4 dw 25, 9
|
|
|
|
times 4 dw 87, 57
|
|
times 4 dw 9, -43
|
|
times 4 dw -80, -90
|
|
times 4 dw -70, -25
|
|
|
|
times 4 dw 80, 9
|
|
times 4 dw -70, -87
|
|
times 4 dw -25, 57
|
|
times 4 dw 90, 43
|
|
|
|
times 4 dw 70, -43
|
|
times 4 dw -87, 9
|
|
times 4 dw 90, 25
|
|
times 4 dw -80, -57
|
|
|
|
times 4 dw 57, -80
|
|
times 4 dw -25, 90
|
|
times 4 dw -9, -87
|
|
times 4 dw 43, 70
|
|
|
|
times 4 dw 43, -90
|
|
times 4 dw 57, 25
|
|
times 4 dw -87, 70
|
|
times 4 dw 9, -80
|
|
|
|
times 4 dw 25, -70
|
|
times 4 dw 90, -80
|
|
times 4 dw 43, 9
|
|
times 4 dw -57, 87
|
|
|
|
times 4 dw 9, -25
|
|
times 4 dw 43, -57
|
|
times 4 dw 70, -80
|
|
times 4 dw 87, -90
|
|
|
|
; 32x32 transform coeffs
|
|
trans_coeff32: times 8 dw 90
|
|
times 4 dw 88, 85
|
|
times 4 dw 82, 78
|
|
times 4 dw 73, 67
|
|
times 4 dw 61, 54
|
|
times 4 dw 46, 38
|
|
times 4 dw 31, 22
|
|
times 4 dw 13, 4
|
|
|
|
times 4 dw 90, 82
|
|
times 4 dw 67, 46
|
|
times 4 dw 22, -4
|
|
times 4 dw -31, -54
|
|
times 4 dw -73, -85
|
|
times 4 dw -90, -88
|
|
times 4 dw -78, -61
|
|
times 4 dw -38, -13
|
|
|
|
times 4 dw 88, 67
|
|
times 4 dw 31, -13
|
|
times 4 dw -54, -82
|
|
times 4 dw -90, -78
|
|
times 4 dw -46, -4
|
|
times 4 dw 38, 73
|
|
times 4 dw 90, 85
|
|
times 4 dw 61, 22
|
|
|
|
times 4 dw 85, 46
|
|
times 4 dw -13, -67
|
|
times 4 dw -90, -73
|
|
times 4 dw -22, 38
|
|
times 4 dw 82, 88
|
|
times 4 dw 54, -4
|
|
times 4 dw -61, -90
|
|
times 4 dw -78, -31
|
|
|
|
times 4 dw 82, 22
|
|
times 4 dw -54, -90
|
|
times 4 dw -61, 13
|
|
times 4 dw 78, 85
|
|
times 4 dw 31, -46
|
|
times 4 dw -90, -67
|
|
times 4 dw 4, 73
|
|
times 4 dw 88, 38
|
|
|
|
times 4 dw 78, -4
|
|
times 4 dw -82, -73
|
|
times 4 dw 13, 85
|
|
times 4 dw 67, -22
|
|
times 4 dw -88, -61
|
|
times 4 dw 31, 90
|
|
times 4 dw 54, -38
|
|
times 4 dw -90, -46
|
|
|
|
times 4 dw 73, -31
|
|
times 4 dw -90, -22
|
|
times 4 dw 78, 67
|
|
times 4 dw -38, -90
|
|
times 4 dw -13, 82
|
|
times 4 dw 61, -46
|
|
times 4 dw -88, -4
|
|
times 4 dw 85, 54
|
|
|
|
times 4 dw 67, -54
|
|
times 4 dw -78, 38
|
|
times 4 dw 85, -22
|
|
times 4 dw -90, 4
|
|
times 4 dw 90, 13
|
|
times 4 dw -88, -31
|
|
times 4 dw 82, 46
|
|
times 4 dw -73, -61
|
|
|
|
times 4 dw 61, -73
|
|
times 4 dw -46, 82
|
|
times 4 dw 31, -88
|
|
times 4 dw -13, 90
|
|
times 4 dw -4, -90
|
|
times 4 dw 22, 85
|
|
times 4 dw -38, -78
|
|
times 4 dw 54, 67
|
|
|
|
times 4 dw 54, -85
|
|
times 4 dw -4, 88
|
|
times 4 dw -46, -61
|
|
times 4 dw 82, 13
|
|
times 4 dw -90, 38
|
|
times 4 dw 67, -78
|
|
times 4 dw -22, 90
|
|
times 4 dw -31, -73
|
|
|
|
times 4 dw 46, -90
|
|
times 4 dw 38, 54
|
|
times 4 dw -90, 31
|
|
times 4 dw 61, -88
|
|
times 4 dw 22, 67
|
|
times 4 dw -85, 13
|
|
times 4 dw 73, -82
|
|
times 4 dw 4, 78
|
|
|
|
times 4 dw 38, -88
|
|
times 4 dw 73, -4
|
|
times 4 dw -67, 90
|
|
times 4 dw -46, -31
|
|
times 4 dw 85, -78
|
|
times 4 dw 13, 61
|
|
times 4 dw -90, 54
|
|
times 4 dw 22, -82
|
|
|
|
times 4 dw 31, -78
|
|
times 4 dw 90, -61
|
|
times 4 dw 4, 54
|
|
times 4 dw -88, 82
|
|
times 4 dw -38, -22
|
|
times 4 dw 73, -90
|
|
times 4 dw 67, -13
|
|
times 4 dw -46, 85
|
|
|
|
times 4 dw 22, -61
|
|
times 4 dw 85, -90
|
|
times 4 dw 73, -38
|
|
times 4 dw -4, 46
|
|
times 4 dw -78, 90
|
|
times 4 dw -82, 54
|
|
times 4 dw -13, -31
|
|
times 4 dw 67, -88
|
|
|
|
times 4 dw 13, -38
|
|
times 4 dw 61, -78
|
|
times 4 dw 88, -90
|
|
times 4 dw 85, -73
|
|
times 4 dw 54, -31
|
|
times 4 dw 4, 22
|
|
times 4 dw -46, 67
|
|
times 4 dw -82, 90
|
|
|
|
times 4 dw 4, -13
|
|
times 4 dw 22, -31
|
|
times 4 dw 38, -46
|
|
times 4 dw 54, -61
|
|
times 4 dw 67, -73
|
|
times 4 dw 78, -82
|
|
times 4 dw 85, -88
|
|
times 4 dw 90, -90
|
|
|
|
SECTION .text
|
|
|
|
; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs)
|
|
; %1 = HxW
|
|
; %2 = number of loops
|
|
; %3 = bitdepth
|
|
%macro IDCT_DC 3
|
|
cglobal hevc_idct_%1x%1_dc_%3, 1, 2, 1, coeff, tmp
|
|
movsx tmpd, word [coeffq]
|
|
add tmpd, (1 << (14 - %3)) + 1
|
|
sar tmpd, (15 - %3)
|
|
movd xm0, tmpd
|
|
SPLATW m0, xm0
|
|
DEFINE_ARGS coeff, cnt
|
|
mov cntd, %2
|
|
.loop:
|
|
mova [coeffq+mmsize*0], m0
|
|
mova [coeffq+mmsize*1], m0
|
|
mova [coeffq+mmsize*2], m0
|
|
mova [coeffq+mmsize*3], m0
|
|
add coeffq, mmsize*8
|
|
mova [coeffq+mmsize*-4], m0
|
|
mova [coeffq+mmsize*-3], m0
|
|
mova [coeffq+mmsize*-2], m0
|
|
mova [coeffq+mmsize*-1], m0
|
|
dec cntd
|
|
jg .loop
|
|
RET
|
|
%endmacro
|
|
|
|
; %1 = HxW
|
|
; %2 = bitdepth
|
|
%macro IDCT_DC_NL 2 ; No loop
|
|
cglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1, coeff, tmp
|
|
movsx tmpd, word [coeffq]
|
|
add tmpd, (1 << (14 - %2)) + 1
|
|
sar tmpd, (15 - %2)
|
|
movd m0, tmpd
|
|
SPLATW m0, xm0
|
|
mova [coeffq+mmsize*0], m0
|
|
mova [coeffq+mmsize*1], m0
|
|
mova [coeffq+mmsize*2], m0
|
|
mova [coeffq+mmsize*3], m0
|
|
%if mmsize == 16
|
|
mova [coeffq+mmsize*4], m0
|
|
mova [coeffq+mmsize*5], m0
|
|
mova [coeffq+mmsize*6], m0
|
|
mova [coeffq+mmsize*7], m0
|
|
%endif
|
|
RET
|
|
%endmacro
|
|
|
|
; IDCT 4x4, expects input in m0, m1
|
|
; %1 - shift
|
|
; %2 - 1/0 - SCALE and Transpose or not
|
|
; %3 - 1/0 add constant or not
|
|
%macro TR_4x4 3
|
|
; interleaves src0 with src2 to m0
|
|
; and src1 with scr3 to m2
|
|
; src0: 00 01 02 03 m0: 00 20 01 21 02 22 03 23
|
|
; src1: 10 11 12 13 -->
|
|
; src2: 20 21 22 23 m1: 10 30 11 31 12 32 13 33
|
|
; src3: 30 31 32 33
|
|
|
|
SBUTTERFLY wd, 0, 1, 2
|
|
|
|
pmaddwd m2, m0, [pw_64] ; e0
|
|
pmaddwd m3, m1, [pw_83_36] ; o0
|
|
pmaddwd m0, [pw_64_m64] ; e1
|
|
pmaddwd m1, [pw_36_m83] ; o1
|
|
|
|
%if %3 == 1
|
|
%assign %%add 1 << (%1 - 1)
|
|
mova m4, [pd_ %+ %%add]
|
|
paddd m2, m4
|
|
paddd m0, m4
|
|
%endif
|
|
|
|
SUMSUB_BADC d, 3, 2, 1, 0, 4
|
|
|
|
%if %2 == 1
|
|
psrad m3, %1 ; e0 + o0
|
|
psrad m1, %1 ; e1 + o1
|
|
psrad m2, %1 ; e0 - o0
|
|
psrad m0, %1 ; e1 - o1
|
|
;clip16
|
|
packssdw m3, m1
|
|
packssdw m0, m2
|
|
; Transpose
|
|
SBUTTERFLY wd, 3, 0, 1
|
|
SBUTTERFLY wd, 3, 0, 1
|
|
SWAP 3, 1, 0
|
|
%else
|
|
SWAP 3, 2, 0
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro DEFINE_BIAS 1
|
|
%assign shift (20 - %1)
|
|
%assign c_add (1 << (shift - 1))
|
|
%define arr_add pd_ %+ c_add
|
|
%endmacro
|
|
|
|
; %1 - bit_depth
|
|
; %2 - register add constant
|
|
; is loaded to
|
|
; shift = 20 - bit_depth
|
|
%macro LOAD_BIAS 2
|
|
DEFINE_BIAS %1
|
|
mova %2, [arr_add]
|
|
%endmacro
|
|
|
|
; %1, %2 - registers to load packed 16 bit values to
|
|
; %3, %4, %5, %6 - vertical offsets
|
|
; %7 - horizontal offset
|
|
%macro LOAD_BLOCK 7
|
|
movq %1, [r0 + %3 + %7]
|
|
movhps %1, [r0 + %5 + %7]
|
|
movq %2, [r0 + %4 + %7]
|
|
movhps %2, [r0 + %6 + %7]
|
|
%endmacro
|
|
|
|
; void ff_hevc_idct_4x4__{8,10}_<opt>(int16_t *coeffs, int col_limit)
|
|
; %1 = bitdepth
|
|
%macro IDCT_4x4 1
|
|
cglobal hevc_idct_4x4_%1, 1, 1, 5, coeffs
|
|
mova m0, [coeffsq]
|
|
mova m1, [coeffsq + 16]
|
|
|
|
TR_4x4 7, 1, 1
|
|
TR_4x4 20 - %1, 1, 1
|
|
|
|
mova [coeffsq], m0
|
|
mova [coeffsq + 16], m1
|
|
RET
|
|
%endmacro
|
|
|
|
; scale, pack (clip16) and store the residuals 0 e8[0] + o8[0] --> + %1
|
|
; 4 at one time (4 columns) 1 e8[1] + o8[1]
|
|
; from %5: e8/16 + o8/16, with %1 offset ...
|
|
; and %3: e8/16 - o8/16, with %2 offset 6 e8[1] - o8[1]
|
|
; %4 - shift 7 e8[0] - o8[0] --> + %2
|
|
%macro STORE_8 7
|
|
psrad %5, %4
|
|
psrad %3, %4
|
|
packssdw %5, %3
|
|
movq [coeffsq + %1], %5
|
|
movhps [coeffsq + %2], %5
|
|
%endmacro
|
|
|
|
; %1 - horizontal offset
|
|
; %2 - shift
|
|
; %3, %4 - transform coeffs
|
|
; %5 - vertical offset for e8 + o8
|
|
; %6 - vertical offset for e8 - o8
|
|
; %7 - register with e8 inside
|
|
; %8 - block_size
|
|
; %9 - register to store e8 +o8
|
|
; %10 - register to store e8 - o8
|
|
%macro E8_O8 10
|
|
pmaddwd m6, m4, %3
|
|
pmaddwd m7, m5, %4
|
|
|
|
paddd m6, m7
|
|
paddd m7, m6, %7 ; o8 + e8
|
|
psubd %7, m6 ; e8 - o8
|
|
%if %8 == 8
|
|
STORE_8 %5 + %1, %6 + %1, %7, %2, m7, 0, 0
|
|
%else
|
|
SWAP m7, %9
|
|
SWAP %7, %10
|
|
%endif
|
|
%endmacro
|
|
|
|
; 8x4 residuals are processed and stored
|
|
; %1 - horizontal offset
|
|
; %2 - shift
|
|
; %3 - offset of the even row
|
|
; %4 - step: 1 for 8x8, 2 for 16x16, 4 for 32x32
|
|
; %5 - offset of the odd row
|
|
; %6 - block size
|
|
; %7 - 1/0 add a constant in TR_4x4 or not
|
|
; I want to add a constant for 8x8 transform but not for 16x16 and 32x32
|
|
%macro TR_8x4 7
|
|
; load 4 columns of even rows
|
|
LOAD_BLOCK m0, m1, 0, 2 * %4 * %3, %4 * %3, 3 * %4 * %3, %1
|
|
|
|
TR_4x4 %2, 0, %7 ; e8: m0, m1, m2, m3, for 4 columns only
|
|
|
|
; load 4 columns of odd rows
|
|
LOAD_BLOCK m4, m5, %4 * %5, 3 * %4 * %5, 5 * %4 * %5, 7 * %4 * %5, %1
|
|
|
|
; 00 01 02 03
|
|
; 10 11 12 13 m4: 10 30 11 31 12 32 13 33
|
|
|
|
; ... -- >
|
|
; m5: 50 70 51 71 52 72 53 73
|
|
; 70 71 72 73
|
|
SBUTTERFLY wd, 4, 5, 6
|
|
|
|
E8_O8 %1, %2, [pw_89_75], [pw_50_18], 0, %5 * 7, m0, %6, m8, m15
|
|
E8_O8 %1, %2, [pw_75_m18], [pw_m89_m50], %5, %5 * 6, m1, %6, m9, m14
|
|
E8_O8 %1, %2, [pw_50_m89], [pw_18_75], %5 * 2, %5 * 5, m2, %6, m10, m13
|
|
E8_O8 %1, %2, [pw_18_m50], [pw_75_m89], %5 * 3, %5 * 4, m3, %6, m11, m12
|
|
%endmacro
|
|
|
|
%macro STORE_PACKED 7
|
|
movq [r0 + %3 + %7], %1
|
|
movhps [r0 + %4 + %7], %1
|
|
movq [r0 + %5 + %7], %2
|
|
movhps [r0 + %6 + %7], %2
|
|
%endmacro
|
|
|
|
; transpose 4x4 block packed
|
|
; in %1 and %2 registers
|
|
; %3 - temporary register
|
|
%macro TRANSPOSE_4x4 3
|
|
SBUTTERFLY wd, %1, %2, %3
|
|
SBUTTERFLY dq, %1, %2, %3
|
|
%endmacro
|
|
|
|
; %1 - horizontal offset of the block i
|
|
; %2 - vertical offset of the block i
|
|
; %3 - width in bytes
|
|
; %4 - vertical offset for the block j
|
|
; %5 - horizontal offset for the block j
|
|
%macro SWAP_BLOCKS 5
|
|
; M_j
|
|
LOAD_BLOCK m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
|
|
TRANSPOSE_4x4 4, 5, 6
|
|
|
|
; M_i
|
|
LOAD_BLOCK m6, m7, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
|
|
|
|
STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
|
|
|
|
; transpose and store M_i
|
|
SWAP m6, m4
|
|
SWAP m7, m5
|
|
TRANSPOSE_4x4 4, 5, 6
|
|
STORE_PACKED m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
|
|
%endmacro
|
|
|
|
; %1 - horizontal offset
|
|
; %2 - vertical offset of the block
|
|
; %3 - width in bytes
|
|
%macro TRANSPOSE_BLOCK 3
|
|
LOAD_BLOCK m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
|
|
TRANSPOSE_4x4 4, 5, 6
|
|
STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
|
|
%endmacro
|
|
|
|
%macro TRANSPOSE_8x8 0
|
|
cglobal hevc_idct_transpose_8x8, 0, 0, 0
|
|
; M1 M2 ^T = M1^t M3^t
|
|
; M3 M4 M2^t M4^t
|
|
|
|
; M1 4x4 block
|
|
TRANSPOSE_BLOCK 0, 0, 16
|
|
|
|
; M2 and M3
|
|
SWAP_BLOCKS 0, 64, 16, 0, 8
|
|
|
|
; M4
|
|
TRANSPOSE_BLOCK 8, 64, 16
|
|
|
|
ret
|
|
%endmacro
|
|
|
|
; void ff_hevc_idct_8x8_{8,10}_<opt>(int16_t *coeffs, int col_limit)
|
|
; %1 = bitdepth
|
|
%macro IDCT_8x8 1
|
|
cglobal hevc_idct_8x8_%1, 1, 1, 8, coeffs
|
|
TR_8x4 0, 7, 32, 1, 16, 8, 1
|
|
TR_8x4 8, 7, 32, 1, 16, 8, 1
|
|
|
|
call hevc_idct_transpose_8x8_ %+ cpuname
|
|
|
|
DEFINE_BIAS %1
|
|
TR_8x4 0, shift, 32, 1, 16, 8, 1
|
|
TR_8x4 8, shift, 32, 1, 16, 8, 1
|
|
|
|
TAIL_CALL hevc_idct_transpose_8x8_ %+ cpuname, 1
|
|
%endmacro
|
|
|
|
; store intermedite e32 coeffs on stack
|
|
; as 16x4 matrix
|
|
; from m10: e8 + o8, with %6 offset
|
|
; and %3: e8 - o8, with %7 offset
|
|
; %4 - shift, unused here
|
|
%macro STORE_16 7
|
|
mova [rsp + %6], %5
|
|
mova [rsp + %7], %3
|
|
%endmacro
|
|
|
|
; %1, %2 - transform constants
|
|
; %3, %4 - regs with interleaved coeffs
|
|
; %5 - 1/0 SWAP or add
|
|
; %6, %7 - registers for intermidiate sums
|
|
; %8 - accumulator register
|
|
%macro ADD_ROWS 8
|
|
pmaddwd %6, %3, %1
|
|
pmaddwd %7, %4, %2
|
|
paddd %6, %7
|
|
%if %5 == 1
|
|
SWAP %6, %8
|
|
%else
|
|
paddd %8, %6
|
|
%endif
|
|
%endmacro
|
|
|
|
; %1 - transform coeffs
|
|
; %2, %3 offsets for storing e+o/e-o back to coeffsq
|
|
; %4 - shift
|
|
; %5 - add
|
|
; %6 - block_size
|
|
; %7 - register with e16
|
|
; %8, %9 - stack offsets for storing e+o/e-o
|
|
%macro E16_O16 9
|
|
ADD_ROWS [%1], [%1 + 16], m0, m1, 1, m5, m6, m7
|
|
ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m5, m6, m7
|
|
|
|
%if %6 == 8
|
|
paddd %7, %5
|
|
%endif
|
|
|
|
paddd m4, m7, %7 ; o16 + e16
|
|
psubd %7, m7 ; e16 - o16
|
|
STORE_%6 %2, %3, %7, %4, m4, %8, %9
|
|
%endmacro
|
|
|
|
%macro TR_16x4 10
|
|
; produce 8x4 matrix of e16 coeffs
|
|
; for 4 first rows and store it on stack (128 bytes)
|
|
TR_8x4 %1, 7, %4, %5, %6, %8, 0
|
|
|
|
; load 8 even rows
|
|
LOAD_BLOCK m0, m1, %9 * %6, %9 * 3 * %6, %9 * 5 * %6, %9 * 7 * %6, %1
|
|
LOAD_BLOCK m2, m3, %9 * 9 * %6, %9 * 11 * %6, %9 * 13 * %6, %9 * 15 * %6, %1
|
|
|
|
SBUTTERFLY wd, 0, 1, 4
|
|
SBUTTERFLY wd, 2, 3, 4
|
|
|
|
E16_O16 trans_coeffs16, 0 + %1, 15 * %6 + %1, %2, %3, %7, m8, 0, 15 * 16
|
|
mova m8, %3
|
|
E16_O16 trans_coeffs16 + 64, %6 + %1, 14 * %6 + %1, %2, m8, %7, m9, 16, 14 * 16
|
|
E16_O16 trans_coeffs16 + 2 * 64, 2 * %6 + %1, 13 * %6 + %1, %2, m8, %7, m10, 2 * 16, 13 * 16
|
|
E16_O16 trans_coeffs16 + 3 * 64, 3 * %6 + %1, 12 * %6 + %1, %2, m8, %7, m11, 3 * 16, 12 * 16
|
|
E16_O16 trans_coeffs16 + 4 * 64, 4 * %6 + %1, 11 * %6 + %1, %2, m8, %7, m12, 4 * 16, 11 * 16
|
|
E16_O16 trans_coeffs16 + 5 * 64, 5 * %6 + %1, 10 * %6 + %1, %2, m8, %7, m13, 5 * 16, 10 * 16
|
|
E16_O16 trans_coeffs16 + 6 * 64, 6 * %6 + %1, 9 * %6 + %1, %2, m8, %7, m14, 6 * 16, 9 * 16
|
|
E16_O16 trans_coeffs16 + 7 * 64, 7 * %6 + %1, 8 * %6 + %1, %2, m8, %7, m15, 7 * 16, 8 * 16
|
|
%endmacro
|
|
|
|
%macro TRANSPOSE_16x16 0
|
|
cglobal hevc_idct_transpose_16x16, 0, 0, 0
|
|
; M1 M2 M3 M4 ^T m1 m5 m9 m13 M_i^T = m_i
|
|
; M5 M6 M7 M8 --> m2 m6 m10 m14
|
|
; M9 M10 M11 M12 m3 m7 m11 m15
|
|
; M13 M14 M15 M16 m4 m8 m12 m16
|
|
|
|
; M1 4x4 block
|
|
TRANSPOSE_BLOCK 0, 0, 32
|
|
|
|
; M5, M2
|
|
SWAP_BLOCKS 0, 128, 32, 0, 8
|
|
; M9, M3
|
|
SWAP_BLOCKS 0, 256, 32, 0, 16
|
|
; M13, M4
|
|
SWAP_BLOCKS 0, 384, 32, 0, 24
|
|
|
|
;M6
|
|
TRANSPOSE_BLOCK 8, 128, 32
|
|
|
|
; M10, M7
|
|
SWAP_BLOCKS 8, 256, 32, 128, 16
|
|
; M14, M8
|
|
SWAP_BLOCKS 8, 384, 32, 128, 24
|
|
|
|
;M11
|
|
TRANSPOSE_BLOCK 16, 256, 32
|
|
|
|
; M15, M12
|
|
SWAP_BLOCKS 16, 384, 32, 256, 24
|
|
|
|
;M16
|
|
TRANSPOSE_BLOCK 24, 384, 32
|
|
|
|
ret
|
|
%endmacro
|
|
|
|
; void ff_hevc_idct_16x16_{8,10}_<opt>(int16_t *coeffs, int col_limit)
|
|
; %1 = bitdepth
|
|
%macro IDCT_16x16 1
|
|
cglobal hevc_idct_16x16_%1, 1, 2, 16, coeffs
|
|
mov r1d, 3
|
|
.loop16:
|
|
TR_16x4 8 * r1, 7, [pd_64], 64, 2, 32, 8, 16, 1, 0
|
|
dec r1d
|
|
jge .loop16
|
|
|
|
call hevc_idct_transpose_16x16_ %+ cpuname
|
|
|
|
DEFINE_BIAS %1
|
|
mov r1d, 3
|
|
.loop16_2:
|
|
TR_16x4 8 * r1, shift, [arr_add], 64, 2, 32, 8, 16, 1, 1
|
|
dec r1d
|
|
jge .loop16_2
|
|
|
|
TAIL_CALL hevc_idct_transpose_16x16_ %+ cpuname, 1
|
|
%endmacro
|
|
|
|
; scale, pack (clip16) and store the residuals 0 e32[0] + o32[0] --> %1
|
|
; 4 at one time (4 columns) 1 e32[1] + o32[1]
|
|
; %1 - address to store e32 + o32
|
|
; %2 - address to store e32 - e32
|
|
; %5 - reg with e32 + o32 ...
|
|
; %3 - reg with e32 - o32 30 e32[1] - o32[1]
|
|
; %4 - shift 31 e32[0] - o32[0] --> %2
|
|
%macro STORE_32 5
|
|
psrad %5, %4
|
|
psrad %3, %4
|
|
packssdw %5, %3
|
|
movq [%1], %5
|
|
movhps [%2], %5
|
|
%endmacro
|
|
|
|
; %1 - transform coeffs
|
|
; %2 - stack offset for e32
|
|
; %2, %3 offsets for storing e+o/e-o back to coeffsq
|
|
; %4 - shift
|
|
; %5 - stack offset of e32
|
|
%macro E32_O32 5
|
|
ADD_ROWS [%1], [%1 + 16], m0, m1, 1, m8, m9, m10
|
|
ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m8, m9, m10
|
|
ADD_ROWS [%1 + 4 * 16], [%1 + 5 * 16], m4, m5, 0, m8, m9, m10
|
|
ADD_ROWS [%1 + 6 * 16], [%1 + 7 * 16], m6, m7, 0, m8, m9, m10
|
|
|
|
paddd m11, m14, [rsp + %5]
|
|
paddd m12, m10, m11 ; o32 + e32
|
|
psubd m11, m10 ; e32 - o32
|
|
STORE_32 %2, %3, m11, %4, m12
|
|
%endmacro
|
|
|
|
; %1 - horizontal offset
|
|
; %2 - bitdepth
|
|
%macro TR_32x4 3
|
|
TR_16x4 %1, 7, [pd_64], 128, 4, 64, 16, 16, 2, 0
|
|
|
|
LOAD_BLOCK m0, m1, 64, 3 * 64, 5 * 64, 7 * 64, %1
|
|
LOAD_BLOCK m2, m3, 9 * 64, 11 * 64, 13 * 64, 15 * 64, %1
|
|
LOAD_BLOCK m4, m5, 17 * 64, 19 * 64, 21 * 64, 23 * 64, %1
|
|
LOAD_BLOCK m6, m7, 25 * 64, 27 * 64, 29 * 64, 31 * 64, %1
|
|
|
|
SBUTTERFLY wd, 0, 1, 8
|
|
SBUTTERFLY wd, 2, 3, 8
|
|
SBUTTERFLY wd, 4, 5, 8
|
|
SBUTTERFLY wd, 6, 7, 8
|
|
|
|
%if %3 == 1
|
|
%assign shift 7
|
|
mova m14, [pd_64]
|
|
%else
|
|
LOAD_BIAS %2, m14
|
|
%endif
|
|
|
|
lea r2, [trans_coeff32 + 15 * 128]
|
|
lea r3, [coeffsq + %1]
|
|
lea r4, [r3 + 16 * 64]
|
|
mov r5d, 15 * 16
|
|
%%loop:
|
|
E32_O32 r2, r3 + r5 * 4, r4, shift, r5
|
|
sub r2, 128
|
|
add r4, 64
|
|
sub r5d, 16
|
|
jge %%loop
|
|
%endmacro
|
|
|
|
%macro TRANSPOSE_32x32 0
|
|
cglobal hevc_idct_transpose_32x32, 0, 0, 0
|
|
; M0 M1 ... M7
|
|
; M8 M15
|
|
;
|
|
; ...
|
|
;
|
|
; M56 M63
|
|
|
|
TRANSPOSE_BLOCK 0, 0, 64 ; M1
|
|
mov r1d, 7
|
|
mov r2d, 7 * 256
|
|
.loop_transpose:
|
|
SWAP_BLOCKS 0, r2, 64, 0, r1 * 8
|
|
sub r2d, 256
|
|
dec r1d
|
|
jg .loop_transpose
|
|
|
|
TRANSPOSE_BLOCK 8, 256, 64 ; M9
|
|
mov r1d, 6
|
|
mov r2d, 512
|
|
mov r3d, 16
|
|
.loop_transpose2:
|
|
SWAP_BLOCKS 8, r2, 64, 256, r3
|
|
add r3d, 8
|
|
add r2d, 256
|
|
dec r1d
|
|
jg .loop_transpose2
|
|
|
|
TRANSPOSE_BLOCK 2 * 8, 2 * 256, 64 ; M9
|
|
mov r1d, 5
|
|
mov r2d, 768
|
|
mov r3d, 24
|
|
.loop_transpose3:
|
|
SWAP_BLOCKS 2 * 8, r2, 64, 2 * 256, r3
|
|
add r3d, 8
|
|
add r2d, 256
|
|
dec r1d
|
|
jg .loop_transpose3
|
|
|
|
TRANSPOSE_BLOCK 3 * 8, 3 * 256, 64 ; M27
|
|
mov r1d, 4
|
|
mov r2d, 1024
|
|
mov r3d, 32
|
|
.loop_transpose4:
|
|
SWAP_BLOCKS 3 * 8, r2, 64, 3 * 256, r3
|
|
add r3d, 8
|
|
add r2d, 256
|
|
dec r1d
|
|
jg .loop_transpose4
|
|
|
|
TRANSPOSE_BLOCK 4 * 8, 4 * 256, 64 ; M36
|
|
mov r1d, 3
|
|
mov r2d, 1280
|
|
mov r3d, 40
|
|
.loop_transpose5:
|
|
SWAP_BLOCKS 4 * 8, r2, 64, 4 * 256, r3
|
|
add r3d, 8
|
|
add r2d, 256
|
|
dec r1d
|
|
jg .loop_transpose5
|
|
|
|
TRANSPOSE_BLOCK 5 * 8, 5 * 256, 64 ; M45
|
|
SWAP_BLOCKS 5 * 8, 6 * 256, 64, 5 * 256, 6 * 8
|
|
SWAP_BLOCKS 5 * 8, 7 * 256, 64, 5 * 256, 7 * 8
|
|
|
|
TRANSPOSE_BLOCK 6 * 8, 6 * 256, 64 ; M54
|
|
SWAP_BLOCKS 6 * 8, 7 * 256, 64, 6 * 256, 7 * 8
|
|
|
|
TRANSPOSE_BLOCK 7 * 8, 7 * 256, 64 ; M63
|
|
|
|
ret
|
|
%endmacro
|
|
|
|
; void ff_hevc_idct_32x32_{8,10}_<opt>(int16_t *coeffs, int col_limit)
|
|
; %1 = bitdepth
|
|
%macro IDCT_32x32 1
|
|
cglobal hevc_idct_32x32_%1, 1, 6, 16, 256, coeffs
|
|
mov r1d, 7
|
|
.loop32:
|
|
TR_32x4 8 * r1, %1, 1
|
|
dec r1d
|
|
jge .loop32
|
|
|
|
call hevc_idct_transpose_32x32_ %+ cpuname
|
|
|
|
mov r1d, 7
|
|
.loop32_2:
|
|
TR_32x4 8 * r1, %1, 0
|
|
dec r1d
|
|
jge .loop32_2
|
|
|
|
TAIL_CALL hevc_idct_transpose_32x32_ %+ cpuname, 1
|
|
%endmacro
|
|
|
|
%macro INIT_IDCT_DC 1
|
|
INIT_MMX mmxext
|
|
IDCT_DC_NL 4, %1
|
|
|
|
INIT_XMM sse2
|
|
IDCT_DC_NL 8, %1
|
|
IDCT_DC 16, 4, %1
|
|
IDCT_DC 32, 16, %1
|
|
|
|
%if HAVE_AVX2_EXTERNAL
|
|
INIT_YMM avx2
|
|
IDCT_DC 16, 2, %1
|
|
IDCT_DC 32, 8, %1
|
|
%endif ;HAVE_AVX2_EXTERNAL
|
|
%endmacro
|
|
|
|
%macro INIT_IDCT 2
|
|
INIT_XMM %2
|
|
%if %1 == 8
|
|
TRANSPOSE_8x8
|
|
%if ARCH_X86_64
|
|
TRANSPOSE_16x16
|
|
TRANSPOSE_32x32
|
|
%endif
|
|
%endif
|
|
%if ARCH_X86_64
|
|
IDCT_32x32 %1
|
|
IDCT_16x16 %1
|
|
%endif
|
|
IDCT_8x8 %1
|
|
IDCT_4x4 %1
|
|
%endmacro
|
|
|
|
INIT_IDCT_DC 8
|
|
INIT_IDCT_DC 10
|
|
INIT_IDCT_DC 12
|
|
INIT_IDCT 8, sse2
|
|
INIT_IDCT 8, avx
|
|
INIT_IDCT 10, sse2
|
|
INIT_IDCT 10, avx
|
|
;INIT_IDCT 12, sse2
|
|
;INIT_IDCT 12, avx
|