ffmpeg/libswscale/x86/range_convert.asm
Ramiro Polla be108ebcf4 swscale/x86/range_convert: update sse2 and avx2 range_convert functions to new API
chrRangeFromJpeg8_1920_c:    2127.4 (1.00x)
chrRangeFromJpeg8_1920_sse2:  816.0 (2.61x)  813.5 (2.62x)
chrRangeFromJpeg8_1920_avx2:  408.9 (5.20x)  405.4 (5.25x)
chrRangeToJpeg8_1920_c:      3166.9 (1.00x)
chrRangeToJpeg8_1920_sse2:    815.0 (3.89x)  815.0 (3.89x)
chrRangeToJpeg8_1920_avx2:    404.5 (7.83x)  405.5 (7.81x)
lumRangeFromJpeg8_1920_c:    1263.0 (1.00x)
lumRangeFromJpeg8_1920_sse2:  411.0 (3.07x)  413.2 (3.06x)
lumRangeFromJpeg8_1920_avx2:  200.5 (6.30x)  201.9 (6.26x)
lumRangeToJpeg8_1920_c:      1886.8 (1.00x)
lumRangeToJpeg8_1920_sse2:    412.0 (4.58x)  408.9 (4.61x)
lumRangeToJpeg8_1920_avx2:    208.5 (9.05x)  205.7 (9.17x)
2024-12-05 21:10:29 +01:00

132 lines
3.9 KiB
NASM

;******************************************************************************
;* Copyright (c) 2024 Ramiro Polla
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
;-----------------------------------------------------------------------------
; lumConvertRange
;
; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width,
; uint32_t coeff, int64_t offset);
; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width,
; uint32_t coeff, int64_t offset);
;
;-----------------------------------------------------------------------------
%macro LUMCONVERTRANGE 1
cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
shl widthd, 1
movd xm2, coeffd
VBROADCASTSS m2, xm2
%if ARCH_X86_64
movq xm3, offsetq
%else
movq xm3, offsetm
%endif
VBROADCASTSS m3, xm3
pxor m4, m4
add dstq, widthq
neg widthq
.loop:
movu m0, [dstq+widthq]
punpckhwd m1, m0, m4
punpcklwd m0, m4
pmaddwd m0, m2
pmaddwd m1, m2
paddd m0, m3
paddd m1, m3
psrad m0, 14
psrad m1, 14
packssdw m0, m1
movu [dstq+widthq], m0
add widthq, mmsize
jl .loop
RET
%endmacro
;-----------------------------------------------------------------------------
; chrConvertRange
;
; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width,
; uint32_t coeff, int64_t offset);
; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width,
; uint32_t coeff, int64_t offset);
;
;-----------------------------------------------------------------------------
%macro CHRCONVERTRANGE 1
cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
shl widthd, 1
movd xm4, coeffd
VBROADCASTSS m4, xm4
%if ARCH_X86_64
movq xm5, offsetq
%else
movq xm5, offsetm
%endif
VBROADCASTSS m5, xm5
pxor m6, m6
add dstUq, widthq
add dstVq, widthq
neg widthq
.loop:
movu m0, [dstUq+widthq]
movu m2, [dstVq+widthq]
punpckhwd m1, m0, m6
punpckhwd m3, m2, m6
punpcklwd m0, m6
punpcklwd m2, m6
pmaddwd m0, m4
pmaddwd m1, m4
pmaddwd m2, m4
pmaddwd m3, m4
paddd m0, m5
paddd m1, m5
paddd m2, m5
paddd m3, m5
psrad m0, 14
psrad m1, 14
psrad m2, 14
psrad m3, 14
packssdw m0, m1
packssdw m2, m3
movu [dstUq+widthq], m0
movu [dstVq+widthq], m2
add widthq, mmsize
jl .loop
RET
%endmacro
INIT_XMM sse2
LUMCONVERTRANGE To
CHRCONVERTRANGE To
LUMCONVERTRANGE From
CHRCONVERTRANGE From
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
LUMCONVERTRANGE To
CHRCONVERTRANGE To
LUMCONVERTRANGE From
CHRCONVERTRANGE From
%endif