;***************************************************************************** ;* x86-optimized functions for blend filter ;* ;* Copyright (C) 2015 Paul B Mahol ;* Copyright (C) 2018 Henrik Gramner ;* Copyright (C) 2018 Jokyo Images ;* ;* This file is part of FFmpeg. ;* ;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* ;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public ;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "libavutil/x86/x86util.asm" SECTION_RODATA ps_255: times 4 dd 255.0 pd_32768 : times 4 dd 32768 pd_65535 : times 4 dd 65535 pw_1: times 8 dw 1 pw_128: times 8 dw 128 pw_255: times 8 dw 255 pb_127: times 16 db 127 pb_128: times 16 db 128 pb_255: times 16 db 255 SECTION .text %macro BLEND_INIT 2-3 0 %if ARCH_X86_64 cglobal blend_%1, 6, 9, %2, top, top_linesize, bottom, bottom_linesize, dst, dst_linesize, width, end, x mov widthd, dword widthm %if %3; is 16 bit add widthq, widthq ; doesn't compile on x86_32 %endif %else cglobal blend_%1, 5, 7, %2, top, top_linesize, bottom, bottom_linesize, dst, end, x %define dst_linesizeq r5mp %define widthq r6mp %endif mov endd, dword r7m add topq, widthq add bottomq, widthq add dstq, widthq neg widthq %endmacro %macro BLEND_END 0 add topq, top_linesizeq add bottomq, bottom_linesizeq add dstq, dst_linesizeq sub endd, 1 jg .nextrow RET %endmacro %macro BLEND_SIMPLE 2-3 0 BLEND_INIT %1, 2, %3 .nextrow: mov xq, widthq .loop: movu m0, [topq + xq] movu m1, [bottomq + xq] p%2 m0, m1 movu [dstq + xq], m0 add xq, mmsize jl .loop BLEND_END %endmacro ; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit) %macro GRAINEXTRACT 3-4 0 BLEND_INIT %1, 6, %4 pxor m4, m4 %if %4 ; 16 bit VBROADCASTI128 m5, [pd_32768] %else VBROADCASTI128 m5, [pw_128] %endif .nextrow: mov xq, widthq .loop: movu m1, [topq + xq] movu m3, [bottomq + xq] punpckl%2%3 m0, m1, m4 punpckh%2%3 m1, m4 punpckl%2%3 m2, m3, m4 punpckh%2%3 m3, m4 padd%3 m0, m5 padd%3 m1, m5 psub%3 m0, m2 psub%3 m1, m3 packus%3%2 m0, m1 movu [dstq + xq], m0 add xq, mmsize jl .loop BLEND_END %endmacro %macro MULTIPLY 3 ; a, b, pw_1 pmullw %1, %2 ; xxxxxxxx a * b paddw %1, %3 psrlw %2, %1, 8 paddw %1, %2 psrlw %1, 8 ; 00xx00xx a * b / 255 %endmacro %macro SCREEN 4 ; a, b, pw_1, pw_255 pxor %1, %4 ; 00xx00xx 255 - a pxor %2, %4 MULTIPLY %1, %2, %3 pxor %1, %4 ; 00xx00xx 255 - x / 255 %endmacro %macro BLEND_MULTIPLY 0 BLEND_INIT multiply, 6 pxor m4, m4 VBROADCASTI128 m5, [pw_1] .nextrow: mov xq, widthq .loop: movu m1, [topq + xq] movu m3, [bottomq + xq] punpcklbw m0, m1, m4 punpckhbw m1, m4 punpcklbw m2, m3, m4 punpckhbw m3, m4 MULTIPLY m0, m2, m5 MULTIPLY m1, m3, m5 packuswb m0, m1 movu [dstq + xq], m0 add xq, mmsize jl .loop BLEND_END %endmacro %macro BLEND_SCREEN 0 BLEND_INIT screen, 7 pxor m4, m4 VBROADCASTI128 m5, [pw_1] VBROADCASTI128 m6, [pw_255] .nextrow: mov xq, widthq .loop: movu m1, [topq + xq] movu m3, [bottomq + xq] punpcklbw m0, m1, m4 punpckhbw m1, m4 punpcklbw m2, m3, m4 punpckhbw m3, m4 SCREEN m0, m2, m5, m6 SCREEN m1, m3, m5, m6 packuswb m0, m1 movu [dstq + xq], m0 add xq, mmsize jl .loop BLEND_END %endmacro ;%1 name, %2 (b or w), %3 (set if 16 bit) %macro AVERAGE 2-3 0 BLEND_INIT %1, 3, %3 pcmpeqb m2, m2 .nextrow: mov xq, widthq .loop: movu m0, [topq + xq] movu m1, [bottomq + xq] pxor m0, m2 pxor m1, m2 pavg%2 m0, m1 pxor m0, m2 movu [dstq + xq], m0 add xq, mmsize jl .loop BLEND_END %endmacro ; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit) %macro GRAINMERGE 3-4 0 BLEND_INIT %1, 6, %4 pxor m4, m4 %if %4 ; 16 bit VBROADCASTI128 m5, [pd_32768] %else VBROADCASTI128 m5, [pw_128] %endif .nextrow: mov xq, widthq .loop: movu m1, [topq + xq] movu m3, [bottomq + xq] punpckl%2%3 m0, m1, m4 punpckh%2%3 m1, m4 punpckl%2%3 m2, m3, m4 punpckh%2%3 m3, m4 padd%3 m0, m2 padd%3 m1, m3 psub%3 m0, m5 psub%3 m1, m5 packus%3%2 m0, m1 movu [dstq + xq], m0 add xq, mmsize jl .loop BLEND_END %endmacro %macro HARDMIX 0 BLEND_INIT hardmix, 5 VBROADCASTI128 m2, [pb_255] VBROADCASTI128 m3, [pb_128] VBROADCASTI128 m4, [pb_127] .nextrow: mov xq, widthq .loop: movu m0, [topq + xq] movu m1, [bottomq + xq] pxor m1, m4 pxor m0, m3 pcmpgtb m1, m0 pxor m1, m2 movu [dstq + xq], m1 add xq, mmsize jl .loop BLEND_END %endmacro %macro DIVIDE 0 BLEND_INIT divide, 4 pxor m2, m2 mova m3, [ps_255] .nextrow: mov xq, widthq .loop: movd m0, [topq + xq] ; 000000xx movd m1, [bottomq + xq] punpcklbw m0, m2 ; 00000x0x punpcklbw m1, m2 punpcklwd m0, m2 ; 000x000x punpcklwd m1, m2 cvtdq2ps m0, m0 cvtdq2ps m1, m1 divps m0, m1 ; a / b mulps m0, m3 ; a / b * 255 minps m0, m3 cvttps2dq m0, m0 packssdw m0, m0 ; 00000x0x packuswb m0, m0 ; 000000xx movd [dstq + xq], m0 add xq, mmsize / 4 jl .loop BLEND_END %endmacro %macro PHOENIX 2-3 0 ; %1 name, %2 b or w, %3 (opt) 1 if 16 bit BLEND_INIT %1, 4, %3 VBROADCASTI128 m3, [pb_255] .nextrow: mov xq, widthq .loop: movu m0, [topq + xq] movu m1, [bottomq + xq] mova m2, m0 pminu%2 m0, m1 pmaxu%2 m1, m2 mova m2, m3 psubus%2 m2, m1 paddus%2 m2, m0 movu [dstq + xq], m2 add xq, mmsize jl .loop BLEND_END %endmacro ; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit) %macro DIFFERENCE 3-4 0 BLEND_INIT %1, 5, %4 pxor m2, m2 .nextrow: mov xq, widthq .loop: movu m0, [topq + xq] movu m1, [bottomq + xq] punpckh%2%3 m3, m0, m2 punpckl%2%3 m0, m2 punpckh%2%3 m4, m1, m2 punpckl%2%3 m1, m2 psub%3 m0, m1 psub%3 m3, m4 %if %4; 16 bit pabsd m0, m0 pabsd m3, m3 %else ABS2 m0, m3, m1, m4 %endif packus%3%2 m0, m3 movu [dstq + xq], m0 add xq, mmsize jl .loop BLEND_END %endmacro ; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit) %macro EXTREMITY 3-4 0 BLEND_INIT %1, 8, %4 pxor m2, m2 %if %4; 16 bit VBROADCASTI128 m4, [pd_65535] %else VBROADCASTI128 m4, [pw_255] %endif .nextrow: mov xq, widthq .loop: movu m0, [topq + xq] movu m1, [bottomq + xq] punpckh%2%3 m5, m0, m2 punpckl%2%3 m0, m2 punpckh%2%3 m6, m1, m2 punpckl%2%3 m1, m2 psub%3 m3, m4, m0 psub%3 m7, m4, m5 psub%3 m3, m1 psub%3 m7, m6 %if %4; 16 bit pabsd m3, m3 pabsd m7, m7 %else ABS2 m3, m7, m1, m6 %endif packus%3%2 m3, m7 movu [dstq + xq], m3 add xq, mmsize jl .loop BLEND_END %endmacro %macro NEGATION 3-4 0 BLEND_INIT %1, 8, %4 pxor m2, m2 %if %4; 16 bit VBROADCASTI128 m4, [pd_65535] %else VBROADCASTI128 m4, [pw_255] %endif .nextrow: mov xq, widthq .loop: movu m0, [topq + xq] movu m1, [bottomq + xq] punpckh%2%3 m5, m0, m2 punpckl%2%3 m0, m2 punpckh%2%3 m6, m1, m2 punpckl%2%3 m1, m2 psub%3 m3, m4, m0 psub%3 m7, m4, m5 psub%3 m3, m1 psub%3 m7, m6 %if %4; 16 bit pabsd m3, m3 pabsd m7, m7 %else ABS2 m3, m7, m1, m6 %endif psub%3 m0, m4, m3 psub%3 m1, m4, m7 packus%3%2 m0, m1 movu [dstq + xq], m0 add xq, mmsize jl .loop BLEND_END %endmacro INIT_XMM sse2 BLEND_SIMPLE xor, xor BLEND_SIMPLE or, or BLEND_SIMPLE and, and BLEND_SIMPLE addition, addusb BLEND_SIMPLE subtract, subusb BLEND_SIMPLE darken, minub BLEND_SIMPLE lighten, maxub GRAINEXTRACT grainextract, b, w BLEND_MULTIPLY BLEND_SCREEN AVERAGE average, b GRAINMERGE grainmerge, b, w HARDMIX PHOENIX phoenix, b DIFFERENCE difference, b, w DIVIDE EXTREMITY extremity, b, w NEGATION negation, b, w %if ARCH_X86_64 BLEND_SIMPLE addition_16, addusw, 1 BLEND_SIMPLE and_16, and, 1 BLEND_SIMPLE or_16, or, 1 BLEND_SIMPLE subtract_16, subusw, 1 BLEND_SIMPLE xor_16, xor, 1 AVERAGE average_16, w, 1 %endif INIT_XMM ssse3 DIFFERENCE difference, b, w EXTREMITY extremity, b, w NEGATION negation, b, w INIT_XMM sse4 %if ARCH_X86_64 BLEND_SIMPLE darken_16, minuw, 1 BLEND_SIMPLE lighten_16, maxuw, 1 GRAINEXTRACT grainextract_16, w, d, 1 GRAINMERGE grainmerge_16, w, d, 1 PHOENIX phoenix_16, w, 1 DIFFERENCE difference_16, w, d, 1 EXTREMITY extremity_16, w, d, 1 NEGATION negation_16, w, d, 1 %endif %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 BLEND_SIMPLE xor, xor BLEND_SIMPLE or, or BLEND_SIMPLE and, and BLEND_SIMPLE addition, addusb BLEND_SIMPLE subtract, subusb BLEND_SIMPLE darken, minub BLEND_SIMPLE lighten, maxub GRAINEXTRACT grainextract, b, w BLEND_MULTIPLY BLEND_SCREEN AVERAGE average, b GRAINMERGE grainmerge, b, w HARDMIX PHOENIX phoenix, b DIFFERENCE difference, b, w EXTREMITY extremity, b, w NEGATION negation, b, w %if ARCH_X86_64 BLEND_SIMPLE addition_16, addusw, 1 BLEND_SIMPLE and_16, and, 1 BLEND_SIMPLE darken_16, minuw, 1 BLEND_SIMPLE lighten_16, maxuw, 1 BLEND_SIMPLE or_16, or, 1 BLEND_SIMPLE subtract_16, subusw, 1 BLEND_SIMPLE xor_16, xor, 1 GRAINEXTRACT grainextract_16, w, d, 1 AVERAGE average_16, w, 1 GRAINMERGE grainmerge_16, w, d, 1 PHOENIX phoenix_16, w, 1 DIFFERENCE difference_16, w, d, 1 EXTREMITY extremity_16, w, d, 1 NEGATION negation_16, w, d, 1 %endif %endif