;****************************************************************************** ;* x86 optimized discrete wavelet trasnform ;* Copyright (c) 2010 David Conrad ;* ;* This file is part of FFmpeg. ;* ;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* ;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public ;* License along with FFmpeg; if not, write to the Free Software ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "libavutil/x86/x86util.asm" SECTION_RODATA pw_1991: times 4 dw 9,-1 cextern pw_1 cextern pw_2 cextern pw_8 cextern pw_16 SECTION .text ; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2 %macro COMPOSE_53iL0 4 paddw %2, %3 paddw %2, %4 psraw %2, 2 psubw %1, %2 %endm ; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4 ; if %4 is supplied, %1 is loaded unaligned from there ; m2: clobbered m3: pw_8 m4: pw_1991 %macro COMPOSE_DD97iH0 3-4 paddw m0, %3 paddw m1, %2 psubw m0, m3 mova m2, m1 punpcklwd m1, m0 punpckhwd m2, m0 pmaddwd m1, m4 pmaddwd m2, m4 %if %0 > 3 movu %1, %4 %endif psrad m1, 4 psrad m2, 4 packssdw m1, m2 paddw m1, %1 %endm %macro COMPOSE_VERTICAL 1 ; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, ; int width) cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width mova m2, [pw_2] %if ARCH_X86_64 mov widthd, widthd %endif .loop: sub widthq, mmsize/2 mova m1, [b0q+2*widthq] mova m0, [b1q+2*widthq] COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2 mova [b1q+2*widthq], m0 jg .loop RET ; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, ; int width) cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width mova m1, [pw_1] %if ARCH_X86_64 mov widthd, widthd %endif .loop: sub widthq, mmsize/2 mova m0, [b0q+2*widthq] paddw m0, [b2q+2*widthq] paddw m0, m1 psraw m0, 1 paddw m0, [b1q+2*widthq] mova [b1q+2*widthq], m0 jg .loop RET ; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, ; IDWTELEM *b3, IDWTELEM *b4, int width) cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width mova m3, [pw_8] mova m4, [pw_1991] %if ARCH_X86_64 mov widthd, widthd %endif .loop: sub widthq, mmsize/2 mova m0, [b0q+2*widthq] mova m1, [b1q+2*widthq] COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq] mova [b2q+2*widthq], m1 jg .loop RET ; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, ; IDWTELEM *b3, IDWTELEM *b4, int width) cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width mova m3, [pw_16] mova m4, [pw_1991] %if ARCH_X86_64 mov widthd, widthd %endif .loop: sub widthq, mmsize/2 mova m0, [b0q+2*widthq] mova m1, [b1q+2*widthq] mova m5, [b2q+2*widthq] paddw m0, [b4q+2*widthq] paddw m1, [b3q+2*widthq] psubw m0, m3 mova m2, m1 punpcklwd m1, m0 punpckhwd m2, m0 pmaddwd m1, m4 pmaddwd m2, m4 psrad m1, 5 psrad m2, 5 packssdw m1, m2 psubw m5, m1 mova [b2q+2*widthq], m5 jg .loop RET ; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width) cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width mova m3, [pw_1] %if ARCH_X86_64 mov widthd, widthd %endif .loop: sub widthq, mmsize/2 mova m1, [b1q+2*widthq] mova m0, [b0q+2*widthq] mova m2, m1 paddw m1, m3 psraw m1, 1 psubw m0, m1 mova [b0q+2*widthq], m0 paddw m2, m0 mova [b1q+2*widthq], m2 jg .loop RET %endmacro ; extend the left and right edges of the tmp array by %1 and %2 respectively %macro EDGE_EXTENSION 3 mov %3, [tmpq] %assign %%i 1 %rep %1 mov [tmpq-2*%%i], %3 %assign %%i %%i+1 %endrep mov %3, [tmpq+2*w2q-2] %assign %%i 0 %rep %2 mov [tmpq+2*w2q+2*%%i], %3 %assign %%i %%i+1 %endrep %endmacro %macro HAAR_HORIZONTAL 2 ; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width) cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2 mov w2d, wd xor xq, xq shr w2d, 1 lea b_w2q, [bq+wq] mova m3, [pw_1] .lowpass_loop: movu m1, [b_w2q + 2*xq] mova m0, [bq + 2*xq] paddw m1, m3 psraw m1, 1 psubw m0, m1 mova [tmpq + 2*xq], m0 add xq, mmsize/2 cmp xq, w2q jl .lowpass_loop xor xq, xq and w2q, ~(mmsize/2 - 1) cmp w2q, mmsize/2 jl .end .highpass_loop: movu m1, [b_w2q + 2*xq] mova m0, [tmpq + 2*xq] paddw m1, m0 ; shift and interleave %if %2 == 1 paddw m0, m3 paddw m1, m3 psraw m0, 1 psraw m1, 1 %endif mova m2, m0 punpcklwd m0, m1 punpckhwd m2, m1 mova [bq+4*xq], m0 mova [bq+4*xq+mmsize], m2 add xq, mmsize/2 cmp xq, w2q jl .highpass_loop .end: RET %endmacro INIT_XMM ; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width) cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2 mov w2d, wd xor xd, xd shr w2d, 1 lea b_w2q, [bq+wq] movu m4, [bq+wq] mova m7, [pw_2] pslldq m4, 14 .lowpass_loop: movu m1, [b_w2q + 2*xq] mova m0, [bq + 2*xq] mova m2, m1 palignr m1, m4, 14 mova m4, m2 COMPOSE_53iL0 m0, m1, m2, m7 mova [tmpq + 2*xq], m0 add xd, mmsize/2 cmp xd, w2d jl .lowpass_loop EDGE_EXTENSION 1, 2, xw ; leave the last up to 7 (sse) or 3 (mmx) values for C xor xd, xd and w2d, ~(mmsize/2 - 1) cmp w2d, mmsize/2 jl .end mova m7, [tmpq-mmsize] mova m0, [tmpq] mova m5, [pw_1] mova m3, [pw_8] mova m4, [pw_1991] .highpass_loop: mova m6, m0 palignr m0, m7, 14 mova m7, [tmpq + 2*xq + 16] mova m1, m7 mova m2, m7 palignr m1, m6, 2 palignr m2, m6, 4 COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq] mova m0, m7 mova m7, m6 ; shift and interleave paddw m6, m5 paddw m1, m5 psraw m6, 1 psraw m1, 1 mova m2, m6 punpcklwd m6, m1 punpckhwd m2, m1 mova [bq+4*xq], m6 mova [bq+4*xq+mmsize], m2 add xd, mmsize/2 cmp xd, w2d jl .highpass_loop .end: RET INIT_XMM COMPOSE_VERTICAL sse2 HAAR_HORIZONTAL sse2, 0 HAAR_HORIZONTAL sse2, 1