diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c index 2e73421b28..61d68ed238 100644 --- a/libavfilter/vf_fspp.c +++ b/libavfilter/vf_fspp.c @@ -151,11 +151,11 @@ static void store_slice2_c(uint8_t *dst, int16_t *src, } } -static void mul_thrmat_c(FSPPContext *p, int q) +static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q) { int a; for (a = 0; a < 64; a++) - ((int16_t *)p->threshold_mtx)[a] = q * ((int16_t *)p->threshold_mtx_noq)[a];//ints faster in C + thr_adr[a] = q * thr_adr_noq[a]; } static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src, @@ -220,7 +220,7 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src, t = qp_store[qy + (t >> qpsh)]; t = norm_qscale(t, p->qscale_type); - if (t != p->prev_q) p->prev_q = t, p->mul_thrmat(p, t); + if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t); p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT } p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1)); @@ -378,7 +378,7 @@ static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int } } -static void row_idct_c(int16_t *workspace, int16_t *output_adr, int output_stride, int cnt) +static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt) { int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int_simd16_t tmp10, tmp11, tmp12, tmp13; @@ -440,7 +440,7 @@ static void row_idct_c(int16_t *workspace, int16_t *output_adr, int output_strid } } -static void row_fdct_c(int16_t *data, const uint8_t *pixels, int line_size, int cnt) +static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt) { int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int_simd16_t tmp10, tmp11, tmp12, tmp13; @@ -582,7 +582,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) } if (fspp->qp) - fspp->prev_q = fspp->qp, fspp->mul_thrmat(fspp, fspp->qp); + fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp); /* if we are not in a constant user quantizer mode and we don't want to use * the quantizers from the B-frames (B-frames often have a higher QP), we diff --git a/libavfilter/vf_fspp.h b/libavfilter/vf_fspp.h index db860c6413..237ffb1dcf 100644 --- a/libavfilter/vf_fspp.h +++ b/libavfilter/vf_fspp.h @@ -79,16 +79,16 @@ typedef struct FSPPContext { ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); - void (*mul_thrmat)(struct FSPPContext *fspp, int q); + void (*mul_thrmat)(int16_t *thr_adr_noq, int16_t *thr_adr, int q); void (*column_fidct)(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); void (*row_idct)(int16_t *workspace, int16_t *output_adr, - int output_stride, int cnt); + ptrdiff_t output_stride, int cnt); void (*row_fdct)(int16_t *data, const uint8_t *pixels, - int line_size, int cnt); + ptrdiff_t line_size, int cnt); } FSPPContext; diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index 4f9c83d159..d9265c94a6 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -1,4 +1,4 @@ -OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o +OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_init.o OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o @@ -10,6 +10,7 @@ OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_tinterlace_init.o OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o +YASM-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o YASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o YASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm new file mode 100644 index 0000000000..5ad4275ed8 --- /dev/null +++ b/libavfilter/x86/vf_fspp.asm @@ -0,0 +1,727 @@ +;***************************************************************************** +;* x86-optimized functions for fspp filter +;* +;* Copyright (c) 2003 Michael Niedermayer +;* Copyright (C) 2005 Nikolaj Poroshin +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License along +;* with FFmpeg; if not, write to the Free Software Foundation, Inc., +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pb_dither: db 0, 48, 12, 60, 3, 51, 15, 63, 32, 16, 44, 28, 35, 19, 47, 31, \ + 8, 56, 4, 52, 11, 59, 7, 55, 40, 24, 36, 20, 43, 27, 39, 23, \ + 2, 50, 14, 62, 1, 49, 13, 61, 34, 18, 46, 30, 33, 17, 45, 29, \ + 10, 58, 6, 54, 9, 57, 5, 53, 42, 26, 38, 22, 41, 25, 37, 21 +pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14) +pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13) +pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13) +pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14) +pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14) +pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13) +pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13) +pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14) +pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14) +pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14) +pw_4: times 4 dw 4 +pw_2: times 4 dw 2 + +SECTION .text + +%define DCTSIZE 8 + +INIT_MMX mmx + +;void ff_store_slice_mmx(uint8_t *dst, int16_t *src, +; ptrdiff_t dst_stride, ptrdiff_t src_stride, +; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) +%if ARCH_X86_64 +cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2 +%else +cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 +%define dst_strideq r2m +%define src_strideq r3m + mov widthq, r4m + mov dither_heightq, r5m + mov ditherq, r6m ; log2_scale +%endif + add widthq, 7 + mov tmpq, src_strideq + and widthq, ~7 + sub dst_strideq, widthq + movd m5, ditherq ; log2_scale + xor ditherq, -1 ; log2_scale + mov tmp2q, tmpq + add ditherq, 7 ; log2_scale + neg tmpq + sub tmp2q, widthq + movd m2, ditherq ; log2_scale + add tmp2q, tmp2q + lea ditherq, [pb_dither] + mov src_strideq, tmp2q + shl tmpq, 4 + lea dither_heightq, [ditherq+dither_heightq*8] + +.loop_height: + movq m3, [ditherq] + movq m4, m3 + pxor m7, m7 + punpcklbw m3, m7 + punpckhbw m4, m7 + mov tmp2q, widthq + psraw m3, m5 + psraw m4, m5 + +.loop_width: + movq [srcq+tmpq], m7 + movq m0, [srcq] + movq m1, [srcq+8] + movq [srcq+tmpq+8], m7 + paddw m0, m3 + paddw m1, m4 + movq [srcq], m7 + psraw m0, m2 + psraw m1, m2 + movq [srcq+8], m7 + packuswb m0, m1 + add srcq, 16 + movq [dstq], m0 + add dstq, 8 + sub tmp2q, 8 + jg .loop_width + + add srcq, src_strideq + add ditherq, 8 + add dstq, dst_strideq + cmp ditherq, dither_heightq + jl .loop_height + RET + +;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src, +; ptrdiff_t dst_stride, ptrdiff_t src_stride, +; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) +%if ARCH_X86_64 +cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2 +%else +cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 +%define dst_strideq r2m +%define src_strideq r3m + mov dstq, dstm + mov srcq, srcm + mov widthq, r4m + mov dither_heightq, r5m + mov ditherq, r6m ; log2_scale +%endif + add widthq, 7 + mov tmpq, src_strideq + and widthq, ~7 + sub dst_strideq, widthq + movd m5, ditherq ; log2_scale + xor ditherq, -1 ; log2_scale + mov tmp2q, tmpq + add ditherq, 7 ; log2_scale + sub tmp2q, widthq + movd m2, ditherq ; log2_scale + add tmp2q, tmp2q + lea ditherq, [pb_dither] + mov src_strideq, tmp2q + shl tmpq, 5 + lea dither_heightq, [ditherq+dither_heightq*8] + +.loop_height: + movq m3, [ditherq] + movq m4, m3 + pxor m7, m7 + punpcklbw m3, m7 + punpckhbw m4, m7 + mov tmp2q,widthq + psraw m3, m5 + psraw m4, m5 + +.loop_width: + movq m0, [srcq] + movq m1, [srcq+8] + paddw m0, m3 + paddw m0, [srcq+tmpq] + paddw m1, m4 + movq m6, [srcq+tmpq+8] + movq [srcq+tmpq], m7 + psraw m0, m2 + paddw m1, m6 + movq [srcq+tmpq+8], m7 + psraw m1, m2 + packuswb m0, m1 + movq [dstq], m0 + add srcq, 16 + add dstq, 8 + sub tmp2q, 8 + jg .loop_width + + add srcq, src_strideq + add ditherq, 8 + add dstq, dst_strideq + cmp ditherq, dither_heightq + jl .loop_height + RET + +;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q); +cglobal mul_thrmat, 3, 3, 0, thrn, thr, q + movd m7, qd + movq m0, [thrnq] + punpcklwd m7, m7 + movq m1, [thrnq+8] + punpckldq m7, m7 + pmullw m0, m7 + movq m2, [thrnq+8*2] + pmullw m1, m7 + movq m3, [thrnq+8*3] + pmullw m2, m7 + movq [thrq], m0 + movq m4, [thrnq+8*4] + pmullw m3, m7 + movq [thrq+8], m1 + movq m5, [thrnq+8*5] + pmullw m4, m7 + movq [thrq+8*2], m2 + movq m6, [thrnq+8*6] + pmullw m5, m7 + movq [thrq+8*3], m3 + movq m0, [thrnq+8*7] + pmullw m6, m7 + movq [thrq+8*4], m4 + movq m1, [thrnq+8*7+8] + pmullw m0, m7 + movq [thrq+8*5], m5 + movq m2, [thrnq+8*7+8*2] + pmullw m1, m7 + movq [thrq+8*6], m6 + movq m3, [thrnq+8*7+8*3] + pmullw m2, m7 + movq [thrq+8*7], m0 + movq m4, [thrnq+8*7+8*4] + pmullw m3, m7 + movq [thrq+8*7+8], m1 + movq m5, [thrnq+8*7+8*5] + pmullw m4, m7 + movq [thrq+8*7+8*2], m2 + movq m6, [thrnq+8*7+8*6] + pmullw m5, m7 + movq [thrq+8*7+8*3], m3 + movq m0, [thrnq+14*8] + pmullw m6, m7 + movq [thrq+8*7+8*4], m4 + movq m1, [thrnq+14*8+8] + pmullw m0, m7 + movq [thrq+8*7+8*5], m5 + pmullw m1, m7 + movq [thrq+8*7+8*6], m6 + movq [thrq+14*8], m0 + movq [thrq+14*8+8], m1 + RET + +%macro COLUMN_FDCT 1-3 0, 0 + movq m1, [srcq+DCTSIZE*0*2] + movq m7, [srcq+DCTSIZE*3*2] + movq m0, m1 + paddw m1, [srcq+DCTSIZE*7*2] + movq m3, m7 + paddw m7, [srcq+DCTSIZE*4*2] + movq m5, m1 + movq m6, [srcq+DCTSIZE*1*2] + psubw m1, m7 + movq m2, [srcq+DCTSIZE*2*2] + movq m4, m6 + paddw m6, [srcq+DCTSIZE*6*2] + paddw m5, m7 + paddw m2, [srcq+DCTSIZE*5*2] + movq m7, m6 + paddw m6, m2 + psubw m7, m2 + movq m2, m5 + paddw m5, m6 + psubw m2, m6 + paddw m7, m1 + movq m6, [thrq+4*16+%2] + psllw m7, 2 + psubw m5, [thrq+%2] + psubw m2, m6 + paddusw m5, [thrq+%2] + paddusw m2, m6 + pmulhw m7, [pw_2D41] + paddw m5, [thrq+%2] + paddw m2, m6 + psubusw m5, [thrq+%2] + psubusw m2, m6 + paddw m5, [pw_2] + movq m6, m2 + paddw m2, m5 + psubw m5, m6 + movq m6, m1 + paddw m1, m7 + psubw m1, [thrq+2*16+%2] + psubw m6, m7 + movq m7, [thrq+6*16+%2] + psraw m5, 2 + paddusw m1, [thrq+2*16+%2] + psubw m6, m7 + paddw m1, [thrq+2*16+%2] + paddusw m6, m7 + psubusw m1, [thrq+2*16+%2] + paddw m6, m7 + psubw m3, [srcq+DCTSIZE*4*2] + psubusw m6, m7 + movq m7, m1 + psraw m2, 2 + psubw m4, [srcq+DCTSIZE*6*2] + psubw m1, m6 + psubw m0, [srcq+DCTSIZE*7*2] + paddw m6, m7 + psraw m6, 2 + movq m7, m2 + pmulhw m1, [pw_5A82] + paddw m2, m6 + movq [rsp], m2 + psubw m7, m6 + movq m2, [srcq+DCTSIZE*2*2] + psubw m1, m6 + psubw m2, [srcq+DCTSIZE*5*2] + movq m6, m5 + movq [rsp+8*3], m7 + paddw m3, m2 + paddw m2, m4 + paddw m4, m0 + movq m7, m3 + psubw m3, m4 + psllw m3, 2 + psllw m7, 2 + pmulhw m3, [pw_187E] + psllw m4, 2 + pmulhw m7, [pw_22A3] + psllw m2, 2 + pmulhw m4, [pw_539F] + paddw m5, m1 + pmulhw m2, [pw_2D41] + psubw m6, m1 + paddw m7, m3 + movq [rsp+8], m5 + paddw m4, m3 + movq m3, [thrq+3*16+%2] + movq m1, m0 + movq [rsp+8*2], m6 + psubw m1, m2 + paddw m0, m2 + movq m5, m1 + movq m2, [thrq+5*16+%2] + psubw m1, m7 + paddw m5, m7 + psubw m1, m3 + movq m7, [thrq+16+%2] + psubw m5, m2 + movq m6, m0 + paddw m0, m4 + paddusw m1, m3 + psubw m6, m4 + movq m4, [thrq+7*16+%2] + psubw m0, m7 + psubw m6, m4 + paddusw m5, m2 + paddusw m6, m4 + paddw m1, m3 + paddw m5, m2 + paddw m6, m4 + psubusw m1, m3 + psubusw m5, m2 + psubusw m6, m4 + movq m4, m1 + por m4, m5 + paddusw m0, m7 + por m4, m6 + paddw m0, m7 + packssdw m4, m4 + psubusw m0, m7 + movd tmpd, m4 + or tmpd, tmpd + jnz %1 + movq m4, [rsp] + movq m1, m0 + pmulhw m0, [pw_3642] + movq m2, m1 + movq m5, [outq+DCTSIZE*0*2] + movq m3, m2 + pmulhw m1, [pw_2441] + paddw m5, m4 + movq m6, [rsp+8] + psraw m3, 2 + pmulhw m2, [pw_0CBB] + psubw m4, m3 + movq m7, [outq+DCTSIZE*1*2] + paddw m5, m3 + movq [outq+DCTSIZE*7*2], m4 + paddw m7, m6 + movq m3, [rsp+8*2] + psubw m6, m0 + movq m4, [outq+DCTSIZE*2*2] + paddw m7, m0 + movq [outq], m5 + paddw m4, m3 + movq [outq+DCTSIZE*6*2], m6 + psubw m3, m1 + movq m5, [outq+DCTSIZE*5*2] + paddw m4, m1 + movq m6, [outq+DCTSIZE*3*2] + paddw m5, m3 + movq m0, [rsp+8*3] + add srcq, 8+%3 + movq [outq+DCTSIZE*1*2], m7 + paddw m6, m0 + movq [outq+DCTSIZE*2*2], m4 + psubw m0, m2 + movq m7, [outq+DCTSIZE*4*2] + paddw m6, m2 + movq [outq+DCTSIZE*5*2], m5 + paddw m7, m0 + movq [outq+DCTSIZE*3*2], m6 + movq [outq+DCTSIZE*4*2], m7 + add outq, 8+%3 +%endmacro + +%macro COLUMN_IDCT 0-1 0 + movq m3, m5 + psubw m5, m1 + psllw m5, 1 + paddw m3, m1 + movq m2, m0 + psubw m0, m6 + movq m1, m5 + psllw m0, 1 + pmulhw m1, [pw_AC62] + paddw m5, m0 + pmulhw m5, [pw_3B21] + paddw m2, m6 + pmulhw m0, [pw_22A3] + movq m7, m2 + movq m4, [rsp] + psubw m2, m3 + psllw m2, 1 + paddw m7, m3 + pmulhw m2, [pw_2D41] + movq m6, m4 + psraw m7, 2 + paddw m4, [outq] + psubw m6, m7 + movq m3, [rsp+8] + paddw m4, m7 + movq [outq+DCTSIZE*7*2], m6 + paddw m1, m5 + movq [outq], m4 + psubw m1, m7 + movq m7, [rsp+8*2] + psubw m0, m5 + movq m6, [rsp+8*3] + movq m5, m3 + paddw m3, [outq+DCTSIZE*1*2] + psubw m5, m1 + psubw m2, m1 + paddw m3, m1 + movq [outq+DCTSIZE*6*2], m5 + movq m4, m7 + paddw m7, [outq+DCTSIZE*2*2] + psubw m4, m2 + paddw m4, [outq+DCTSIZE*5*2] + paddw m7, m2 + movq [outq+DCTSIZE*1*2], m3 + paddw m0, m2 + movq [outq+DCTSIZE*2*2], m7 + movq m1, m6 + paddw m6, [outq+DCTSIZE*4*2] + psubw m1, m0 + paddw m1, [outq+DCTSIZE*3*2] + paddw m6, m0 + movq [outq+DCTSIZE*5*2], m4 + add srcq, 8+%1 + movq [outq+DCTSIZE*4*2], m6 + movq [outq+DCTSIZE*3*2], m1 + add outq, 8+%1 +%endmacro + +;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); +cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp +.fdct1: + COLUMN_FDCT .idct1 + jmp .fdct2 + +.idct1: + COLUMN_IDCT + +.fdct2: + COLUMN_FDCT .idct2, 8, 16 + sub cntd, 2 + jnz .fdct1 + RET + +.idct2: + COLUMN_IDCT 16 + sub cntd, 2 + jnz .fdct1 + RET + +;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); +cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3 + add strideq, strideq + lea stride3q, [strideq+strideq*2] +.loop: + movq m0, [srcq+DCTSIZE*0*2] + movq m1, [srcq+DCTSIZE*1*2] + movq m4, m0 + movq m2, [srcq+DCTSIZE*2*2] + punpcklwd m0, m1 + movq m3, [srcq+DCTSIZE*3*2] + punpckhwd m4, m1 + movq m7, m2 + punpcklwd m2, m3 + movq m6, m0 + punpckldq m0, m2 + punpckhdq m6, m2 + movq m5, m0 + punpckhwd m7, m3 + psubw m0, m6 + pmulhw m0, [pw_5A82] + movq m2, m4 + punpckldq m4, m7 + paddw m5, m6 + punpckhdq m2, m7 + movq m1, m4 + psllw m0, 2 + paddw m4, m2 + movq m3, [srcq+DCTSIZE*0*2+8] + psubw m1, m2 + movq m2, [srcq+DCTSIZE*1*2+8] + psubw m0, m5 + movq m6, m4 + paddw m4, m5 + psubw m6, m5 + movq m7, m1 + movq m5, [srcq+DCTSIZE*2*2+8] + paddw m1, m0 + movq [rsp], m4 + movq m4, m3 + movq [rsp+8], m6 + punpcklwd m3, m2 + movq m6, [srcq+DCTSIZE*3*2+8] + punpckhwd m4, m2 + movq m2, m5 + punpcklwd m5, m6 + psubw m7, m0 + punpckhwd m2, m6 + movq m0, m3 + punpckldq m3, m5 + punpckhdq m0, m5 + movq m5, m4 + movq m6, m3 + punpckldq m4, m2 + psubw m3, m0 + punpckhdq m5, m2 + paddw m6, m0 + movq m2, m4 + movq m0, m3 + psubw m4, m5 + pmulhw m0, [pw_AC62] + paddw m3, m4 + pmulhw m3, [pw_3B21] + paddw m2, m5 + pmulhw m4, [pw_22A3] + movq m5, m2 + psubw m2, m6 + paddw m5, m6 + pmulhw m2, [pw_2D41] + paddw m0, m3 + psllw m0, 3 + psubw m4, m3 + movq m6, [rsp] + movq m3, m1 + psllw m4, 3 + psubw m0, m5 + psllw m2, 3 + paddw m1, m0 + psubw m2, m0 + psubw m3, m0 + paddw m4, m2 + movq m0, m7 + paddw m7, m2 + psubw m0, m2 + movq m2, [pw_4] + psubw m6, m5 + paddw m5, [rsp] + paddw m1, m2 + paddw m5, m2 + psraw m1, 3 + paddw m7, m2 + psraw m5, 3 + paddw m5, [dstq] + psraw m7, 3 + paddw m1, [dstq+strideq*1] + paddw m0, m2 + paddw m7, [dstq+strideq*2] + paddw m3, m2 + movq [dstq], m5 + paddw m6, m2 + movq [dstq+strideq*1], m1 + psraw m0, 3 + movq [dstq+strideq*2], m7 + add dstq, stride3q + movq m5, [rsp+8] + psraw m3, 3 + paddw m0, [dstq+strideq*2] + psubw m5, m4 + paddw m3, [dstq+stride3q*1] + psraw m6, 3 + paddw m4, [rsp+8] + paddw m5, m2 + paddw m6, [dstq+strideq*4] + paddw m4, m2 + movq [dstq+strideq*2], m0 + psraw m5, 3 + paddw m5, [dstq] + psraw m4, 3 + paddw m4, [dstq+strideq*1] + add srcq, DCTSIZE*2*4 + movq [dstq+stride3q*1], m3 + movq [dstq+strideq*4], m6 + movq [dstq], m5 + movq [dstq+strideq*1], m4 + sub dstq, stride3q + add dstq, 8 + dec r3d + jnz .loop + RET + +;void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt); +cglobal row_fdct, 4, 5, 0, 16, src, pix, stride, cnt, stride3 + lea stride3q, [strideq+strideq*2] +.loop: + movd m0, [pixq] + pxor m7, m7 + movd m1, [pixq+strideq*1] + punpcklbw m0, m7 + movd m2, [pixq+strideq*2] + punpcklbw m1, m7 + punpcklbw m2, m7 + add pixq,stride3q + movq m5, m0 + movd m3, [pixq+strideq*4] + movq m6, m1 + movd m4, [pixq+stride3q*1] + punpcklbw m3, m7 + psubw m5, m3 + punpcklbw m4, m7 + paddw m0, m3 + psubw m6, m4 + movd m3, [pixq+strideq*2] + paddw m1, m4 + movq [rsp], m5 + punpcklbw m3, m7 + movq [rsp+8], m6 + movq m4, m2 + movd m5, [pixq] + paddw m2, m3 + movd m6, [pixq+strideq*1] + punpcklbw m5, m7 + psubw m4, m3 + punpcklbw m6, m7 + movq m3, m5 + paddw m5, m6 + psubw m3, m6 + movq m6, m0 + movq m7, m1 + psubw m0, m5 + psubw m1, m2 + paddw m7, m2 + paddw m1, m0 + movq m2, m7 + psllw m1, 2 + paddw m6, m5 + pmulhw m1, [pw_2D41] + paddw m7, m6 + psubw m6, m2 + movq m5, m0 + movq m2, m7 + punpcklwd m7, m6 + paddw m0, m1 + punpckhwd m2, m6 + psubw m5, m1 + movq m6, m0 + movq m1, [rsp+8] + punpcklwd m0, m5 + punpckhwd m6, m5 + movq m5, m0 + punpckldq m0, m7 + paddw m3, m4 + punpckhdq m5, m7 + movq m7, m6 + movq [srcq+DCTSIZE*0*2], m0 + punpckldq m6, m2 + movq [srcq+DCTSIZE*1*2], m5 + punpckhdq m7, m2 + movq [srcq+DCTSIZE*2*2], m6 + paddw m4, m1 + movq [srcq+DCTSIZE*3*2], m7 + psllw m3, 2 + movq m2, [rsp] + psllw m4, 2 + pmulhw m4, [pw_2D41] + paddw m1, m2 + psllw m1, 2 + movq m0, m3 + pmulhw m0, [pw_22A3] + psubw m3, m1 + pmulhw m3, [pw_187E] + movq m5, m2 + pmulhw m1, [pw_539F] + psubw m2, m4 + paddw m5, m4 + movq m6, m2 + paddw m0, m3 + movq m7, m5 + paddw m2, m0 + psubw m6, m0 + movq m4, m2 + paddw m1, m3 + punpcklwd m2, m6 + paddw m5, m1 + punpckhwd m4, m6 + psubw m7, m1 + movq m6, m5 + punpcklwd m5, m7 + punpckhwd m6, m7 + movq m7, m2 + punpckldq m2, m5 + sub pixq, stride3q + punpckhdq m7, m5 + movq m5, m4 + movq [srcq+DCTSIZE*0*2+8], m2 + punpckldq m4, m6 + movq [srcq+DCTSIZE*1*2+8], m7 + punpckhdq m5, m6 + movq [srcq+DCTSIZE*2*2+8], m4 + add pixq, 4 + movq [srcq+DCTSIZE*3*2+8], m5 + add srcq, DCTSIZE*4*2 + dec cntd + jnz .loop + RET diff --git a/libavfilter/x86/vf_fspp.c b/libavfilter/x86/vf_fspp.c deleted file mode 100644 index ec24a1ea9f..0000000000 --- a/libavfilter/x86/vf_fspp.c +++ /dev/null @@ -1,1409 +0,0 @@ -/* - * Copyright (c) 2003 Michael Niedermayer - * Copyright (C) 2005 Nikolaj Poroshin - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with FFmpeg; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/mem.h" -#include "libavutil/x86/asm.h" -#include "libavfilter/vf_fspp.h" - -#if HAVE_MMX_INLINE -DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = { - { 0, 48, 12, 60, 3, 51, 15, 63, }, - { 32, 16, 44, 28, 35, 19, 47, 31, }, - { 8, 56, 4, 52, 11, 59, 7, 55, }, - { 40, 24, 36, 20, 43, 27, 39, 23, }, - { 2, 50, 14, 62, 1, 49, 13, 61, }, - { 34, 18, 46, 30, 33, 17, 45, 29, }, - { 10, 58, 6, 54, 9, 57, 5, 53, }, - { 42, 26, 38, 22, 41, 25, 37, 21, }, -}; - -//This func reads from 1 slice, 1 and clears 0 & 1 -static void store_slice_mmx(uint8_t *dst, int16_t *src, - ptrdiff_t dst_stride, ptrdiff_t src_stride, - ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) -{ - const uint8_t *od = &dither[0][0]; - const uint8_t *end = &dither[height][0]; - width = (width + 7) & ~7; - dst_stride -= width; - - __asm__ volatile( - "mov %5 , %%"REG_d" \n\t" - "mov %6 , %%"REG_S" \n\t" - "mov %7 , %%"REG_D" \n\t" - "mov %1 , %%"REG_a" \n\t" - "movd %%"REG_d" , %%mm5 \n\t" - "xor $-1 , %%"REG_d" \n\t" - "mov %%"REG_a" , %%"REG_c" \n\t" - "add $7 , %%"REG_d" \n\t" - "neg %%"REG_a" \n\t" - "sub %0 , %%"REG_c" \n\t" - "add %%"REG_c" , %%"REG_c" \n\t" - "movd %%"REG_d" , %%mm2 \n\t" - "mov %%"REG_c" , %1 \n\t" - "mov %2 , %%"REG_d" \n\t" - "shl $4 , %%"REG_a" \n\t" - - "2: \n\t" - "movq (%%"REG_d") , %%mm3 \n\t" - "movq %%mm3 , %%mm4 \n\t" - "pxor %%mm7 , %%mm7 \n\t" - "punpcklbw %%mm7 , %%mm3 \n\t" - "punpckhbw %%mm7 , %%mm4 \n\t" - "mov %0 , %%"REG_c" \n\t" - "psraw %%mm5 , %%mm3 \n\t" - "psraw %%mm5 , %%mm4 \n\t" - "1: \n\t" - "movq %%mm7, (%%"REG_S",%%"REG_a") \n\t" - "movq (%%"REG_S") , %%mm0 \n\t" - "movq 8(%%"REG_S"), %%mm1 \n\t" - - "movq %%mm7, 8(%%"REG_S",%%"REG_a")\n\t" - "paddw %%mm3, %%mm0 \n\t" - "paddw %%mm4, %%mm1 \n\t" - - "movq %%mm7, (%%"REG_S") \n\t" - "psraw %%mm2, %%mm0 \n\t" - "psraw %%mm2, %%mm1 \n\t" - - "movq %%mm7, 8(%%"REG_S") \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "add $16, %%"REG_S" \n\t" - - "movq %%mm0, (%%"REG_D") \n\t" - "add $8, %%"REG_D" \n\t" - "sub $8, %%"REG_c" \n\t" - "jg 1b \n\t" - "add %1, %%"REG_S" \n\t" - "add $8, %%"REG_d" \n\t" - "add %3, %%"REG_D" \n\t" - "cmp %4, %%"REG_d" \n\t" - "jl 2b \n\t" - - : - : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end), - "m" (log2_scale), "m" (src), "m" (dst) //input - : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D - ); -} - -//This func reads from 2 slices, 0 & 2 and clears 2-nd -static void store_slice2_mmx(uint8_t *dst, int16_t *src, - ptrdiff_t dst_stride, ptrdiff_t src_stride, - ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) -{ - const uint8_t *od = &dither[0][0]; - const uint8_t *end = &dither[height][0]; - width = (width + 7) & ~7; - dst_stride -= width; - - __asm__ volatile( - "mov %5, %%"REG_d" \n\t" - "mov %6, %%"REG_S" \n\t" - "mov %7, %%"REG_D" \n\t" - "mov %1, %%"REG_a" \n\t" - "movd %%"REG_d", %%mm5 \n\t" - "xor $-1, %%"REG_d" \n\t" - "mov %%"REG_a", %%"REG_c" \n\t" - "add $7, %%"REG_d" \n\t" - "sub %0, %%"REG_c" \n\t" - "add %%"REG_c", %%"REG_c" \n\t" - "movd %%"REG_d", %%mm2 \n\t" - "mov %%"REG_c", %1 \n\t" - "mov %2, %%"REG_d" \n\t" - "shl $5, %%"REG_a" \n\t" - - "2: \n\t" - "movq (%%"REG_d"), %%mm3 \n\t" - "movq %%mm3, %%mm4 \n\t" - "pxor %%mm7, %%mm7 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "punpckhbw %%mm7, %%mm4 \n\t" - "mov %0, %%"REG_c" \n\t" - "psraw %%mm5, %%mm3 \n\t" - "psraw %%mm5, %%mm4 \n\t" - "1: \n\t" - "movq (%%"REG_S"), %%mm0 \n\t" - "movq 8(%%"REG_S"), %%mm1 \n\t" - "paddw %%mm3, %%mm0 \n\t" - - "paddw (%%"REG_S",%%"REG_a"),%%mm0\n\t" - "paddw %%mm4, %%mm1 \n\t" - "movq 8(%%"REG_S",%%"REG_a"),%%mm6\n\t" - - "movq %%mm7, (%%"REG_S",%%"REG_a")\n\t" - "psraw %%mm2, %%mm0 \n\t" - "paddw %%mm6, %%mm1 \n\t" - - "movq %%mm7,8(%%"REG_S",%%"REG_a")\n\t" - "psraw %%mm2, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - - "movq %%mm0, (%%"REG_D") \n\t" - "add $16, %%"REG_S" \n\t" - "add $8, %%"REG_D" \n\t" - "sub $8, %%"REG_c" \n\t" - "jg 1b \n\t" - "add %1, %%"REG_S" \n\t" - "add $8, %%"REG_d" \n\t" - "add %3, %%"REG_D" \n\t" - "cmp %4, %%"REG_d" \n\t" - "jl 2b \n\t" - - : - : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end), - "m" (log2_scale), "m" (src), "m" (dst) //input - : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_D, "%"REG_S - ); -} - -static void mul_thrmat_mmx(FSPPContext *p, int q) -{ - uint64_t *adr = &p->threshold_mtx_noq[0]; - - __asm__ volatile( - "movd %0, %%mm7 \n\t" - "add $8*8*2, %%"REG_D" \n\t" - "movq 0*8(%%"REG_S"), %%mm0 \n\t" - "punpcklwd %%mm7, %%mm7 \n\t" - "movq 1*8(%%"REG_S"), %%mm1 \n\t" - "punpckldq %%mm7, %%mm7 \n\t" - "pmullw %%mm7, %%mm0 \n\t" - - "movq 2*8(%%"REG_S"), %%mm2 \n\t" - "pmullw %%mm7, %%mm1 \n\t" - - "movq 3*8(%%"REG_S"), %%mm3 \n\t" - "pmullw %%mm7, %%mm2 \n\t" - - "movq %%mm0, 0*8(%%"REG_D") \n\t" - "movq 4*8(%%"REG_S"), %%mm4 \n\t" - "pmullw %%mm7, %%mm3 \n\t" - - "movq %%mm1, 1*8(%%"REG_D") \n\t" - "movq 5*8(%%"REG_S"), %%mm5 \n\t" - "pmullw %%mm7, %%mm4 \n\t" - - "movq %%mm2, 2*8(%%"REG_D") \n\t" - "movq 6*8(%%"REG_S"), %%mm6 \n\t" - "pmullw %%mm7, %%mm5 \n\t" - - "movq %%mm3, 3*8(%%"REG_D") \n\t" - "movq 7*8+0*8(%%"REG_S"), %%mm0 \n\t" - "pmullw %%mm7, %%mm6 \n\t" - - "movq %%mm4, 4*8(%%"REG_D") \n\t" - "movq 7*8+1*8(%%"REG_S"), %%mm1 \n\t" - "pmullw %%mm7, %%mm0 \n\t" - - "movq %%mm5, 5*8(%%"REG_D") \n\t" - "movq 7*8+2*8(%%"REG_S"), %%mm2 \n\t" - "pmullw %%mm7, %%mm1 \n\t" - - "movq %%mm6, 6*8(%%"REG_D") \n\t" - "movq 7*8+3*8(%%"REG_S"), %%mm3 \n\t" - "pmullw %%mm7, %%mm2 \n\t" - - "movq %%mm0, 7*8+0*8(%%"REG_D") \n\t" - "movq 7*8+4*8(%%"REG_S"), %%mm4 \n\t" - "pmullw %%mm7, %%mm3 \n\t" - - "movq %%mm1, 7*8+1*8(%%"REG_D") \n\t" - "movq 7*8+5*8(%%"REG_S"), %%mm5 \n\t" - "pmullw %%mm7, %%mm4 \n\t" - - "movq %%mm2, 7*8+2*8(%%"REG_D") \n\t" - "movq 7*8+6*8(%%"REG_S"), %%mm6 \n\t" - "pmullw %%mm7, %%mm5 \n\t" - - "movq %%mm3, 7*8+3*8(%%"REG_D") \n\t" - "movq 14*8+0*8(%%"REG_S"), %%mm0 \n\t" - "pmullw %%mm7, %%mm6 \n\t" - - "movq %%mm4, 7*8+4*8(%%"REG_D") \n\t" - "movq 14*8+1*8(%%"REG_S"), %%mm1 \n\t" - "pmullw %%mm7, %%mm0 \n\t" - - "movq %%mm5, 7*8+5*8(%%"REG_D") \n\t" - "pmullw %%mm7, %%mm1 \n\t" - - "movq %%mm6, 7*8+6*8(%%"REG_D") \n\t" - "movq %%mm0, 14*8+0*8(%%"REG_D") \n\t" - "movq %%mm1, 14*8+1*8(%%"REG_D") \n\t" - - : "+g" (q), "+S" (adr), "+D" (adr) - : - ); -} - -DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_382683433) = FIX64(0.382683433, 14); -DECLARE_ALIGNED (8, uint64_t, ff_MM_FIX_0_541196100)= FIX64(0.541196100, 14); -DECLARE_ALIGNED (8, uint64_t, ff_MM_FIX_0_707106781)= FIX64(0.707106781, 14); -DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_306562965) = FIX64(1.306562965, 14); - -DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562_A) = FIX64(1.414213562, 14); - -DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_847759065) = FIX64(1.847759065, 13); -DECLARE_ASM_CONST(8, uint64_t, MM_FIX_2_613125930) = FIX64(-2.613125930, 13); -DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562) = FIX64(1.414213562, 13); -DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_082392200) = FIX64(1.082392200, 13); -//for t3,t5,t7 == 0 shortcut -DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_847759065) = FIX64(0.847759065, 14); -DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_566454497) = FIX64(0.566454497, 14); -DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_198912367) = FIX64(0.198912367, 14); - -DECLARE_ASM_CONST(8, uint64_t, MM_DESCALE_RND) = C64(4); -DECLARE_ASM_CONST(8, uint64_t, MM_2) = C64(2); - -static void column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt) -{ - DECLARE_ALIGNED(8, uint64_t, temps)[4]; - - __asm__ volatile( - - "1: \n\t" - "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t" - // - "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t" - "movq %%mm1, %%mm0 \n\t" - - "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0 - "movq %%mm7, %%mm3 \n\t" - - "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3 - "movq %%mm1, %%mm5 \n\t" - - "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t" - "psubw %%mm7, %%mm1 \n\t" //t13 - - "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t" - "movq %%mm6, %%mm4 \n\t" - - "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1 - "paddw %%mm7, %%mm5 \n\t" //t10 - - "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2 - "movq %%mm6, %%mm7 \n\t" - - "paddw %%mm2, %%mm6 \n\t" //t11 - "psubw %%mm2, %%mm7 \n\t" //t12 - - "movq %%mm5, %%mm2 \n\t" - "paddw %%mm6, %%mm5 \n\t" //d0 - // i0 t13 t12 i3 i1 d0 - d4 - "psubw %%mm6, %%mm2 \n\t" //d4 - "paddw %%mm1, %%mm7 \n\t" - - "movq 4*16(%%"REG_d"), %%mm6 \n\t" - "psllw $2, %%mm7 \n\t" - - "psubw 0*16(%%"REG_d"), %%mm5 \n\t" - "psubw %%mm6, %%mm2 \n\t" - - "paddusw 0*16(%%"REG_d"), %%mm5 \n\t" - "paddusw %%mm6, %%mm2 \n\t" - - "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm7 \n\t" - // - "paddw 0*16(%%"REG_d"), %%mm5 \n\t" - "paddw %%mm6, %%mm2 \n\t" - - "psubusw 0*16(%%"REG_d"), %%mm5 \n\t" - "psubusw %%mm6, %%mm2 \n\t" - -//This func is totally compute-bound, operates at huge speed. So, DC shortcut -// at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3). -//However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare. - "paddw "MANGLE(MM_2)", %%mm5 \n\t" - "movq %%mm2, %%mm6 \n\t" - - "paddw %%mm5, %%mm2 \n\t" - "psubw %%mm6, %%mm5 \n\t" - - "movq %%mm1, %%mm6 \n\t" - "paddw %%mm7, %%mm1 \n\t" //d2 - - "psubw 2*16(%%"REG_d"), %%mm1 \n\t" - "psubw %%mm7, %%mm6 \n\t" //d6 - - "movq 6*16(%%"REG_d"), %%mm7 \n\t" - "psraw $2, %%mm5 \n\t" - - "paddusw 2*16(%%"REG_d"), %%mm1 \n\t" - "psubw %%mm7, %%mm6 \n\t" - // t7 d2 /t11 t4 t6 - d6 /t10 - - "paddw 2*16(%%"REG_d"), %%mm1 \n\t" - "paddusw %%mm7, %%mm6 \n\t" - - "psubusw 2*16(%%"REG_d"), %%mm1 \n\t" - "paddw %%mm7, %%mm6 \n\t" - - "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t" - "psubusw %%mm7, %%mm6 \n\t" - - //movq [edi+"DCTSIZE_S"*2*2], mm1 - //movq [edi+"DCTSIZE_S"*6*2], mm6 - "movq %%mm1, %%mm7 \n\t" - "psraw $2, %%mm2 \n\t" - - "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t" - "psubw %%mm6, %%mm1 \n\t" - - "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t" - "paddw %%mm7, %%mm6 \n\t" //'t13 - - "psraw $2, %%mm6 \n\t" //paddw mm6, MM_2 !! --- - "movq %%mm2, %%mm7 \n\t" - - "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t" - "paddw %%mm6, %%mm2 \n\t" //'t0 - - "movq %%mm2, 0*8+%3 \n\t" //! - "psubw %%mm6, %%mm7 \n\t" //'t3 - - "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t" - "psubw %%mm6, %%mm1 \n\t" //'t12 - - "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5 - "movq %%mm5, %%mm6 \n\t" - - "movq %%mm7, 3*8+%3 \n\t" - "paddw %%mm2, %%mm3 \n\t" //t10 - - "paddw %%mm4, %%mm2 \n\t" //t11 - "paddw %%mm0, %%mm4 \n\t" //t12 - - "movq %%mm3, %%mm7 \n\t" - "psubw %%mm4, %%mm3 \n\t" - - "psllw $2, %%mm3 \n\t" - "psllw $2, %%mm7 \n\t" //opt for P6 - - "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" - "psllw $2, %%mm4 \n\t" - - "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm7 \n\t" - "psllw $2, %%mm2 \n\t" - - "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t" - "paddw %%mm1, %%mm5 \n\t" //'t1 - - "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm2 \n\t" - "psubw %%mm1, %%mm6 \n\t" //'t2 - // t7 't12 't11 t4 t6 - 't13 't10 --- - - "paddw %%mm3, %%mm7 \n\t" //z2 - - "movq %%mm5, 1*8+%3 \n\t" - "paddw %%mm3, %%mm4 \n\t" //z4 - - "movq 3*16(%%"REG_d"), %%mm3 \n\t" - "movq %%mm0, %%mm1 \n\t" - - "movq %%mm6, 2*8+%3 \n\t" - "psubw %%mm2, %%mm1 \n\t" //z13 - -//=== - "paddw %%mm2, %%mm0 \n\t" //z11 - "movq %%mm1, %%mm5 \n\t" - - "movq 5*16(%%"REG_d"), %%mm2 \n\t" - "psubw %%mm7, %%mm1 \n\t" //d3 - - "paddw %%mm7, %%mm5 \n\t" //d5 - "psubw %%mm3, %%mm1 \n\t" - - "movq 1*16(%%"REG_d"), %%mm7 \n\t" - "psubw %%mm2, %%mm5 \n\t" - - "movq %%mm0, %%mm6 \n\t" - "paddw %%mm4, %%mm0 \n\t" //d1 - - "paddusw %%mm3, %%mm1 \n\t" - "psubw %%mm4, %%mm6 \n\t" //d7 - - // d1 d3 - - - d5 d7 - - "movq 7*16(%%"REG_d"), %%mm4 \n\t" - "psubw %%mm7, %%mm0 \n\t" - - "psubw %%mm4, %%mm6 \n\t" - "paddusw %%mm2, %%mm5 \n\t" - - "paddusw %%mm4, %%mm6 \n\t" - "paddw %%mm3, %%mm1 \n\t" - - "paddw %%mm2, %%mm5 \n\t" - "paddw %%mm4, %%mm6 \n\t" - - "psubusw %%mm3, %%mm1 \n\t" - "psubusw %%mm2, %%mm5 \n\t" - - "psubusw %%mm4, %%mm6 \n\t" - "movq %%mm1, %%mm4 \n\t" - - "por %%mm5, %%mm4 \n\t" - "paddusw %%mm7, %%mm0 \n\t" - - "por %%mm6, %%mm4 \n\t" - "paddw %%mm7, %%mm0 \n\t" - - "packssdw %%mm4, %%mm4 \n\t" - "psubusw %%mm7, %%mm0 \n\t" - - "movd %%mm4, %%"REG_a" \n\t" - "or %%"REG_a", %%"REG_a" \n\t" - "jnz 2f \n\t" - //movq [edi+"DCTSIZE_S"*3*2], mm1 - //movq [edi+"DCTSIZE_S"*5*2], mm5 - //movq [edi+"DCTSIZE_S"*1*2], mm0 - //movq [edi+"DCTSIZE_S"*7*2], mm6 - // t4 t5 - - - t6 t7 - - //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0 -//Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile - "movq 0*8+%3, %%mm4 \n\t" - "movq %%mm0, %%mm1 \n\t" - - "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6 - "movq %%mm1, %%mm2 \n\t" - - "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t" - "movq %%mm2, %%mm3 \n\t" - - "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5 - "paddw %%mm4, %%mm5 \n\t" - - "movq 1*8+%3, %%mm6 \n\t" - //paddw mm3, MM_2 - "psraw $2, %%mm3 \n\t" //tmp7 - - "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4 - "psubw %%mm3, %%mm4 \n\t" - - "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t" - "paddw %%mm3, %%mm5 \n\t" - - "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t" - "paddw %%mm6, %%mm7 \n\t" - - "movq 2*8+%3, %%mm3 \n\t" - "psubw %%mm0, %%mm6 \n\t" - - "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t" - "paddw %%mm0, %%mm7 \n\t" - - "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t" - "paddw %%mm3, %%mm4 \n\t" - - "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t" - "psubw %%mm1, %%mm3 \n\t" - - "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t" - "paddw %%mm1, %%mm4 \n\t" - - "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t" - "paddw %%mm3, %%mm5 \n\t" - - "movq 3*8+%3, %%mm0 \n\t" - "add $8, %%"REG_S" \n\t" - - "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t" - "paddw %%mm0, %%mm6 \n\t" - - "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t" - "psubw %%mm2, %%mm0 \n\t" - - "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t" - "paddw %%mm2, %%mm6 \n\t" - - "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t" - "paddw %%mm0, %%mm7 \n\t" - - "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t" - - "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t" - "add $8, %%"REG_D" \n\t" - "jmp 4f \n\t" - - "2: \n\t" - //--- non DC2 - //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1) - //psraw mm5, 2 - //psraw mm0, 2 - //psraw mm6, 2 - "movq %%mm5, %%mm3 \n\t" - "psubw %%mm1, %%mm5 \n\t" - - "psllw $1, %%mm5 \n\t" //'z10 - "paddw %%mm1, %%mm3 \n\t" //'z13 - - "movq %%mm0, %%mm2 \n\t" - "psubw %%mm6, %%mm0 \n\t" - - "movq %%mm5, %%mm1 \n\t" - "psllw $1, %%mm0 \n\t" //'z12 - - "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //- - "paddw %%mm0, %%mm5 \n\t" - - "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5 - "paddw %%mm6, %%mm2 \n\t" //'z11 - - "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t" - "movq %%mm2, %%mm7 \n\t" - - //--- - "movq 0*8+%3, %%mm4 \n\t" - "psubw %%mm3, %%mm2 \n\t" - - "psllw $1, %%mm2 \n\t" - "paddw %%mm3, %%mm7 \n\t" //'t7 - - "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11 - "movq %%mm4, %%mm6 \n\t" - //paddw mm7, MM_2 - "psraw $2, %%mm7 \n\t" - - "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4\n\t" - "psubw %%mm7, %%mm6 \n\t" - - "movq 1*8+%3, %%mm3 \n\t" - "paddw %%mm7, %%mm4 \n\t" - - "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t" - "paddw %%mm5, %%mm1 \n\t" //'t12 - - "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t" - "psubw %%mm7, %%mm1 \n\t" //'t6 - - "movq 2*8+%3, %%mm7 \n\t" - "psubw %%mm5, %%mm0 \n\t" //'t10 - - "movq 3*8+%3, %%mm6 \n\t" - "movq %%mm3, %%mm5 \n\t" - - "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3\n\t" - "psubw %%mm1, %%mm5 \n\t" - - "psubw %%mm1, %%mm2 \n\t" //'t5 - "paddw %%mm1, %%mm3 \n\t" - - "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t" - "movq %%mm7, %%mm4 \n\t" - - "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7\n\t" - "psubw %%mm2, %%mm4 \n\t" - - "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4\n\t" - "paddw %%mm2, %%mm7 \n\t" - - "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t" - "paddw %%mm2, %%mm0 \n\t" //'t4 - - // 't4 't6 't5 - - - - 't7 - "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t" - "movq %%mm6, %%mm1 \n\t" - - "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6\n\t" - "psubw %%mm0, %%mm1 \n\t" - - "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1\n\t" - "paddw %%mm0, %%mm6 \n\t" - - "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t" - "add $8, %%"REG_S" \n\t" - - "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t" - - "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t" - "add $8, %%"REG_D" \n\t" - - "4: \n\t" - "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t" - // - "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t" - "movq %%mm1, %%mm0 \n\t" - - "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1\n\t" //t0 - "movq %%mm7, %%mm3 \n\t" - - "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7\n\t" //t3 - "movq %%mm1, %%mm5 \n\t" - - "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t" - "psubw %%mm7, %%mm1 \n\t" //t13 - - "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t" - "movq %%mm6, %%mm4 \n\t" - - "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6\n\t" //t1 - "paddw %%mm7, %%mm5 \n\t" //t10 - - "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2\n\t" //t2 - "movq %%mm6, %%mm7 \n\t" - - "paddw %%mm2, %%mm6 \n\t" //t11 - "psubw %%mm2, %%mm7 \n\t" //t12 - - "movq %%mm5, %%mm2 \n\t" - "paddw %%mm6, %%mm5 \n\t" //d0 - // i0 t13 t12 i3 i1 d0 - d4 - "psubw %%mm6, %%mm2 \n\t" //d4 - "paddw %%mm1, %%mm7 \n\t" - - "movq 1*8+4*16(%%"REG_d"), %%mm6 \n\t" - "psllw $2, %%mm7 \n\t" - - "psubw 1*8+0*16(%%"REG_d"), %%mm5 \n\t" - "psubw %%mm6, %%mm2 \n\t" - - "paddusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t" - "paddusw %%mm6, %%mm2 \n\t" - - "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm7 \n\t" - // - "paddw 1*8+0*16(%%"REG_d"), %%mm5 \n\t" - "paddw %%mm6, %%mm2 \n\t" - - "psubusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t" - "psubusw %%mm6, %%mm2 \n\t" - -//This func is totally compute-bound, operates at huge speed. So, DC shortcut -// at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3). -//However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare. - "paddw "MANGLE(MM_2)", %%mm5 \n\t" - "movq %%mm2, %%mm6 \n\t" - - "paddw %%mm5, %%mm2 \n\t" - "psubw %%mm6, %%mm5 \n\t" - - "movq %%mm1, %%mm6 \n\t" - "paddw %%mm7, %%mm1 \n\t" //d2 - - "psubw 1*8+2*16(%%"REG_d"), %%mm1 \n\t" - "psubw %%mm7, %%mm6 \n\t" //d6 - - "movq 1*8+6*16(%%"REG_d"), %%mm7 \n\t" - "psraw $2, %%mm5 \n\t" - - "paddusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t" - "psubw %%mm7, %%mm6 \n\t" - // t7 d2 /t11 t4 t6 - d6 /t10 - - "paddw 1*8+2*16(%%"REG_d"), %%mm1 \n\t" - "paddusw %%mm7, %%mm6 \n\t" - - "psubusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t" - "paddw %%mm7, %%mm6 \n\t" - - "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3\n\t" - "psubusw %%mm7, %%mm6 \n\t" - - //movq [edi+"DCTSIZE_S"*2*2], mm1 - //movq [edi+"DCTSIZE_S"*6*2], mm6 - "movq %%mm1, %%mm7 \n\t" - "psraw $2, %%mm2 \n\t" - - "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4\n\t" - "psubw %%mm6, %%mm1 \n\t" - - "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0\n\t" - "paddw %%mm7, %%mm6 \n\t" //'t13 - - "psraw $2, %%mm6 \n\t" //paddw mm6, MM_2 !! --- - "movq %%mm2, %%mm7 \n\t" - - "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t" - "paddw %%mm6, %%mm2 \n\t" //'t0 - - "movq %%mm2, 0*8+%3 \n\t" //! - "psubw %%mm6, %%mm7 \n\t" //'t3 - - "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t" - "psubw %%mm6, %%mm1 \n\t" //'t12 - - "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2\n\t" //t5 - "movq %%mm5, %%mm6 \n\t" - - "movq %%mm7, 3*8+%3 \n\t" - "paddw %%mm2, %%mm3 \n\t" //t10 - - "paddw %%mm4, %%mm2 \n\t" //t11 - "paddw %%mm0, %%mm4 \n\t" //t12 - - "movq %%mm3, %%mm7 \n\t" - "psubw %%mm4, %%mm3 \n\t" - - "psllw $2, %%mm3 \n\t" - "psllw $2, %%mm7 \n\t" //opt for P6 - - "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" - "psllw $2, %%mm4 \n\t" - - "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm7 \n\t" - "psllw $2, %%mm2 \n\t" - - "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t" - "paddw %%mm1, %%mm5 \n\t" //'t1 - - "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm2 \n\t" - "psubw %%mm1, %%mm6 \n\t" //'t2 - // t7 't12 't11 t4 t6 - 't13 't10 --- - - "paddw %%mm3, %%mm7 \n\t" //z2 - - "movq %%mm5, 1*8+%3 \n\t" - "paddw %%mm3, %%mm4 \n\t" //z4 - - "movq 1*8+3*16(%%"REG_d"), %%mm3 \n\t" - "movq %%mm0, %%mm1 \n\t" - - "movq %%mm6, 2*8+%3 \n\t" - "psubw %%mm2, %%mm1 \n\t" //z13 - -//=== - "paddw %%mm2, %%mm0 \n\t" //z11 - "movq %%mm1, %%mm5 \n\t" - - "movq 1*8+5*16(%%"REG_d"), %%mm2 \n\t" - "psubw %%mm7, %%mm1 \n\t" //d3 - - "paddw %%mm7, %%mm5 \n\t" //d5 - "psubw %%mm3, %%mm1 \n\t" - - "movq 1*8+1*16(%%"REG_d"), %%mm7 \n\t" - "psubw %%mm2, %%mm5 \n\t" - - "movq %%mm0, %%mm6 \n\t" - "paddw %%mm4, %%mm0 \n\t" //d1 - - "paddusw %%mm3, %%mm1 \n\t" - "psubw %%mm4, %%mm6 \n\t" //d7 - - // d1 d3 - - - d5 d7 - - "movq 1*8+7*16(%%"REG_d"), %%mm4 \n\t" - "psubw %%mm7, %%mm0 \n\t" - - "psubw %%mm4, %%mm6 \n\t" - "paddusw %%mm2, %%mm5 \n\t" - - "paddusw %%mm4, %%mm6 \n\t" - "paddw %%mm3, %%mm1 \n\t" - - "paddw %%mm2, %%mm5 \n\t" - "paddw %%mm4, %%mm6 \n\t" - - "psubusw %%mm3, %%mm1 \n\t" - "psubusw %%mm2, %%mm5 \n\t" - - "psubusw %%mm4, %%mm6 \n\t" - "movq %%mm1, %%mm4 \n\t" - - "por %%mm5, %%mm4 \n\t" - "paddusw %%mm7, %%mm0 \n\t" - - "por %%mm6, %%mm4 \n\t" - "paddw %%mm7, %%mm0 \n\t" - - "packssdw %%mm4, %%mm4 \n\t" - "psubusw %%mm7, %%mm0 \n\t" - - "movd %%mm4, %%"REG_a" \n\t" - "or %%"REG_a", %%"REG_a" \n\t" - "jnz 3f \n\t" - //movq [edi+"DCTSIZE_S"*3*2], mm1 - //movq [edi+"DCTSIZE_S"*5*2], mm5 - //movq [edi+"DCTSIZE_S"*1*2], mm0 - //movq [edi+"DCTSIZE_S"*7*2], mm6 - // t4 t5 - - - t6 t7 - - //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0 -//Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile - "movq 0*8+%3, %%mm4 \n\t" - "movq %%mm0, %%mm1 \n\t" - - "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6 - "movq %%mm1, %%mm2 \n\t" - - "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5\n\t" - "movq %%mm2, %%mm3 \n\t" - - "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5 - "paddw %%mm4, %%mm5 \n\t" - - "movq 1*8+%3, %%mm6 \n\t" - //paddw mm3, MM_2 - "psraw $2, %%mm3 \n\t" //tmp7 - - "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4 - "psubw %%mm3, %%mm4 \n\t" - - "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7\n\t" - "paddw %%mm3, %%mm5 \n\t" - - "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D")\n\t" - "paddw %%mm6, %%mm7 \n\t" - - "movq 2*8+%3, %%mm3 \n\t" - "psubw %%mm0, %%mm6 \n\t" - - "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4\n\t" - "paddw %%mm0, %%mm7 \n\t" - - "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D")\n\t" - "paddw %%mm3, %%mm4 \n\t" - - "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D")\n\t" - "psubw %%mm1, %%mm3 \n\t" - - "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5\n\t" - "paddw %%mm1, %%mm4 \n\t" - - "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6\n\t" - "paddw %%mm3, %%mm5 \n\t" - - "movq 3*8+%3, %%mm0 \n\t" - "add $24, %%"REG_S" \n\t" - - "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D")\n\t" - "paddw %%mm0, %%mm6 \n\t" - - "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D")\n\t" - "psubw %%mm2, %%mm0 \n\t" - - "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7\n\t" - "paddw %%mm2, %%mm6 \n\t" - - "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D")\n\t" - "paddw %%mm0, %%mm7 \n\t" - - "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D")\n\t" - - "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D")\n\t" - "add $24, %%"REG_D" \n\t" - "sub $2, %%"REG_c" \n\t" - "jnz 1b \n\t" - "jmp 5f \n\t" - - "3: \n\t" - //--- non DC2 - //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1) - //psraw mm5, 2 - //psraw mm0, 2 - //psraw mm6, 2 - "movq %%mm5, %%mm3 \n\t" - "psubw %%mm1, %%mm5 \n\t" - - "psllw $1, %%mm5 \n\t" //'z10 - "paddw %%mm1, %%mm3 \n\t" //'z13 - - "movq %%mm0, %%mm2 \n\t" - "psubw %%mm6, %%mm0 \n\t" - - "movq %%mm5, %%mm1 \n\t" - "psllw $1, %%mm0 \n\t" //'z12 - - "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //- - "paddw %%mm0, %%mm5 \n\t" - - "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5 - "paddw %%mm6, %%mm2 \n\t" //'z11 - - "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t" - "movq %%mm2, %%mm7 \n\t" - - //--- - "movq 0*8+%3, %%mm4 \n\t" - "psubw %%mm3, %%mm2 \n\t" - - "psllw $1, %%mm2 \n\t" - "paddw %%mm3, %%mm7 \n\t" //'t7 - - "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11 - "movq %%mm4, %%mm6 \n\t" - //paddw mm7, MM_2 - "psraw $2, %%mm7 \n\t" - - "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t" - "psubw %%mm7, %%mm6 \n\t" - - "movq 1*8+%3, %%mm3 \n\t" - "paddw %%mm7, %%mm4 \n\t" - - "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t" - "paddw %%mm5, %%mm1 \n\t" //'t12 - - "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t" - "psubw %%mm7, %%mm1 \n\t" //'t6 - - "movq 2*8+%3, %%mm7 \n\t" - "psubw %%mm5, %%mm0 \n\t" //'t10 - - "movq 3*8+%3, %%mm6 \n\t" - "movq %%mm3, %%mm5 \n\t" - - "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t" - "psubw %%mm1, %%mm5 \n\t" - - "psubw %%mm1, %%mm2 \n\t" //'t5 - "paddw %%mm1, %%mm3 \n\t" - - "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t" - "movq %%mm7, %%mm4 \n\t" - - "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t" - "psubw %%mm2, %%mm4 \n\t" - - "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t" - "paddw %%mm2, %%mm7 \n\t" - - "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t" - "paddw %%mm2, %%mm0 \n\t" //'t4 - - // 't4 't6 't5 - - - - 't7 - "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t" - "movq %%mm6, %%mm1 \n\t" - - "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t" - "psubw %%mm0, %%mm1 \n\t" - - "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - - "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t" - "add $24, %%"REG_S" \n\t" - - "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t" - - "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t" - "add $24, %%"REG_D" \n\t" - "sub $2, %%"REG_c" \n\t" - "jnz 1b \n\t" - "5: \n\t" - - : "+S"(data), "+D"(output), "+c"(cnt), "=o"(temps) - : "d"(thr_adr) - NAMED_CONSTRAINTS_ADD(ff_MM_FIX_0_707106781, MM_2,MM_FIX_1_414213562_A, MM_FIX_1_414213562, MM_FIX_0_382683433, - ff_MM_FIX_0_541196100, MM_FIX_1_306562965, MM_FIX_0_847759065) - NAMED_CONSTRAINTS_ADD(MM_FIX_0_566454497, MM_FIX_0_198912367, MM_FIX_2_613125930, MM_FIX_1_847759065, - MM_FIX_1_082392200) - : "%"REG_a - ); -} - -static void row_idct_mmx (int16_t *workspace, int16_t *output_adr, int output_stride, int cnt) -{ - DECLARE_ALIGNED(8, uint64_t, temps)[4]; - - __asm__ volatile( - "lea (%%"REG_a",%%"REG_a",2), %%"REG_d" \n\t" - "1: \n\t" - "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm0 \n\t" - // - - "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm1 \n\t" - "movq %%mm0, %%mm4 \n\t" - - "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t" - "punpcklwd %%mm1, %%mm0 \n\t" - - "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm3 \n\t" - "punpckhwd %%mm1, %%mm4 \n\t" - - //transpose 4x4 - "movq %%mm2, %%mm7 \n\t" - "punpcklwd %%mm3, %%mm2 \n\t" - - "movq %%mm0, %%mm6 \n\t" - "punpckldq %%mm2, %%mm0 \n\t" //0 - - "punpckhdq %%mm2, %%mm6 \n\t" //1 - "movq %%mm0, %%mm5 \n\t" - - "punpckhwd %%mm3, %%mm7 \n\t" - "psubw %%mm6, %%mm0 \n\t" - - "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm0 \n\t" - "movq %%mm4, %%mm2 \n\t" - - "punpckldq %%mm7, %%mm4 \n\t" //2 - "paddw %%mm6, %%mm5 \n\t" - - "punpckhdq %%mm7, %%mm2 \n\t" //3 - "movq %%mm4, %%mm1 \n\t" - - "psllw $2, %%mm0 \n\t" - "paddw %%mm2, %%mm4 \n\t" //t10 - - "movq "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_S"), %%mm3 \n\t" - "psubw %%mm2, %%mm1 \n\t" //t11 - - "movq "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_S"), %%mm2 \n\t" - "psubw %%mm5, %%mm0 \n\t" - - "movq %%mm4, %%mm6 \n\t" - "paddw %%mm5, %%mm4 \n\t" //t0 - - "psubw %%mm5, %%mm6 \n\t" //t3 - "movq %%mm1, %%mm7 \n\t" - - "movq "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_S"), %%mm5 \n\t" - "paddw %%mm0, %%mm1 \n\t" //t1 - - "movq %%mm4, 0*8+%3 \n\t" //t0 - "movq %%mm3, %%mm4 \n\t" - - "movq %%mm6, 1*8+%3 \n\t" //t3 - "punpcklwd %%mm2, %%mm3 \n\t" - - //transpose 4x4 - "movq "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_S"), %%mm6 \n\t" - "punpckhwd %%mm2, %%mm4 \n\t" - - "movq %%mm5, %%mm2 \n\t" - "punpcklwd %%mm6, %%mm5 \n\t" - - "psubw %%mm0, %%mm7 \n\t" //t2 - "punpckhwd %%mm6, %%mm2 \n\t" - - "movq %%mm3, %%mm0 \n\t" - "punpckldq %%mm5, %%mm3 \n\t" //4 - - "punpckhdq %%mm5, %%mm0 \n\t" //5 - "movq %%mm4, %%mm5 \n\t" - - // - "movq %%mm3, %%mm6 \n\t" - "punpckldq %%mm2, %%mm4 \n\t" //6 - - "psubw %%mm0, %%mm3 \n\t" //z10 - "punpckhdq %%mm2, %%mm5 \n\t" //7 - - "paddw %%mm0, %%mm6 \n\t" //z13 - "movq %%mm4, %%mm2 \n\t" - - "movq %%mm3, %%mm0 \n\t" - "psubw %%mm5, %%mm4 \n\t" //z12 - - "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm0\n\t" //- - "paddw %%mm4, %%mm3 \n\t" - - "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm3\n\t" //z5 - "paddw %%mm5, %%mm2 \n\t" //z11 > - - "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm4\n\t" - "movq %%mm2, %%mm5 \n\t" - - "psubw %%mm6, %%mm2 \n\t" - "paddw %%mm6, %%mm5 \n\t" //t7 - - "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2\n\t" //t11 - "paddw %%mm3, %%mm0 \n\t" //t12 - - "psllw $3, %%mm0 \n\t" - "psubw %%mm3, %%mm4 \n\t" //t10 - - "movq 0*8+%3, %%mm6 \n\t" - "movq %%mm1, %%mm3 \n\t" - - "psllw $3, %%mm4 \n\t" - "psubw %%mm5, %%mm0 \n\t" //t6 - - "psllw $3, %%mm2 \n\t" - "paddw %%mm0, %%mm1 \n\t" //d1 - - "psubw %%mm0, %%mm2 \n\t" //t5 - "psubw %%mm0, %%mm3 \n\t" //d6 - - "paddw %%mm2, %%mm4 \n\t" //t4 - "movq %%mm7, %%mm0 \n\t" - - "paddw %%mm2, %%mm7 \n\t" //d2 - "psubw %%mm2, %%mm0 \n\t" //d5 - - "movq "MANGLE(MM_DESCALE_RND)", %%mm2 \n\t" //4 - "psubw %%mm5, %%mm6 \n\t" //d7 - - "paddw 0*8+%3, %%mm5 \n\t" //d0 - "paddw %%mm2, %%mm1 \n\t" - - "paddw %%mm2, %%mm5 \n\t" - "psraw $3, %%mm1 \n\t" - - "paddw %%mm2, %%mm7 \n\t" - "psraw $3, %%mm5 \n\t" - - "paddw (%%"REG_D"), %%mm5 \n\t" - "psraw $3, %%mm7 \n\t" - - "paddw (%%"REG_D",%%"REG_a"), %%mm1 \n\t" - "paddw %%mm2, %%mm0 \n\t" - - "paddw (%%"REG_D",%%"REG_a",2), %%mm7 \n\t" - "paddw %%mm2, %%mm3 \n\t" - - "movq %%mm5, (%%"REG_D") \n\t" - "paddw %%mm2, %%mm6 \n\t" - - "movq %%mm1, (%%"REG_D",%%"REG_a") \n\t" - "psraw $3, %%mm0 \n\t" - - "movq %%mm7, (%%"REG_D",%%"REG_a",2) \n\t" - "add %%"REG_d", %%"REG_D" \n\t" //3*ls - - "movq 1*8+%3, %%mm5 \n\t" //t3 - "psraw $3, %%mm3 \n\t" - - "paddw (%%"REG_D",%%"REG_a",2), %%mm0 \n\t" - "psubw %%mm4, %%mm5 \n\t" //d3 - - "paddw (%%"REG_D",%%"REG_d"), %%mm3 \n\t" - "psraw $3, %%mm6 \n\t" - - "paddw 1*8+%3, %%mm4 \n\t" //d4 - "paddw %%mm2, %%mm5 \n\t" - - "paddw (%%"REG_D",%%"REG_a",4), %%mm6 \n\t" - "paddw %%mm2, %%mm4 \n\t" - - "movq %%mm0, (%%"REG_D",%%"REG_a",2) \n\t" - "psraw $3, %%mm5 \n\t" - - "paddw (%%"REG_D"), %%mm5 \n\t" - "psraw $3, %%mm4 \n\t" - - "paddw (%%"REG_D",%%"REG_a"), %%mm4 \n\t" - "add $"DCTSIZE_S"*2*4, %%"REG_S" \n\t" //4 rows - - "movq %%mm3, (%%"REG_D",%%"REG_d") \n\t" - "movq %%mm6, (%%"REG_D",%%"REG_a",4) \n\t" - "movq %%mm5, (%%"REG_D") \n\t" - "movq %%mm4, (%%"REG_D",%%"REG_a") \n\t" - - "sub %%"REG_d", %%"REG_D" \n\t" - "add $8, %%"REG_D" \n\t" - "dec %%"REG_c" \n\t" - "jnz 1b \n\t" - - : "+S"(workspace), "+D"(output_adr), "+c"(cnt), "=o"(temps) - : "a"(output_stride * sizeof(short)) - NAMED_CONSTRAINTS_ADD(MM_FIX_1_414213562_A, MM_FIX_2_613125930, MM_FIX_1_847759065, MM_FIX_1_082392200, - MM_FIX_1_414213562,MM_DESCALE_RND) - : "%"REG_d - ); -} - -static void row_fdct_mmx(int16_t *data, const uint8_t *pixels, int line_size, int cnt) -{ - DECLARE_ALIGNED(8, uint64_t, temps)[4]; - - __asm__ volatile( - "lea (%%"REG_a",%%"REG_a",2), %%"REG_d" \n\t" - "6: \n\t" - "movd (%%"REG_S"), %%mm0 \n\t" - "pxor %%mm7, %%mm7 \n\t" - - "movd (%%"REG_S",%%"REG_a"), %%mm1 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - - "movd (%%"REG_S",%%"REG_a",2), %%mm2 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - - "punpcklbw %%mm7, %%mm2 \n\t" - "add %%"REG_d", %%"REG_S" \n\t" - - "movq %%mm0, %%mm5 \n\t" - // - - "movd (%%"REG_S",%%"REG_a",4), %%mm3 \n\t" //7 ;prefetch! - "movq %%mm1, %%mm6 \n\t" - - "movd (%%"REG_S",%%"REG_d"), %%mm4 \n\t" //6 - "punpcklbw %%mm7, %%mm3 \n\t" - - "psubw %%mm3, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - - "paddw %%mm3, %%mm0 \n\t" - "psubw %%mm4, %%mm6 \n\t" - - "movd (%%"REG_S",%%"REG_a",2), %%mm3 \n\t" //5 - "paddw %%mm4, %%mm1 \n\t" - - "movq %%mm5, %3 \n\t" //t7 - "punpcklbw %%mm7, %%mm3 \n\t" - - "movq %%mm6, %4 \n\t" //t6 - "movq %%mm2, %%mm4 \n\t" - - "movd (%%"REG_S"), %%mm5 \n\t" //3 - "paddw %%mm3, %%mm2 \n\t" - - "movd (%%"REG_S",%%"REG_a"), %%mm6 \n\t" //4 - "punpcklbw %%mm7, %%mm5 \n\t" - - "psubw %%mm3, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm6 \n\t" - - "movq %%mm5, %%mm3 \n\t" - "paddw %%mm6, %%mm5 \n\t" //t3 - - "psubw %%mm6, %%mm3 \n\t" //t4 ; t0 t1 t2 t4 t5 t3 - - - "movq %%mm0, %%mm6 \n\t" - - "movq %%mm1, %%mm7 \n\t" - "psubw %%mm5, %%mm0 \n\t" //t13 - - "psubw %%mm2, %%mm1 \n\t" - "paddw %%mm2, %%mm7 \n\t" //t11 - - "paddw %%mm0, %%mm1 \n\t" - "movq %%mm7, %%mm2 \n\t" - - "psllw $2, %%mm1 \n\t" - "paddw %%mm5, %%mm6 \n\t" //t10 - - "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm1 \n\t" - "paddw %%mm6, %%mm7 \n\t" //d2 - - "psubw %%mm2, %%mm6 \n\t" //d3 - "movq %%mm0, %%mm5 \n\t" - - //transpose 4x4 - "movq %%mm7, %%mm2 \n\t" - "punpcklwd %%mm6, %%mm7 \n\t" - - "paddw %%mm1, %%mm0 \n\t" //d0 - "punpckhwd %%mm6, %%mm2 \n\t" - - "psubw %%mm1, %%mm5 \n\t" //d1 - "movq %%mm0, %%mm6 \n\t" - - "movq %4, %%mm1 \n\t" - "punpcklwd %%mm5, %%mm0 \n\t" - - "punpckhwd %%mm5, %%mm6 \n\t" - "movq %%mm0, %%mm5 \n\t" - - "punpckldq %%mm7, %%mm0 \n\t" //0 - "paddw %%mm4, %%mm3 \n\t" - - "punpckhdq %%mm7, %%mm5 \n\t" //1 - "movq %%mm6, %%mm7 \n\t" - - "movq %%mm0, "DCTSIZE_S"*0*2(%%"REG_D") \n\t" - "punpckldq %%mm2, %%mm6 \n\t" //2 - - "movq %%mm5, "DCTSIZE_S"*1*2(%%"REG_D") \n\t" - "punpckhdq %%mm2, %%mm7 \n\t" //3 - - "movq %%mm6, "DCTSIZE_S"*2*2(%%"REG_D") \n\t" - "paddw %%mm1, %%mm4 \n\t" - - "movq %%mm7, "DCTSIZE_S"*3*2(%%"REG_D") \n\t" - "psllw $2, %%mm3 \n\t" //t10 - - "movq %3, %%mm2 \n\t" - "psllw $2, %%mm4 \n\t" //t11 - - "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm4 \n\t" //z3 - "paddw %%mm2, %%mm1 \n\t" - - "psllw $2, %%mm1 \n\t" //t12 - "movq %%mm3, %%mm0 \n\t" - - "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm0 \n\t" - "psubw %%mm1, %%mm3 \n\t" - - "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" //z5 - "movq %%mm2, %%mm5 \n\t" - - "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm1 \n\t" - "psubw %%mm4, %%mm2 \n\t" //z13 - - "paddw %%mm4, %%mm5 \n\t" //z11 - "movq %%mm2, %%mm6 \n\t" - - "paddw %%mm3, %%mm0 \n\t" //z2 - "movq %%mm5, %%mm7 \n\t" - - "paddw %%mm0, %%mm2 \n\t" //d4 - "psubw %%mm0, %%mm6 \n\t" //d5 - - "movq %%mm2, %%mm4 \n\t" - "paddw %%mm3, %%mm1 \n\t" //z4 - - //transpose 4x4 - "punpcklwd %%mm6, %%mm2 \n\t" - "paddw %%mm1, %%mm5 \n\t" //d6 - - "punpckhwd %%mm6, %%mm4 \n\t" - "psubw %%mm1, %%mm7 \n\t" //d7 - - "movq %%mm5, %%mm6 \n\t" - "punpcklwd %%mm7, %%mm5 \n\t" - - "punpckhwd %%mm7, %%mm6 \n\t" - "movq %%mm2, %%mm7 \n\t" - - "punpckldq %%mm5, %%mm2 \n\t" //4 - "sub %%"REG_d", %%"REG_S" \n\t" - - "punpckhdq %%mm5, %%mm7 \n\t" //5 - "movq %%mm4, %%mm5 \n\t" - - "movq %%mm2, "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_D") \n\t" - "punpckldq %%mm6, %%mm4 \n\t" //6 - - "movq %%mm7, "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_D") \n\t" - "punpckhdq %%mm6, %%mm5 \n\t" //7 - - "movq %%mm4, "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_D") \n\t" - "add $4, %%"REG_S" \n\t" - - "movq %%mm5, "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_D") \n\t" - "add $"DCTSIZE_S"*2*4, %%"REG_D" \n\t" //4 rows - "dec %%"REG_c" \n\t" - "jnz 6b \n\t" - - : "+S"(pixels), "+D"(data), "+c"(cnt), "=o"(temps), "=o"(temps[1]) - : "a"(line_size) - NAMED_CONSTRAINTS_ADD(ff_MM_FIX_0_707106781, ff_MM_FIX_0_541196100, MM_FIX_0_382683433, MM_FIX_1_306562965) - : "%"REG_d); -} -#endif - -av_cold void ff_fspp_init_x86(FSPPContext *s) -{ -#if HAVE_MMX_INLINE - int cpu_flags = av_get_cpu_flags(); - - if (HAVE_MMX_INLINE && cpu_flags & AV_CPU_FLAG_MMX) { - s->store_slice = store_slice_mmx; - s->store_slice2 = store_slice2_mmx; - s->mul_thrmat = mul_thrmat_mmx; - s->column_fidct = column_fidct_mmx; - s->row_idct = row_idct_mmx; - s->row_fdct = row_fdct_mmx; - } -#endif -} diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c new file mode 100644 index 0000000000..8e00317cb7 --- /dev/null +++ b/libavfilter/x86/vf_fspp_init.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2003 Michael Niedermayer + * Copyright (C) 2005 Nikolaj Poroshin + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "libavutil/attributes.h" +#include "libavutil/x86/cpu.h" +#include "libavfilter/vf_fspp.h" + +void ff_store_slice_mmx(uint8_t *dst, int16_t *src, + ptrdiff_t dst_stride, ptrdiff_t src_stride, + ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); +void ff_store_slice2_mmx(uint8_t *dst, int16_t *src, + ptrdiff_t dst_stride, ptrdiff_t src_stride, + ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); +void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q); +void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); +void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); +void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt); + +av_cold void ff_fspp_init_x86(FSPPContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMX(cpu_flags)) { + s->store_slice = ff_store_slice_mmx; + s->store_slice2 = ff_store_slice2_mmx; + s->mul_thrmat = ff_mul_thrmat_mmx; + s->column_fidct = ff_column_fidct_mmx; + s->row_idct = ff_row_idct_mmx; + s->row_fdct = ff_row_fdct_mmx; + } +}