From 5024a82e9548d186224b3be4de4041dbd1c2a482 Mon Sep 17 00:00:00 2001 From: Thomas Mundt Date: Sun, 13 Mar 2016 10:06:21 +0100 Subject: [PATCH] avfilter/vf_bwdif: add x86 SIMD Signed-off-by: Thomas Mundt --- libavfilter/bwdif.h | 72 +++++++++ libavfilter/vf_bwdif.c | 69 ++------- libavfilter/x86/Makefile | 2 + libavfilter/x86/vf_bwdif.asm | 266 ++++++++++++++++++++++++++++++++ libavfilter/x86/vf_bwdif_init.c | 78 ++++++++++ 5 files changed, 432 insertions(+), 55 deletions(-) create mode 100644 libavfilter/bwdif.h create mode 100644 libavfilter/x86/vf_bwdif.asm create mode 100644 libavfilter/x86/vf_bwdif_init.c diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h new file mode 100644 index 0000000000..8b42c760a0 --- /dev/null +++ b/libavfilter/bwdif.h @@ -0,0 +1,72 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVFILTER_BWDIF_H +#define AVFILTER_BWDIF_H + +#include "libavutil/pixdesc.h" +#include "avfilter.h" + +enum BWDIFMode { + BWDIF_MODE_SEND_FRAME = 0, ///< send 1 frame for each frame + BWDIF_MODE_SEND_FIELD = 1, ///< send 1 frame for each field +}; + +enum BWDIFParity { + BWDIF_PARITY_TFF = 0, ///< top field first + BWDIF_PARITY_BFF = 1, ///< bottom field first + BWDIF_PARITY_AUTO = -1, ///< auto detection +}; + +enum BWDIFDeint { + BWDIF_DEINT_ALL = 0, ///< deinterlace all frames + BWDIF_DEINT_INTERLACED = 1, ///< only deinterlace frames marked as interlaced +}; + +typedef struct BWDIFContext { + const AVClass *class; + + int mode; ///< BWDIFMode + int parity; ///< BWDIFParity + int deint; ///< BWDIFDeint + + int frame_pending; + + AVFrame *cur; + AVFrame *next; + AVFrame *prev; + AVFrame *out; + + void (*filter_intra)(void *dst1, void *cur1, int w, int prefs, int mrefs, + int prefs3, int mrefs3, int parity, int clip_max); + void (*filter_line)(void *dst, void *prev, void *cur, void *next, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int prefs3, int mrefs3, int prefs4, int mrefs4, + int parity, int clip_max); + void (*filter_edge)(void *dst, void *prev, void *cur, void *next, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int parity, int clip_max, int spat); + + const AVPixFmtDescriptor *csp; + int inter_field; + int eof; +} BWDIFContext; + +void ff_bwdif_init_x86(BWDIFContext *bwdif); + +#endif /* AVFILTER_BWDIF_H */ diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c index 79850545a0..d402aa4f8b 100644 --- a/libavfilter/vf_bwdif.c +++ b/libavfilter/vf_bwdif.c @@ -37,6 +37,7 @@ #include "formats.h" #include "internal.h" #include "video.h" +#include "bwdif.h" /* * Filter coefficients coef_lf and coef_hf taken from BBC PH-2071 (Weston 3 Field Deinterlacer). @@ -48,51 +49,6 @@ static const uint16_t coef_lf[2] = { 4309, 213 }; static const uint16_t coef_hf[3] = { 5570, 3801, 1016 }; static const uint16_t coef_sp[2] = { 5077, 981 }; -enum BWDIFMode { - BWDIF_MODE_SEND_FRAME = 0, ///< send 1 frame for each frame - BWDIF_MODE_SEND_FIELD = 1, ///< send 1 frame for each field -}; - -enum BWDIFParity { - BWDIF_PARITY_TFF = 0, ///< top field first - BWDIF_PARITY_BFF = 1, ///< bottom field first - BWDIF_PARITY_AUTO = -1, ///< auto detection -}; - -enum BWDIFDeint { - BWDIF_DEINT_ALL = 0, ///< deinterlace all frames - BWDIF_DEINT_INTERLACED = 1, ///< only deinterlace frames marked as interlaced -}; - -typedef struct BWDIFContext { - const AVClass *class; - - int mode; ///< BWDIFMode - int parity; ///< BWDIFParity - int deint; ///< BWDIFDeint - - int frame_pending; - - AVFrame *cur; - AVFrame *next; - AVFrame *prev; - AVFrame *out; - - void (*filter_intra)(void *dst1, void *cur1, int w, int prefs, int mrefs, - int prefs3, int mrefs3, int parity, int clip_max); - void (*filter_line)(void *dst, void *prev, void *cur, void *next, - int w, int prefs, int mrefs, int prefs2, int mrefs2, - int prefs3, int mrefs3, int prefs4, int mrefs4, - int parity, int clip_max); - void (*filter_edge)(void *dst, void *prev, void *cur, void *next, - int w, int prefs, int mrefs, int prefs2, int mrefs2, - int parity, int clip_max, int spat); - - const AVPixFmtDescriptor *csp; - int inter_field; - int eof; -} BWDIFContext; - typedef struct ThreadData { AVFrame *frame; int plane; @@ -177,10 +133,10 @@ static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs, FILTER_INTRA() } -static void filter_line(void *dst1, void *prev1, void *cur1, void *next1, - int w, int prefs, int mrefs, int prefs2, int mrefs2, - int prefs3, int mrefs3, int prefs4, int mrefs4, - int parity, int clip_max) +static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int prefs3, int mrefs3, int prefs4, int mrefs4, + int parity, int clip_max) { uint8_t *dst = dst1; uint8_t *prev = prev1; @@ -222,10 +178,10 @@ static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mre FILTER_INTRA() } -static void filter_line_16bit(void *dst1, void *prev1, void *cur1, void *next1, - int w, int prefs, int mrefs, int prefs2, int mrefs2, - int prefs3, int mrefs3, int prefs4, int mrefs4, - int parity, int clip_max) +static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int prefs3, int mrefs3, int prefs4, int mrefs4, + int parity, int clip_max) { uint16_t *dst = dst1; uint16_t *prev = prev1; @@ -557,14 +513,17 @@ static int config_props(AVFilterLink *link) s->csp = av_pix_fmt_desc_get(link->format); if (s->csp->comp[0].depth > 8) { s->filter_intra = filter_intra_16bit; - s->filter_line = filter_line_16bit; + s->filter_line = filter_line_c_16bit; s->filter_edge = filter_edge_16bit; } else { s->filter_intra = filter_intra; - s->filter_line = filter_line; + s->filter_line = filter_line_c; s->filter_edge = filter_edge; } + if (ARCH_X86) + ff_bwdif_init_x86(s); + return 0; } diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index 33de380bc0..ed294e0f92 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -1,4 +1,5 @@ OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend_init.o +OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif_init.o OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_init.o @@ -21,6 +22,7 @@ OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif_init.o OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o YASM-OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend.o +YASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o YASM-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o YASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o diff --git a/libavfilter/x86/vf_bwdif.asm b/libavfilter/x86/vf_bwdif.asm new file mode 100644 index 0000000000..11aa0252b2 --- /dev/null +++ b/libavfilter/x86/vf_bwdif.asm @@ -0,0 +1,266 @@ +;***************************************************************************** +;* x86-optimized functions for bwdif filter +;* +;* Copyright (C) 2016 Thomas Mundt +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pw_coefhf: times 4 dw 1016, 5570 +pw_coefhf1: times 8 dw -3801 +pw_coefsp: times 4 dw 5077, -981 +pw_splfdif: times 4 dw -768, 768 + +SECTION .text + +%macro LOAD8 2 + movh %1, %2 + punpcklbw %1, m7 +%endmacro + +%macro LOAD12 2 + movu %1, %2 +%endmacro + +%macro DISP8 0 + packuswb m2, m2 + movh [dstq], m2 +%endmacro + +%macro DISP12 0 + CLIPW m2, m7, m12 + movu [dstq], m2 +%endmacro + +%macro FILTER 5 + pxor m7, m7 +.loop%1: + LOAD%4 m0, [curq+t0*%5] + LOAD%4 m1, [curq+t1*%5] + LOAD%4 m2, [%2] + LOAD%4 m3, [%3] + mova m4, m3 + paddw m3, m2 + psubw m2, m4 + ABS1 m2, m4 + mova m8, m3 + mova m9, m2 + LOAD%4 m3, [prevq+t0*%5] + LOAD%4 m4, [prevq+t1*%5] + psubw m3, m0 + psubw m4, m1 + ABS2 m3, m4, m5, m6 + paddw m3, m4 + psrlw m2, 1 + psrlw m3, 1 + pmaxsw m2, m3 + LOAD%4 m3, [nextq+t0*%5] + LOAD%4 m4, [nextq+t1*%5] + psubw m3, m0 + psubw m4, m1 + ABS2 m3, m4, m5, m6 + paddw m3, m4 + psrlw m3, 1 + pmaxsw m2, m3 + + LOAD%4 m3, [%2+t0*2*%5] + LOAD%4 m4, [%3+t0*2*%5] + LOAD%4 m5, [%2+t1*2*%5] + LOAD%4 m6, [%3+t1*2*%5] + paddw m3, m4 + paddw m5, m6 + mova m6, m3 + paddw m6, m5 + mova m10, m6 + psrlw m3, 1 + psrlw m5, 1 + psubw m3, m0 + psubw m5, m1 + mova m6, m3 + pminsw m3, m5 + pmaxsw m5, m6 + mova m4, m8 + psraw m4, 1 + mova m6, m4 + psubw m6, m0 + psubw m4, m1 + pmaxsw m3, m6 + pminsw m5, m6 + pmaxsw m3, m4 + pminsw m5, m4 + mova m6, m7 + psubw m6, m3 + pmaxsw m6, m5 + mova m3, m2 + pcmpgtw m3, m7 + pand m6, m3 + pmaxsw m2, m6 + mova m11, m2 + + LOAD%4 m2, [%2+t0*4*%5] + LOAD%4 m3, [%3+t0*4*%5] + LOAD%4 m4, [%2+t1*4*%5] + LOAD%4 m5, [%3+t1*4*%5] + paddw m2, m3 + paddw m4, m5 + paddw m2, m4 + mova m3, m2 + punpcklwd m2, m8 + punpckhwd m3, m8 + pmaddwd m2, [pw_coefhf] + pmaddwd m3, [pw_coefhf] + mova m4, m10 + mova m6, m4 + pmullw m4, [pw_coefhf1] + pmulhw m6, [pw_coefhf1] + mova m5, m4 + punpcklwd m4, m6 + punpckhwd m5, m6 + paddd m2, m4 + paddd m3, m5 + psrad m2, 2 + psrad m3, 2 + + mova m4, m0 + paddw m0, m1 +%if ARCH_X86_64 + LOAD%4 m5, [curq+t2*%5] + LOAD%4 m6, [curq+t3*%5] +%else + mov r4, prefs3mp + mov r5, mrefs3mp + LOAD%4 m5, [curq+t0*%5] + LOAD%4 m6, [curq+t1*%5] + mov r4, prefsmp + mov r5, mrefsmp +%endif + paddw m6, m5 + psubw m1, m4 + ABS1 m1, m4 + pcmpgtw m1, m9 + mova m4, m1 + punpcklwd m1, m4 + punpckhwd m4, m4 + pand m2, m1 + pand m3, m4 + mova m5, [pw_splfdif] + mova m7, m5 + pand m5, m1 + pand m7, m4 + paddw m5, [pw_coefsp] + paddw m7, [pw_coefsp] + mova m4, m0 + punpcklwd m0, m6 + punpckhwd m4, m6 + pmaddwd m0, m5 + pmaddwd m4, m7 + paddd m2, m0 + paddd m3, m4 + psrad m2, 13 + psrad m3, 13 + packssdw m2, m3 + + mova m4, m8 + psraw m4, 1 + mova m0, m11 + mova m3, m4 + psubw m4, m0 + paddw m3, m0 + CLIPW m2, m4, m3 + pxor m7, m7 + DISP%4 + + add dstq, STEP + add prevq, STEP + add curq, STEP + add nextq, STEP + sub DWORD wm, mmsize/2 + jg .loop%1 +%endmacro + +%macro PROC 2 +%if ARCH_X86_64 + movsxd r5, DWORD prefsm + movsxd r6, DWORD mrefsm + movsxd r7, DWORD prefs3m + movsxd r8, DWORD mrefs3m + DECLARE_REG_TMP 5, 6, 7, 8 +%else + %define m8 [rsp+ 0] + %define m9 [rsp+16] + %define m10 [rsp+32] + %define m11 [rsp+48] + mov r4, prefsmp + mov r5, mrefsmp + DECLARE_REG_TMP 4, 5 +%endif + cmp DWORD paritym, 0 + je .parity0 + FILTER 1, prevq, curq, %1, %2 + jmp .ret +.parity0: + FILTER 0, curq, nextq, %1, %2 +.ret: + RET +%endmacro + +%macro BWDIF 0 +%if ARCH_X86_64 +cglobal bwdif_filter_line, 4, 9, 12, 0, dst, prev, cur, next, w, prefs, \ + mrefs, prefs2, mrefs2, prefs3, mrefs3, \ + prefs4, mrefs4, parity, clip_max +%else +cglobal bwdif_filter_line, 4, 6, 8, 64, dst, prev, cur, next, w, prefs, \ + mrefs, prefs2, mrefs2, prefs3, mrefs3, \ + prefs4, mrefs4, parity, clip_max +%endif + %define STEP mmsize/2 + PROC 8, 1 + +%if ARCH_X86_64 +cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, cur, next, w, \ + prefs, mrefs, prefs2, mrefs2, \ + prefs3, mrefs3, prefs4, \ + mrefs4, parity, clip_max + movd m12, DWORD clip_maxm + SPLATW m12, m12, 0 +%else +cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \ + prefs, mrefs, prefs2, mrefs2, \ + prefs3, mrefs3, prefs4, \ + mrefs4, parity, clip_max + %define m12 [rsp+64] + movd m0, DWORD clip_maxm + SPLATW m0, m0, 0 + mova m12, m0 +%endif + %define STEP mmsize + PROC 12, 2 +%endmacro + +INIT_XMM ssse3 +BWDIF +INIT_XMM sse2 +BWDIF +%if ARCH_X86_32 +INIT_MMX mmxext +BWDIF +%endif diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c new file mode 100644 index 0000000000..1cb8438e5f --- /dev/null +++ b/libavfilter/x86/vf_bwdif_init.c @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2016 Thomas Mundt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/mem.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavfilter/bwdif.h" + +void ff_bwdif_filter_line_mmxext(void *dst, void *prev, void *cur, void *next, + int w, int prefs, int mrefs, int prefs2, + int mrefs2, int prefs3, int mrefs3, int prefs4, + int mrefs4, int parity, int clip_max); +void ff_bwdif_filter_line_sse2(void *dst, void *prev, void *cur, void *next, + int w, int prefs, int mrefs, int prefs2, + int mrefs2, int prefs3, int mrefs3, int prefs4, + int mrefs4, int parity, int clip_max); +void ff_bwdif_filter_line_ssse3(void *dst, void *prev, void *cur, void *next, + int w, int prefs, int mrefs, int prefs2, + int mrefs2, int prefs3, int mrefs3, int prefs4, + int mrefs4, int parity, int clip_max); + +void ff_bwdif_filter_line_12bit_mmxext(void *dst, void *prev, void *cur, void *next, + int w, int prefs, int mrefs, int prefs2, + int mrefs2, int prefs3, int mrefs3, int prefs4, + int mrefs4, int parity, int clip_max); +void ff_bwdif_filter_line_12bit_sse2(void *dst, void *prev, void *cur, void *next, + int w, int prefs, int mrefs, int prefs2, + int mrefs2, int prefs3, int mrefs3, int prefs4, + int mrefs4, int parity, int clip_max); +void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *next, + int w, int prefs, int mrefs, int prefs2, + int mrefs2, int prefs3, int mrefs3, int prefs4, + int mrefs4, int parity, int clip_max); + +av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif) +{ + int cpu_flags = av_get_cpu_flags(); + int bit_depth = (!bwdif->csp) ? 8 : bwdif->csp->comp[0].depth; + + if (bit_depth <= 8) { +#if ARCH_X86_32 + if (EXTERNAL_MMXEXT(cpu_flags)) + bwdif->filter_line = ff_bwdif_filter_line_mmxext; +#endif /* ARCH_X86_32 */ + if (EXTERNAL_SSE2(cpu_flags)) + bwdif->filter_line = ff_bwdif_filter_line_sse2; + if (EXTERNAL_SSSE3(cpu_flags)) + bwdif->filter_line = ff_bwdif_filter_line_ssse3; + } else if (bit_depth <= 12) { +#if ARCH_X86_32 + if (EXTERNAL_MMXEXT(cpu_flags)) + bwdif->filter_line = ff_bwdif_filter_line_12bit_mmxext; +#endif /* ARCH_X86_32 */ + if (EXTERNAL_SSE2(cpu_flags)) + bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2; + if (EXTERNAL_SSSE3(cpu_flags)) + bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3; + } +}