ffmpeg/libavfilter/x86/vf_yadif_init.c

/*
 * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */

#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/x86/dsputil_mmx.h"
#include "libavfilter/yadif.h"

void ff_yadif_filter_line_mmxext(void *dst, void *prev, void *cur,
                                 void *next, int w, int prefs,
                                 int mrefs, int parity, int mode);
void ff_yadif_filter_line_sse2(void *dst, void *prev, void *cur,
                               void *next, int w, int prefs,
                               int mrefs, int parity, int mode);
void ff_yadif_filter_line_ssse3(void *dst, void *prev, void *cur,
                                void *next, int w, int prefs,
                                int mrefs, int parity, int mode);

void ff_yadif_filter_line_16bit_mmxext(void *dst, void *prev, void *cur,
                                       void *next, int w, int prefs,
                                       int mrefs, int parity, int mode);
void ff_yadif_filter_line_16bit_sse2(void *dst, void *prev, void *cur,
                                     void *next, int w, int prefs,
                                     int mrefs, int parity, int mode);
void ff_yadif_filter_line_16bit_ssse3(void *dst, void *prev, void *cur,
                                      void *next, int w, int prefs,
                                      int mrefs, int parity, int mode);
void ff_yadif_filter_line_16bit_sse4(void *dst, void *prev, void *cur,
                                     void *next, int w, int prefs,
                                     int mrefs, int parity, int mode);

void ff_yadif_filter_line_10bit_mmxext(void *dst, void *prev, void *cur,
                                       void *next, int w, int prefs,
                                       int mrefs, int parity, int mode);
void ff_yadif_filter_line_10bit_sse2(void *dst, void *prev, void *cur,
                                     void *next, int w, int prefs,
                                     int mrefs, int parity, int mode);
void ff_yadif_filter_line_10bit_ssse3(void *dst, void *prev, void *cur,
                                      void *next, int w, int prefs,
                                      int mrefs, int parity, int mode);

av_cold void ff_yadif_init_x86(YADIFContext *yadif)
{
    int cpu_flags = av_get_cpu_flags();
    int bit_depth = (!yadif->csp) ? 8
                                  : yadif->csp->comp[0].depth_minus1 + 1;

#if HAVE_YASM
    if (bit_depth >= 15) {
#if ARCH_X86_32
        if (EXTERNAL_MMXEXT(cpu_flags))
            yadif->filter_line = ff_yadif_filter_line_16bit_mmxext;
#endif /* ARCH_X86_32 */
        if (EXTERNAL_SSE2(cpu_flags))
            yadif->filter_line = ff_yadif_filter_line_16bit_sse2;
        if (EXTERNAL_SSSE3(cpu_flags))
            yadif->filter_line = ff_yadif_filter_line_16bit_ssse3;
        if (EXTERNAL_SSE4(cpu_flags))
            yadif->filter_line = ff_yadif_filter_line_16bit_sse4;
    } else if ( bit_depth >= 9 && bit_depth <= 14) {
#if ARCH_X86_32
        if (EXTERNAL_MMXEXT(cpu_flags))
            yadif->filter_line = ff_yadif_filter_line_10bit_mmxext;
#endif /* ARCH_X86_32 */
        if (EXTERNAL_SSE2(cpu_flags))
            yadif->filter_line = ff_yadif_filter_line_10bit_sse2;
        if (EXTERNAL_SSSE3(cpu_flags))
            yadif->filter_line = ff_yadif_filter_line_10bit_ssse3;
    } else {
#if ARCH_X86_32
    if (EXTERNAL_MMXEXT(cpu_flags))
        yadif->filter_line = ff_yadif_filter_line_mmxext;
#endif /* ARCH_X86_32 */
    if (EXTERNAL_SSE2(cpu_flags))
        yadif->filter_line = ff_yadif_filter_line_sse2;
    if (EXTERNAL_SSSE3(cpu_flags))
        yadif->filter_line = ff_yadif_filter_line_ssse3;
    }
#endif /* HAVE_YASM */
}
yadif filter, based on stefanos port of my yadif from mplayer. Compared to stefanos, 2 frame output works with ffplay. Originally committed as revision 25196 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-09-25 16:43:42 +00:00			`/*`
			`* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>`
			`*`
			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation; either version 2 of the License, or`
			`* (at your option) any later version.`
			`*`
			`* FFmpeg is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License along`
			`* with FFmpeg; if not, write to the Free Software Foundation, Inc.,`
			`* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.`
			`*/`

vf_yadif: move x86 init code to x86/yadif.c Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-07-01 12:08:17 +00:00			`#include "libavutil/attributes.h"`
yadif filter, based on stefanos port of my yadif from mplayer. Compared to stefanos, 2 frame output works with ffplay. Originally committed as revision 25196 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-09-25 16:43:42 +00:00			`#include "libavutil/cpu.h"`
Don't include common.h from avutil.h Signed-off-by: Martin Storsjö <martin@martin.st> 2012-08-06 13:49:32 +00:00			`#include "libavutil/mem.h"`
x86: rename libavutil/x86_cpu.h to libavutil/x86/asm.h This puts x86-specific things in the x86/ subdirectory where they belong. Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-08-08 12:51:52 +00:00			`#include "libavutil/x86/asm.h"`
yadif: Port inline assembly to yasm Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-09 15:34:46 +00:00			`#include "libavutil/x86/cpu.h"`
In yadif filter, declare asm constants directly to avoid dependency on libavcodec Originally committed as revision 25895 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-12-06 00:14:15 +00:00			`#include "libavcodec/x86/dsputil_mmx.h"`
yadif filter, based on stefanos port of my yadif from mplayer. Compared to stefanos, 2 frame output works with ffplay. Originally committed as revision 25196 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-09-25 16:43:42 +00:00			`#include "libavfilter/yadif.h"`

vf_yadif: silence a warning. clang says: libavfilter/vf_yadif.c:192:28: warning: incompatible pointer types assigning to 'void ()(uint8_t , uint8_t , uint8_t , uint8_t , int, int, int, int, int)' from 'void (uint16_t , uint16_t , uint16_t , uint16_t *, int, int, int, int, int)' 2013-01-26 19:49:16 +00:00			`void ff_yadif_filter_line_mmxext(void dst, void prev, void *cur,`
			`void *next, int w, int prefs,`
yadif: Port inline assembly to yasm Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-09 15:34:46 +00:00			`int mrefs, int parity, int mode);`
vf_yadif: silence a warning. clang says: libavfilter/vf_yadif.c:192:28: warning: incompatible pointer types assigning to 'void ()(uint8_t , uint8_t , uint8_t , uint8_t , int, int, int, int, int)' from 'void (uint16_t , uint16_t , uint16_t , uint16_t *, int, int, int, int, int)' 2013-01-26 19:49:16 +00:00			`void ff_yadif_filter_line_sse2(void dst, void prev, void *cur,`
			`void *next, int w, int prefs,`
yadif: Port inline assembly to yasm Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-09 15:34:46 +00:00			`int mrefs, int parity, int mode);`
vf_yadif: silence a warning. clang says: libavfilter/vf_yadif.c:192:28: warning: incompatible pointer types assigning to 'void ()(uint8_t , uint8_t , uint8_t , uint8_t , int, int, int, int, int)' from 'void (uint16_t , uint16_t , uint16_t , uint16_t *, int, int, int, int, int)' 2013-01-26 19:49:16 +00:00			`void ff_yadif_filter_line_ssse3(void dst, void prev, void *cur,`
			`void *next, int w, int prefs,`
yadif: Port inline assembly to yasm Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-09 15:34:46 +00:00			`int mrefs, int parity, int mode);`
lavfi: place x86 inline assembly under HAVE_INLINE_ASM. This allows compiling this code using compilers that do not understand gcc-style inline assembly. 2012-07-22 00:03:12 +00:00
yadif: x86 assembly for 16-bit samples This is a fairly dumb copy of the assembly for 8-bit samples but it works and produces identical output to the C version. The options have been tested on an Athlon64 and a Core2Quad. Athlon64: 1810385 decicycles in C, 32726 runs, 42 skips 1080744 decicycles in mmx, 32744 runs, 24 skips, 1.7x faster 818315 decicycles in sse2, 32735 runs, 33 skips, 2.2x faster Core2Quad: 924025 decicycles in C, 32750 runs, 18 skips 623995 decicycles in mmx, 32767 runs, 1 skips, 1.5x faster 406223 decicycles in sse2, 32764 runs, 4 skips, 2.3x faster 387842 decicycles in ssse3, 32767 runs, 1 skips, 2.4x faster 307726 decicycles in sse4, 32763 runs, 5 skips, 3.0x faster Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2013-03-16 20:42:23 +00:00			`void ff_yadif_filter_line_16bit_mmxext(void dst, void prev, void *cur,`
			`void *next, int w, int prefs,`
			`int mrefs, int parity, int mode);`
			`void ff_yadif_filter_line_16bit_sse2(void dst, void prev, void *cur,`
			`void *next, int w, int prefs,`
			`int mrefs, int parity, int mode);`
			`void ff_yadif_filter_line_16bit_ssse3(void dst, void prev, void *cur,`
			`void *next, int w, int prefs,`
			`int mrefs, int parity, int mode);`
			`void ff_yadif_filter_line_16bit_sse4(void dst, void prev, void *cur,`
			`void *next, int w, int prefs,`
			`int mrefs, int parity, int mode);`

yadif: x86 assembly for 9 to 14-bit samples These smaller samples do not need to be unpacked to double words allowing the code to process more pixels every iteration (still 2 in MMX but 6 in SSE2). It also avoids emulating the missing double word instructions on older instruction sets. Like with the previous code for 16-bit samples this has been tested on an Athlon64 and a Core2Quad. Athlon64: 1809275 decicycles in C, 32718 runs, 50 skips 911675 decicycles in mmx, 32727 runs, 41 skips, 2.0x faster 495284 decicycles in sse2, 32747 runs, 21 skips, 3.7x faster Core2Quad: 921363 decicycles in C, 32756 runs, 12 skips 486537 decicycles in mmx, 32764 runs, 4 skips, 1.9x faster 293296 decicycles in sse2, 32759 runs, 9 skips, 3.1x faster 284910 decicycles in ssse3, 32759 runs, 9 skips, 3.2x faster Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2013-03-16 20:42:24 +00:00			`void ff_yadif_filter_line_10bit_mmxext(void dst, void prev, void *cur,`
			`void *next, int w, int prefs,`
			`int mrefs, int parity, int mode);`
			`void ff_yadif_filter_line_10bit_sse2(void dst, void prev, void *cur,`
			`void *next, int w, int prefs,`
			`int mrefs, int parity, int mode);`
			`void ff_yadif_filter_line_10bit_ssse3(void dst, void prev, void *cur,`
			`void *next, int w, int prefs,`
			`int mrefs, int parity, int mode);`

vf_yadif: move x86 init code to x86/yadif.c Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-07-01 12:08:17 +00:00			`av_cold void ff_yadif_init_x86(YADIFContext *yadif)`
			`{`
			`int cpu_flags = av_get_cpu_flags();`
yadif: x86 assembly for 16-bit samples This is a fairly dumb copy of the assembly for 8-bit samples but it works and produces identical output to the C version. The options have been tested on an Athlon64 and a Core2Quad. Athlon64: 1810385 decicycles in C, 32726 runs, 42 skips 1080744 decicycles in mmx, 32744 runs, 24 skips, 1.7x faster 818315 decicycles in sse2, 32735 runs, 33 skips, 2.2x faster Core2Quad: 924025 decicycles in C, 32750 runs, 18 skips 623995 decicycles in mmx, 32767 runs, 1 skips, 1.5x faster 406223 decicycles in sse2, 32764 runs, 4 skips, 2.3x faster 387842 decicycles in ssse3, 32767 runs, 1 skips, 2.4x faster 307726 decicycles in sse4, 32763 runs, 5 skips, 3.0x faster Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2013-03-16 20:42:23 +00:00			`int bit_depth = (!yadif->csp) ? 8`
			`: yadif->csp->comp[0].depth_minus1 + 1;`
vf_yadif: move x86 init code to x86/yadif.c Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-07-01 12:08:17 +00:00
yadif: Port inline assembly to yasm Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-09 15:34:46 +00:00			`#if HAVE_YASM`
yadif: x86 assembly for 9 to 14-bit samples These smaller samples do not need to be unpacked to double words allowing the code to process more pixels every iteration (still 2 in MMX but 6 in SSE2). It also avoids emulating the missing double word instructions on older instruction sets. Like with the previous code for 16-bit samples this has been tested on an Athlon64 and a Core2Quad. Athlon64: 1809275 decicycles in C, 32718 runs, 50 skips 911675 decicycles in mmx, 32727 runs, 41 skips, 2.0x faster 495284 decicycles in sse2, 32747 runs, 21 skips, 3.7x faster Core2Quad: 921363 decicycles in C, 32756 runs, 12 skips 486537 decicycles in mmx, 32764 runs, 4 skips, 1.9x faster 293296 decicycles in sse2, 32759 runs, 9 skips, 3.1x faster 284910 decicycles in ssse3, 32759 runs, 9 skips, 3.2x faster Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2013-03-16 20:42:24 +00:00			`if (bit_depth >= 15) {`
yadif: x86 assembly for 16-bit samples This is a fairly dumb copy of the assembly for 8-bit samples but it works and produces identical output to the C version. The options have been tested on an Athlon64 and a Core2Quad. Athlon64: 1810385 decicycles in C, 32726 runs, 42 skips 1080744 decicycles in mmx, 32744 runs, 24 skips, 1.7x faster 818315 decicycles in sse2, 32735 runs, 33 skips, 2.2x faster Core2Quad: 924025 decicycles in C, 32750 runs, 18 skips 623995 decicycles in mmx, 32767 runs, 1 skips, 1.5x faster 406223 decicycles in sse2, 32764 runs, 4 skips, 2.3x faster 387842 decicycles in ssse3, 32767 runs, 1 skips, 2.4x faster 307726 decicycles in sse4, 32763 runs, 5 skips, 3.0x faster Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2013-03-16 20:42:23 +00:00			`#if ARCH_X86_32`
			`if (EXTERNAL_MMXEXT(cpu_flags))`
			`yadif->filter_line = ff_yadif_filter_line_16bit_mmxext;`
			`#endif /* ARCH_X86_32 */`
			`if (EXTERNAL_SSE2(cpu_flags))`
			`yadif->filter_line = ff_yadif_filter_line_16bit_sse2;`
			`if (EXTERNAL_SSSE3(cpu_flags))`
			`yadif->filter_line = ff_yadif_filter_line_16bit_ssse3;`
			`if (EXTERNAL_SSE4(cpu_flags))`
			`yadif->filter_line = ff_yadif_filter_line_16bit_sse4;`
yadif: x86 assembly for 9 to 14-bit samples These smaller samples do not need to be unpacked to double words allowing the code to process more pixels every iteration (still 2 in MMX but 6 in SSE2). It also avoids emulating the missing double word instructions on older instruction sets. Like with the previous code for 16-bit samples this has been tested on an Athlon64 and a Core2Quad. Athlon64: 1809275 decicycles in C, 32718 runs, 50 skips 911675 decicycles in mmx, 32727 runs, 41 skips, 2.0x faster 495284 decicycles in sse2, 32747 runs, 21 skips, 3.7x faster Core2Quad: 921363 decicycles in C, 32756 runs, 12 skips 486537 decicycles in mmx, 32764 runs, 4 skips, 1.9x faster 293296 decicycles in sse2, 32759 runs, 9 skips, 3.1x faster 284910 decicycles in ssse3, 32759 runs, 9 skips, 3.2x faster Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2013-03-16 20:42:24 +00:00			`} else if ( bit_depth >= 9 && bit_depth <= 14) {`
			`#if ARCH_X86_32`
			`if (EXTERNAL_MMXEXT(cpu_flags))`
			`yadif->filter_line = ff_yadif_filter_line_10bit_mmxext;`
			`#endif /* ARCH_X86_32 */`
			`if (EXTERNAL_SSE2(cpu_flags))`
			`yadif->filter_line = ff_yadif_filter_line_10bit_sse2;`
			`if (EXTERNAL_SSSE3(cpu_flags))`
			`yadif->filter_line = ff_yadif_filter_line_10bit_ssse3;`
yadif: x86 assembly for 16-bit samples This is a fairly dumb copy of the assembly for 8-bit samples but it works and produces identical output to the C version. The options have been tested on an Athlon64 and a Core2Quad. Athlon64: 1810385 decicycles in C, 32726 runs, 42 skips 1080744 decicycles in mmx, 32744 runs, 24 skips, 1.7x faster 818315 decicycles in sse2, 32735 runs, 33 skips, 2.2x faster Core2Quad: 924025 decicycles in C, 32750 runs, 18 skips 623995 decicycles in mmx, 32767 runs, 1 skips, 1.5x faster 406223 decicycles in sse2, 32764 runs, 4 skips, 2.3x faster 387842 decicycles in ssse3, 32767 runs, 1 skips, 2.4x faster 307726 decicycles in sse4, 32763 runs, 5 skips, 3.0x faster Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2013-03-16 20:42:23 +00:00			`} else {`
yadif: Port inline assembly to yasm Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-09 15:34:46 +00:00			`#if ARCH_X86_32`
yadif: restore speed of the C filtering code Always use the special filter for the first and last 3 columns (only). Changes made in 64ed397 slowed the filter to just under 3/4 of what it was. This commit restores the speed while maintaining identical output. For reference, on my Athlon64: 1733222 decicycles in old 2358563 decicycles in new 1727558 decicycles in this Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2013-03-10 14:08:50 +00:00			`if (EXTERNAL_MMXEXT(cpu_flags))`
yadif: Port inline assembly to yasm Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-09 15:34:46 +00:00			`yadif->filter_line = ff_yadif_filter_line_mmxext;`
			`#endif /* ARCH_X86_32 */`
yadif: restore speed of the C filtering code Always use the special filter for the first and last 3 columns (only). Changes made in 64ed397 slowed the filter to just under 3/4 of what it was. This commit restores the speed while maintaining identical output. For reference, on my Athlon64: 1733222 decicycles in old 2358563 decicycles in new 1727558 decicycles in this Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2013-03-10 14:08:50 +00:00			`if (EXTERNAL_SSE2(cpu_flags))`
yadif: Port inline assembly to yasm Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-09 15:34:46 +00:00			`yadif->filter_line = ff_yadif_filter_line_sse2;`
yadif: restore speed of the C filtering code Always use the special filter for the first and last 3 columns (only). Changes made in 64ed397 slowed the filter to just under 3/4 of what it was. This commit restores the speed while maintaining identical output. For reference, on my Athlon64: 1733222 decicycles in old 2358563 decicycles in new 1727558 decicycles in this Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2013-03-10 14:08:50 +00:00			`if (EXTERNAL_SSSE3(cpu_flags))`
yadif: Port inline assembly to yasm Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-09 15:34:46 +00:00			`yadif->filter_line = ff_yadif_filter_line_ssse3;`
yadif: x86 assembly for 16-bit samples This is a fairly dumb copy of the assembly for 8-bit samples but it works and produces identical output to the C version. The options have been tested on an Athlon64 and a Core2Quad. Athlon64: 1810385 decicycles in C, 32726 runs, 42 skips 1080744 decicycles in mmx, 32744 runs, 24 skips, 1.7x faster 818315 decicycles in sse2, 32735 runs, 33 skips, 2.2x faster Core2Quad: 924025 decicycles in C, 32750 runs, 18 skips 623995 decicycles in mmx, 32767 runs, 1 skips, 1.5x faster 406223 decicycles in sse2, 32764 runs, 4 skips, 2.3x faster 387842 decicycles in ssse3, 32767 runs, 1 skips, 2.4x faster 307726 decicycles in sse4, 32763 runs, 5 skips, 3.0x faster Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2013-03-16 20:42:23 +00:00			`}`
yadif: Port inline assembly to yasm Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 2013-01-09 15:34:46 +00:00			`#endif /* HAVE_YASM */`
vf_yadif: move x86 init code to x86/yadif.c Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-07-01 12:08:17 +00:00			`}`