2010-09-25 16:43:42 +00:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
|
|
|
|
*
|
|
|
|
* This file is part of FFmpeg.
|
|
|
|
*
|
|
|
|
* FFmpeg is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License along
|
|
|
|
* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
*/
|
|
|
|
|
2012-07-01 12:08:17 +00:00
|
|
|
#include "libavutil/attributes.h"
|
2010-09-25 16:43:42 +00:00
|
|
|
#include "libavutil/cpu.h"
|
2012-08-06 13:49:32 +00:00
|
|
|
#include "libavutil/mem.h"
|
2012-08-08 12:51:52 +00:00
|
|
|
#include "libavutil/x86/asm.h"
|
2013-01-09 15:34:46 +00:00
|
|
|
#include "libavutil/x86/cpu.h"
|
2010-12-06 00:14:15 +00:00
|
|
|
#include "libavcodec/x86/dsputil_mmx.h"
|
2010-09-25 16:43:42 +00:00
|
|
|
#include "libavfilter/yadif.h"
|
|
|
|
|
vf_yadif: silence a warning.
clang says:
libavfilter/vf_yadif.c:192:28: warning: incompatible pointer types assigning to
'void (*)(uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int, int)'
from 'void (uint16_t *, uint16_t *, uint16_t *, uint16_t *, int, int, int, int, int)'
2013-01-26 19:49:16 +00:00
|
|
|
void ff_yadif_filter_line_mmxext(void *dst, void *prev, void *cur,
|
|
|
|
void *next, int w, int prefs,
|
2013-01-09 15:34:46 +00:00
|
|
|
int mrefs, int parity, int mode);
|
vf_yadif: silence a warning.
clang says:
libavfilter/vf_yadif.c:192:28: warning: incompatible pointer types assigning to
'void (*)(uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int, int)'
from 'void (uint16_t *, uint16_t *, uint16_t *, uint16_t *, int, int, int, int, int)'
2013-01-26 19:49:16 +00:00
|
|
|
void ff_yadif_filter_line_sse2(void *dst, void *prev, void *cur,
|
|
|
|
void *next, int w, int prefs,
|
2013-01-09 15:34:46 +00:00
|
|
|
int mrefs, int parity, int mode);
|
vf_yadif: silence a warning.
clang says:
libavfilter/vf_yadif.c:192:28: warning: incompatible pointer types assigning to
'void (*)(uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int, int)'
from 'void (uint16_t *, uint16_t *, uint16_t *, uint16_t *, int, int, int, int, int)'
2013-01-26 19:49:16 +00:00
|
|
|
void ff_yadif_filter_line_ssse3(void *dst, void *prev, void *cur,
|
|
|
|
void *next, int w, int prefs,
|
2013-01-09 15:34:46 +00:00
|
|
|
int mrefs, int parity, int mode);
|
2012-07-22 00:03:12 +00:00
|
|
|
|
yadif: x86 assembly for 16-bit samples
This is a fairly dumb copy of the assembly for 8-bit samples but it
works and produces identical output to the C version. The options have
been tested on an Athlon64 and a Core2Quad.
Athlon64:
1810385 decicycles in C, 32726 runs, 42 skips
1080744 decicycles in mmx, 32744 runs, 24 skips, 1.7x faster
818315 decicycles in sse2, 32735 runs, 33 skips, 2.2x faster
Core2Quad:
924025 decicycles in C, 32750 runs, 18 skips
623995 decicycles in mmx, 32767 runs, 1 skips, 1.5x faster
406223 decicycles in sse2, 32764 runs, 4 skips, 2.3x faster
387842 decicycles in ssse3, 32767 runs, 1 skips, 2.4x faster
307726 decicycles in sse4, 32763 runs, 5 skips, 3.0x faster
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
2013-03-16 20:42:23 +00:00
|
|
|
void ff_yadif_filter_line_16bit_mmxext(void *dst, void *prev, void *cur,
|
|
|
|
void *next, int w, int prefs,
|
|
|
|
int mrefs, int parity, int mode);
|
|
|
|
void ff_yadif_filter_line_16bit_sse2(void *dst, void *prev, void *cur,
|
|
|
|
void *next, int w, int prefs,
|
|
|
|
int mrefs, int parity, int mode);
|
|
|
|
void ff_yadif_filter_line_16bit_ssse3(void *dst, void *prev, void *cur,
|
|
|
|
void *next, int w, int prefs,
|
|
|
|
int mrefs, int parity, int mode);
|
|
|
|
void ff_yadif_filter_line_16bit_sse4(void *dst, void *prev, void *cur,
|
|
|
|
void *next, int w, int prefs,
|
|
|
|
int mrefs, int parity, int mode);
|
|
|
|
|
yadif: x86 assembly for 9 to 14-bit samples
These smaller samples do not need to be unpacked to double words
allowing the code to process more pixels every iteration (still 2 in MMX
but 6 in SSE2). It also avoids emulating the missing double word
instructions on older instruction sets.
Like with the previous code for 16-bit samples this has been tested on
an Athlon64 and a Core2Quad.
Athlon64:
1809275 decicycles in C, 32718 runs, 50 skips
911675 decicycles in mmx, 32727 runs, 41 skips, 2.0x faster
495284 decicycles in sse2, 32747 runs, 21 skips, 3.7x faster
Core2Quad:
921363 decicycles in C, 32756 runs, 12 skips
486537 decicycles in mmx, 32764 runs, 4 skips, 1.9x faster
293296 decicycles in sse2, 32759 runs, 9 skips, 3.1x faster
284910 decicycles in ssse3, 32759 runs, 9 skips, 3.2x faster
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
2013-03-16 20:42:24 +00:00
|
|
|
void ff_yadif_filter_line_10bit_mmxext(void *dst, void *prev, void *cur,
|
|
|
|
void *next, int w, int prefs,
|
|
|
|
int mrefs, int parity, int mode);
|
|
|
|
void ff_yadif_filter_line_10bit_sse2(void *dst, void *prev, void *cur,
|
|
|
|
void *next, int w, int prefs,
|
|
|
|
int mrefs, int parity, int mode);
|
|
|
|
void ff_yadif_filter_line_10bit_ssse3(void *dst, void *prev, void *cur,
|
|
|
|
void *next, int w, int prefs,
|
|
|
|
int mrefs, int parity, int mode);
|
|
|
|
|
2012-07-01 12:08:17 +00:00
|
|
|
av_cold void ff_yadif_init_x86(YADIFContext *yadif)
|
|
|
|
{
|
|
|
|
int cpu_flags = av_get_cpu_flags();
|
yadif: x86 assembly for 16-bit samples
This is a fairly dumb copy of the assembly for 8-bit samples but it
works and produces identical output to the C version. The options have
been tested on an Athlon64 and a Core2Quad.
Athlon64:
1810385 decicycles in C, 32726 runs, 42 skips
1080744 decicycles in mmx, 32744 runs, 24 skips, 1.7x faster
818315 decicycles in sse2, 32735 runs, 33 skips, 2.2x faster
Core2Quad:
924025 decicycles in C, 32750 runs, 18 skips
623995 decicycles in mmx, 32767 runs, 1 skips, 1.5x faster
406223 decicycles in sse2, 32764 runs, 4 skips, 2.3x faster
387842 decicycles in ssse3, 32767 runs, 1 skips, 2.4x faster
307726 decicycles in sse4, 32763 runs, 5 skips, 3.0x faster
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
2013-03-16 20:42:23 +00:00
|
|
|
int bit_depth = (!yadif->csp) ? 8
|
|
|
|
: yadif->csp->comp[0].depth_minus1 + 1;
|
2012-07-01 12:08:17 +00:00
|
|
|
|
2013-01-09 15:34:46 +00:00
|
|
|
#if HAVE_YASM
|
yadif: x86 assembly for 9 to 14-bit samples
These smaller samples do not need to be unpacked to double words
allowing the code to process more pixels every iteration (still 2 in MMX
but 6 in SSE2). It also avoids emulating the missing double word
instructions on older instruction sets.
Like with the previous code for 16-bit samples this has been tested on
an Athlon64 and a Core2Quad.
Athlon64:
1809275 decicycles in C, 32718 runs, 50 skips
911675 decicycles in mmx, 32727 runs, 41 skips, 2.0x faster
495284 decicycles in sse2, 32747 runs, 21 skips, 3.7x faster
Core2Quad:
921363 decicycles in C, 32756 runs, 12 skips
486537 decicycles in mmx, 32764 runs, 4 skips, 1.9x faster
293296 decicycles in sse2, 32759 runs, 9 skips, 3.1x faster
284910 decicycles in ssse3, 32759 runs, 9 skips, 3.2x faster
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
2013-03-16 20:42:24 +00:00
|
|
|
if (bit_depth >= 15) {
|
yadif: x86 assembly for 16-bit samples
This is a fairly dumb copy of the assembly for 8-bit samples but it
works and produces identical output to the C version. The options have
been tested on an Athlon64 and a Core2Quad.
Athlon64:
1810385 decicycles in C, 32726 runs, 42 skips
1080744 decicycles in mmx, 32744 runs, 24 skips, 1.7x faster
818315 decicycles in sse2, 32735 runs, 33 skips, 2.2x faster
Core2Quad:
924025 decicycles in C, 32750 runs, 18 skips
623995 decicycles in mmx, 32767 runs, 1 skips, 1.5x faster
406223 decicycles in sse2, 32764 runs, 4 skips, 2.3x faster
387842 decicycles in ssse3, 32767 runs, 1 skips, 2.4x faster
307726 decicycles in sse4, 32763 runs, 5 skips, 3.0x faster
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
2013-03-16 20:42:23 +00:00
|
|
|
#if ARCH_X86_32
|
|
|
|
if (EXTERNAL_MMXEXT(cpu_flags))
|
|
|
|
yadif->filter_line = ff_yadif_filter_line_16bit_mmxext;
|
|
|
|
#endif /* ARCH_X86_32 */
|
|
|
|
if (EXTERNAL_SSE2(cpu_flags))
|
|
|
|
yadif->filter_line = ff_yadif_filter_line_16bit_sse2;
|
|
|
|
if (EXTERNAL_SSSE3(cpu_flags))
|
|
|
|
yadif->filter_line = ff_yadif_filter_line_16bit_ssse3;
|
|
|
|
if (EXTERNAL_SSE4(cpu_flags))
|
|
|
|
yadif->filter_line = ff_yadif_filter_line_16bit_sse4;
|
yadif: x86 assembly for 9 to 14-bit samples
These smaller samples do not need to be unpacked to double words
allowing the code to process more pixels every iteration (still 2 in MMX
but 6 in SSE2). It also avoids emulating the missing double word
instructions on older instruction sets.
Like with the previous code for 16-bit samples this has been tested on
an Athlon64 and a Core2Quad.
Athlon64:
1809275 decicycles in C, 32718 runs, 50 skips
911675 decicycles in mmx, 32727 runs, 41 skips, 2.0x faster
495284 decicycles in sse2, 32747 runs, 21 skips, 3.7x faster
Core2Quad:
921363 decicycles in C, 32756 runs, 12 skips
486537 decicycles in mmx, 32764 runs, 4 skips, 1.9x faster
293296 decicycles in sse2, 32759 runs, 9 skips, 3.1x faster
284910 decicycles in ssse3, 32759 runs, 9 skips, 3.2x faster
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
2013-03-16 20:42:24 +00:00
|
|
|
} else if ( bit_depth >= 9 && bit_depth <= 14) {
|
|
|
|
#if ARCH_X86_32
|
|
|
|
if (EXTERNAL_MMXEXT(cpu_flags))
|
|
|
|
yadif->filter_line = ff_yadif_filter_line_10bit_mmxext;
|
|
|
|
#endif /* ARCH_X86_32 */
|
|
|
|
if (EXTERNAL_SSE2(cpu_flags))
|
|
|
|
yadif->filter_line = ff_yadif_filter_line_10bit_sse2;
|
|
|
|
if (EXTERNAL_SSSE3(cpu_flags))
|
|
|
|
yadif->filter_line = ff_yadif_filter_line_10bit_ssse3;
|
yadif: x86 assembly for 16-bit samples
This is a fairly dumb copy of the assembly for 8-bit samples but it
works and produces identical output to the C version. The options have
been tested on an Athlon64 and a Core2Quad.
Athlon64:
1810385 decicycles in C, 32726 runs, 42 skips
1080744 decicycles in mmx, 32744 runs, 24 skips, 1.7x faster
818315 decicycles in sse2, 32735 runs, 33 skips, 2.2x faster
Core2Quad:
924025 decicycles in C, 32750 runs, 18 skips
623995 decicycles in mmx, 32767 runs, 1 skips, 1.5x faster
406223 decicycles in sse2, 32764 runs, 4 skips, 2.3x faster
387842 decicycles in ssse3, 32767 runs, 1 skips, 2.4x faster
307726 decicycles in sse4, 32763 runs, 5 skips, 3.0x faster
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
2013-03-16 20:42:23 +00:00
|
|
|
} else {
|
2013-01-09 15:34:46 +00:00
|
|
|
#if ARCH_X86_32
|
2013-03-10 14:08:50 +00:00
|
|
|
if (EXTERNAL_MMXEXT(cpu_flags))
|
2013-01-09 15:34:46 +00:00
|
|
|
yadif->filter_line = ff_yadif_filter_line_mmxext;
|
|
|
|
#endif /* ARCH_X86_32 */
|
2013-03-10 14:08:50 +00:00
|
|
|
if (EXTERNAL_SSE2(cpu_flags))
|
2013-01-09 15:34:46 +00:00
|
|
|
yadif->filter_line = ff_yadif_filter_line_sse2;
|
2013-03-10 14:08:50 +00:00
|
|
|
if (EXTERNAL_SSSE3(cpu_flags))
|
2013-01-09 15:34:46 +00:00
|
|
|
yadif->filter_line = ff_yadif_filter_line_ssse3;
|
yadif: x86 assembly for 16-bit samples
This is a fairly dumb copy of the assembly for 8-bit samples but it
works and produces identical output to the C version. The options have
been tested on an Athlon64 and a Core2Quad.
Athlon64:
1810385 decicycles in C, 32726 runs, 42 skips
1080744 decicycles in mmx, 32744 runs, 24 skips, 1.7x faster
818315 decicycles in sse2, 32735 runs, 33 skips, 2.2x faster
Core2Quad:
924025 decicycles in C, 32750 runs, 18 skips
623995 decicycles in mmx, 32767 runs, 1 skips, 1.5x faster
406223 decicycles in sse2, 32764 runs, 4 skips, 2.3x faster
387842 decicycles in ssse3, 32767 runs, 1 skips, 2.4x faster
307726 decicycles in sse4, 32763 runs, 5 skips, 3.0x faster
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
2013-03-16 20:42:23 +00:00
|
|
|
}
|
2013-01-09 15:34:46 +00:00
|
|
|
#endif /* HAVE_YASM */
|
2012-07-01 12:08:17 +00:00
|
|
|
}
|