avfilter/x86/vf_hflip : add avx2 version for hflip_byte and hflip_short

This commit is contained in:
Martin Vignali 2017-12-19 21:06:01 +01:00
parent a4a4179e83
commit f181648176
2 changed files with 27 additions and 5 deletions

View File

@ -32,7 +32,7 @@ SECTION .text
;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short) ;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short)
%macro HFLIP 3 %macro HFLIP 3
cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
mova m0, [pb_flip_%1] VBROADCASTI128 m0, [pb_flip_%1]
xor xq, xq xor xq, xq
%if %3 == 1 %if %3 == 1
movsxdifnidn wq, wd movsxdifnidn wq, wd
@ -47,8 +47,13 @@ cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
.loop0: .loop0:
neg xq neg xq
%if mmsize == 32
vpermq m1, [srcq + xq - mmsize + %3], 0x4e; flip each lane at load
vpermq m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
%else
movu m1, [srcq + xq - mmsize + %3] movu m1, [srcq + xq - mmsize + %3]
movu m2, [srcq + xq - 2 * mmsize + %3] movu m2, [srcq + xq - 2 * mmsize + %3]
%endif
pshufb m1, m0 pshufb m1, m0
pshufb m2, m0 pshufb m2, m0
neg xq neg xq
@ -78,3 +83,8 @@ INIT_XMM ssse3
HFLIP byte, b, 1 HFLIP byte, b, 1
HFLIP short, w, 2 HFLIP short, w, 2
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
HFLIP byte, b, 1
HFLIP short, w, 2
%endif

View File

@ -24,7 +24,9 @@
#include "libavfilter/hflip.h" #include "libavfilter/hflip.h"
void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w); void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w);
void ff_hflip_byte_avx2(const uint8_t *src, uint8_t *dst, int w);
void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w); void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w);
void ff_hflip_short_avx2(const uint8_t *src, uint8_t *dst, int w);
av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes) av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
{ {
@ -32,10 +34,20 @@ av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
int i; int i;
for (i = 0; i < nb_planes; i++) { for (i = 0; i < nb_planes; i++) {
if (EXTERNAL_SSSE3(cpu_flags) && step[i] == 1) { if (step[i] == 1) {
s->flip_line[i] = ff_hflip_byte_ssse3; if (EXTERNAL_SSSE3(cpu_flags)) {
} else if (EXTERNAL_SSSE3(cpu_flags) && step[i] == 2) { s->flip_line[i] = ff_hflip_byte_ssse3;
s->flip_line[i] = ff_hflip_short_ssse3; }
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
s->flip_line[i] = ff_hflip_byte_avx2;
}
} else if (step[i] == 2) {
if (EXTERNAL_SSSE3(cpu_flags)) {
s->flip_line[i] = ff_hflip_short_ssse3;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
s->flip_line[i] = ff_hflip_short_avx2;
}
} }
} }
} }