mirror of https://git.ffmpeg.org/ffmpeg.git
avfilter/x86/vf_gblur: add postscale SIMD
This commit is contained in:
parent
058db59e16
commit
44cf3a2b16
|
@ -171,13 +171,14 @@ static int filter_postscale(AVFilterContext *ctx, void *arg, int jobnr, int nb_j
|
|||
const float min = s->flt ? -FLT_MAX : 0.f;
|
||||
const int height = td->height;
|
||||
const int width = td->width;
|
||||
const int64_t numpixels = width * (int64_t)height;
|
||||
const int slice_start = (numpixels * jobnr ) / nb_jobs;
|
||||
const int slice_end = (numpixels * (jobnr+1)) / nb_jobs;
|
||||
const int awidth = FFALIGN(width, 64);
|
||||
const int slice_start = (height * jobnr ) / nb_jobs;
|
||||
const int slice_end = (height * (jobnr+1)) / nb_jobs;
|
||||
const float postscale = s->postscale * s->postscaleV;
|
||||
float *buffer = s->buffer + slice_start;
|
||||
const int slice_size = slice_end - slice_start;
|
||||
|
||||
s->postscale_slice(buffer, slice_end - slice_start, postscale, min, max);
|
||||
s->postscale_slice(s->buffer + slice_start * awidth,
|
||||
slice_size * awidth, postscale, min, max);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -251,7 +252,7 @@ static int config_input(AVFilterLink *inlink)
|
|||
|
||||
s->nb_planes = av_pix_fmt_count_planes(inlink->format);
|
||||
|
||||
s->buffer = av_malloc_array(FFALIGN(inlink->w, 16), FFALIGN(inlink->h, 16) * sizeof(*s->buffer));
|
||||
s->buffer = av_malloc_array(FFALIGN(inlink->w, 64), FFALIGN(inlink->h, 64) * sizeof(*s->buffer));
|
||||
if (!s->buffer)
|
||||
return AVERROR(ENOMEM);
|
||||
|
||||
|
|
|
@ -183,3 +183,52 @@ HORIZ_SLICE
|
|||
INIT_XMM avx2
|
||||
HORIZ_SLICE
|
||||
%endif
|
||||
|
||||
%macro POSTSCALE_SLICE 0
|
||||
%if UNIX64
|
||||
cglobal postscale_slice, 2, 2, 4, ptr, length
|
||||
%else
|
||||
cglobal postscale_slice, 5, 5, 4, ptr, length, postscale, min, max
|
||||
%endif
|
||||
shl lengthd, 2
|
||||
add ptrq, lengthq
|
||||
neg lengthq
|
||||
%if WIN64
|
||||
SWAP 0, 2
|
||||
SWAP 1, 3
|
||||
SWAP 2, 4
|
||||
%endif
|
||||
%if cpuflag(avx2)
|
||||
vbroadcastss m0, xm0
|
||||
vbroadcastss m1, xm1
|
||||
vbroadcastss m2, xm2
|
||||
%else
|
||||
shufps xm0, xm0, 0
|
||||
shufps xm1, xm1, 0
|
||||
shufps xm2, xm2, 0
|
||||
%endif
|
||||
|
||||
.loop:
|
||||
%if cpuflag(avx2)
|
||||
mulps m3, m0, [ptrq + lengthq]
|
||||
%else
|
||||
movu m3, [ptrq + lengthq]
|
||||
mulps m3, m0
|
||||
%endif
|
||||
maxps m3, m1
|
||||
minps m3, m2
|
||||
movu [ptrq+lengthq], m3
|
||||
|
||||
add lengthq, mmsize
|
||||
jl .loop
|
||||
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
POSTSCALE_SLICE
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
POSTSCALE_SLICE
|
||||
%endif
|
||||
|
|
|
@ -27,14 +27,25 @@
|
|||
void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, float nu, float bscale);
|
||||
void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, float nu, float bscale);
|
||||
|
||||
void ff_postscale_slice_sse(float *ptr, int length, float postscale, float min, float max);
|
||||
void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max);
|
||||
|
||||
av_cold void ff_gblur_init_x86(GBlurContext *s)
|
||||
{
|
||||
#if ARCH_X86_64
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE4(cpu_flags))
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
s->postscale_slice = ff_postscale_slice_sse;
|
||||
}
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
s->postscale_slice = ff_postscale_slice_avx2;
|
||||
}
|
||||
#if ARCH_X86_64
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
s->horiz_slice = ff_horiz_slice_sse4;
|
||||
if (EXTERNAL_AVX2(cpu_flags))
|
||||
}
|
||||
if (EXTERNAL_AVX2(cpu_flags)) {
|
||||
s->horiz_slice = ff_horiz_slice_avx2;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue