diff --git a/libavfilter/vf_gblur.c b/libavfilter/vf_gblur.c index 70e2a668b4..109a7a95f9 100644 --- a/libavfilter/vf_gblur.c +++ b/libavfilter/vf_gblur.c @@ -171,13 +171,14 @@ static int filter_postscale(AVFilterContext *ctx, void *arg, int jobnr, int nb_j const float min = s->flt ? -FLT_MAX : 0.f; const int height = td->height; const int width = td->width; - const int64_t numpixels = width * (int64_t)height; - const int slice_start = (numpixels * jobnr ) / nb_jobs; - const int slice_end = (numpixels * (jobnr+1)) / nb_jobs; + const int awidth = FFALIGN(width, 64); + const int slice_start = (height * jobnr ) / nb_jobs; + const int slice_end = (height * (jobnr+1)) / nb_jobs; const float postscale = s->postscale * s->postscaleV; - float *buffer = s->buffer + slice_start; + const int slice_size = slice_end - slice_start; - s->postscale_slice(buffer, slice_end - slice_start, postscale, min, max); + s->postscale_slice(s->buffer + slice_start * awidth, + slice_size * awidth, postscale, min, max); return 0; } @@ -251,7 +252,7 @@ static int config_input(AVFilterLink *inlink) s->nb_planes = av_pix_fmt_count_planes(inlink->format); - s->buffer = av_malloc_array(FFALIGN(inlink->w, 16), FFALIGN(inlink->h, 16) * sizeof(*s->buffer)); + s->buffer = av_malloc_array(FFALIGN(inlink->w, 64), FFALIGN(inlink->h, 64) * sizeof(*s->buffer)); if (!s->buffer) return AVERROR(ENOMEM); diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm index a25b1659f5..c29ecba889 100644 --- a/libavfilter/x86/vf_gblur.asm +++ b/libavfilter/x86/vf_gblur.asm @@ -183,3 +183,52 @@ HORIZ_SLICE INIT_XMM avx2 HORIZ_SLICE %endif + +%macro POSTSCALE_SLICE 0 +%if UNIX64 +cglobal postscale_slice, 2, 2, 4, ptr, length +%else +cglobal postscale_slice, 5, 5, 4, ptr, length, postscale, min, max +%endif + shl lengthd, 2 + add ptrq, lengthq + neg lengthq +%if WIN64 + SWAP 0, 2 + SWAP 1, 3 + SWAP 2, 4 +%endif +%if cpuflag(avx2) + vbroadcastss m0, xm0 + vbroadcastss m1, xm1 + vbroadcastss m2, xm2 +%else + shufps xm0, xm0, 0 + shufps xm1, xm1, 0 + shufps xm2, xm2, 0 +%endif + + .loop: +%if cpuflag(avx2) + mulps m3, m0, [ptrq + lengthq] +%else + movu m3, [ptrq + lengthq] + mulps m3, m0 +%endif + maxps m3, m1 + minps m3, m2 + movu [ptrq+lengthq], m3 + + add lengthq, mmsize + jl .loop + + RET +%endmacro + +INIT_XMM sse +POSTSCALE_SLICE + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +POSTSCALE_SLICE +%endif diff --git a/libavfilter/x86/vf_gblur_init.c b/libavfilter/x86/vf_gblur_init.c index e63e59fe23..d80fb46fe4 100644 --- a/libavfilter/x86/vf_gblur_init.c +++ b/libavfilter/x86/vf_gblur_init.c @@ -27,14 +27,25 @@ void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, float nu, float bscale); void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, float nu, float bscale); +void ff_postscale_slice_sse(float *ptr, int length, float postscale, float min, float max); +void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max); + av_cold void ff_gblur_init_x86(GBlurContext *s) { -#if ARCH_X86_64 int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_SSE4(cpu_flags)) + if (EXTERNAL_SSE(cpu_flags)) { + s->postscale_slice = ff_postscale_slice_sse; + } + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + s->postscale_slice = ff_postscale_slice_avx2; + } +#if ARCH_X86_64 + if (EXTERNAL_SSE4(cpu_flags)) { s->horiz_slice = ff_horiz_slice_sse4; - if (EXTERNAL_AVX2(cpu_flags)) + } + if (EXTERNAL_AVX2(cpu_flags)) { s->horiz_slice = ff_horiz_slice_avx2; + } #endif }