From c59e49f9b2f30184e6d74255f3c1b2a1a89c6520 Mon Sep 17 00:00:00 2001 From: Paul B Mahol Date: Mon, 25 Dec 2017 11:53:54 +0100 Subject: [PATCH] avfilter/vf_convolve: implement slice threading Signed-off-by: Paul B Mahol --- libavfilter/vf_convolve.c | 142 +++++++++++++++++++++++++++----------- 1 file changed, 100 insertions(+), 42 deletions(-) diff --git a/libavfilter/vf_convolve.c b/libavfilter/vf_convolve.c index e3d0c5fa2a..de58cdff8f 100644 --- a/libavfilter/vf_convolve.c +++ b/libavfilter/vf_convolve.c @@ -29,12 +29,14 @@ #include "internal.h" #include "video.h" +#define MAX_THREADS 16 + typedef struct ConvolveContext { const AVClass *class; FFFrameSync fs; - FFTContext *fft[4]; - FFTContext *ifft[4]; + FFTContext *fft[4][MAX_THREADS]; + FFTContext *ifft[4][MAX_THREADS]; int fft_bits[4]; int fft_len[4]; @@ -152,15 +154,28 @@ static int config_input_impulse(AVFilterLink *inlink) return 0; } -static void fft_horizontal(ConvolveContext *s, FFTComplex *fft_hdata, - int n, int plane) +typedef struct ThreadData { + FFTComplex *hdata, *vdata; + int plane, n; +} ThreadData; + +static int fft_horizontal(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) { + ConvolveContext *s = ctx->priv; + ThreadData *td = arg; + FFTComplex *hdata = td->hdata; + const int plane = td->plane; + const int n = td->n; + int start = (n * jobnr ) / nb_jobs; + int end = (n * (jobnr+1)) / nb_jobs; int y; - for (y = 0; y < n; y++) { - av_fft_permute(s->fft[plane], fft_hdata + y * n); - av_fft_calc(s->fft[plane], fft_hdata + y * n); + for (y = start; y < end; y++) { + av_fft_permute(s->fft[plane][jobnr], hdata + y * n); + av_fft_calc(s->fft[plane][jobnr], hdata + y * n); } + + return 0; } static void get_input(ConvolveContext *s, FFTComplex *fft_hdata, @@ -238,46 +253,73 @@ static void get_input(ConvolveContext *s, FFTComplex *fft_hdata, } } -static void fft_vertical(ConvolveContext *s, FFTComplex *fft_hdata, FFTComplex *fft_vdata, - int n, int plane) +static int fft_vertical(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) { + ConvolveContext *s = ctx->priv; + ThreadData *td = arg; + FFTComplex *hdata = td->hdata; + FFTComplex *vdata = td->vdata; + const int plane = td->plane; + const int n = td->n; + int start = (n * jobnr ) / nb_jobs; + int end = (n * (jobnr+1)) / nb_jobs; int y, x; - for (y = 0; y < n; y++) { + for (y = start; y < end; y++) { for (x = 0; x < n; x++) { - fft_vdata[y * n + x].re = fft_hdata[x * n + y].re; - fft_vdata[y * n + x].im = fft_hdata[x * n + y].im; + vdata[y * n + x].re = hdata[x * n + y].re; + vdata[y * n + x].im = hdata[x * n + y].im; } - av_fft_permute(s->fft[plane], fft_vdata + y * n); - av_fft_calc(s->fft[plane], fft_vdata + y * n); + av_fft_permute(s->fft[plane][jobnr], vdata + y * n); + av_fft_calc(s->fft[plane][jobnr], vdata + y * n); } + + return 0; } -static void ifft_vertical(ConvolveContext *s, int n, int plane) +static int ifft_vertical(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) { + ConvolveContext *s = ctx->priv; + ThreadData *td = arg; + FFTComplex *hdata = td->hdata; + FFTComplex *vdata = td->vdata; + const int plane = td->plane; + const int n = td->n; + int start = (n * jobnr ) / nb_jobs; + int end = (n * (jobnr+1)) / nb_jobs; int y, x; - for (y = 0; y < n; y++) { - av_fft_permute(s->ifft[plane], s->fft_vdata[plane] + y * n); - av_fft_calc(s->ifft[plane], s->fft_vdata[plane] + y * n); + for (y = start; y < end; y++) { + av_fft_permute(s->ifft[plane][jobnr], vdata + y * n); + av_fft_calc(s->ifft[plane][jobnr], vdata + y * n); for (x = 0; x < n; x++) { - s->fft_hdata[plane][x * n + y].re = s->fft_vdata[plane][y * n + x].re; - s->fft_hdata[plane][x * n + y].im = s->fft_vdata[plane][y * n + x].im; + hdata[x * n + y].re = vdata[y * n + x].re; + hdata[x * n + y].im = vdata[y * n + x].im; } } + + return 0; } -static void ifft_horizontal(ConvolveContext *s, int n, int plane) +static int ifft_horizontal(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) { - FFTComplex *input = s->fft_hdata[plane]; + ConvolveContext *s = ctx->priv; + ThreadData *td = arg; + FFTComplex *hdata = td->hdata; + const int plane = td->plane; + const int n = td->n; + int start = (n * jobnr ) / nb_jobs; + int end = (n * (jobnr+1)) / nb_jobs; int y; - for (y = 0; y < n; y++) { - av_fft_permute(s->ifft[plane], input + y * n); - av_fft_calc(s->ifft[plane], input + y * n); + for (y = start; y < end; y++) { + av_fft_permute(s->ifft[plane][jobnr], hdata + y * n); + av_fft_calc(s->ifft[plane][jobnr], hdata + y * n); } + + return 0; } static void get_output(ConvolveContext *s, AVFrame *out, @@ -356,15 +398,20 @@ static int do_convolve(FFFrameSync *fs) const int w = s->planewidth[plane]; const int h = s->planeheight[plane]; float total = 0; + ThreadData td; if (!(s->planes & (1 << plane))) { continue; } + td.plane = plane, td.n = n; get_input(s, s->fft_hdata[plane], mainpic, w, h, n, plane, 1.f); - fft_horizontal(s, s->fft_hdata[plane], n, plane); - fft_vertical(s, s->fft_hdata[plane], s->fft_vdata[plane], - n, plane); + + td.hdata = s->fft_hdata[plane]; + td.vdata = s->fft_vdata[plane]; + + ctx->internal->execute(ctx, fft_horizontal, &td, NULL, FFMIN3(MAX_THREADS, n, ff_filter_get_nb_threads(ctx))); + ctx->internal->execute(ctx, fft_vertical, &td, NULL, FFMIN3(MAX_THREADS, n, ff_filter_get_nb_threads(ctx))); if ((!s->impulse && !s->got_impulse[plane]) || s->impulse) { if (s->depth == 8) { @@ -385,9 +432,12 @@ static int do_convolve(FFFrameSync *fs) total = FFMAX(1, total); get_input(s, s->fft_hdata_impulse[plane], impulsepic, w, h, n, plane, 1 / total); - fft_horizontal(s, s->fft_hdata_impulse[plane], n, plane); - fft_vertical(s, s->fft_hdata_impulse[plane], s->fft_vdata_impulse[plane], - n, plane); + + td.hdata = s->fft_hdata_impulse[plane]; + td.vdata = s->fft_vdata_impulse[plane]; + + ctx->internal->execute(ctx, fft_horizontal, &td, NULL, FFMIN3(MAX_THREADS, n, ff_filter_get_nb_threads(ctx))); + ctx->internal->execute(ctx, fft_vertical, &td, NULL, FFMIN3(MAX_THREADS, n, ff_filter_get_nb_threads(ctx))); s->got_impulse[plane] = 1; } @@ -408,8 +458,11 @@ static int do_convolve(FFFrameSync *fs) } } - ifft_vertical(s, n, plane); - ifft_horizontal(s, n, plane); + td.hdata = s->fft_hdata[plane]; + td.vdata = s->fft_vdata[plane]; + + ctx->internal->execute(ctx, ifft_vertical, &td, NULL, FFMIN3(MAX_THREADS, n, ff_filter_get_nb_threads(ctx))); + ctx->internal->execute(ctx, ifft_horizontal, &td, NULL, FFMIN3(MAX_THREADS, n, ff_filter_get_nb_threads(ctx))); get_output(s, mainpic, w, h, n, plane); } @@ -421,7 +474,7 @@ static int config_output(AVFilterLink *outlink) AVFilterContext *ctx = outlink->src; ConvolveContext *s = ctx->priv; AVFilterLink *mainlink = ctx->inputs[0]; - int ret, i; + int ret, i, j; s->fs.on_event = do_convolve; ret = ff_framesync_init_dualinput(&s->fs, ctx); @@ -437,10 +490,12 @@ static int config_output(AVFilterLink *outlink) return ret; for (i = 0; i < s->nb_planes; i++) { - s->fft[i] = av_fft_init(s->fft_bits[i], 0); - s->ifft[i] = av_fft_init(s->fft_bits[i], 1); - if (!s->fft[i] || !s->ifft[i]) - return AVERROR(ENOMEM); + for (j = 0; j < MAX_THREADS; j++) { + s->fft[i][j] = av_fft_init(s->fft_bits[i], 0); + s->ifft[i][j] = av_fft_init(s->fft_bits[i], 1); + if (!s->fft[i][j] || !s->ifft[i][j]) + return AVERROR(ENOMEM); + } } return 0; @@ -455,15 +510,18 @@ static int activate(AVFilterContext *ctx) static av_cold void uninit(AVFilterContext *ctx) { ConvolveContext *s = ctx->priv; - int i; + int i, j; for (i = 0; i < 4; i++) { av_freep(&s->fft_hdata[i]); av_freep(&s->fft_vdata[i]); av_freep(&s->fft_hdata_impulse[i]); av_freep(&s->fft_vdata_impulse[i]); - av_fft_end(s->fft[i]); - av_fft_end(s->ifft[i]); + + for (j = 0; j < MAX_THREADS; j++) { + av_fft_end(s->fft[i][j]); + av_fft_end(s->ifft[i][j]); + } } ff_framesync_uninit(&s->fs); @@ -502,5 +560,5 @@ AVFilter ff_vf_convolve = { .priv_class = &convolve_class, .inputs = convolve_inputs, .outputs = convolve_outputs, - .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL, + .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL | AVFILTER_FLAG_SLICE_THREADS, };