lavfi/lut: Add slice threading support

Used the command for 1080p h264 clip as follow:

a). ffmpeg -i input -vf lutyuv="u=128:v=128" -f null /dev/null
b). ffmpeg -i input -vf lutrgb="g=0:b=0" -f null /dev/null

after enabled the slice threading, the fps change from:

a). 144fps to 258fps (lutyuv)
b). 94fps  to 153fps (lutrgb)

in Intel(R) Core(TM) i5-8265U CPU @ 1.60GHz

Reviewed-by: Paul B Mahol <onemda@gmail.com>
Signed-off-by: Jun Zhao <barryjzhao@tencent.com>
This commit is contained in:
Jun Zhao 2019-05-21 23:39:09 +08:00
parent 360bee8ca4
commit bbad0bc5ff
1 changed files with 197 additions and 113 deletions

View File

@ -337,13 +337,194 @@ static int config_props(AVFilterLink *inlink)
return 0;
}
struct thread_data {
AVFrame *in;
AVFrame *out;
int w;
int h;
};
#define LOAD_PACKED_COMMON\
LutContext *s = ctx->priv;\
const struct thread_data *td = arg;\
\
int i, j;\
const int w = td->w;\
const int h = td->h;\
AVFrame *in = td->in;\
AVFrame *out = td->out;\
const uint16_t (*tab)[256*256] = (const uint16_t (*)[256*256])s->lut;\
const int step = s->step;\
\
const int slice_start = (h * jobnr ) / nb_jobs;\
const int slice_end = (h * (jobnr+1)) / nb_jobs;\
/* packed, 16-bit */
static int lut_packed_16bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
{
LOAD_PACKED_COMMON
uint16_t *inrow, *outrow, *inrow0, *outrow0;
const int in_linesize = in->linesize[0] / 2;
const int out_linesize = out->linesize[0] / 2;
inrow0 = (uint16_t *)in ->data[0];
outrow0 = (uint16_t *)out->data[0];
for (i = slice_start; i < slice_end; i++) {
inrow = inrow0 + i * in_linesize;
outrow = outrow0 + i * out_linesize;
for (j = 0; j < w; j++) {
switch (step) {
#if HAVE_BIGENDIAN
case 4: outrow[3] = av_bswap16(tab[3][av_bswap16(inrow[3])]); // Fall-through
case 3: outrow[2] = av_bswap16(tab[2][av_bswap16(inrow[2])]); // Fall-through
case 2: outrow[1] = av_bswap16(tab[1][av_bswap16(inrow[1])]); // Fall-through
default: outrow[0] = av_bswap16(tab[0][av_bswap16(inrow[0])]);
#else
case 4: outrow[3] = tab[3][inrow[3]]; // Fall-through
case 3: outrow[2] = tab[2][inrow[2]]; // Fall-through
case 2: outrow[1] = tab[1][inrow[1]]; // Fall-through
default: outrow[0] = tab[0][inrow[0]];
#endif
}
outrow += step;
inrow += step;
}
}
return 0;
}
/* packed, 8-bit */
static int lut_packed_8bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
{
LOAD_PACKED_COMMON
uint8_t *inrow, *outrow, *inrow0, *outrow0;
const int in_linesize = in->linesize[0];
const int out_linesize = out->linesize[0];
inrow0 = in ->data[0];
outrow0 = out->data[0];
for (i = slice_start; i < slice_end; i++) {
inrow = inrow0 + i * in_linesize;
outrow = outrow0 + i * out_linesize;
for (j = 0; j < w; j++) {
switch (step) {
case 4: outrow[3] = tab[3][inrow[3]]; // Fall-through
case 3: outrow[2] = tab[2][inrow[2]]; // Fall-through
case 2: outrow[1] = tab[1][inrow[1]]; // Fall-through
default: outrow[0] = tab[0][inrow[0]];
}
outrow += step;
inrow += step;
}
}
return 0;
}
#define LOAD_PLANAR_COMMON\
LutContext *s = ctx->priv;\
const struct thread_data *td = arg;\
int i, j, plane;\
AVFrame *in = td->in;\
AVFrame *out = td->out;\
#define PLANAR_COMMON\
int vsub = plane == 1 || plane == 2 ? s->vsub : 0;\
int hsub = plane == 1 || plane == 2 ? s->hsub : 0;\
int h = AV_CEIL_RSHIFT(td->h, vsub);\
int w = AV_CEIL_RSHIFT(td->w, hsub);\
const uint16_t *tab = s->lut[plane];\
\
const int slice_start = (h * jobnr ) / nb_jobs;\
const int slice_end = (h * (jobnr+1)) / nb_jobs;\
/* planar >8 bit depth */
static int lut_planar_16bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
{
LOAD_PLANAR_COMMON
uint16_t *inrow, *outrow;
for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
PLANAR_COMMON
const int in_linesize = in->linesize[plane] / 2;
const int out_linesize = out->linesize[plane] / 2;
inrow = (uint16_t *)(in ->data[plane] + slice_start * in_linesize);
outrow = (uint16_t *)(out->data[plane] + slice_start * out_linesize);
for (i = slice_start; i < slice_end; i++) {
for (j = 0; j < w; j++) {
#if HAVE_BIGENDIAN
outrow[j] = av_bswap16(tab[av_bswap16(inrow[j])]);
#else
outrow[j] = tab[inrow[j]];
#endif
}
inrow += in_linesize;
outrow += out_linesize;
}
}
return 0;
}
/* planar 8bit depth */
static int lut_planar_8bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
{
LOAD_PLANAR_COMMON
uint8_t *inrow, *outrow;
for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
PLANAR_COMMON
const int in_linesize = in->linesize[plane];
const int out_linesize = out->linesize[plane];
inrow = in ->data[plane] + slice_start * in_linesize;
outrow = out->data[plane] + slice_start * out_linesize;
for (i = slice_start; i < slice_end; i++) {
for (j = 0; j < w; j++)
outrow[j] = tab[inrow[j]];
inrow += in_linesize;
outrow += out_linesize;
}
}
return 0;
}
#define PACKED_THREAD_DATA\
struct thread_data td = {\
.in = in,\
.out = out,\
.w = inlink->w,\
.h = in->height,\
};\
#define PLANAR_THREAD_DATA\
struct thread_data td = {\
.in = in,\
.out = out,\
.w = inlink->w,\
.h = inlink->h,\
};\
static int filter_frame(AVFilterLink *inlink, AVFrame *in)
{
AVFilterContext *ctx = inlink->dst;
LutContext *s = ctx->priv;
AVFilterLink *outlink = ctx->outputs[0];
AVFrame *out;
int i, j, plane, direct = 0;
int direct = 0;
if (av_frame_is_writable(in)) {
direct = 1;
@ -359,121 +540,24 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
if (s->is_rgb && s->is_16bit && !s->is_planar) {
/* packed, 16-bit */
uint16_t *inrow, *outrow, *inrow0, *outrow0;
const int w = inlink->w;
const int h = in->height;
const uint16_t (*tab)[256*256] = (const uint16_t (*)[256*256])s->lut;
const int in_linesize = in->linesize[0] / 2;
const int out_linesize = out->linesize[0] / 2;
const int step = s->step;
inrow0 = (uint16_t*) in ->data[0];
outrow0 = (uint16_t*) out->data[0];
for (i = 0; i < h; i ++) {
inrow = inrow0;
outrow = outrow0;
for (j = 0; j < w; j++) {
switch (step) {
#if HAVE_BIGENDIAN
case 4: outrow[3] = av_bswap16(tab[3][av_bswap16(inrow[3])]); // Fall-through
case 3: outrow[2] = av_bswap16(tab[2][av_bswap16(inrow[2])]); // Fall-through
case 2: outrow[1] = av_bswap16(tab[1][av_bswap16(inrow[1])]); // Fall-through
default: outrow[0] = av_bswap16(tab[0][av_bswap16(inrow[0])]);
#else
case 4: outrow[3] = tab[3][inrow[3]]; // Fall-through
case 3: outrow[2] = tab[2][inrow[2]]; // Fall-through
case 2: outrow[1] = tab[1][inrow[1]]; // Fall-through
default: outrow[0] = tab[0][inrow[0]];
#endif
}
outrow += step;
inrow += step;
}
inrow0 += in_linesize;
outrow0 += out_linesize;
}
PACKED_THREAD_DATA
ctx->internal->execute(ctx, lut_packed_16bits, &td, NULL,
FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
} else if (s->is_rgb && !s->is_planar) {
/* packed */
uint8_t *inrow, *outrow, *inrow0, *outrow0;
const int w = inlink->w;
const int h = in->height;
const uint16_t (*tab)[256*256] = (const uint16_t (*)[256*256])s->lut;
const int in_linesize = in->linesize[0];
const int out_linesize = out->linesize[0];
const int step = s->step;
inrow0 = in ->data[0];
outrow0 = out->data[0];
for (i = 0; i < h; i ++) {
inrow = inrow0;
outrow = outrow0;
for (j = 0; j < w; j++) {
switch (step) {
case 4: outrow[3] = tab[3][inrow[3]]; // Fall-through
case 3: outrow[2] = tab[2][inrow[2]]; // Fall-through
case 2: outrow[1] = tab[1][inrow[1]]; // Fall-through
default: outrow[0] = tab[0][inrow[0]];
}
outrow += step;
inrow += step;
}
inrow0 += in_linesize;
outrow0 += out_linesize;
}
/* packed 8 bits */
PACKED_THREAD_DATA
ctx->internal->execute(ctx, lut_packed_8bits, &td, NULL,
FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
} else if (s->is_16bit) {
// planar >8 bit depth
uint16_t *inrow, *outrow;
for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
int vsub = plane == 1 || plane == 2 ? s->vsub : 0;
int hsub = plane == 1 || plane == 2 ? s->hsub : 0;
int h = AV_CEIL_RSHIFT(inlink->h, vsub);
int w = AV_CEIL_RSHIFT(inlink->w, hsub);
const uint16_t *tab = s->lut[plane];
const int in_linesize = in->linesize[plane] / 2;
const int out_linesize = out->linesize[plane] / 2;
inrow = (uint16_t *)in ->data[plane];
outrow = (uint16_t *)out->data[plane];
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
#if HAVE_BIGENDIAN
outrow[j] = av_bswap16(tab[av_bswap16(inrow[j])]);
#else
outrow[j] = tab[inrow[j]];
#endif
}
inrow += in_linesize;
outrow += out_linesize;
}
}
/* planar >8 bit depth */
PLANAR_THREAD_DATA
ctx->internal->execute(ctx, lut_planar_16bits, &td, NULL,
FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
} else {
/* planar 8bit depth */
uint8_t *inrow, *outrow;
for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
int vsub = plane == 1 || plane == 2 ? s->vsub : 0;
int hsub = plane == 1 || plane == 2 ? s->hsub : 0;
int h = AV_CEIL_RSHIFT(inlink->h, vsub);
int w = AV_CEIL_RSHIFT(inlink->w, hsub);
const uint16_t *tab = s->lut[plane];
const int in_linesize = in->linesize[plane];
const int out_linesize = out->linesize[plane];
inrow = in ->data[plane];
outrow = out->data[plane];
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++)
outrow[j] = tab[inrow[j]];
inrow += in_linesize;
outrow += out_linesize;
}
}
PLANAR_THREAD_DATA
ctx->internal->execute(ctx, lut_planar_8bits, &td, NULL,
FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
}
if (!direct)
@ -508,7 +592,7 @@ static const AVFilterPad outputs[] = {
.query_formats = query_formats, \
.inputs = inputs, \
.outputs = outputs, \
.flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC, \
.flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC | AVFILTER_FLAG_SLICE_THREADS, \
}
#if CONFIG_LUT_FILTER