mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2024-12-31 20:02:07 +00:00
lavfi/lut: Add slice threading support
Used the command for 1080p h264 clip as follow: a). ffmpeg -i input -vf lutyuv="u=128:v=128" -f null /dev/null b). ffmpeg -i input -vf lutrgb="g=0:b=0" -f null /dev/null after enabled the slice threading, the fps change from: a). 144fps to 258fps (lutyuv) b). 94fps to 153fps (lutrgb) in Intel(R) Core(TM) i5-8265U CPU @ 1.60GHz Reviewed-by: Paul B Mahol <onemda@gmail.com> Signed-off-by: Jun Zhao <barryjzhao@tencent.com>
This commit is contained in:
parent
360bee8ca4
commit
bbad0bc5ff
@ -337,13 +337,194 @@ static int config_props(AVFilterLink *inlink)
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct thread_data {
|
||||
AVFrame *in;
|
||||
AVFrame *out;
|
||||
|
||||
int w;
|
||||
int h;
|
||||
};
|
||||
|
||||
#define LOAD_PACKED_COMMON\
|
||||
LutContext *s = ctx->priv;\
|
||||
const struct thread_data *td = arg;\
|
||||
\
|
||||
int i, j;\
|
||||
const int w = td->w;\
|
||||
const int h = td->h;\
|
||||
AVFrame *in = td->in;\
|
||||
AVFrame *out = td->out;\
|
||||
const uint16_t (*tab)[256*256] = (const uint16_t (*)[256*256])s->lut;\
|
||||
const int step = s->step;\
|
||||
\
|
||||
const int slice_start = (h * jobnr ) / nb_jobs;\
|
||||
const int slice_end = (h * (jobnr+1)) / nb_jobs;\
|
||||
|
||||
/* packed, 16-bit */
|
||||
static int lut_packed_16bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
|
||||
{
|
||||
LOAD_PACKED_COMMON
|
||||
|
||||
uint16_t *inrow, *outrow, *inrow0, *outrow0;
|
||||
const int in_linesize = in->linesize[0] / 2;
|
||||
const int out_linesize = out->linesize[0] / 2;
|
||||
inrow0 = (uint16_t *)in ->data[0];
|
||||
outrow0 = (uint16_t *)out->data[0];
|
||||
|
||||
for (i = slice_start; i < slice_end; i++) {
|
||||
inrow = inrow0 + i * in_linesize;
|
||||
outrow = outrow0 + i * out_linesize;
|
||||
for (j = 0; j < w; j++) {
|
||||
|
||||
switch (step) {
|
||||
#if HAVE_BIGENDIAN
|
||||
case 4: outrow[3] = av_bswap16(tab[3][av_bswap16(inrow[3])]); // Fall-through
|
||||
case 3: outrow[2] = av_bswap16(tab[2][av_bswap16(inrow[2])]); // Fall-through
|
||||
case 2: outrow[1] = av_bswap16(tab[1][av_bswap16(inrow[1])]); // Fall-through
|
||||
default: outrow[0] = av_bswap16(tab[0][av_bswap16(inrow[0])]);
|
||||
#else
|
||||
case 4: outrow[3] = tab[3][inrow[3]]; // Fall-through
|
||||
case 3: outrow[2] = tab[2][inrow[2]]; // Fall-through
|
||||
case 2: outrow[1] = tab[1][inrow[1]]; // Fall-through
|
||||
default: outrow[0] = tab[0][inrow[0]];
|
||||
#endif
|
||||
}
|
||||
outrow += step;
|
||||
inrow += step;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* packed, 8-bit */
|
||||
static int lut_packed_8bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
|
||||
{
|
||||
LOAD_PACKED_COMMON
|
||||
|
||||
uint8_t *inrow, *outrow, *inrow0, *outrow0;
|
||||
const int in_linesize = in->linesize[0];
|
||||
const int out_linesize = out->linesize[0];
|
||||
inrow0 = in ->data[0];
|
||||
outrow0 = out->data[0];
|
||||
|
||||
for (i = slice_start; i < slice_end; i++) {
|
||||
inrow = inrow0 + i * in_linesize;
|
||||
outrow = outrow0 + i * out_linesize;
|
||||
for (j = 0; j < w; j++) {
|
||||
switch (step) {
|
||||
case 4: outrow[3] = tab[3][inrow[3]]; // Fall-through
|
||||
case 3: outrow[2] = tab[2][inrow[2]]; // Fall-through
|
||||
case 2: outrow[1] = tab[1][inrow[1]]; // Fall-through
|
||||
default: outrow[0] = tab[0][inrow[0]];
|
||||
}
|
||||
outrow += step;
|
||||
inrow += step;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define LOAD_PLANAR_COMMON\
|
||||
LutContext *s = ctx->priv;\
|
||||
const struct thread_data *td = arg;\
|
||||
int i, j, plane;\
|
||||
AVFrame *in = td->in;\
|
||||
AVFrame *out = td->out;\
|
||||
|
||||
#define PLANAR_COMMON\
|
||||
int vsub = plane == 1 || plane == 2 ? s->vsub : 0;\
|
||||
int hsub = plane == 1 || plane == 2 ? s->hsub : 0;\
|
||||
int h = AV_CEIL_RSHIFT(td->h, vsub);\
|
||||
int w = AV_CEIL_RSHIFT(td->w, hsub);\
|
||||
const uint16_t *tab = s->lut[plane];\
|
||||
\
|
||||
const int slice_start = (h * jobnr ) / nb_jobs;\
|
||||
const int slice_end = (h * (jobnr+1)) / nb_jobs;\
|
||||
|
||||
/* planar >8 bit depth */
|
||||
static int lut_planar_16bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
|
||||
{
|
||||
LOAD_PLANAR_COMMON
|
||||
|
||||
uint16_t *inrow, *outrow;
|
||||
|
||||
for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
|
||||
PLANAR_COMMON
|
||||
|
||||
const int in_linesize = in->linesize[plane] / 2;
|
||||
const int out_linesize = out->linesize[plane] / 2;
|
||||
|
||||
inrow = (uint16_t *)(in ->data[plane] + slice_start * in_linesize);
|
||||
outrow = (uint16_t *)(out->data[plane] + slice_start * out_linesize);
|
||||
|
||||
for (i = slice_start; i < slice_end; i++) {
|
||||
for (j = 0; j < w; j++) {
|
||||
#if HAVE_BIGENDIAN
|
||||
outrow[j] = av_bswap16(tab[av_bswap16(inrow[j])]);
|
||||
#else
|
||||
outrow[j] = tab[inrow[j]];
|
||||
#endif
|
||||
}
|
||||
inrow += in_linesize;
|
||||
outrow += out_linesize;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* planar 8bit depth */
|
||||
static int lut_planar_8bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
|
||||
{
|
||||
LOAD_PLANAR_COMMON
|
||||
|
||||
uint8_t *inrow, *outrow;
|
||||
|
||||
for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
|
||||
PLANAR_COMMON
|
||||
|
||||
const int in_linesize = in->linesize[plane];
|
||||
const int out_linesize = out->linesize[plane];
|
||||
|
||||
inrow = in ->data[plane] + slice_start * in_linesize;
|
||||
outrow = out->data[plane] + slice_start * out_linesize;
|
||||
|
||||
for (i = slice_start; i < slice_end; i++) {
|
||||
for (j = 0; j < w; j++)
|
||||
outrow[j] = tab[inrow[j]];
|
||||
inrow += in_linesize;
|
||||
outrow += out_linesize;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define PACKED_THREAD_DATA\
|
||||
struct thread_data td = {\
|
||||
.in = in,\
|
||||
.out = out,\
|
||||
.w = inlink->w,\
|
||||
.h = in->height,\
|
||||
};\
|
||||
|
||||
#define PLANAR_THREAD_DATA\
|
||||
struct thread_data td = {\
|
||||
.in = in,\
|
||||
.out = out,\
|
||||
.w = inlink->w,\
|
||||
.h = inlink->h,\
|
||||
};\
|
||||
|
||||
static int filter_frame(AVFilterLink *inlink, AVFrame *in)
|
||||
{
|
||||
AVFilterContext *ctx = inlink->dst;
|
||||
LutContext *s = ctx->priv;
|
||||
AVFilterLink *outlink = ctx->outputs[0];
|
||||
AVFrame *out;
|
||||
int i, j, plane, direct = 0;
|
||||
int direct = 0;
|
||||
|
||||
if (av_frame_is_writable(in)) {
|
||||
direct = 1;
|
||||
@ -359,121 +540,24 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
|
||||
|
||||
if (s->is_rgb && s->is_16bit && !s->is_planar) {
|
||||
/* packed, 16-bit */
|
||||
uint16_t *inrow, *outrow, *inrow0, *outrow0;
|
||||
const int w = inlink->w;
|
||||
const int h = in->height;
|
||||
const uint16_t (*tab)[256*256] = (const uint16_t (*)[256*256])s->lut;
|
||||
const int in_linesize = in->linesize[0] / 2;
|
||||
const int out_linesize = out->linesize[0] / 2;
|
||||
const int step = s->step;
|
||||
|
||||
inrow0 = (uint16_t*) in ->data[0];
|
||||
outrow0 = (uint16_t*) out->data[0];
|
||||
|
||||
for (i = 0; i < h; i ++) {
|
||||
inrow = inrow0;
|
||||
outrow = outrow0;
|
||||
for (j = 0; j < w; j++) {
|
||||
|
||||
switch (step) {
|
||||
#if HAVE_BIGENDIAN
|
||||
case 4: outrow[3] = av_bswap16(tab[3][av_bswap16(inrow[3])]); // Fall-through
|
||||
case 3: outrow[2] = av_bswap16(tab[2][av_bswap16(inrow[2])]); // Fall-through
|
||||
case 2: outrow[1] = av_bswap16(tab[1][av_bswap16(inrow[1])]); // Fall-through
|
||||
default: outrow[0] = av_bswap16(tab[0][av_bswap16(inrow[0])]);
|
||||
#else
|
||||
case 4: outrow[3] = tab[3][inrow[3]]; // Fall-through
|
||||
case 3: outrow[2] = tab[2][inrow[2]]; // Fall-through
|
||||
case 2: outrow[1] = tab[1][inrow[1]]; // Fall-through
|
||||
default: outrow[0] = tab[0][inrow[0]];
|
||||
#endif
|
||||
}
|
||||
outrow += step;
|
||||
inrow += step;
|
||||
}
|
||||
inrow0 += in_linesize;
|
||||
outrow0 += out_linesize;
|
||||
}
|
||||
PACKED_THREAD_DATA
|
||||
ctx->internal->execute(ctx, lut_packed_16bits, &td, NULL,
|
||||
FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
|
||||
} else if (s->is_rgb && !s->is_planar) {
|
||||
/* packed */
|
||||
uint8_t *inrow, *outrow, *inrow0, *outrow0;
|
||||
const int w = inlink->w;
|
||||
const int h = in->height;
|
||||
const uint16_t (*tab)[256*256] = (const uint16_t (*)[256*256])s->lut;
|
||||
const int in_linesize = in->linesize[0];
|
||||
const int out_linesize = out->linesize[0];
|
||||
const int step = s->step;
|
||||
|
||||
inrow0 = in ->data[0];
|
||||
outrow0 = out->data[0];
|
||||
|
||||
for (i = 0; i < h; i ++) {
|
||||
inrow = inrow0;
|
||||
outrow = outrow0;
|
||||
for (j = 0; j < w; j++) {
|
||||
switch (step) {
|
||||
case 4: outrow[3] = tab[3][inrow[3]]; // Fall-through
|
||||
case 3: outrow[2] = tab[2][inrow[2]]; // Fall-through
|
||||
case 2: outrow[1] = tab[1][inrow[1]]; // Fall-through
|
||||
default: outrow[0] = tab[0][inrow[0]];
|
||||
}
|
||||
outrow += step;
|
||||
inrow += step;
|
||||
}
|
||||
inrow0 += in_linesize;
|
||||
outrow0 += out_linesize;
|
||||
}
|
||||
/* packed 8 bits */
|
||||
PACKED_THREAD_DATA
|
||||
ctx->internal->execute(ctx, lut_packed_8bits, &td, NULL,
|
||||
FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
|
||||
} else if (s->is_16bit) {
|
||||
// planar >8 bit depth
|
||||
uint16_t *inrow, *outrow;
|
||||
|
||||
for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
|
||||
int vsub = plane == 1 || plane == 2 ? s->vsub : 0;
|
||||
int hsub = plane == 1 || plane == 2 ? s->hsub : 0;
|
||||
int h = AV_CEIL_RSHIFT(inlink->h, vsub);
|
||||
int w = AV_CEIL_RSHIFT(inlink->w, hsub);
|
||||
const uint16_t *tab = s->lut[plane];
|
||||
const int in_linesize = in->linesize[plane] / 2;
|
||||
const int out_linesize = out->linesize[plane] / 2;
|
||||
|
||||
inrow = (uint16_t *)in ->data[plane];
|
||||
outrow = (uint16_t *)out->data[plane];
|
||||
|
||||
for (i = 0; i < h; i++) {
|
||||
for (j = 0; j < w; j++) {
|
||||
#if HAVE_BIGENDIAN
|
||||
outrow[j] = av_bswap16(tab[av_bswap16(inrow[j])]);
|
||||
#else
|
||||
outrow[j] = tab[inrow[j]];
|
||||
#endif
|
||||
}
|
||||
inrow += in_linesize;
|
||||
outrow += out_linesize;
|
||||
}
|
||||
}
|
||||
/* planar >8 bit depth */
|
||||
PLANAR_THREAD_DATA
|
||||
ctx->internal->execute(ctx, lut_planar_16bits, &td, NULL,
|
||||
FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
|
||||
} else {
|
||||
/* planar 8bit depth */
|
||||
uint8_t *inrow, *outrow;
|
||||
|
||||
for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
|
||||
int vsub = plane == 1 || plane == 2 ? s->vsub : 0;
|
||||
int hsub = plane == 1 || plane == 2 ? s->hsub : 0;
|
||||
int h = AV_CEIL_RSHIFT(inlink->h, vsub);
|
||||
int w = AV_CEIL_RSHIFT(inlink->w, hsub);
|
||||
const uint16_t *tab = s->lut[plane];
|
||||
const int in_linesize = in->linesize[plane];
|
||||
const int out_linesize = out->linesize[plane];
|
||||
|
||||
inrow = in ->data[plane];
|
||||
outrow = out->data[plane];
|
||||
|
||||
for (i = 0; i < h; i++) {
|
||||
for (j = 0; j < w; j++)
|
||||
outrow[j] = tab[inrow[j]];
|
||||
inrow += in_linesize;
|
||||
outrow += out_linesize;
|
||||
}
|
||||
}
|
||||
PLANAR_THREAD_DATA
|
||||
ctx->internal->execute(ctx, lut_planar_8bits, &td, NULL,
|
||||
FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
|
||||
}
|
||||
|
||||
if (!direct)
|
||||
@ -508,7 +592,7 @@ static const AVFilterPad outputs[] = {
|
||||
.query_formats = query_formats, \
|
||||
.inputs = inputs, \
|
||||
.outputs = outputs, \
|
||||
.flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC, \
|
||||
.flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC | AVFILTER_FLAG_SLICE_THREADS, \
|
||||
}
|
||||
|
||||
#if CONFIG_LUT_FILTER
|
||||
|
Loading…
Reference in New Issue
Block a user