avfilter/vf_nnedi: simplify code even more

This commit is contained in:
Paul B Mahol 2021-01-19 17:06:05 +01:00
parent 553eb07737
commit c737f6edce

View File

@ -37,8 +37,8 @@ static const uint8_t NNEDI_XDIM[] = { 8, 16, 32, 48, 8, 16, 32 };
static const uint8_t NNEDI_YDIM[] = { 6, 6, 6, 6, 4, 4, 4 }; static const uint8_t NNEDI_YDIM[] = { 6, 6, 6, 6, 4, 4, 4 };
static const uint16_t NNEDI_NNS[] = { 16, 32, 64, 128, 256 }; static const uint16_t NNEDI_NNS[] = { 16, 32, 64, 128, 256 };
typedef struct PrescreenerOldCoefficients { typedef struct PrescreenerCoefficients {
DECLARE_ALIGNED(32, float, kernel_l0)[4][14 * 4]; DECLARE_ALIGNED(32, float, kernel_l0)[4][16 * 4];
DECLARE_ALIGNED(32, float, bias_l0)[4]; DECLARE_ALIGNED(32, float, bias_l0)[4];
DECLARE_ALIGNED(32, float, kernel_l1)[4][4]; DECLARE_ALIGNED(32, float, kernel_l1)[4][4];
@ -46,15 +46,7 @@ typedef struct PrescreenerOldCoefficients {
DECLARE_ALIGNED(32, float, kernel_l2)[4][8]; DECLARE_ALIGNED(32, float, kernel_l2)[4][8];
DECLARE_ALIGNED(32, float, bias_l2)[4]; DECLARE_ALIGNED(32, float, bias_l2)[4];
} PrescreenerOldCoefficients; } PrescreenerCoefficients;
typedef struct PrescreenerNewCoefficients {
DECLARE_ALIGNED(32, float, kernel_l0)[4][16 * 4];
DECLARE_ALIGNED(32, float, bias_l0)[4];
DECLARE_ALIGNED(32, float, kernel_l1)[4][4];
DECLARE_ALIGNED(32, float, bias_l1)[4];
} PrescreenerNewCoefficients;
typedef struct PredictorCoefficients { typedef struct PredictorCoefficients {
int xdim, ydim, nns, nsize; int xdim, ydim, nns, nsize;
@ -89,8 +81,7 @@ typedef struct NNEDIContext {
int planeheight[4]; int planeheight[4];
int field_n; int field_n;
PrescreenerOldCoefficients prescreener_old; PrescreenerCoefficients prescreener[4];
PrescreenerNewCoefficients prescreener_new[3];
PredictorCoefficients coeffs[2][5][7]; PredictorCoefficients coeffs[2][5][7];
float half; float half;
@ -108,9 +99,9 @@ typedef struct NNEDIContext {
int pscrn; int pscrn;
int input_size; int input_size;
uint8_t *prescreen_buf; uint8_t **prescreen_buf;
float *input_buf; float **input_buf;
float *output_buf; float **output_buf;
void (*read)(const uint8_t *src, float *dst, void (*read)(const uint8_t *src, float *dst,
int src_stride, int dst_stride, int src_stride, int dst_stride,
@ -120,7 +111,8 @@ typedef struct NNEDIContext {
int width, int height, int depth, float scale); int width, int height, int depth, float scale);
void (*prescreen[2])(AVFilterContext *ctx, void (*prescreen[2])(AVFilterContext *ctx,
const void *src, ptrdiff_t src_stride, const void *src, ptrdiff_t src_stride,
uint8_t *prescreen, int N, void *data); uint8_t *prescreen, int N,
const PrescreenerCoefficients *const coeffs);
} NNEDIContext; } NNEDIContext;
#define OFFSET(x) offsetof(NNEDIContext, x) #define OFFSET(x) offsetof(NNEDIContext, x)
@ -222,7 +214,7 @@ static int query_formats(AVFilterContext *ctx)
return ff_set_common_formats(ctx, fmts_list); return ff_set_common_formats(ctx, fmts_list);
} }
static float dot_dsp(NNEDIContext *s, const float *kernel, const float *input, static float dot_dsp(const NNEDIContext *const s, const float *kernel, const float *input,
int n, float scale, float bias) int n, float scale, float bias)
{ {
float sum; float sum;
@ -246,10 +238,9 @@ static void transform_elliott(float *input, int size)
static void process_old(AVFilterContext *ctx, static void process_old(AVFilterContext *ctx,
const void *src, ptrdiff_t src_stride, const void *src, ptrdiff_t src_stride,
uint8_t *prescreen, int N, uint8_t *prescreen, int N,
void *data) const PrescreenerCoefficients *const m_data)
{ {
NNEDIContext *s = ctx->priv; NNEDIContext *s = ctx->priv;
const PrescreenerOldCoefficients *const m_data = data;
const float *src_p = src; const float *src_p = src;
// Adjust source pointer to point to top-left of filter window. // Adjust source pointer to point to top-left of filter window.
@ -283,10 +274,9 @@ static void process_old(AVFilterContext *ctx,
static void process_new(AVFilterContext *ctx, static void process_new(AVFilterContext *ctx,
const void *src, ptrdiff_t src_stride, const void *src, ptrdiff_t src_stride,
uint8_t *prescreen, int N, uint8_t *prescreen, int N,
void *data) const PrescreenerCoefficients *const m_data)
{ {
NNEDIContext *s = ctx->priv; NNEDIContext *s = ctx->priv;
const PrescreenerNewCoefficients *const m_data = data;
const float *src_p = src; const float *src_p = src;
// Adjust source pointer to point to top-left of filter window. // Adjust source pointer to point to top-left of filter window.
@ -344,8 +334,8 @@ static void gather_input(const float *src, ptrdiff_t src_stride,
float *buf, float mstd[4], float *buf, float mstd[4],
const PredictorCoefficients *const model) const PredictorCoefficients *const model)
{ {
float sum = 0; float sum = 0.f;
float sum_sq = 0; float sum_sq = 0.f;
float tmp; float tmp;
for (int i = 0; i < model->ydim; i++) { for (int i = 0; i < model->ydim; i++) {
@ -405,17 +395,16 @@ static void wae5(const float *softmax, const float *el,
static void predictor(AVFilterContext *ctx, static void predictor(AVFilterContext *ctx,
const void *src, ptrdiff_t src_stride, void *dst, const void *src, ptrdiff_t src_stride, void *dst,
const uint8_t *prescreen, int N, const uint8_t *prescreen, int N,
void *data, int use_q2) const PredictorCoefficients *const model, int use_q2)
{ {
NNEDIContext *s = ctx->priv; const NNEDIContext *const s = ctx->priv;
const PredictorCoefficients *const model = data;
const float *src_p = src; const float *src_p = src;
float *dst_p = dst; float *dst_p = dst;
// Adjust source pointer to point to top-left of filter window. // Adjust source pointer to point to top-left of filter window.
const float *window = src_p - (model->ydim / 2) * src_stride - (model->xdim / 2 - 1); const float *window = src_p - (model->ydim / 2) * src_stride - (model->xdim / 2 - 1);
int filter_size = model->nsize; const int filter_size = model->nsize;
int nns = model->nns; const int nns = model->nns;
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
LOCAL_ALIGNED_32(float, input, [48 * 6]); LOCAL_ALIGNED_32(float, input, [48 * 6]);
@ -433,7 +422,7 @@ static void predictor(AVFilterContext *ctx,
activation[nn] = dot_dsp(s, softmax_q1_filter(nn, model), input, filter_size, scale, model->softmax_bias_q1[nn]); activation[nn] = dot_dsp(s, softmax_q1_filter(nn, model), input, filter_size, scale, model->softmax_bias_q1[nn]);
for (int nn = 0; nn < nns; nn++) for (int nn = 0; nn < nns; nn++)
activation[model->nns + nn] = dot_dsp(s, elliott_q1_filter(nn, model), input, filter_size, scale, model->elliott_bias_q1[nn]); activation[nns + nn] = dot_dsp(s, elliott_q1_filter(nn, model), input, filter_size, scale, model->elliott_bias_q1[nn]);
transform_softmax_exp(activation, nns); transform_softmax_exp(activation, nns);
wae5(activation, activation + nns, nns, mstd); wae5(activation, activation + nns, nns, mstd);
@ -449,7 +438,7 @@ static void predictor(AVFilterContext *ctx,
wae5(activation, activation + nns, nns, mstd); wae5(activation, activation + nns, nns, mstd);
} }
dst_p[i] = mstd[3] / (use_q2 ? 2 : 1); dst_p[i] = mstd[3] * (use_q2 ? 0.5f : 1.f);
} }
} }
@ -551,7 +540,7 @@ static void interpolation(const void *src, ptrdiff_t src_stride,
static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
{ {
NNEDIContext *s = ctx->priv; const NNEDIContext *const s = ctx->priv;
AVFrame *out = s->dst; AVFrame *out = s->dst;
AVFrame *in = s->src; AVFrame *in = s->src;
const float in_scale = s->in_scale; const float in_scale = s->in_scale;
@ -572,10 +561,10 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
uint8_t *dst = out->data[p] + slice_start * out->linesize[p]; uint8_t *dst = out->data[p] + slice_start * out->linesize[p];
const int src_linesize = in->linesize[p]; const int src_linesize = in->linesize[p];
const int dst_linesize = out->linesize[p]; const int dst_linesize = out->linesize[p];
uint8_t *prescreen_buf = s->prescreen_buf + s->planewidth[0] * jobnr; uint8_t *prescreen_buf = s->prescreen_buf[jobnr];
float *srcbuf = s->input_buf + s->input_size * jobnr; float *srcbuf = s->input_buf[jobnr];
const int srcbuf_stride = width + 64; const int srcbuf_stride = width + 64;
float *dstbuf = s->output_buf + s->input_size * jobnr; float *dstbuf = s->output_buf[jobnr];
const int dstbuf_stride = width; const int dstbuf_stride = width;
const int slice_height = (slice_end - slice_start) / 2; const int slice_height = (slice_end - slice_start) / 2;
const int last_slice = slice_end == height; const int last_slice = slice_end == height;
@ -646,15 +635,10 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
width, 1, in_scale); width, 1, in_scale);
for (int y = 0; y < slice_end - slice_start; y += 2) { for (int y = 0; y < slice_end - slice_start; y += 2) {
if (s->pscrn > 1) { if (s->prescreen > 0)
s->prescreen[1](ctx, srcbuf + (y / 2) * srcbuf_stride + 32, s->prescreen[s->pscrn > 1](ctx, srcbuf + (y / 2) * srcbuf_stride + 32,
srcbuf_stride, prescreen_buf, width, srcbuf_stride, prescreen_buf, width,
&s->prescreener_new[s->pscrn - 2]); &s->prescreener[s->pscrn - 1]);
} else if (s->pscrn == 1) {
s->prescreen[0](ctx, srcbuf + (y / 2) * srcbuf_stride + 32,
srcbuf_stride, prescreen_buf, width,
&s->prescreener_old);
}
predictor(ctx, predictor(ctx,
srcbuf + (y / 2) * srcbuf_stride + 32, srcbuf + (y / 2) * srcbuf_stride + 32,
@ -831,7 +815,7 @@ static int allocate_model(PredictorCoefficients *coeffs, int xdim, int ydim, int
int bias_size = nns; int bias_size = nns;
float *data; float *data;
data = av_malloc_array(filter_size + bias_size, 4 * sizeof(float)); data = av_calloc(filter_size + bias_size, 4 * sizeof(float));
if (!data) if (!data)
return AVERROR(ENOMEM); return AVERROR(ENOMEM);
@ -859,17 +843,17 @@ static int read_weights(AVFilterContext *ctx, const float *bdata)
NNEDIContext *s = ctx->priv; NNEDIContext *s = ctx->priv;
int ret; int ret;
copy_weights(&s->prescreener_old.kernel_l0[0][0], 4 * 48, &bdata); copy_weights(&s->prescreener[0].kernel_l0[0][0], 4 * 48, &bdata);
copy_weights(s->prescreener_old.bias_l0, 4, &bdata); copy_weights(s->prescreener[0].bias_l0, 4, &bdata);
copy_weights(&s->prescreener_old.kernel_l1[0][0], 4 * 4, &bdata); copy_weights(&s->prescreener[0].kernel_l1[0][0], 4 * 4, &bdata);
copy_weights(s->prescreener_old.bias_l1, 4, &bdata); copy_weights(s->prescreener[0].bias_l1, 4, &bdata);
copy_weights(&s->prescreener_old.kernel_l2[0][0], 4 * 8, &bdata); copy_weights(&s->prescreener[0].kernel_l2[0][0], 4 * 8, &bdata);
copy_weights(s->prescreener_old.bias_l2, 4, &bdata); copy_weights(s->prescreener[0].bias_l2, 4, &bdata);
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
PrescreenerNewCoefficients *data = &s->prescreener_new[i]; PrescreenerCoefficients *data = &s->prescreener[i + 1];
float kernel_l0_shuffled[4 * 64]; float kernel_l0_shuffled[4 * 64];
float kernel_l1_shuffled[4 * 4]; float kernel_l1_shuffled[4 * 4];
@ -890,14 +874,14 @@ static int read_weights(AVFilterContext *ctx, const float *bdata)
for (int m = 0; m < 2; m++) { for (int m = 0; m < 2; m++) {
// Grouping by neuron count. // Grouping by neuron count.
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
int nns = NNEDI_NNS[i]; const int nns = NNEDI_NNS[i];
// Grouping by window size. // Grouping by window size.
for (int j = 0; j < 7; j++) { for (int j = 0; j < 7; j++) {
PredictorCoefficients *model = &s->coeffs[m][i][j]; PredictorCoefficients *model = &s->coeffs[m][i][j];
int xdim = NNEDI_XDIM[j]; const int xdim = NNEDI_XDIM[j];
int ydim = NNEDI_YDIM[j]; const int ydim = NNEDI_YDIM[j];
int filter_size = xdim * ydim; const int filter_size = xdim * ydim;
ret = allocate_model(model, xdim, ydim, nns); ret = allocate_model(model, xdim, ydim, nns);
if (ret < 0) if (ret < 0)
@ -927,7 +911,7 @@ static int read_weights(AVFilterContext *ctx, const float *bdata)
static float mean(const float *input, int size) static float mean(const float *input, int size)
{ {
float sum = 0.; float sum = 0.f;
for (int i = 0; i < size; i++) for (int i = 0; i < size; i++)
sum += input[i]; sum += input[i];
@ -941,7 +925,7 @@ static void transform(float *input, int size, float mean, float half)
input[i] = (input[i] - mean) / half; input[i] = (input[i] - mean) / half;
} }
static void subtract_mean_old(PrescreenerOldCoefficients *coeffs, float half) static void subtract_mean_old(PrescreenerCoefficients *coeffs, float half)
{ {
for (int n = 0; n < 4; n++) { for (int n = 0; n < 4; n++) {
float m = mean(coeffs->kernel_l0[n], 48); float m = mean(coeffs->kernel_l0[n], 48);
@ -950,7 +934,7 @@ static void subtract_mean_old(PrescreenerOldCoefficients *coeffs, float half)
} }
} }
static void subtract_mean_new(PrescreenerNewCoefficients *coeffs, float half) static void subtract_mean_new(PrescreenerCoefficients *coeffs, float half)
{ {
for (int n = 0; n < 4; n++) { for (int n = 0; n < 4; n++) {
float m = mean(coeffs->kernel_l0[n], 64); float m = mean(coeffs->kernel_l0[n], 64);
@ -1121,10 +1105,10 @@ static int config_input(AVFilterLink *inlink)
break; break;
} }
subtract_mean_old(&s->prescreener_old, s->half); subtract_mean_old(&s->prescreener[0], s->half);
subtract_mean_new(&s->prescreener_new[0], s->half); subtract_mean_new(&s->prescreener[1], s->half);
subtract_mean_new(&s->prescreener_new[1], s->half); subtract_mean_new(&s->prescreener[2], s->half);
subtract_mean_new(&s->prescreener_new[2], s->half); subtract_mean_new(&s->prescreener[3], s->half);
s->prescreen[0] = process_old; s->prescreen[0] = process_old;
s->prescreen[1] = process_new; s->prescreen[1] = process_new;
@ -1136,19 +1120,37 @@ static int config_input(AVFilterLink *inlink)
} }
} }
s->prescreen_buf = av_calloc(s->nb_threads * s->planewidth[0], sizeof(*s->prescreen_buf));
if (!s->prescreen_buf)
return AVERROR(ENOMEM);
s->input_size = (s->planewidth[0] + 64) * (s->planeheight[0] + 6); s->input_size = (s->planewidth[0] + 64) * (s->planeheight[0] + 6);
s->input_buf = av_calloc(s->nb_threads * s->input_size, sizeof(*s->input_buf)); s->input_buf = av_calloc(s->nb_threads, sizeof(*s->input_buf));
if (!s->input_buf) if (!s->input_buf)
return AVERROR(ENOMEM); return AVERROR(ENOMEM);
s->output_buf = av_calloc(s->nb_threads * s->input_size, sizeof(*s->output_buf)); for (int i = 0; i < s->nb_threads; i++) {
s->input_buf[i] = av_calloc(s->input_size, sizeof(**s->input_buf));
if (!s->input_buf[i])
return AVERROR(ENOMEM);
}
s->output_buf = av_calloc(s->nb_threads, sizeof(*s->output_buf));
if (!s->output_buf) if (!s->output_buf)
return AVERROR(ENOMEM); return AVERROR(ENOMEM);
for (int i = 0; i < s->nb_threads; i++) {
s->output_buf[i] = av_calloc(s->input_size, sizeof(**s->output_buf));
if (!s->output_buf[i])
return AVERROR(ENOMEM);
}
s->prescreen_buf = av_calloc(s->nb_threads, sizeof(*s->prescreen_buf));
if (!s->prescreen_buf)
return AVERROR(ENOMEM);
for (int i = 0; i < s->nb_threads; i++) {
s->prescreen_buf[i] = av_calloc(s->planewidth[0], sizeof(**s->prescreen_buf));
if (!s->prescreen_buf[i])
return AVERROR(ENOMEM);
}
return 0; return 0;
} }
@ -1156,8 +1158,19 @@ static av_cold void uninit(AVFilterContext *ctx)
{ {
NNEDIContext *s = ctx->priv; NNEDIContext *s = ctx->priv;
for (int i = 0; i < s->nb_threads && s->prescreen_buf; i++)
av_freep(&s->prescreen_buf[i]);
av_freep(&s->prescreen_buf); av_freep(&s->prescreen_buf);
for (int i = 0; i < s->nb_threads && s->input_buf; i++)
av_freep(&s->input_buf[i]);
av_freep(&s->input_buf); av_freep(&s->input_buf);
for (int i = 0; i < s->nb_threads && s->output_buf; i++)
av_freep(&s->output_buf[i]);
av_freep(&s->output_buf); av_freep(&s->output_buf);
av_freep(&s->fdsp); av_freep(&s->fdsp);