diracdec: rewrite golomb reader

This version is able to output multiple coefficients at a time and
is able to altogether remove actual golomb code parsing.
Its also able to partially recover the last coefficient in case
the packet is incomplete.

Total decoder performance gain for 8bit 420 1080p lossless: 40%.
Total decoder performance gain for 10bit 420 1080p lossless: 40%.

clang was able to vectorize the loop much better than
my handwritten assembly, but gcc was very naive and didn't.

Lookup table is a rewritten version of vc2hqdecode.
This commit is contained in:
Lynne 2020-03-01 11:23:53 +00:00
parent d778be6e4a
commit 675bb1f4f9
3 changed files with 1102 additions and 249 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,4 @@
/*
* Copyright (C) 2016 Open Broadcast Systems Ltd.
* Author 2016 Rostislav Pehlivanov <rpehlivanov@obe.tv>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
@ -24,28 +21,9 @@
#include "libavutil/avutil.h"
/* Can be 32 bits wide for some performance gain on some machines, but it will
* incorrectly decode very long coefficients (usually only 1 or 2 per frame) */
typedef uint64_t residual;
#define LUT_BITS 8
/* Exactly 64 bytes */
typedef struct DiracGolombLUT {
residual preamble, leftover;
int32_t ready[LUT_BITS];
int32_t preamble_bits, leftover_bits, ready_num;
int8_t need_s, sign;
} DiracGolombLUT;
av_cold int ff_dirac_golomb_reader_init(DiracGolombLUT **lut_ctx);
int ff_dirac_golomb_read_32bit(DiracGolombLUT *lut_ctx, const uint8_t *buf,
int bytes, uint8_t *dst, int coeffs);
int ff_dirac_golomb_read_16bit(DiracGolombLUT *lut_ctx, const uint8_t *buf,
int bytes, uint8_t *_dst, int coeffs);
av_cold void ff_dirac_golomb_reader_end(DiracGolombLUT **lut_ctx);
int ff_dirac_golomb_read_16bit(const uint8_t *buf, int bytes,
uint8_t *_dst, int coeffs);
int ff_dirac_golomb_read_32bit(const uint8_t *buf, int bytes,
uint8_t *_dst, int coeffs);
#endif /* AVCODEC_DIRAC_VLC_H */

View File

@ -136,7 +136,6 @@ typedef struct DiracContext {
MpegvideoEncDSPContext mpvencdsp;
VideoDSPContext vdsp;
DiracDSPContext diracdsp;
DiracGolombLUT *reader_ctx;
DiracVersionInfo version;
GetBitContext gb;
AVDiracSeqHeader seq;
@ -395,7 +394,6 @@ static av_cold int dirac_decode_init(AVCodecContext *avctx)
s->threads_num_buf = -1;
s->thread_buf_size = -1;
ff_dirac_golomb_reader_init(&s->reader_ctx);
ff_diracdsp_init(&s->diracdsp);
ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
ff_videodsp_init(&s->vdsp, 8);
@ -428,8 +426,6 @@ static av_cold int dirac_decode_end(AVCodecContext *avctx)
DiracContext *s = avctx->priv_data;
int i;
ff_dirac_golomb_reader_end(&s->reader_ctx);
dirac_decode_flush(avctx);
for (i = 0; i < MAX_FRAMES; i++)
av_frame_free(&s->all_frames[i].avframe);
@ -881,11 +877,11 @@ static int decode_hq_slice(DiracContext *s, DiracSlice *slice, uint8_t *tmp_buf)
coef_num = subband_coeffs(s, slice->slice_x, slice->slice_y, i, coeffs_num);
if (s->pshift)
coef_par = ff_dirac_golomb_read_32bit(s->reader_ctx, addr,
length, tmp_buf, coef_num);
coef_par = ff_dirac_golomb_read_32bit(addr, length,
tmp_buf, coef_num);
else
coef_par = ff_dirac_golomb_read_16bit(s->reader_ctx, addr,
length, tmp_buf, coef_num);
coef_par = ff_dirac_golomb_read_16bit(addr, length,
tmp_buf, coef_num);
if (coef_num > coef_par) {
const int start_b = coef_par * (1 << (s->pshift + 1));