diracdec: rewrite golomb reader

This version is able to output multiple coefficients at a time and is able to altogether remove actual golomb code parsing. Its also able to partially recover the last coefficient in case the packet is incomplete. Total decoder performance gain for 8bit 420 1080p lossless: 40%. Total decoder performance gain for 10bit 420 1080p lossless: 40%. clang was able to vectorize the loop much better than my handwritten assembly, but gcc was very naive and didn't. Lookup table is a rewritten version of vc2hqdecode.
2020-03-01 11:23:53 +00:00 · 2020-03-01 11:23:53 +00:00 · 675bb1f4f9
parent d778be6e4a
commit 675bb1f4f9
3 changed files with 1102 additions and 249 deletions
--- a/libavcodec/dirac_vlc.c
+++ b/libavcodec/dirac_vlc.c
--- a/libavcodec/dirac_vlc.h
+++ b/libavcodec/dirac_vlc.h
@ -1,7 +1,4 @@
 /*
- * Copyright (C) 2016 Open Broadcast Systems Ltd.
- * Author        2016 Rostislav Pehlivanov <rpehlivanov@obe.tv>
- *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
@ -24,28 +21,9 @@

 #include "libavutil/avutil.h"

-/* Can be 32 bits wide for some performance gain on some machines, but it will
- * incorrectly decode very long coefficients (usually only 1 or 2 per frame) */
-typedef uint64_t residual;
-
-#define LUT_BITS 8
-
-/* Exactly 64 bytes */
-typedef struct DiracGolombLUT {
-    residual preamble, leftover;
-    int32_t  ready[LUT_BITS];
-    int32_t  preamble_bits, leftover_bits, ready_num;
-    int8_t   need_s, sign;
-} DiracGolombLUT;
-
-av_cold int ff_dirac_golomb_reader_init(DiracGolombLUT **lut_ctx);
-
-int ff_dirac_golomb_read_32bit(DiracGolombLUT *lut_ctx, const uint8_t *buf,
-                               int bytes, uint8_t *dst, int coeffs);
-
-int ff_dirac_golomb_read_16bit(DiracGolombLUT *lut_ctx, const uint8_t *buf,
-                               int bytes, uint8_t *_dst, int coeffs);
-
-av_cold void ff_dirac_golomb_reader_end(DiracGolombLUT **lut_ctx);
+int ff_dirac_golomb_read_16bit(const uint8_t *buf, int bytes,
+                               uint8_t *_dst, int coeffs);
+int ff_dirac_golomb_read_32bit(const uint8_t *buf, int bytes,
+                               uint8_t *_dst, int coeffs);

 #endif /* AVCODEC_DIRAC_VLC_H */
--- a/libavcodec/diracdec.c
+++ b/libavcodec/diracdec.c
@ -136,7 +136,6 @@ typedef struct DiracContext {
    MpegvideoEncDSPContext mpvencdsp;
    VideoDSPContext vdsp;
    DiracDSPContext diracdsp;
-    DiracGolombLUT *reader_ctx;
    DiracVersionInfo version;
    GetBitContext gb;
    AVDiracSeqHeader seq;
@ -395,7 +394,6 @@ static av_cold int dirac_decode_init(AVCodecContext *avctx)
    s->threads_num_buf = -1;
    s->thread_buf_size = -1;

-    ff_dirac_golomb_reader_init(&s->reader_ctx);
    ff_diracdsp_init(&s->diracdsp);
    ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
    ff_videodsp_init(&s->vdsp, 8);
@ -428,8 +426,6 @@ static av_cold int dirac_decode_end(AVCodecContext *avctx)
    DiracContext *s = avctx->priv_data;
    int i;

-    ff_dirac_golomb_reader_end(&s->reader_ctx);
-
    dirac_decode_flush(avctx);
    for (i = 0; i < MAX_FRAMES; i++)
        av_frame_free(&s->all_frames[i].avframe);
@ -881,11 +877,11 @@ static int decode_hq_slice(DiracContext *s, DiracSlice *slice, uint8_t *tmp_buf)
        coef_num = subband_coeffs(s, slice->slice_x, slice->slice_y, i, coeffs_num);

        if (s->pshift)
-            coef_par = ff_dirac_golomb_read_32bit(s->reader_ctx, addr,
-                                                  length, tmp_buf, coef_num);
+            coef_par = ff_dirac_golomb_read_32bit(addr, length,
+                                                  tmp_buf, coef_num);
        else
-            coef_par = ff_dirac_golomb_read_16bit(s->reader_ctx, addr,
-                                                  length, tmp_buf, coef_num);
+            coef_par = ff_dirac_golomb_read_16bit(addr, length,
+                                                  tmp_buf, coef_num);

        if (coef_num > coef_par) {
            const int start_b = coef_par * (1 << (s->pshift + 1));