From 08a7510fcad552bfcda644e4ba047605de15fab5 Mon Sep 17 00:00:00 2001 From: Christophe Gisquet Date: Fri, 25 Sep 2015 08:56:43 +0200 Subject: [PATCH] dnxhddec: implement slice multithreading Around 3x speedup with 4 threads. Maybe more mb lines should be batched per thread, but that's good enough for a first try. Signed-off-by: Michael Niedermayer --- libavcodec/dnxhddec.c | 219 +++++++++++++++++++++++++----------------- 1 file changed, 131 insertions(+), 88 deletions(-) diff --git a/libavcodec/dnxhddec.c b/libavcodec/dnxhddec.c index 22d8c0b785..fa9340721c 100644 --- a/libavcodec/dnxhddec.c +++ b/libavcodec/dnxhddec.c @@ -32,10 +32,21 @@ #include "internal.h" #include "thread.h" +typedef struct RowContext { + DECLARE_ALIGNED(16, int16_t, blocks)[12][64]; + int luma_scale[64]; + int chroma_scale[64]; + GetBitContext gb; + int last_dc[3]; + int last_qscale; +} RowContext; + typedef struct DNXHDContext { AVCodecContext *avctx; - GetBitContext gb; + RowContext *rows; BlockDSPContext bdsp; + const uint8_t* buf; + int buf_size; int64_t cid; ///< compression id unsigned int width, height; enum AVPixelFormat pix_fmt; @@ -43,30 +54,29 @@ typedef struct DNXHDContext { uint32_t mb_scan_index[68]; /* max for 1080p */ int cur_field; ///< current interlaced field VLC ac_vlc, dc_vlc, run_vlc; - int last_dc[3]; IDCTDSPContext idsp; - DECLARE_ALIGNED(16, int16_t, blocks)[12][64]; ScanTable scantable; const CIDEntry *cid_table; int bit_depth; // 8, 10 or 0 if not initialized at all. int is_444; int mbaff; int act; - void (*decode_dct_block)(struct DNXHDContext *ctx, int16_t *block, + void (*decode_dct_block)(const struct DNXHDContext *ctx, + RowContext *row, int16_t *block, int n, int qscale); - int last_qscale; - int luma_scale[64]; - int chroma_scale[64]; } DNXHDContext; #define DNXHD_VLC_BITS 9 #define DNXHD_DC_VLC_BITS 7 -static void dnxhd_decode_dct_block_8(DNXHDContext *ctx, int16_t *block, +static void dnxhd_decode_dct_block_8(const DNXHDContext *ctx, + RowContext *row, int16_t *block, int n, int qscale); -static void dnxhd_decode_dct_block_10(DNXHDContext *ctx, int16_t *block, +static void dnxhd_decode_dct_block_10(const DNXHDContext *ctx, + RowContext *row, int16_t *block, int n, int qscale); -static void dnxhd_decode_dct_block_10_444(DNXHDContext *ctx, int16_t *block, +static void dnxhd_decode_dct_block_10_444(const DNXHDContext *ctx, + RowContext *row, int16_t *block, int n, int qscale); static av_cold int dnxhd_decode_init(AVCodecContext *avctx) @@ -76,6 +86,11 @@ static av_cold int dnxhd_decode_init(AVCodecContext *avctx) ctx->avctx = avctx; ctx->cid = -1; avctx->colorspace = AVCOL_SPC_BT709; + + ctx->rows = av_mallocz_array(avctx->thread_count, sizeof(RowContext)); + if (!ctx->rows) + return AVERROR(ENOMEM); + return 0; } @@ -116,6 +131,20 @@ static int dnxhd_init_vlc(DNXHDContext *ctx, uint32_t cid) return 0; } +static av_cold int dnxhd_decode_init_thread_copy(AVCodecContext *avctx) +{ + DNXHDContext *ctx = avctx->priv_data; + + // make sure VLC tables will be loaded when cid is parsed + ctx->cid = -1; + + ctx->rows = av_mallocz_array(avctx->thread_count, sizeof(RowContext)); + if (!ctx->rows) + return AVERROR(ENOMEM); + + return 0; +} + static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame, const uint8_t *buf, int buf_size, int first_field) @@ -240,7 +269,8 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame, return 0; } -static av_always_inline void dnxhd_decode_dct_block(DNXHDContext *ctx, +static av_always_inline void dnxhd_decode_dct_block(const DNXHDContext *ctx, + RowContext *row, int16_t *block, int n, int qscale, int index_bits, @@ -254,61 +284,61 @@ static av_always_inline void dnxhd_decode_dct_block(DNXHDContext *ctx, const uint8_t *ac_level = ctx->cid_table->ac_level; const uint8_t *ac_flags = ctx->cid_table->ac_flags; const int eob_index = ctx->cid_table->eob_index; - OPEN_READER(bs, &ctx->gb); + OPEN_READER(bs, &row->gb); if (!ctx->is_444) { if (n & 2) { component = 1 + (n & 1); - scale = ctx->chroma_scale; + scale = row->chroma_scale; weight_matrix = ctx->cid_table->chroma_weight; } else { component = 0; - scale = ctx->luma_scale; + scale = row->luma_scale; weight_matrix = ctx->cid_table->luma_weight; } } else { component = (n >> 1) % 3; if (component) { - scale = ctx->chroma_scale; + scale = row->chroma_scale; weight_matrix = ctx->cid_table->chroma_weight; } else { - scale = ctx->luma_scale; + scale = row->luma_scale; weight_matrix = ctx->cid_table->luma_weight; } } - UPDATE_CACHE(bs, &ctx->gb); - GET_VLC(len, bs, &ctx->gb, ctx->dc_vlc.table, DNXHD_DC_VLC_BITS, 1); + UPDATE_CACHE(bs, &row->gb); + GET_VLC(len, bs, &row->gb, ctx->dc_vlc.table, DNXHD_DC_VLC_BITS, 1); if (len) { - level = GET_CACHE(bs, &ctx->gb); - LAST_SKIP_BITS(bs, &ctx->gb, len); + level = GET_CACHE(bs, &row->gb); + LAST_SKIP_BITS(bs, &row->gb, len); sign = ~level >> 31; level = (NEG_USR32(sign ^ level, len) ^ sign) - sign; - ctx->last_dc[component] += level; + row->last_dc[component] += level; } - block[0] = ctx->last_dc[component]; + block[0] = row->last_dc[component]; i = 0; - UPDATE_CACHE(bs, &ctx->gb); - GET_VLC(index1, bs, &ctx->gb, ctx->ac_vlc.table, + UPDATE_CACHE(bs, &row->gb); + GET_VLC(index1, bs, &row->gb, ctx->ac_vlc.table, DNXHD_VLC_BITS, 2); while (index1 != eob_index) { level = ac_level[index1]; flags = ac_flags[index1]; - sign = SHOW_SBITS(bs, &ctx->gb, 1); - SKIP_BITS(bs, &ctx->gb, 1); + sign = SHOW_SBITS(bs, &row->gb, 1); + SKIP_BITS(bs, &row->gb, 1); if (flags & 1) { - level += SHOW_UBITS(bs, &ctx->gb, index_bits) << 7; - SKIP_BITS(bs, &ctx->gb, index_bits); + level += SHOW_UBITS(bs, &row->gb, index_bits) << 7; + SKIP_BITS(bs, &row->gb, index_bits); } if (flags & 2) { - UPDATE_CACHE(bs, &ctx->gb); - GET_VLC(index2, bs, &ctx->gb, ctx->run_vlc.table, + UPDATE_CACHE(bs, &row->gb); + GET_VLC(index2, bs, &row->gb, ctx->run_vlc.table, DNXHD_VLC_BITS, 2); i += ctx->cid_table->run[index2]; } @@ -326,34 +356,37 @@ static av_always_inline void dnxhd_decode_dct_block(DNXHDContext *ctx, block[j] = (level ^ sign) - sign; - UPDATE_CACHE(bs, &ctx->gb); - GET_VLC(index1, bs, &ctx->gb, ctx->ac_vlc.table, + UPDATE_CACHE(bs, &row->gb); + GET_VLC(index1, bs, &row->gb, ctx->ac_vlc.table, DNXHD_VLC_BITS, 2); } - CLOSE_READER(bs, &ctx->gb); + CLOSE_READER(bs, &row->gb); } -static void dnxhd_decode_dct_block_8(DNXHDContext *ctx, int16_t *block, +static void dnxhd_decode_dct_block_8(const DNXHDContext *ctx, + RowContext *row, int16_t *block, int n, int qscale) { - dnxhd_decode_dct_block(ctx, block, n, qscale, 4, 32, 6); + dnxhd_decode_dct_block(ctx, row, block, n, qscale, 4, 32, 6); } -static void dnxhd_decode_dct_block_10(DNXHDContext *ctx, int16_t *block, +static void dnxhd_decode_dct_block_10(const DNXHDContext *ctx, + RowContext *row, int16_t *block, int n, int qscale) { - dnxhd_decode_dct_block(ctx, block, n, qscale, 6, 8, 4); + dnxhd_decode_dct_block(ctx, row, block, n, qscale, 6, 8, 4); } -static void dnxhd_decode_dct_block_10_444(DNXHDContext *ctx, int16_t *block, +static void dnxhd_decode_dct_block_10_444(const DNXHDContext *ctx, + RowContext *row, int16_t *block, int n, int qscale) { - dnxhd_decode_dct_block(ctx, block, n, qscale, 6, 32, 6); + dnxhd_decode_dct_block(ctx, row, block, n, qscale, 6, 32, 6); } -static int dnxhd_decode_macroblock(DNXHDContext *ctx, AVFrame *frame, - int x, int y) +static int dnxhd_decode_macroblock(const DNXHDContext *ctx, RowContext *row, + AVFrame *frame, int x, int y) { int shift1 = ctx->bit_depth == 10; int dct_linesize_luma = frame->linesize[0]; @@ -364,11 +397,11 @@ static int dnxhd_decode_macroblock(DNXHDContext *ctx, AVFrame *frame, int interlaced_mb = 0; if (ctx->mbaff) { - interlaced_mb = get_bits1(&ctx->gb); - qscale = get_bits(&ctx->gb, 10); + interlaced_mb = get_bits1(&row->gb); + qscale = get_bits(&row->gb, 10); } else - qscale = get_bits(&ctx->gb, 11); - act = get_bits1(&ctx->gb); + qscale = get_bits(&row->gb, 11); + act = get_bits1(&row->gb); if (act) { static int warned = 0; if (!warned) { @@ -378,22 +411,22 @@ static int dnxhd_decode_macroblock(DNXHDContext *ctx, AVFrame *frame, } } - if (qscale != ctx->last_qscale) { + if (qscale != row->last_qscale) { for (i = 0; i < 64; i++) { - ctx->luma_scale[i] = qscale * ctx->cid_table->luma_weight[i]; - ctx->chroma_scale[i] = qscale * ctx->cid_table->chroma_weight[i]; + row->luma_scale[i] = qscale * ctx->cid_table->luma_weight[i]; + row->chroma_scale[i] = qscale * ctx->cid_table->chroma_weight[i]; } - ctx->last_qscale = qscale; + row->last_qscale = qscale; } for (i = 0; i < 8; i++) { - ctx->bdsp.clear_block(ctx->blocks[i]); - ctx->decode_dct_block(ctx, ctx->blocks[i], i, qscale); + ctx->bdsp.clear_block(row->blocks[i]); + ctx->decode_dct_block(ctx, row, row->blocks[i], i, qscale); } if (ctx->is_444) { for (; i < 12; i++) { - ctx->bdsp.clear_block(ctx->blocks[i]); - ctx->decode_dct_block(ctx, ctx->blocks[i], i, qscale); + ctx->bdsp.clear_block(row->blocks[i]); + ctx->decode_dct_block(ctx, row, row->blocks[i], i, qscale); } } @@ -419,55 +452,58 @@ static int dnxhd_decode_macroblock(DNXHDContext *ctx, AVFrame *frame, dct_y_offset = interlaced_mb ? frame->linesize[0] : (dct_linesize_luma << 3); dct_x_offset = 8 << shift1; if (!ctx->is_444) { - ctx->idsp.idct_put(dest_y, dct_linesize_luma, ctx->blocks[0]); - ctx->idsp.idct_put(dest_y + dct_x_offset, dct_linesize_luma, ctx->blocks[1]); - ctx->idsp.idct_put(dest_y + dct_y_offset, dct_linesize_luma, ctx->blocks[4]); - ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, ctx->blocks[5]); + ctx->idsp.idct_put(dest_y, dct_linesize_luma, row->blocks[0]); + ctx->idsp.idct_put(dest_y + dct_x_offset, dct_linesize_luma, row->blocks[1]); + ctx->idsp.idct_put(dest_y + dct_y_offset, dct_linesize_luma, row->blocks[4]); + ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, row->blocks[5]); if (!(ctx->avctx->flags & AV_CODEC_FLAG_GRAY)) { dct_y_offset = interlaced_mb ? frame->linesize[1] : (dct_linesize_chroma << 3); - ctx->idsp.idct_put(dest_u, dct_linesize_chroma, ctx->blocks[2]); - ctx->idsp.idct_put(dest_v, dct_linesize_chroma, ctx->blocks[3]); - ctx->idsp.idct_put(dest_u + dct_y_offset, dct_linesize_chroma, ctx->blocks[6]); - ctx->idsp.idct_put(dest_v + dct_y_offset, dct_linesize_chroma, ctx->blocks[7]); + ctx->idsp.idct_put(dest_u, dct_linesize_chroma, row->blocks[2]); + ctx->idsp.idct_put(dest_v, dct_linesize_chroma, row->blocks[3]); + ctx->idsp.idct_put(dest_u + dct_y_offset, dct_linesize_chroma, row->blocks[6]); + ctx->idsp.idct_put(dest_v + dct_y_offset, dct_linesize_chroma, row->blocks[7]); } } else { - ctx->idsp.idct_put(dest_y, dct_linesize_luma, ctx->blocks[0]); - ctx->idsp.idct_put(dest_y + dct_x_offset, dct_linesize_luma, ctx->blocks[1]); - ctx->idsp.idct_put(dest_y + dct_y_offset, dct_linesize_luma, ctx->blocks[6]); - ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, ctx->blocks[7]); + ctx->idsp.idct_put(dest_y, dct_linesize_luma, row->blocks[0]); + ctx->idsp.idct_put(dest_y + dct_x_offset, dct_linesize_luma, row->blocks[1]); + ctx->idsp.idct_put(dest_y + dct_y_offset, dct_linesize_luma, row->blocks[6]); + ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, row->blocks[7]); if (!(ctx->avctx->flags & AV_CODEC_FLAG_GRAY)) { dct_y_offset = interlaced_mb ? frame->linesize[1] : (dct_linesize_chroma << 3); - ctx->idsp.idct_put(dest_u, dct_linesize_chroma, ctx->blocks[2]); - ctx->idsp.idct_put(dest_u + dct_x_offset, dct_linesize_chroma, ctx->blocks[3]); - ctx->idsp.idct_put(dest_u + dct_y_offset, dct_linesize_chroma, ctx->blocks[8]); - ctx->idsp.idct_put(dest_u + dct_y_offset + dct_x_offset, dct_linesize_chroma, ctx->blocks[9]); - ctx->idsp.idct_put(dest_v, dct_linesize_chroma, ctx->blocks[4]); - ctx->idsp.idct_put(dest_v + dct_x_offset, dct_linesize_chroma, ctx->blocks[5]); - ctx->idsp.idct_put(dest_v + dct_y_offset, dct_linesize_chroma, ctx->blocks[10]); - ctx->idsp.idct_put(dest_v + dct_y_offset + dct_x_offset, dct_linesize_chroma, ctx->blocks[11]); + ctx->idsp.idct_put(dest_u, dct_linesize_chroma, row->blocks[2]); + ctx->idsp.idct_put(dest_u + dct_x_offset, dct_linesize_chroma, row->blocks[3]); + ctx->idsp.idct_put(dest_u + dct_y_offset, dct_linesize_chroma, row->blocks[8]); + ctx->idsp.idct_put(dest_u + dct_y_offset + dct_x_offset, dct_linesize_chroma, row->blocks[9]); + ctx->idsp.idct_put(dest_v, dct_linesize_chroma, row->blocks[4]); + ctx->idsp.idct_put(dest_v + dct_x_offset, dct_linesize_chroma, row->blocks[5]); + ctx->idsp.idct_put(dest_v + dct_y_offset, dct_linesize_chroma, row->blocks[10]); + ctx->idsp.idct_put(dest_v + dct_y_offset + dct_x_offset, dct_linesize_chroma, row->blocks[11]); } } return 0; } -static int dnxhd_decode_macroblocks(DNXHDContext *ctx, AVFrame *frame, - const uint8_t *buf, int buf_size) +static int dnxhd_decode_row(AVCodecContext *avctx, void *data, + int rownb, int threadnb) { - int x, y; - for (y = 0; y < ctx->mb_height; y++) { - ctx->last_dc[0] = - ctx->last_dc[1] = - ctx->last_dc[2] = 1 << (ctx->bit_depth + 2); // for levels +2^(bitdepth-1) - init_get_bits(&ctx->gb, buf + ctx->mb_scan_index[y], (buf_size - ctx->mb_scan_index[y]) << 3); - for (x = 0; x < ctx->mb_width; x++) { - //START_TIMER; - dnxhd_decode_macroblock(ctx, frame, x, y); - //STOP_TIMER("decode macroblock"); - } + const DNXHDContext *ctx = avctx->priv_data; + uint32_t offset = ctx->mb_scan_index[rownb]; + RowContext *row = ctx->rows + threadnb; + int x; + + row->last_dc[0] = + row->last_dc[1] = + row->last_dc[2] = 1 << (ctx->bit_depth + 2); // for levels +2^(bitdepth-1) + init_get_bits(&row->gb, ctx->buf + offset, (ctx->buf_size - offset) << 3); + for (x = 0; x < ctx->mb_width; x++) { + //START_TIMER; + dnxhd_decode_macroblock(ctx, row, data, x, rownb); + //STOP_TIMER("decode macroblock"); } + return 0; } @@ -512,7 +548,9 @@ decode_coding_unit: picture->key_frame = 1; } - dnxhd_decode_macroblocks(ctx, picture, buf + 0x280, buf_size - 0x280); + ctx->buf_size = buf_size - 0x280; + ctx->buf = buf + 0x280; + avctx->execute2(avctx, dnxhd_decode_row, picture, NULL, ctx->mb_height); if (first_field && picture->interlaced_frame) { buf += ctx->cid_table->coding_unit_size; @@ -532,6 +570,9 @@ static av_cold int dnxhd_decode_close(AVCodecContext *avctx) ff_free_vlc(&ctx->ac_vlc); ff_free_vlc(&ctx->dc_vlc); ff_free_vlc(&ctx->run_vlc); + + av_freep(&ctx->rows); + return 0; } @@ -544,5 +585,7 @@ AVCodec ff_dnxhd_decoder = { .init = dnxhd_decode_init, .close = dnxhd_decode_close, .decode = dnxhd_decode_frame, - .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS, + .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS | + AV_CODEC_CAP_SLICE_THREADS, + .init_thread_copy = ONLY_IF_THREADS_ENABLED(dnxhd_decode_init_thread_copy), };