avcodec/magicyuvenc: put some slice work under parallel execution

Speeds up slice threaded encoding.
This commit is contained in:
Paul B Mahol 2023-06-03 10:49:08 +02:00
parent ba32f28498
commit 2342c05e43
1 changed files with 74 additions and 22 deletions

View File

@ -65,8 +65,12 @@ typedef struct MagicYUVContext {
int hshift[4]; int hshift[4];
int vshift[4]; int vshift[4];
uint8_t **slices; uint8_t **slices;
uint8_t **bitslices;
unsigned bitslice_size;
unsigned *slice_pos; unsigned *slice_pos;
unsigned *slice_size;
unsigned tables_size; unsigned tables_size;
PTable *counts;
uint8_t *decorrelate_buf[2]; uint8_t *decorrelate_buf[2];
HuffEntry he[4][256]; HuffEntry he[4][256];
LLVidEncDSPContext llvidencdsp; LLVidEncDSPContext llvidencdsp;
@ -206,15 +210,20 @@ static av_cold int magy_encode_init(AVCodecContext *avctx)
s->nb_slices = FFMAX(1, s->nb_slices); s->nb_slices = FFMAX(1, s->nb_slices);
s->slice_height = FFALIGN((avctx->height + s->nb_slices - 1) / s->nb_slices, 1 << s->vshift[1]); s->slice_height = FFALIGN((avctx->height + s->nb_slices - 1) / s->nb_slices, 1 << s->vshift[1]);
s->slice_pos = av_calloc(s->nb_slices * s->planes, sizeof(*s->slice_pos)); s->slice_pos = av_calloc(s->nb_slices * s->planes, sizeof(*s->slice_pos));
s->slice_size = av_calloc(s->nb_slices * s->planes, sizeof(*s->slice_size));
s->slices = av_calloc(s->nb_slices * s->planes, sizeof(*s->slices)); s->slices = av_calloc(s->nb_slices * s->planes, sizeof(*s->slices));
if (!s->slices || !s->slice_pos) s->bitslices = av_calloc(s->nb_slices * s->planes, sizeof(*s->bitslices));
s->counts = av_calloc(s->nb_slices * s->planes * 256, sizeof(*s->counts));
if (!s->slices || !s->slice_pos || !s->counts || !s->slice_size)
return AVERROR(ENOMEM); return AVERROR(ENOMEM);
s->bitslice_size = avctx->width * (s->slice_height + 2) + AV_INPUT_BUFFER_PADDING_SIZE;
for (int n = 0; n < s->nb_slices; n++) { for (int n = 0; n < s->nb_slices; n++) {
for (int i = 0; i < s->planes; i++) { for (int i = 0; i < s->planes; i++) {
s->bitslices[n * s->planes + i] = av_malloc(s->bitslice_size);
s->slices[n * s->planes + i] = av_malloc(avctx->width * (s->slice_height + 2) + s->slices[n * s->planes + i] = av_malloc(avctx->width * (s->slice_height + 2) +
AV_INPUT_BUFFER_PADDING_SIZE); AV_INPUT_BUFFER_PADDING_SIZE);
if (!s->slices[n * s->planes + i]) { if (!s->slices[n * s->planes + i] || !s->bitslices[n * s->planes + i]) {
av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer.\n"); av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer.\n");
return AVERROR(ENOMEM); return AVERROR(ENOMEM);
} }
@ -358,6 +367,20 @@ static void magy_huffman_compute_bits(PTable *prob_table, HuffEntry *distincts,
} }
} }
static int count_plane_slice(AVCodecContext *avctx, int n, int plane)
{
MagicYUVContext *s = avctx->priv_data;
const uint8_t *dst = s->slices[n * s->planes + plane];
PTable *counts = s->counts + 256 * (n * s->planes + plane);
memset(counts, 0, sizeof(*counts) * 256);
count_usage(dst, AV_CEIL_RSHIFT(avctx->width, s->hshift[plane]),
AV_CEIL_RSHIFT(s->slice_height, s->vshift[plane]), counts);
return 0;
}
static int encode_table(AVCodecContext *avctx, static int encode_table(AVCodecContext *avctx,
PutBitContext *pb, HuffEntry *he, int plane) PutBitContext *pb, HuffEntry *he, int plane)
{ {
@ -366,16 +389,16 @@ static int encode_table(AVCodecContext *avctx,
uint16_t codes_counts[33] = { 0 }; uint16_t codes_counts[33] = { 0 };
for (int n = 0; n < s->nb_slices; n++) { for (int n = 0; n < s->nb_slices; n++) {
const uint8_t *dst = s->slices[n * s->planes + plane]; PTable *slice_counts = s->counts + 256 * (n * s->planes + plane);
count_usage(dst, AV_CEIL_RSHIFT(avctx->width, s->hshift[plane]), for (int i = 0; i < 256; i++)
AV_CEIL_RSHIFT(s->slice_height, s->vshift[plane]), counts); counts[i].prob = slice_counts[i].prob;
}
for (int i = 0; i < 256; i++) { for (int i = 0; i < 256; i++) {
counts[i].prob++; counts[i].prob++;
counts[i].value = i; counts[i].value = i;
} }
}
magy_huffman_compute_bits(counts, he, codes_counts, 256, 12); magy_huffman_compute_bits(counts, he, codes_counts, 256, 12);
@ -389,7 +412,7 @@ static int encode_table(AVCodecContext *avctx,
return 0; return 0;
} }
static int encode_slice(uint8_t *src, uint8_t *dst, int dst_size, static int encode_plane_slice(uint8_t *src, uint8_t *dst, int dst_size,
int width, int height, HuffEntry *he, int prediction) int width, int height, HuffEntry *he, int prediction)
{ {
PutBitContext pb; PutBitContext pb;
@ -420,6 +443,31 @@ static int encode_slice(uint8_t *src, uint8_t *dst, int dst_size,
return put_bytes_output(&pb); return put_bytes_output(&pb);
} }
static int encode_slice(AVCodecContext *avctx, void *tdata,
int n, int threadnr)
{
MagicYUVContext *s = avctx->priv_data;
const int slice_height = s->slice_height;
const int last_height = FFMIN(slice_height, avctx->height - n * slice_height);
const int height = (n < (s->nb_slices - 1)) ? slice_height : last_height;
PutByteContext pb;
for (int i = 0; i < s->planes; i++) {
bytestream2_init_writer(&pb, s->bitslices[n + s->planes + i],
s->bitslice_size);
s->slice_size[n * s->planes + i] =
encode_plane_slice(s->slices[n * s->planes + i],
s->bitslices[n * s->planes + i],
bytestream2_get_bytes_left_p(&pb),
AV_CEIL_RSHIFT(avctx->width, s->hshift[i]),
AV_CEIL_RSHIFT(height, s->vshift[i]),
s->he[i], s->frame_pred);
}
return 0;
}
static int predict_slice(AVCodecContext *avctx, void *tdata, static int predict_slice(AVCodecContext *avctx, void *tdata,
int n, int threadnr) int n, int threadnr)
{ {
@ -469,6 +517,9 @@ static int predict_slice(AVCodecContext *avctx, void *tdata,
} }
} }
for (int p = 0; p < s->planes; p++)
count_plane_slice(avctx, n, p);
return 0; return 0;
} }
@ -528,18 +579,14 @@ static int magy_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
s->tables_size = put_bytes_count(&s->pb, 1); s->tables_size = put_bytes_count(&s->pb, 1);
bytestream2_skip_p(&pb, s->tables_size); bytestream2_skip_p(&pb, s->tables_size);
avctx->execute2(avctx, encode_slice, NULL, NULL, s->nb_slices);
for (int n = 0; n < s->nb_slices; n++) { for (int n = 0; n < s->nb_slices; n++) {
for (int i = 0; i < s->planes; i++) { for (int i = 0; i < s->planes; i++) {
unsigned slice_size;
s->slice_pos[n * s->planes + i] = bytestream2_tell_p(&pb); s->slice_pos[n * s->planes + i] = bytestream2_tell_p(&pb);
slice_size = encode_slice(s->slices[n * s->planes + i],
pkt->data + bytestream2_tell_p(&pb), bytestream2_put_buffer(&pb, s->bitslices[n * s->planes + i],
bytestream2_get_bytes_left_p(&pb), s->slice_size[n * s->planes + i]);
AV_CEIL_RSHIFT(frame->width, s->hshift[i]),
AV_CEIL_RSHIFT(slice_height, s->vshift[i]),
s->he[i], s->frame_pred);
bytestream2_skip_p(&pb, slice_size);
} }
} }
@ -564,9 +611,14 @@ static av_cold int magy_encode_close(AVCodecContext *avctx)
MagicYUVContext *s = avctx->priv_data; MagicYUVContext *s = avctx->priv_data;
av_freep(&s->slice_pos); av_freep(&s->slice_pos);
for (int i = 0; i < s->planes && s->slices; i++) av_freep(&s->slice_size);
for (int i = 0; i < s->planes * s->nb_slices && s->slices; i++)
av_freep(&s->slices[i]); av_freep(&s->slices[i]);
for (int i = 0; i < s->planes * s->nb_slices && s->bitslices; i++)
av_freep(&s->bitslices[i]);
av_freep(&s->counts);
av_freep(&s->slices); av_freep(&s->slices);
av_freep(&s->bitslices);
av_freep(&s->decorrelate_buf); av_freep(&s->decorrelate_buf);
return 0; return 0;