ffmpeg/libavcodec/bitpacked_dec.c
Devin Heitmueller b2c82b23b9 avcodec/bitpacked_dec: optimize bitpacked_decode_yuv422p10
Rework the code a bit to speed up the 10-bit bitpacked decoding
routine.  This is probably about as fast as I can get it without
switching to assembly language.

Demonstratable with:

./ffmpeg -f lavfi -i "smptehdbars=size=3840x2160" -c bitpacked -f image2 -frames:v 1 source.yuv
./ffmpeg -f bitpacked -pix_fmt yuv422p10le -s 3840x2160 -c:v bitpacked -i source.yuv -pix_fmt yuv422p10le out.yuv

On my development system, it went from 80ms for a 2160p frame
down to 20ms (i.e. a 4X speedup).  Good enough for now, I hope...

Comments from Marton:

Originally on my system better performance could be achieved by simply
switching to the cached bitstream reader, but for Devin it was slower than
his direct byte operations.

I changed the order of writing output from u/y/v/y to u/v/y/y, and that made
the code faster than the cached bitstream reader on my system as well.

TIMER measurement of the decode loop on Ryzen 5 3600 with command line:

./ffmpeg -stream_loop 256 -threads 1 -f bitpacked -pix_fmt yuv422p10le -s 3840x2160 -c:v bitpacked -i source.yuv -pix_fmt yuv422p10le -f null none -loglevel error

Before: 823204127 decicycles in YUV,     256 runs,      0 skips
After:  315070524 decicycles in YUV,     256 runs,      0 skips

Signed-off-by: Devin Heitmueller <dheitmueller@ltnglobal.com>
Signed-off-by: Marton Balint <cus@passwd.hu>
2023-12-28 23:56:14 +01:00

155 lines
4.5 KiB
C

/*
* Unpack bit-packed streams to formats supported by FFmpeg
* Copyright (c) 2017 Savoir-faire Linux, Inc
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/* Development sponsored by CBC/Radio-Canada */
/**
* @file
* Bitpacked
*/
#include "avcodec.h"
#include "codec_internal.h"
#include "libavutil/imgutils.h"
#include "thread.h"
struct BitpackedContext {
int (*decode)(AVCodecContext *avctx, AVFrame *frame,
const AVPacket *pkt);
};
/* For this format, it's a simple passthrough */
static int bitpacked_decode_uyvy422(AVCodecContext *avctx, AVFrame *frame,
const AVPacket *avpkt)
{
int ret;
/* there is no need to copy as the data already match
* a known pixel format */
frame->buf[0] = av_buffer_ref(avpkt->buf);
if (!frame->buf[0]) {
return AVERROR(ENOMEM);
}
ret = av_image_fill_arrays(frame->data, frame->linesize, avpkt->data,
avctx->pix_fmt, avctx->width, avctx->height, 1);
if (ret < 0) {
av_buffer_unref(&frame->buf[0]);
return ret;
}
return 0;
}
static int bitpacked_decode_yuv422p10(AVCodecContext *avctx, AVFrame *frame,
const AVPacket *avpkt)
{
uint64_t frame_size = (uint64_t)avctx->width * (uint64_t)avctx->height * 20;
uint64_t packet_size = (uint64_t)avpkt->size * 8;
uint8_t *src;
uint16_t *y, *u, *v;
int ret, i, j;
ret = ff_thread_get_buffer(avctx, frame, 0);
if (ret < 0)
return ret;
if (frame_size > packet_size)
return AVERROR_INVALIDDATA;
if (avctx->width % 2)
return AVERROR_PATCHWELCOME;
src = avpkt->data;
for (i = 0; i < avctx->height; i++) {
y = (uint16_t*)(frame->data[0] + i * frame->linesize[0]);
u = (uint16_t*)(frame->data[1] + i * frame->linesize[1]);
v = (uint16_t*)(frame->data[2] + i * frame->linesize[2]);
for (j = 0; j < avctx->width; j += 2) {
*u++ = (src[0] << 2) | (src[1] >> 6);
*v++ = ((src[2] << 6) | (src[3] >> 2)) & 0x3ff;
*y++ = ((src[1] << 4) | (src[2] >> 4)) & 0x3ff;
*y++ = ((src[3] << 8) | (src[4])) & 0x3ff;
src += 5;
}
}
return 0;
}
static av_cold int bitpacked_init_decoder(AVCodecContext *avctx)
{
struct BitpackedContext *bc = avctx->priv_data;
if (!avctx->codec_tag || !avctx->width || !avctx->height)
return AVERROR_INVALIDDATA;
if (avctx->codec_tag == MKTAG('U', 'Y', 'V', 'Y')) {
if (avctx->bits_per_coded_sample == 16 &&
avctx->pix_fmt == AV_PIX_FMT_UYVY422)
bc->decode = bitpacked_decode_uyvy422;
else if (avctx->bits_per_coded_sample == 20 &&
avctx->pix_fmt == AV_PIX_FMT_YUV422P10)
bc->decode = bitpacked_decode_yuv422p10;
else
return AVERROR_INVALIDDATA;
} else {
return AVERROR_INVALIDDATA;
}
return 0;
}
static int bitpacked_decode(AVCodecContext *avctx, AVFrame *frame,
int *got_frame, AVPacket *avpkt)
{
struct BitpackedContext *bc = avctx->priv_data;
int buf_size = avpkt->size;
int res;
res = bc->decode(avctx, frame, avpkt);
if (res)
return res;
frame->pict_type = AV_PICTURE_TYPE_I;
frame->flags |= AV_FRAME_FLAG_KEY;
*got_frame = 1;
return buf_size;
}
const FFCodec ff_bitpacked_decoder = {
.p.name = "bitpacked",
CODEC_LONG_NAME("Bitpacked"),
.p.type = AVMEDIA_TYPE_VIDEO,
.p.id = AV_CODEC_ID_BITPACKED,
.p.capabilities = AV_CODEC_CAP_FRAME_THREADS,
.priv_data_size = sizeof(struct BitpackedContext),
.init = bitpacked_init_decoder,
FF_CODEC_DECODE_CB(bitpacked_decode),
.codec_tags = (const uint32_t []){
MKTAG('U', 'Y', 'V', 'Y'),
FF_CODEC_TAGS_END,
},
};