mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2025-03-08 05:28:40 +00:00
avcodec/bitpacked_dec: optimize bitpacked_decode_yuv422p10
Rework the code a bit to speed up the 10-bit bitpacked decoding routine. This is probably about as fast as I can get it without switching to assembly language. Demonstratable with: ./ffmpeg -f lavfi -i "smptehdbars=size=3840x2160" -c bitpacked -f image2 -frames:v 1 source.yuv ./ffmpeg -f bitpacked -pix_fmt yuv422p10le -s 3840x2160 -c:v bitpacked -i source.yuv -pix_fmt yuv422p10le out.yuv On my development system, it went from 80ms for a 2160p frame down to 20ms (i.e. a 4X speedup). Good enough for now, I hope... Comments from Marton: Originally on my system better performance could be achieved by simply switching to the cached bitstream reader, but for Devin it was slower than his direct byte operations. I changed the order of writing output from u/y/v/y to u/v/y/y, and that made the code faster than the cached bitstream reader on my system as well. TIMER measurement of the decode loop on Ryzen 5 3600 with command line: ./ffmpeg -stream_loop 256 -threads 1 -f bitpacked -pix_fmt yuv422p10le -s 3840x2160 -c:v bitpacked -i source.yuv -pix_fmt yuv422p10le -f null none -loglevel error Before: 823204127 decicycles in YUV, 256 runs, 0 skips After: 315070524 decicycles in YUV, 256 runs, 0 skips Signed-off-by: Devin Heitmueller <dheitmueller@ltnglobal.com> Signed-off-by: Marton Balint <cus@passwd.hu>
This commit is contained in:
parent
059ea1d6f6
commit
b2c82b23b9
@ -28,7 +28,6 @@
|
||||
|
||||
#include "avcodec.h"
|
||||
#include "codec_internal.h"
|
||||
#include "get_bits.h"
|
||||
#include "libavutil/imgutils.h"
|
||||
#include "thread.h"
|
||||
|
||||
@ -65,7 +64,7 @@ static int bitpacked_decode_yuv422p10(AVCodecContext *avctx, AVFrame *frame,
|
||||
{
|
||||
uint64_t frame_size = (uint64_t)avctx->width * (uint64_t)avctx->height * 20;
|
||||
uint64_t packet_size = (uint64_t)avpkt->size * 8;
|
||||
GetBitContext bc;
|
||||
uint8_t *src;
|
||||
uint16_t *y, *u, *v;
|
||||
int ret, i, j;
|
||||
|
||||
@ -79,20 +78,18 @@ static int bitpacked_decode_yuv422p10(AVCodecContext *avctx, AVFrame *frame,
|
||||
if (avctx->width % 2)
|
||||
return AVERROR_PATCHWELCOME;
|
||||
|
||||
ret = init_get_bits(&bc, avpkt->data, avctx->width * avctx->height * 20);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
src = avpkt->data;
|
||||
for (i = 0; i < avctx->height; i++) {
|
||||
y = (uint16_t*)(frame->data[0] + i * frame->linesize[0]);
|
||||
u = (uint16_t*)(frame->data[1] + i * frame->linesize[1]);
|
||||
v = (uint16_t*)(frame->data[2] + i * frame->linesize[2]);
|
||||
|
||||
for (j = 0; j < avctx->width; j += 2) {
|
||||
*u++ = get_bits(&bc, 10);
|
||||
*y++ = get_bits(&bc, 10);
|
||||
*v++ = get_bits(&bc, 10);
|
||||
*y++ = get_bits(&bc, 10);
|
||||
*u++ = (src[0] << 2) | (src[1] >> 6);
|
||||
*v++ = ((src[2] << 6) | (src[3] >> 2)) & 0x3ff;
|
||||
*y++ = ((src[1] << 4) | (src[2] >> 4)) & 0x3ff;
|
||||
*y++ = ((src[3] << 8) | (src[4])) & 0x3ff;
|
||||
src += 5;
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user