ffmpeg/libswscale/hscale.c
Ramiro Polla 384fe39623 swscale/range_convert: fix mpeg ranges in yuv range conversion for non-8-bit pixel formats
There is an issue with the constants used in YUV to YUV range conversion,
where the upper bound is not respected when converting to mpeg range.

With this commit, the constants are calculated at runtime, depending on
the bit depth. This approach also allows us to more easily understand how
the constants are derived.

For bit depths <= 14, the number of fixed point bits has been set to 14
for all conversions, to simplify the code.
For bit depths > 14, the number of fixed points bits has been raised and
set to 18, to allow for the conversion to be accurate enough for the mpeg
range to be respected.

The convert functions now take the conversion constants (coeff and offset)
as function arguments.
For bit depths <= 14, coeff is unsigned 16-bit and offset is 32-bit.
For bit depths > 14, coeff is unsigned 32-bit and offset is 64-bit.

x86_64:
chrRangeFromJpeg8_1920_c:    2127.4   2125.0  (1.00x)
chrRangeFromJpeg16_1920_c:   2325.2   2127.2  (1.09x)
chrRangeToJpeg8_1920_c:      3166.9   3168.7  (1.00x)
chrRangeToJpeg16_1920_c:     2152.4   3164.8  (0.68x)
lumRangeFromJpeg8_1920_c:    1263.0   1302.5  (0.97x)
lumRangeFromJpeg16_1920_c:   1080.5   1299.2  (0.83x)
lumRangeToJpeg8_1920_c:      1886.8   2112.2  (0.89x)
lumRangeToJpeg16_1920_c:     1077.0   1906.5  (0.56x)

aarch64 A55:
chrRangeFromJpeg8_1920_c:   28835.2  28835.6  (1.00x)
chrRangeFromJpeg16_1920_c:  28839.8  32680.8  (0.88x)
chrRangeToJpeg8_1920_c:     23074.7  23075.4  (1.00x)
chrRangeToJpeg16_1920_c:    17318.9  24996.0  (0.69x)
lumRangeFromJpeg8_1920_c:   15389.7  15384.5  (1.00x)
lumRangeFromJpeg16_1920_c:  15388.2  17306.7  (0.89x)
lumRangeToJpeg8_1920_c:     19227.8  19226.6  (1.00x)
lumRangeToJpeg16_1920_c:    15387.0  21146.3  (0.73x)

aarch64 A76:
chrRangeFromJpeg8_1920_c:    6324.4   6268.1  (1.01x)
chrRangeFromJpeg16_1920_c:   6339.9  11521.5  (0.55x)
chrRangeToJpeg8_1920_c:      9656.0   9612.8  (1.00x)
chrRangeToJpeg16_1920_c:     6340.4  11651.8  (0.54x)
lumRangeFromJpeg8_1920_c:    4422.0   4420.8  (1.00x)
lumRangeFromJpeg16_1920_c:   4420.9   5762.0  (0.77x)
lumRangeToJpeg8_1920_c:      5949.1   5977.5  (1.00x)
lumRangeToJpeg16_1920_c:     4446.8   5946.2  (0.75x)

NOTE: all simd optimizations for range_convert have been disabled.
      they will be re-enabled when they are fixed for each architecture.

NOTE2: the same issue still exists in rgb2yuv conversions, which is not
       addressed in this commit.
2024-12-05 21:10:29 +01:00

293 lines
9.5 KiB
C

/*
* Copyright (C) 2015 Pedro Arthur <bygrandao@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/mem.h"
#include "swscale_internal.h"
/// Scaler instance data
typedef struct FilterContext
{
uint16_t *filter;
int *filter_pos;
int filter_size;
int xInc;
} FilterContext;
/// Color conversion instance data
typedef struct ColorContext
{
uint32_t *pal;
} ColorContext;
static int lum_h_scale(SwsInternal *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
FilterContext *instance = desc->instance;
int srcW = desc->src->width;
int dstW = desc->dst->width;
int xInc = instance->xInc;
int i;
for (i = 0; i < sliceH; ++i) {
uint8_t ** src = desc->src->plane[0].line;
uint8_t ** dst = desc->dst->plane[0].line;
int src_pos = sliceY+i - desc->src->plane[0].sliceY;
int dst_pos = sliceY+i - desc->dst->plane[0].sliceY;
if (c->hyscale_fast) {
c->hyscale_fast(c, (int16_t*)dst[dst_pos], dstW, src[src_pos], srcW, xInc);
} else {
c->hyScale(c, (int16_t*)dst[dst_pos], dstW, (const uint8_t *)src[src_pos], instance->filter,
instance->filter_pos, instance->filter_size);
}
if (c->lumConvertRange)
c->lumConvertRange((int16_t*)dst[dst_pos], dstW,
c->lumConvertRange_coeff, c->lumConvertRange_offset);
desc->dst->plane[0].sliceH += 1;
if (desc->alpha) {
src = desc->src->plane[3].line;
dst = desc->dst->plane[3].line;
src_pos = sliceY+i - desc->src->plane[3].sliceY;
dst_pos = sliceY+i - desc->dst->plane[3].sliceY;
desc->dst->plane[3].sliceH += 1;
if (c->hyscale_fast) {
c->hyscale_fast(c, (int16_t*)dst[dst_pos], dstW, src[src_pos], srcW, xInc);
} else {
c->hyScale(c, (int16_t*)dst[dst_pos], dstW, (const uint8_t *)src[src_pos], instance->filter,
instance->filter_pos, instance->filter_size);
}
}
}
return sliceH;
}
static int lum_convert(SwsInternal *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
int srcW = desc->src->width;
ColorContext * instance = desc->instance;
uint32_t * pal = instance->pal;
int i;
desc->dst->plane[0].sliceY = sliceY;
desc->dst->plane[0].sliceH = sliceH;
desc->dst->plane[3].sliceY = sliceY;
desc->dst->plane[3].sliceH = sliceH;
for (i = 0; i < sliceH; ++i) {
int sp0 = sliceY+i - desc->src->plane[0].sliceY;
int sp1 = ((sliceY+i) >> desc->src->v_chr_sub_sample) - desc->src->plane[1].sliceY;
const uint8_t * src[4] = { desc->src->plane[0].line[sp0],
desc->src->plane[1].line[sp1],
desc->src->plane[2].line[sp1],
desc->src->plane[3].line[sp0]};
uint8_t * dst = desc->dst->plane[0].line[i];
if (c->lumToYV12) {
c->lumToYV12(dst, src[0], src[1], src[2], srcW, pal, c->input_opaque);
} else if (c->readLumPlanar) {
c->readLumPlanar(dst, src, srcW, c->input_rgb2yuv_table, c->input_opaque);
}
if (desc->alpha) {
dst = desc->dst->plane[3].line[i];
if (c->alpToYV12) {
c->alpToYV12(dst, src[3], src[1], src[2], srcW, pal, c->input_opaque);
} else if (c->readAlpPlanar) {
c->readAlpPlanar(dst, src, srcW, NULL, c->input_opaque);
}
}
}
return sliceH;
}
int ff_init_desc_fmt_convert(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst, uint32_t *pal)
{
ColorContext * li = av_malloc(sizeof(ColorContext));
if (!li)
return AVERROR(ENOMEM);
li->pal = pal;
desc->instance = li;
desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
desc->src =src;
desc->dst = dst;
desc->process = &lum_convert;
return 0;
}
int ff_init_desc_hscale(SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst, uint16_t *filter, int * filter_pos, int filter_size, int xInc)
{
FilterContext *li = av_malloc(sizeof(FilterContext));
if (!li)
return AVERROR(ENOMEM);
li->filter = filter;
li->filter_pos = filter_pos;
li->filter_size = filter_size;
li->xInc = xInc;
desc->instance = li;
desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
desc->src = src;
desc->dst = dst;
desc->process = &lum_h_scale;
return 0;
}
static int chr_h_scale(SwsInternal *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
FilterContext *instance = desc->instance;
int srcW = AV_CEIL_RSHIFT(desc->src->width, desc->src->h_chr_sub_sample);
int dstW = AV_CEIL_RSHIFT(desc->dst->width, desc->dst->h_chr_sub_sample);
int xInc = instance->xInc;
uint8_t ** src1 = desc->src->plane[1].line;
uint8_t ** dst1 = desc->dst->plane[1].line;
uint8_t ** src2 = desc->src->plane[2].line;
uint8_t ** dst2 = desc->dst->plane[2].line;
int src_pos1 = sliceY - desc->src->plane[1].sliceY;
int dst_pos1 = sliceY - desc->dst->plane[1].sliceY;
int src_pos2 = sliceY - desc->src->plane[2].sliceY;
int dst_pos2 = sliceY - desc->dst->plane[2].sliceY;
int i;
for (i = 0; i < sliceH; ++i) {
if (c->hcscale_fast) {
c->hcscale_fast(c, (uint16_t*)dst1[dst_pos1+i], (uint16_t*)dst2[dst_pos2+i], dstW, src1[src_pos1+i], src2[src_pos2+i], srcW, xInc);
} else {
c->hcScale(c, (uint16_t*)dst1[dst_pos1+i], dstW, src1[src_pos1+i], instance->filter, instance->filter_pos, instance->filter_size);
c->hcScale(c, (uint16_t*)dst2[dst_pos2+i], dstW, src2[src_pos2+i], instance->filter, instance->filter_pos, instance->filter_size);
}
if (c->chrConvertRange)
c->chrConvertRange((uint16_t*)dst1[dst_pos1+i], (uint16_t*)dst2[dst_pos2+i], dstW,
c->chrConvertRange_coeff, c->chrConvertRange_offset);
desc->dst->plane[1].sliceH += 1;
desc->dst->plane[2].sliceH += 1;
}
return sliceH;
}
static int chr_convert(SwsInternal *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
int srcW = AV_CEIL_RSHIFT(desc->src->width, desc->src->h_chr_sub_sample);
ColorContext * instance = desc->instance;
uint32_t * pal = instance->pal;
int sp0 = (sliceY - (desc->src->plane[0].sliceY >> desc->src->v_chr_sub_sample)) << desc->src->v_chr_sub_sample;
int sp1 = sliceY - desc->src->plane[1].sliceY;
int i;
desc->dst->plane[1].sliceY = sliceY;
desc->dst->plane[1].sliceH = sliceH;
desc->dst->plane[2].sliceY = sliceY;
desc->dst->plane[2].sliceH = sliceH;
for (i = 0; i < sliceH; ++i) {
const uint8_t * src[4] = { desc->src->plane[0].line[sp0+i],
desc->src->plane[1].line[sp1+i],
desc->src->plane[2].line[sp1+i],
desc->src->plane[3].line[sp0+i]};
uint8_t * dst1 = desc->dst->plane[1].line[i];
uint8_t * dst2 = desc->dst->plane[2].line[i];
if (c->chrToYV12) {
c->chrToYV12(dst1, dst2, src[0], src[1], src[2], srcW, pal, c->input_opaque);
} else if (c->readChrPlanar) {
c->readChrPlanar(dst1, dst2, src, srcW, c->input_rgb2yuv_table, c->input_opaque);
}
}
return sliceH;
}
int ff_init_desc_cfmt_convert(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst, uint32_t *pal)
{
ColorContext * li = av_malloc(sizeof(ColorContext));
if (!li)
return AVERROR(ENOMEM);
li->pal = pal;
desc->instance = li;
desc->src =src;
desc->dst = dst;
desc->process = &chr_convert;
return 0;
}
int ff_init_desc_chscale(SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst, uint16_t *filter, int * filter_pos, int filter_size, int xInc)
{
FilterContext *li = av_malloc(sizeof(FilterContext));
if (!li)
return AVERROR(ENOMEM);
li->filter = filter;
li->filter_pos = filter_pos;
li->filter_size = filter_size;
li->xInc = xInc;
desc->instance = li;
desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
desc->src = src;
desc->dst = dst;
desc->process = &chr_h_scale;
return 0;
}
static int no_chr_scale(SwsInternal *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
desc->dst->plane[1].sliceY = sliceY + sliceH - desc->dst->plane[1].available_lines;
desc->dst->plane[1].sliceH = desc->dst->plane[1].available_lines;
desc->dst->plane[2].sliceY = sliceY + sliceH - desc->dst->plane[2].available_lines;
desc->dst->plane[2].sliceH = desc->dst->plane[2].available_lines;
return 0;
}
int ff_init_desc_no_chr(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst)
{
desc->src = src;
desc->dst = dst;
desc->alpha = 0;
desc->instance = NULL;
desc->process = &no_chr_scale;
return 0;
}