mirror of https://git.ffmpeg.org/ffmpeg.git
reorganize and simplify the VP3 IDCT stuff
Originally committed as revision 3071 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
4ea4b27469
commit
116824d0aa
|
@ -3126,8 +3126,7 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
|
||||||
|
|
||||||
/* VP3 DSP support */
|
/* VP3 DSP support */
|
||||||
c->vp3_dsp_init = vp3_dsp_init_c;
|
c->vp3_dsp_init = vp3_dsp_init_c;
|
||||||
c->vp3_idct_put = vp3_idct_put_c;
|
c->vp3_idct = vp3_idct_c;
|
||||||
c->vp3_idct_add = vp3_idct_add_c;
|
|
||||||
|
|
||||||
c->get_pixels = get_pixels_c;
|
c->get_pixels = get_pixels_c;
|
||||||
c->diff_pixels = diff_pixels_c;
|
c->diff_pixels = diff_pixels_c;
|
||||||
|
|
|
@ -62,23 +62,16 @@ extern uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
|
||||||
|
|
||||||
/* VP3 DSP functions */
|
/* VP3 DSP functions */
|
||||||
void vp3_dsp_init_c(void);
|
void vp3_dsp_init_c(void);
|
||||||
void vp3_idct_put_c(int16_t *input_data, int16_t *dequant_matrix,
|
void vp3_idct_c(int16_t *input_data, int16_t *dequant_matrix,
|
||||||
int coeff_count, uint8_t *dest, int stride);
|
int coeff_count, DCTELEM *output_data);
|
||||||
void vp3_idct_add_c(int16_t *input_data, int16_t *dequant_matrix,
|
|
||||||
int coeff_count, uint8_t *dest, int stride);
|
|
||||||
|
|
||||||
void vp3_dsp_init_mmx(void);
|
void vp3_dsp_init_mmx(void);
|
||||||
void vp3_idct_put_mmx(int16_t *input_data, int16_t *dequant_matrix,
|
void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix,
|
||||||
int coeff_count, uint8_t *dest, int stride);
|
int coeff_count, DCTELEM *output_data);
|
||||||
void vp3_idct_add_mmx(int16_t *input_data, int16_t *dequant_matrix,
|
|
||||||
int coeff_count, uint8_t *dest, int stride);
|
|
||||||
|
|
||||||
void vp3_dsp_init_sse2(void);
|
void vp3_dsp_init_sse2(void);
|
||||||
void vp3_idct_put_sse2(int16_t *input_data, int16_t *dequant_matrix,
|
void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix,
|
||||||
int coeff_count, uint8_t *dest, int stride);
|
int coeff_count, DCTELEM *output_data);
|
||||||
void vp3_idct_add_sse2(int16_t *input_data, int16_t *dequant_matrix,
|
|
||||||
int coeff_count, uint8_t *dest, int stride);
|
|
||||||
|
|
||||||
|
|
||||||
/* minimum alignment rules ;)
|
/* minimum alignment rules ;)
|
||||||
if u notice errors in the align stuff, need more alignment for some asm code for some cpu
|
if u notice errors in the align stuff, need more alignment for some asm code for some cpu
|
||||||
|
@ -318,32 +311,16 @@ typedef struct DSPContext {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This function is responsible for taking a block of zigzag'd,
|
* This function is responsible for taking a block of zigzag'd,
|
||||||
* quantized DCT coefficients, reconstructing the original block of
|
* quantized DCT coefficients and reconstructing the original block of
|
||||||
* samples, and placing it into the output.
|
* samples.
|
||||||
* @param input_data 64 zigzag'd, quantized DCT coefficients
|
* @param input_data 64 zigzag'd, quantized DCT coefficients
|
||||||
* @param dequant_matrix 64 zigzag'd quantizer coefficients
|
* @param dequant_matrix 64 zigzag'd quantizer coefficients
|
||||||
* @param coeff_count index of the last coefficient
|
* @param coeff_count index of the last coefficient
|
||||||
* @param dest the final output location where the transformed samples
|
* @param output_samples space for 64 DCTELEMs where the transformed
|
||||||
* are to be placed
|
* samples will be stored
|
||||||
* @param stride the width in 8-bit samples of a line on this plane
|
|
||||||
*/
|
*/
|
||||||
void (*vp3_idct_put)(int16_t *input_data, int16_t *dequant_matrix,
|
void (*vp3_idct)(int16_t *input_data, int16_t *dequant_matrix,
|
||||||
int coeff_count, uint8_t *dest, int stride);
|
int coeff_count, DCTELEM *output_samples);
|
||||||
|
|
||||||
/**
|
|
||||||
* This function is responsible for taking a block of zigzag'd,
|
|
||||||
* quantized DCT coefficients, reconstructing the original block of
|
|
||||||
* samples, and adding the transformed samples to an existing block of
|
|
||||||
* samples in the output.
|
|
||||||
* @param input_data 64 zigzag'd, quantized DCT coefficients
|
|
||||||
* @param dequant_matrix 64 zigzag'd quantizer coefficients
|
|
||||||
* @param coeff_count index of the last coefficient
|
|
||||||
* @param dest the final output location where the transformed samples
|
|
||||||
* are to be placed
|
|
||||||
* @param stride the width in 8-bit samples of a line on this plane
|
|
||||||
*/
|
|
||||||
void (*vp3_idct_add)(int16_t *input_data, int16_t *dequant_matrix,
|
|
||||||
int coeff_count, uint8_t *dest, int stride);
|
|
||||||
|
|
||||||
} DSPContext;
|
} DSPContext;
|
||||||
|
|
||||||
|
|
|
@ -2149,14 +2149,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||||
/* VP3 optimized DSP functions */
|
/* VP3 optimized DSP functions */
|
||||||
if (mm_flags & MM_SSE2) {
|
if (mm_flags & MM_SSE2) {
|
||||||
c->vp3_dsp_init = vp3_dsp_init_sse2;
|
c->vp3_dsp_init = vp3_dsp_init_sse2;
|
||||||
c->vp3_idct_put = vp3_idct_put_sse2;
|
c->vp3_idct = vp3_idct_sse2;
|
||||||
c->vp3_idct_add = vp3_idct_add_sse2;
|
|
||||||
} else {
|
} else {
|
||||||
c->vp3_dsp_init = vp3_dsp_init_mmx;
|
c->vp3_dsp_init = vp3_dsp_init_mmx;
|
||||||
c->vp3_idct_put = vp3_idct_put_mmx;
|
c->vp3_idct = vp3_idct_mmx;
|
||||||
c->vp3_idct_add = vp3_idct_add_mmx;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_ENCODERS
|
#ifdef CONFIG_ENCODERS
|
||||||
c->get_pixels = get_pixels_mmx;
|
c->get_pixels = get_pixels_mmx;
|
||||||
c->diff_pixels = diff_pixels_mmx;
|
c->diff_pixels = diff_pixels_mmx;
|
||||||
|
|
|
@ -279,8 +279,8 @@ void vp3_dsp_init_mmx(void)
|
||||||
idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
|
idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix,
|
void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix,
|
||||||
int16_t *output_data)
|
int coeff_count, int16_t *output_data)
|
||||||
{
|
{
|
||||||
/* eax = quantized input
|
/* eax = quantized input
|
||||||
* ebx = dequantizer matrix
|
* ebx = dequantizer matrix
|
||||||
|
@ -563,79 +563,3 @@ static void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix,
|
||||||
#undef J
|
#undef J
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp3_idct_put_mmx(int16_t *input_data, int16_t *dequant_matrix,
|
|
||||||
int coeff_count, uint8_t *dest, int stride)
|
|
||||||
{
|
|
||||||
int16_t transformed_data[64];
|
|
||||||
int16_t *op;
|
|
||||||
int i, j;
|
|
||||||
uint8_t vector128[8] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
|
|
||||||
|
|
||||||
vp3_idct_mmx(input_data, dequant_matrix, transformed_data);
|
|
||||||
|
|
||||||
/* place in final output */
|
|
||||||
op = transformed_data;
|
|
||||||
movq_m2r(*vector128, mm0);
|
|
||||||
for (i = 0; i < 8; i++) {
|
|
||||||
#if 1
|
|
||||||
for (j = 0; j < 8; j++) {
|
|
||||||
if (*op < -128)
|
|
||||||
*dest = 0;
|
|
||||||
else if (*op > 127)
|
|
||||||
*dest = 255;
|
|
||||||
else
|
|
||||||
*dest = (uint8_t)(*op + 128);
|
|
||||||
op++;
|
|
||||||
dest++;
|
|
||||||
}
|
|
||||||
dest += (stride - 8);
|
|
||||||
#else
|
|
||||||
/* prototype optimization */
|
|
||||||
pxor_r2r(mm1, mm1);
|
|
||||||
packsswb_m2r(*(op + 4), mm1);
|
|
||||||
movq_r2r(mm1, mm2);
|
|
||||||
psrlq_i2r(32, mm2);
|
|
||||||
packsswb_m2r(*(op + 0), mm1);
|
|
||||||
op += 8;
|
|
||||||
por_r2r(mm2, mm1);
|
|
||||||
paddb_r2r(mm0, mm1);
|
|
||||||
movq_r2m(mm1, *dest);
|
|
||||||
dest += stride;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/* be a good MMX citizen */
|
|
||||||
emms();
|
|
||||||
}
|
|
||||||
|
|
||||||
void vp3_idct_add_mmx(int16_t *input_data, int16_t *dequant_matrix,
|
|
||||||
int coeff_count, uint8_t *dest, int stride)
|
|
||||||
{
|
|
||||||
int16_t transformed_data[64];
|
|
||||||
int16_t *op;
|
|
||||||
int i, j;
|
|
||||||
int16_t sample;
|
|
||||||
|
|
||||||
vp3_idct_mmx(input_data, dequant_matrix, transformed_data);
|
|
||||||
|
|
||||||
/* place in final output */
|
|
||||||
op = transformed_data;
|
|
||||||
for (i = 0; i < 8; i++) {
|
|
||||||
for (j = 0; j < 8; j++) {
|
|
||||||
sample = *dest + *op;
|
|
||||||
if (sample < 0)
|
|
||||||
*dest = 0;
|
|
||||||
else if (sample > 255)
|
|
||||||
*dest = 255;
|
|
||||||
else
|
|
||||||
*dest = (uint8_t)(sample & 0xFF);
|
|
||||||
op++;
|
|
||||||
dest++;
|
|
||||||
}
|
|
||||||
dest += (stride - 8);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* be a good MMX citizen */
|
|
||||||
emms();
|
|
||||||
}
|
|
||||||
|
|
|
@ -799,11 +799,12 @@ static unsigned short __align16 SSE2_idct_data[7 * 8] =
|
||||||
void vp3_dsp_init_sse2(void)
|
void vp3_dsp_init_sse2(void)
|
||||||
{
|
{
|
||||||
/* nop */
|
/* nop */
|
||||||
|
av_log(NULL, AV_LOG_INFO, "Hey! SSE2!\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix,
|
void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix,
|
||||||
int16_t *output_data)
|
int coeff_count, int16_t *output_data)
|
||||||
{
|
{
|
||||||
unsigned char *input_bytes = (unsigned char *)input_data;
|
unsigned char *input_bytes = (unsigned char *)input_data;
|
||||||
unsigned char *dequant_matrix_bytes = (unsigned char *)dequant_matrix;
|
unsigned char *dequant_matrix_bytes = (unsigned char *)dequant_matrix;
|
||||||
|
@ -832,59 +833,3 @@ static void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix,
|
||||||
|
|
||||||
SSE2_Column_IDCT();
|
SSE2_Column_IDCT();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void vp3_idct_put_sse2(int16_t *input_data, int16_t *dequant_matrix,
|
|
||||||
int coeff_count, uint8_t *dest, int stride)
|
|
||||||
{
|
|
||||||
int16_t transformed_data[64];
|
|
||||||
int16_t *op;
|
|
||||||
int i, j;
|
|
||||||
|
|
||||||
vp3_idct_sse2(input_data, dequant_matrix, transformed_data);
|
|
||||||
|
|
||||||
/* place in final output */
|
|
||||||
op = transformed_data;
|
|
||||||
for (i = 0; i < 8; i++) {
|
|
||||||
for (j = 0; j < 8; j++) {
|
|
||||||
if (*op < -128)
|
|
||||||
*dest = 0;
|
|
||||||
else if (*op > 127)
|
|
||||||
*dest = 255;
|
|
||||||
else
|
|
||||||
*dest = (uint8_t)(*op + 128);
|
|
||||||
op++;
|
|
||||||
dest++;
|
|
||||||
}
|
|
||||||
dest += (stride - 8);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void vp3_idct_add_sse2(int16_t *input_data, int16_t *dequant_matrix,
|
|
||||||
int coeff_count, uint8_t *dest, int stride)
|
|
||||||
{
|
|
||||||
int16_t transformed_data[64];
|
|
||||||
int16_t *op;
|
|
||||||
int i, j;
|
|
||||||
int16_t sample;
|
|
||||||
|
|
||||||
vp3_idct_sse2(input_data, dequant_matrix, transformed_data);
|
|
||||||
|
|
||||||
/* place in final output */
|
|
||||||
op = transformed_data;
|
|
||||||
for (i = 0; i < 8; i++) {
|
|
||||||
for (j = 0; j < 8; j++) {
|
|
||||||
sample = *dest + *op;
|
|
||||||
if (sample < 0)
|
|
||||||
*dest = 0;
|
|
||||||
else if (sample > 255)
|
|
||||||
*dest = 255;
|
|
||||||
else
|
|
||||||
*dest = (uint8_t)(sample & 0xFF);
|
|
||||||
op++;
|
|
||||||
dest++;
|
|
||||||
}
|
|
||||||
dest += (stride - 8);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
@ -2051,6 +2051,7 @@ static void render_fragments(Vp3DecodeContext *s,
|
||||||
int m, n;
|
int m, n;
|
||||||
int i = first_fragment;
|
int i = first_fragment;
|
||||||
int16_t *dequantizer;
|
int16_t *dequantizer;
|
||||||
|
DCTELEM __align16 output_samples[64];
|
||||||
unsigned char *output_plane;
|
unsigned char *output_plane;
|
||||||
unsigned char *last_plane;
|
unsigned char *last_plane;
|
||||||
unsigned char *golden_plane;
|
unsigned char *golden_plane;
|
||||||
|
@ -2060,6 +2061,10 @@ static void render_fragments(Vp3DecodeContext *s,
|
||||||
int motion_halfpel_index;
|
int motion_halfpel_index;
|
||||||
uint8_t *motion_source;
|
uint8_t *motion_source;
|
||||||
|
|
||||||
|
int16_t *op;
|
||||||
|
uint8_t *dest;
|
||||||
|
int j, k;
|
||||||
|
|
||||||
debug_vp3(" vp3: rendering final fragments for %s\n",
|
debug_vp3(" vp3: rendering final fragments for %s\n",
|
||||||
(plane == 0) ? "Y plane" : (plane == 1) ? "U plane" : "V plane");
|
(plane == 0) ? "Y plane" : (plane == 1) ? "U plane" : "V plane");
|
||||||
|
|
||||||
|
@ -2176,16 +2181,29 @@ av_log(s->avctx, AV_LOG_ERROR, " help! got beefy vector! (%X, %X)\n", motion_x,
|
||||||
s->all_fragments[i].coeffs[0], dequantizer[0]);
|
s->all_fragments[i].coeffs[0], dequantizer[0]);
|
||||||
|
|
||||||
/* invert DCT and place (or add) in final output */
|
/* invert DCT and place (or add) in final output */
|
||||||
|
s->dsp.vp3_idct(s->all_fragments[i].coeffs,
|
||||||
|
dequantizer,
|
||||||
|
s->all_fragments[i].coeff_count,
|
||||||
|
output_samples);
|
||||||
if (s->all_fragments[i].coding_method == MODE_INTRA) {
|
if (s->all_fragments[i].coding_method == MODE_INTRA) {
|
||||||
s->dsp.vp3_idct_put(s->all_fragments[i].coeffs,
|
/* this really needs to be optimized sooner or later */
|
||||||
dequantizer,
|
op = output_samples;
|
||||||
s->all_fragments[i].coeff_count,
|
dest = output_plane + s->all_fragments[i].first_pixel;
|
||||||
output_plane + s->all_fragments[i].first_pixel,
|
for (j = 0; j < 8; j++) {
|
||||||
stride);
|
for (k = 0; k < 8; k++) {
|
||||||
|
if (*op < -128)
|
||||||
|
*dest = 0;
|
||||||
|
else if (*op > 127)
|
||||||
|
*dest = 255;
|
||||||
|
else
|
||||||
|
*dest = (uint8_t)(*op + 128);
|
||||||
|
op++;
|
||||||
|
dest++;
|
||||||
|
}
|
||||||
|
dest += (stride - 8);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
s->dsp.vp3_idct_add(s->all_fragments[i].coeffs,
|
s->dsp.add_pixels_clamped(output_samples,
|
||||||
dequantizer,
|
|
||||||
s->all_fragments[i].coeff_count,
|
|
||||||
output_plane + s->all_fragments[i].first_pixel,
|
output_plane + s->all_fragments[i].first_pixel,
|
||||||
stride);
|
stride);
|
||||||
}
|
}
|
||||||
|
|
|
@ -40,8 +40,10 @@ void vp3_dsp_init_c(void)
|
||||||
/* nop */
|
/* nop */
|
||||||
}
|
}
|
||||||
|
|
||||||
static void vp3_idct_c(int32_t *dequantized_data, int16_t *output_data)
|
void vp3_idct_c(int16_t *input_data, int16_t *dequant_matrix,
|
||||||
|
int coeff_count, int16_t *output_data)
|
||||||
{
|
{
|
||||||
|
int32_t dequantized_data[64];
|
||||||
int32_t *ip = dequantized_data;
|
int32_t *ip = dequantized_data;
|
||||||
int16_t *op = output_data;
|
int16_t *op = output_data;
|
||||||
|
|
||||||
|
@ -49,7 +51,13 @@ static void vp3_idct_c(int32_t *dequantized_data, int16_t *output_data)
|
||||||
int32_t _Ed, _Gd, _Add, _Bdd, _Fd, _Hd;
|
int32_t _Ed, _Gd, _Add, _Bdd, _Fd, _Hd;
|
||||||
int32_t t1, t2;
|
int32_t t1, t2;
|
||||||
|
|
||||||
int i;
|
int i, j;
|
||||||
|
|
||||||
|
/* de-zigzag and dequantize */
|
||||||
|
for (i = 0; i < coeff_count; i++) {
|
||||||
|
j = dezigzag_index[i];
|
||||||
|
dequantized_data[j] = dequant_matrix[i] * input_data[i];
|
||||||
|
}
|
||||||
|
|
||||||
/* Inverse DCT on the rows now */
|
/* Inverse DCT on the rows now */
|
||||||
for (i = 0; i < 8; i++) {
|
for (i = 0; i < 8; i++) {
|
||||||
|
@ -248,71 +256,3 @@ static void vp3_idct_c(int32_t *dequantized_data, int16_t *output_data)
|
||||||
op++;
|
op++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp3_idct_put_c(int16_t *input_data, int16_t *dequant_matrix,
|
|
||||||
int coeff_count, uint8_t *dest, int stride)
|
|
||||||
{
|
|
||||||
int32_t dequantized_data[64];
|
|
||||||
int16_t transformed_data[64];
|
|
||||||
int16_t *op;
|
|
||||||
int i, j;
|
|
||||||
|
|
||||||
/* de-zigzag and dequantize */
|
|
||||||
for (i = 0; i < coeff_count; i++) {
|
|
||||||
j = dezigzag_index[i];
|
|
||||||
dequantized_data[j] = dequant_matrix[i] * input_data[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
vp3_idct_c(dequantized_data, transformed_data);
|
|
||||||
|
|
||||||
/* place in final output */
|
|
||||||
op = transformed_data;
|
|
||||||
for (i = 0; i < 8; i++) {
|
|
||||||
for (j = 0; j < 8; j++) {
|
|
||||||
if (*op < -128)
|
|
||||||
*dest = 0;
|
|
||||||
else if (*op > 127)
|
|
||||||
*dest = 255;
|
|
||||||
else
|
|
||||||
*dest = (uint8_t)(*op + 128);
|
|
||||||
op++;
|
|
||||||
dest++;
|
|
||||||
}
|
|
||||||
dest += (stride - 8);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void vp3_idct_add_c(int16_t *input_data, int16_t *dequant_matrix,
|
|
||||||
int coeff_count, uint8_t *dest, int stride)
|
|
||||||
{
|
|
||||||
int32_t dequantized_data[64];
|
|
||||||
int16_t transformed_data[64];
|
|
||||||
int16_t *op;
|
|
||||||
int i, j;
|
|
||||||
int16_t sample;
|
|
||||||
|
|
||||||
/* de-zigzag and dequantize */
|
|
||||||
for (i = 0; i < coeff_count; i++) {
|
|
||||||
j = dezigzag_index[i];
|
|
||||||
dequantized_data[j] = dequant_matrix[i] * input_data[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
vp3_idct_c(dequantized_data, transformed_data);
|
|
||||||
|
|
||||||
/* place in final output */
|
|
||||||
op = transformed_data;
|
|
||||||
for (i = 0; i < 8; i++) {
|
|
||||||
for (j = 0; j < 8; j++) {
|
|
||||||
sample = *dest + *op;
|
|
||||||
if (sample < 0)
|
|
||||||
*dest = 0;
|
|
||||||
else if (sample > 255)
|
|
||||||
*dest = 255;
|
|
||||||
else
|
|
||||||
*dest = (uint8_t)(sample & 0xFF);
|
|
||||||
op++;
|
|
||||||
dest++;
|
|
||||||
}
|
|
||||||
dest += (stride - 8);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
Loading…
Reference in New Issue