diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index fce0b81634..7b554b1fd0 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -3126,8 +3126,7 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx) /* VP3 DSP support */ c->vp3_dsp_init = vp3_dsp_init_c; - c->vp3_idct_put = vp3_idct_put_c; - c->vp3_idct_add = vp3_idct_add_c; + c->vp3_idct = vp3_idct_c; c->get_pixels = get_pixels_c; c->diff_pixels = diff_pixels_c; diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index b5468724f1..0307dbd6ab 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -62,23 +62,16 @@ extern uint8_t cropTbl[256 + 2 * MAX_NEG_CROP]; /* VP3 DSP functions */ void vp3_dsp_init_c(void); -void vp3_idct_put_c(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride); -void vp3_idct_add_c(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride); +void vp3_idct_c(int16_t *input_data, int16_t *dequant_matrix, + int coeff_count, DCTELEM *output_data); void vp3_dsp_init_mmx(void); -void vp3_idct_put_mmx(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride); -void vp3_idct_add_mmx(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride); +void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix, + int coeff_count, DCTELEM *output_data); void vp3_dsp_init_sse2(void); -void vp3_idct_put_sse2(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride); -void vp3_idct_add_sse2(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride); - +void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix, + int coeff_count, DCTELEM *output_data); /* minimum alignment rules ;) if u notice errors in the align stuff, need more alignment for some asm code for some cpu @@ -318,32 +311,16 @@ typedef struct DSPContext { /** * This function is responsible for taking a block of zigzag'd, - * quantized DCT coefficients, reconstructing the original block of - * samples, and placing it into the output. + * quantized DCT coefficients and reconstructing the original block of + * samples. * @param input_data 64 zigzag'd, quantized DCT coefficients * @param dequant_matrix 64 zigzag'd quantizer coefficients * @param coeff_count index of the last coefficient - * @param dest the final output location where the transformed samples - * are to be placed - * @param stride the width in 8-bit samples of a line on this plane + * @param output_samples space for 64 DCTELEMs where the transformed + * samples will be stored */ - void (*vp3_idct_put)(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride); - - /** - * This function is responsible for taking a block of zigzag'd, - * quantized DCT coefficients, reconstructing the original block of - * samples, and adding the transformed samples to an existing block of - * samples in the output. - * @param input_data 64 zigzag'd, quantized DCT coefficients - * @param dequant_matrix 64 zigzag'd quantizer coefficients - * @param coeff_count index of the last coefficient - * @param dest the final output location where the transformed samples - * are to be placed - * @param stride the width in 8-bit samples of a line on this plane - */ - void (*vp3_idct_add)(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride); + void (*vp3_idct)(int16_t *input_data, int16_t *dequant_matrix, + int coeff_count, DCTELEM *output_samples); } DSPContext; diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c index 772c9c1f03..61bfc89ac5 100644 --- a/libavcodec/i386/dsputil_mmx.c +++ b/libavcodec/i386/dsputil_mmx.c @@ -2149,14 +2149,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) /* VP3 optimized DSP functions */ if (mm_flags & MM_SSE2) { c->vp3_dsp_init = vp3_dsp_init_sse2; - c->vp3_idct_put = vp3_idct_put_sse2; - c->vp3_idct_add = vp3_idct_add_sse2; + c->vp3_idct = vp3_idct_sse2; } else { c->vp3_dsp_init = vp3_dsp_init_mmx; - c->vp3_idct_put = vp3_idct_put_mmx; - c->vp3_idct_add = vp3_idct_add_mmx; + c->vp3_idct = vp3_idct_mmx; } - + #ifdef CONFIG_ENCODERS c->get_pixels = get_pixels_mmx; c->diff_pixels = diff_pixels_mmx; diff --git a/libavcodec/i386/vp3dsp_mmx.c b/libavcodec/i386/vp3dsp_mmx.c index 76007a1d16..319e57f1bb 100644 --- a/libavcodec/i386/vp3dsp_mmx.c +++ b/libavcodec/i386/vp3dsp_mmx.c @@ -279,8 +279,8 @@ void vp3_dsp_init_mmx(void) idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift; } -static void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix, - int16_t *output_data) +void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix, + int coeff_count, int16_t *output_data) { /* eax = quantized input * ebx = dequantizer matrix @@ -563,79 +563,3 @@ static void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix, #undef J } - -void vp3_idct_put_mmx(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride) -{ - int16_t transformed_data[64]; - int16_t *op; - int i, j; - uint8_t vector128[8] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; - - vp3_idct_mmx(input_data, dequant_matrix, transformed_data); - - /* place in final output */ - op = transformed_data; - movq_m2r(*vector128, mm0); - for (i = 0; i < 8; i++) { -#if 1 - for (j = 0; j < 8; j++) { - if (*op < -128) - *dest = 0; - else if (*op > 127) - *dest = 255; - else - *dest = (uint8_t)(*op + 128); - op++; - dest++; - } - dest += (stride - 8); -#else -/* prototype optimization */ - pxor_r2r(mm1, mm1); - packsswb_m2r(*(op + 4), mm1); - movq_r2r(mm1, mm2); - psrlq_i2r(32, mm2); - packsswb_m2r(*(op + 0), mm1); - op += 8; - por_r2r(mm2, mm1); - paddb_r2r(mm0, mm1); - movq_r2m(mm1, *dest); - dest += stride; -#endif - } - - /* be a good MMX citizen */ - emms(); -} - -void vp3_idct_add_mmx(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride) -{ - int16_t transformed_data[64]; - int16_t *op; - int i, j; - int16_t sample; - - vp3_idct_mmx(input_data, dequant_matrix, transformed_data); - - /* place in final output */ - op = transformed_data; - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - sample = *dest + *op; - if (sample < 0) - *dest = 0; - else if (sample > 255) - *dest = 255; - else - *dest = (uint8_t)(sample & 0xFF); - op++; - dest++; - } - dest += (stride - 8); - } - - /* be a good MMX citizen */ - emms(); -} diff --git a/libavcodec/i386/vp3dsp_sse2.c b/libavcodec/i386/vp3dsp_sse2.c index c8f9158afb..6adfd2f9fa 100644 --- a/libavcodec/i386/vp3dsp_sse2.c +++ b/libavcodec/i386/vp3dsp_sse2.c @@ -799,11 +799,12 @@ static unsigned short __align16 SSE2_idct_data[7 * 8] = void vp3_dsp_init_sse2(void) { /* nop */ +av_log(NULL, AV_LOG_INFO, "Hey! SSE2!\n"); } -static void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix, - int16_t *output_data) +void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix, + int coeff_count, int16_t *output_data) { unsigned char *input_bytes = (unsigned char *)input_data; unsigned char *dequant_matrix_bytes = (unsigned char *)dequant_matrix; @@ -832,59 +833,3 @@ static void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix, SSE2_Column_IDCT(); } - - -void vp3_idct_put_sse2(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride) -{ - int16_t transformed_data[64]; - int16_t *op; - int i, j; - - vp3_idct_sse2(input_data, dequant_matrix, transformed_data); - - /* place in final output */ - op = transformed_data; - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - if (*op < -128) - *dest = 0; - else if (*op > 127) - *dest = 255; - else - *dest = (uint8_t)(*op + 128); - op++; - dest++; - } - dest += (stride - 8); - } -} - - -void vp3_idct_add_sse2(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride) -{ - int16_t transformed_data[64]; - int16_t *op; - int i, j; - int16_t sample; - - vp3_idct_sse2(input_data, dequant_matrix, transformed_data); - - /* place in final output */ - op = transformed_data; - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - sample = *dest + *op; - if (sample < 0) - *dest = 0; - else if (sample > 255) - *dest = 255; - else - *dest = (uint8_t)(sample & 0xFF); - op++; - dest++; - } - dest += (stride - 8); - } -} diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c index 0667d99eb8..cf22ee6ce0 100644 --- a/libavcodec/vp3.c +++ b/libavcodec/vp3.c @@ -2051,6 +2051,7 @@ static void render_fragments(Vp3DecodeContext *s, int m, n; int i = first_fragment; int16_t *dequantizer; + DCTELEM __align16 output_samples[64]; unsigned char *output_plane; unsigned char *last_plane; unsigned char *golden_plane; @@ -2060,6 +2061,10 @@ static void render_fragments(Vp3DecodeContext *s, int motion_halfpel_index; uint8_t *motion_source; + int16_t *op; + uint8_t *dest; + int j, k; + debug_vp3(" vp3: rendering final fragments for %s\n", (plane == 0) ? "Y plane" : (plane == 1) ? "U plane" : "V plane"); @@ -2176,16 +2181,29 @@ av_log(s->avctx, AV_LOG_ERROR, " help! got beefy vector! (%X, %X)\n", motion_x, s->all_fragments[i].coeffs[0], dequantizer[0]); /* invert DCT and place (or add) in final output */ + s->dsp.vp3_idct(s->all_fragments[i].coeffs, + dequantizer, + s->all_fragments[i].coeff_count, + output_samples); if (s->all_fragments[i].coding_method == MODE_INTRA) { - s->dsp.vp3_idct_put(s->all_fragments[i].coeffs, - dequantizer, - s->all_fragments[i].coeff_count, - output_plane + s->all_fragments[i].first_pixel, - stride); + /* this really needs to be optimized sooner or later */ + op = output_samples; + dest = output_plane + s->all_fragments[i].first_pixel; + for (j = 0; j < 8; j++) { + for (k = 0; k < 8; k++) { + if (*op < -128) + *dest = 0; + else if (*op > 127) + *dest = 255; + else + *dest = (uint8_t)(*op + 128); + op++; + dest++; + } + dest += (stride - 8); + } } else { - s->dsp.vp3_idct_add(s->all_fragments[i].coeffs, - dequantizer, - s->all_fragments[i].coeff_count, + s->dsp.add_pixels_clamped(output_samples, output_plane + s->all_fragments[i].first_pixel, stride); } diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c index ec62d9456d..3ead732803 100644 --- a/libavcodec/vp3dsp.c +++ b/libavcodec/vp3dsp.c @@ -40,8 +40,10 @@ void vp3_dsp_init_c(void) /* nop */ } -static void vp3_idct_c(int32_t *dequantized_data, int16_t *output_data) +void vp3_idct_c(int16_t *input_data, int16_t *dequant_matrix, + int coeff_count, int16_t *output_data) { + int32_t dequantized_data[64]; int32_t *ip = dequantized_data; int16_t *op = output_data; @@ -49,7 +51,13 @@ static void vp3_idct_c(int32_t *dequantized_data, int16_t *output_data) int32_t _Ed, _Gd, _Add, _Bdd, _Fd, _Hd; int32_t t1, t2; - int i; + int i, j; + + /* de-zigzag and dequantize */ + for (i = 0; i < coeff_count; i++) { + j = dezigzag_index[i]; + dequantized_data[j] = dequant_matrix[i] * input_data[i]; + } /* Inverse DCT on the rows now */ for (i = 0; i < 8; i++) { @@ -248,71 +256,3 @@ static void vp3_idct_c(int32_t *dequantized_data, int16_t *output_data) op++; } } - -void vp3_idct_put_c(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride) -{ - int32_t dequantized_data[64]; - int16_t transformed_data[64]; - int16_t *op; - int i, j; - - /* de-zigzag and dequantize */ - for (i = 0; i < coeff_count; i++) { - j = dezigzag_index[i]; - dequantized_data[j] = dequant_matrix[i] * input_data[i]; - } - - vp3_idct_c(dequantized_data, transformed_data); - - /* place in final output */ - op = transformed_data; - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - if (*op < -128) - *dest = 0; - else if (*op > 127) - *dest = 255; - else - *dest = (uint8_t)(*op + 128); - op++; - dest++; - } - dest += (stride - 8); - } -} - -void vp3_idct_add_c(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride) -{ - int32_t dequantized_data[64]; - int16_t transformed_data[64]; - int16_t *op; - int i, j; - int16_t sample; - - /* de-zigzag and dequantize */ - for (i = 0; i < coeff_count; i++) { - j = dezigzag_index[i]; - dequantized_data[j] = dequant_matrix[i] * input_data[i]; - } - - vp3_idct_c(dequantized_data, transformed_data); - - /* place in final output */ - op = transformed_data; - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - sample = *dest + *op; - if (sample < 0) - *dest = 0; - else if (sample > 255) - *dest = 255; - else - *dest = (uint8_t)(sample & 0xFF); - op++; - dest++; - } - dest += (stride - 8); - } -}