diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 30a4c7d80a..e4a44b0dce 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -835,8 +835,6 @@ static void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb int nnz_pred, nnz, nnz_total = 0; int segment = s->segment; - s->dsp.clear_blocks((DCTELEM *)s->block); - if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) { AV_ZERO128(dc); AV_ZERO128(dc+8); diff --git a/libavcodec/vp8dsp.c b/libavcodec/vp8dsp.c index 482d62af1b..3e6463d598 100644 --- a/libavcodec/vp8dsp.c +++ b/libavcodec/vp8dsp.c @@ -69,6 +69,10 @@ static void vp8_idct_add_c(uint8_t *dst, DCTELEM block[16], int stride) t1 = block[0*4+i] - block[2*4+i]; t2 = MUL_35468(block[1*4+i]) - MUL_20091(block[3*4+i]); t3 = MUL_20091(block[1*4+i]) + MUL_35468(block[3*4+i]); + block[0*4+i] = 0; + block[1*4+i] = 0; + block[2*4+i] = 0; + block[3*4+i] = 0; tmp[i*4+0] = t0 + t3; tmp[i*4+1] = t1 + t2; @@ -94,6 +98,7 @@ static void vp8_idct_dc_add_c(uint8_t *dst, DCTELEM block[16], int stride) { int i, dc = (block[0] + 4) >> 3; uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc; + block[0] = 0; for (i = 0; i < 4; i++) { dst[0] = cm[dst[0]]; diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c index fad399fba8..6cf1704594 100644 --- a/libavcodec/x86/vp8dsp-init.c +++ b/libavcodec/x86/vp8dsp-init.c @@ -222,6 +222,7 @@ extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride); extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); +extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride); #define DECLARE_LOOP_FILTER(NAME)\ extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\ @@ -328,6 +329,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) } if (mm_flags & FF_MM_SSE) { + c->vp8_idct_add = ff_vp8_idct_add_sse; c->put_vp8_epel_pixels_tab[0][0][0] = c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; } diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 843873167d..0cf4771abd 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -913,6 +913,7 @@ cglobal vp8_idct_dc_add_mmx, 3, 3 paddw mm0, [pw_4] pxor mm1, mm1 psraw mm0, 3 + movd [r1], mm1 psubw mm1, mm0 packuswb mm0, mm0 packuswb mm1, mm1 @@ -944,11 +945,12 @@ cglobal vp8_idct_dc_add_mmx, 3, 3 cglobal vp8_idct_dc_add_sse4, 3, 3, 6 ; load data movd xmm0, [r1] - lea r1, [r0+r2*2] pxor xmm1, xmm1 ; calculate DC paddw xmm0, [pw_4] + movd [r1], xmm1 + lea r1, [r0+r2*2] movd xmm2, [r0] movd xmm3, [r0+r2] movd xmm4, [r1] @@ -1005,14 +1007,26 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6 %endmacro INIT_MMX -cglobal vp8_idct_add_mmx, 3, 3 +%macro VP8_IDCT_ADD 1 +cglobal vp8_idct_add_%1, 3, 3 ; load block data - movq m0, [r1] - movq m1, [r1+8] + movq m0, [r1+ 0] + movq m1, [r1+ 8] movq m2, [r1+16] movq m3, [r1+24] movq m6, [pw_20091] movq m7, [pw_17734] +%ifidn %1, sse + xorps xmm0, xmm0 + movaps [r1+ 0], xmm0 + movaps [r1+16], xmm0 +%else + pxor m4, m4 + movq [r1+ 0], m4 + movq [r1+ 8], m4 + movq [r1+16], m4 + movq [r1+24], m4 +%endif ; actual IDCT VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 @@ -1028,6 +1042,10 @@ cglobal vp8_idct_add_mmx, 3, 3 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 RET +%endmacro + +VP8_IDCT_ADD mmx +VP8_IDCT_ADD sse ;----------------------------------------------------------------------------- ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])