mirror of https://git.ffmpeg.org/ffmpeg.git
MMX idct_add for VP8.
Originally committed as revision 23886 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
29e719377f
commit
2dd2f71692
|
@ -196,6 +196,7 @@ HVBILIN(ssse3, 8, 16, 16)
|
|||
extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
|
||||
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
|
||||
extern void ff_vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]);
|
||||
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
|
||||
#endif
|
||||
|
||||
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
|
||||
|
@ -229,6 +230,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
|
|||
#if HAVE_YASM
|
||||
if (mm_flags & FF_MM_MMX) {
|
||||
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
|
||||
c->vp8_idct_add = ff_vp8_idct_add_mmx;
|
||||
c->put_vp8_epel_pixels_tab[0][0][0] =
|
||||
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
|
||||
c->put_vp8_epel_pixels_tab[1][0][0] =
|
||||
|
|
|
@ -142,6 +142,9 @@ filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
|
|||
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
|
||||
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
|
||||
|
||||
pw_20091: times 4 dw 20091
|
||||
pw_17734: times 4 dw 17734
|
||||
|
||||
cextern pw_3
|
||||
cextern pw_4
|
||||
cextern pw_64
|
||||
|
@ -923,6 +926,92 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6
|
|||
pextrd [r1+r2], xmm2, 3
|
||||
RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
; calculate %1=%2+%1; %2=%2-%1, with %3=temp register
|
||||
%macro SUMSUB 3
|
||||
mova %3, %1
|
||||
paddw %1, %2
|
||||
psubw %2, %3
|
||||
%endmacro
|
||||
|
||||
; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
|
||||
; this macro assumes that m6/m7 have words for 20091/17734 loaded
|
||||
%macro VP8_MULTIPLY_SUMSUB 4
|
||||
mova %3, %1
|
||||
mova %4, %2
|
||||
pmulhw %3, m6 ;20091(1)
|
||||
pmulhw %4, m6 ;20091(2)
|
||||
paddw %3, %1
|
||||
paddw %4, %2
|
||||
psllw %1, 1
|
||||
psllw %2, 1
|
||||
pmulhw %1, m7 ;35468(1)
|
||||
pmulhw %2, m7 ;35468(2)
|
||||
psubw %1, %4
|
||||
paddw %2, %3
|
||||
%endmacro
|
||||
|
||||
; calculate x0=%1+%3; x1=%1-%3
|
||||
; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
|
||||
; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
|
||||
; %5/%6 are temporary registers
|
||||
; we assume m6/m7 have constant words 20091/17734 loaded in them
|
||||
%macro VP8_IDCT_TRANSFORM4x4_1D 6
|
||||
SUMSUB_BA m%3, m%1, m%5 ;t0, t1
|
||||
VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
|
||||
SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3
|
||||
SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2
|
||||
SWAP %4, %1
|
||||
SWAP %4, %3
|
||||
%endmacro
|
||||
|
||||
; transpose a 4x4 table
|
||||
%macro TRANSPOSE4x4 5 ; output in %1/%4/%5/%3
|
||||
mova m%5, m%1
|
||||
punpcklwd m%1, m%2
|
||||
punpckhwd m%5, m%2
|
||||
mova m%2, m%3
|
||||
punpcklwd m%3, m%4
|
||||
punpckhwd m%2, m%4
|
||||
mova m%4, m%1
|
||||
punpckldq m%1, m%3 ;col0
|
||||
punpckhdq m%4, m%3 ;col1
|
||||
mova m%3, m%5
|
||||
punpckldq m%5, m%2 ;col2
|
||||
punpckhdq m%3, m%2 ;col3
|
||||
SWAP %4, %2
|
||||
SWAP %4, %5
|
||||
SWAP %4, %3
|
||||
%endmacro
|
||||
|
||||
INIT_MMX
|
||||
cglobal vp8_idct_add_mmx, 3, 3
|
||||
; load block data
|
||||
movq m0, [r1]
|
||||
movq m1, [r1+8]
|
||||
movq m2, [r1+16]
|
||||
movq m3, [r1+24]
|
||||
movq m6, [pw_20091]
|
||||
movq m7, [pw_17734]
|
||||
|
||||
; actual IDCT
|
||||
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||
paddw m0, [pw_4]
|
||||
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||
|
||||
; store
|
||||
pxor m4, m4
|
||||
lea r1, [r0+2*r2]
|
||||
STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
|
||||
STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
|
||||
|
||||
RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
|
||||
;-----------------------------------------------------------------------------
|
||||
|
|
|
@ -365,3 +365,18 @@
|
|||
packuswb %1, %1
|
||||
movh %4, %1
|
||||
%endmacro
|
||||
|
||||
%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
|
||||
movh %3, [%7]
|
||||
movh %4, [%7+%8]
|
||||
punpcklbw %3, %5
|
||||
punpcklbw %4, %5
|
||||
psraw %1, %6
|
||||
psraw %2, %6
|
||||
paddw %3, %1
|
||||
paddw %4, %2
|
||||
packuswb %3, %5
|
||||
packuswb %4, %5
|
||||
movh [%7], %3
|
||||
movh [%7+%8], %4
|
||||
%endmacro
|
||||
|
|
Loading…
Reference in New Issue