mirror of https://git.ffmpeg.org/ffmpeg.git
Add mmxext version of VP8 DC Hadamard transform
Originally committed as revision 23878 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
37355fe823
commit
004cda8e79
|
@ -195,6 +195,7 @@ HVBILIN(ssse3, 8, 16, 16)
|
||||||
|
|
||||||
extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
|
extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
|
||||||
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
|
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
|
||||||
|
extern void ff_vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
|
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
|
||||||
|
@ -237,6 +238,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
|
||||||
/* note that 4-tap width=16 functions are missing because w=16
|
/* note that 4-tap width=16 functions are missing because w=16
|
||||||
* is only used for luma, and luma is always a copy or sixtap. */
|
* is only used for luma, and luma is always a copy or sixtap. */
|
||||||
if (mm_flags & FF_MM_MMX2) {
|
if (mm_flags & FF_MM_MMX2) {
|
||||||
|
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmxext;
|
||||||
VP8_LUMA_MC_FUNC(0, 16, mmxext);
|
VP8_LUMA_MC_FUNC(0, 16, mmxext);
|
||||||
VP8_MC_FUNC(1, 8, mmxext);
|
VP8_MC_FUNC(1, 8, mmxext);
|
||||||
VP8_MC_FUNC(1, 4, mmxext);
|
VP8_MC_FUNC(1, 4, mmxext);
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
;******************************************************************************
|
;******************************************************************************
|
||||||
|
|
||||||
%include "x86inc.asm"
|
%include "x86inc.asm"
|
||||||
|
%include "x86util.asm"
|
||||||
|
|
||||||
SECTION_RODATA
|
SECTION_RODATA
|
||||||
|
|
||||||
|
@ -141,6 +142,7 @@ filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
|
||||||
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
|
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
|
||||||
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
|
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
|
||||||
|
|
||||||
|
cextern pw_3
|
||||||
cextern pw_4
|
cextern pw_4
|
||||||
cextern pw_64
|
cextern pw_64
|
||||||
|
|
||||||
|
@ -920,3 +922,47 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6
|
||||||
pextrd [r1], xmm2, 2
|
pextrd [r1], xmm2, 2
|
||||||
pextrd [r1+r2], xmm2, 3
|
pextrd [r1+r2], xmm2, 3
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
%macro SCATTER_WHT 1
|
||||||
|
pextrw r1d, m0, %1
|
||||||
|
pextrw r2d, m1, %1
|
||||||
|
mov [r0+2*16*0], r1w
|
||||||
|
mov [r0+2*16*1], r2w
|
||||||
|
pextrw r1d, m2, %1
|
||||||
|
pextrw r2d, m3, %1
|
||||||
|
mov [r0+2*16*2], r1w
|
||||||
|
mov [r0+2*16*3], r2w
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro HADAMARD4_1D 4
|
||||||
|
SUMSUB_BADC m%2, m%1, m%4, m%3
|
||||||
|
SUMSUB_BADC m%4, m%2, m%3, m%1
|
||||||
|
SWAP %1, %4, %3
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_MMX
|
||||||
|
cglobal vp8_luma_dc_wht_mmxext, 2,3
|
||||||
|
movq m0, [r1]
|
||||||
|
movq m1, [r1+8]
|
||||||
|
movq m2, [r1+16]
|
||||||
|
movq m3, [r1+24]
|
||||||
|
HADAMARD4_1D 0, 1, 2, 3
|
||||||
|
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||||
|
paddw m0, [pw_3]
|
||||||
|
HADAMARD4_1D 0, 1, 2, 3
|
||||||
|
psraw m0, 3
|
||||||
|
psraw m1, 3
|
||||||
|
psraw m2, 3
|
||||||
|
psraw m3, 3
|
||||||
|
SCATTER_WHT 0
|
||||||
|
add r0, 2*16*4
|
||||||
|
SCATTER_WHT 1
|
||||||
|
add r0, 2*16*4
|
||||||
|
SCATTER_WHT 2
|
||||||
|
add r0, 2*16*4
|
||||||
|
SCATTER_WHT 3
|
||||||
|
RET
|
||||||
|
|
Loading…
Reference in New Issue