H.264: split luma dc idct out and implement MMX/SSE2 versions

About 2.5x the speed.

NOTE: the way that the asm code handles large qmuls is a bit suboptimal.
If x264-style dequant was used (separate shift and qmul values), it might
be possible to get some extra speed.

Originally committed as revision 26336 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
Jason Garrett-Glaser 2011-01-14 21:34:25 +00:00
parent 6c18f1cda2
commit 19fb234e4a
12 changed files with 227 additions and 65 deletions

View File

@ -64,6 +64,10 @@ void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *bl
void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul);
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp);
void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
const float *win, float add_bias, int len);
void ff_float_to_int16_c(int16_t *dst, const float *src, long len);

View File

@ -246,46 +246,6 @@ int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
return 0;
}
/**
* IDCT transforms the 16 dc values and dequantizes them.
* @param qp quantization parameter
*/
static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
#define stride 16
int i;
int temp[16]; //FIXME check if this is a good idea
static const int x_offset[4]={0, 1*stride, 4* stride, 5*stride};
static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
//memset(block, 64, 2*256);
//return;
for(i=0; i<4; i++){
const int offset= y_offset[i];
const int z0= block[offset+stride*0] + block[offset+stride*4];
const int z1= block[offset+stride*0] - block[offset+stride*4];
const int z2= block[offset+stride*1] - block[offset+stride*5];
const int z3= block[offset+stride*1] + block[offset+stride*5];
temp[4*i+0]= z0+z3;
temp[4*i+1]= z1+z2;
temp[4*i+2]= z1-z2;
temp[4*i+3]= z0-z3;
}
for(i=0; i<4; i++){
const int offset= x_offset[i];
const int z0= temp[4*0+i] + temp[4*2+i];
const int z1= temp[4*0+i] - temp[4*2+i];
const int z2= temp[4*1+i] - temp[4*3+i];
const int z3= temp[4*1+i] + temp[4*3+i];
block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
}
}
#if 0
/**
* DCT transforms the 16 dc values.
@ -1245,9 +1205,15 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
if(is_h264){
if(!transform_bypass)
h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
h->h264dsp.h264_luma_dc_dequant_idct(h->mb, h->mb_luma_dc, h->dequant4_coeff[0][s->qscale][0]);
else{
static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16,
8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16};
for(i = 0; i < 16; i++)
h->mb[dc_mapping[i]] = h->mb_luma_dc[i];
}
}else
ff_svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
ff_svq3_luma_dc_dequant_idct_c(h->mb, h->mb_luma_dc, s->qscale);
}
if(h->deblocking_filter)
xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);

View File

@ -406,6 +406,7 @@ typedef struct H264Context{
GetBitContext *inter_gb_ptr;
DECLARE_ALIGNED(16, DCTELEM, mb)[16*24];
DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[16];
DCTELEM mb_padding[256]; ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb
/**
@ -600,10 +601,6 @@ typedef struct H264Context{
extern const uint8_t ff_h264_chroma_qp[52];
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
/**
* Decode SEI
*/

View File

@ -1597,17 +1597,15 @@ decode_intra_mb:
s->current_picture.mb_type[mb_xy]= mb_type;
if( cbp || IS_INTRA16x16( mb_type ) ) {
const uint8_t *scan, *scan8x8, *dc_scan;
const uint8_t *scan, *scan8x8;
const uint32_t *qmul;
if(IS_INTERLACED(mb_type)){
scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
scan= s->qscale ? h->field_scan : h->field_scan_q0;
dc_scan= luma_dc_field_scan;
}else{
scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
dc_scan= luma_dc_zigzag_scan;
}
// decode_cabac_mb_dqp
@ -1642,7 +1640,9 @@ decode_intra_mb:
if( IS_INTRA16x16( mb_type ) ) {
int i;
//av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
decode_cabac_residual_dc( h, h->mb, 0, 0, dc_scan, 16);
AV_ZERO128(h->mb_luma_dc+0);
AV_ZERO128(h->mb_luma_dc+8);
decode_cabac_residual_dc( h, h->mb_luma_dc, 0, 0, scan, 16);
if( cbp&15 ) {
qmul = h->dequant4_coeff[0][s->qscale];

View File

@ -911,16 +911,14 @@ decode_intra_mb:
int i8x8, i4x4, chroma_idx;
int dquant;
GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
const uint8_t *scan, *scan8x8, *dc_scan;
const uint8_t *scan, *scan8x8;
if(IS_INTERLACED(mb_type)){
scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
scan= s->qscale ? h->field_scan : h->field_scan_q0;
dc_scan= luma_dc_field_scan;
}else{
scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
dc_scan= luma_dc_zigzag_scan;
}
dquant= get_se_golomb(&s->gb);
@ -939,7 +937,9 @@ decode_intra_mb:
h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
if(IS_INTRA16x16(mb_type)){
if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
AV_ZERO128(h->mb_luma_dc+0);
AV_ZERO128(h->mb_luma_dc+8);
if( decode_residual(h, h->intra_gb_ptr, h->mb_luma_dc, LUMA_DC_BLOCK_INDEX, scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
return -1; //FIXME continue if partitioned and other return -1 too
}

View File

@ -282,6 +282,7 @@ void ff_h264dsp_init(H264DSPContext *c)
c->h264_idct8_add4 = ff_h264_idct8_add4_c;
c->h264_idct_add8 = ff_h264_idct_add8_c;
c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_c;
c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;

View File

@ -65,11 +65,13 @@ typedef struct H264DSPContext{
void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
void (*h264_dct)(DCTELEM block[4][4]);
void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
void (*h264_luma_dc_dequant_idct)(DCTELEM *output, DCTELEM *input/*align 16*/, int qmul);
}H264DSPContext;
void ff_h264dsp_init(H264DSPContext *c);

View File

@ -216,3 +216,38 @@ void ff_h264_idct_add8_c(uint8_t **dest, const int *block_offset, DCTELEM *block
ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
}
}
/**
* IDCT transforms the 16 dc values and dequantizes them.
* @param qp quantization parameter
*/
void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul){
#define stride 16
int i;
int temp[16];
static const uint8_t x_offset[4]={0, 2*stride, 8*stride, 10*stride};
for(i=0; i<4; i++){
const int z0= input[4*i+0] + input[4*i+1];
const int z1= input[4*i+0] - input[4*i+1];
const int z2= input[4*i+2] - input[4*i+3];
const int z3= input[4*i+2] + input[4*i+3];
temp[4*i+0]= z0+z3;
temp[4*i+1]= z0-z3;
temp[4*i+2]= z1-z2;
temp[4*i+3]= z1+z2;
}
for(i=0; i<4; i++){
const int offset= x_offset[i];
const int z0= temp[4*0+i] + temp[4*2+i];
const int z1= temp[4*0+i] - temp[4*2+i];
const int z2= temp[4*1+i] - temp[4*3+i];
const int z3= temp[4*1+i] + temp[4*3+i];
output[stride* 0+offset]= ((((z0 + z3)*qmul + 128 ) >> 8));
output[stride* 1+offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
output[stride* 4+offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
output[stride* 5+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
}
}

View File

@ -126,21 +126,19 @@ static const uint32_t svq3_dequant_coeff[32] = {
};
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp)
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp)
{
const int qmul = svq3_dequant_coeff[qp];
#define stride 16
int i;
int temp[16];
static const int x_offset[4] = {0, 1*stride, 4* stride, 5*stride};
static const int y_offset[4] = {0, 2*stride, 8* stride, 10*stride};
for (i = 0; i < 4; i++){
const int offset = y_offset[i];
const int z0 = 13*(block[offset+stride*0] + block[offset+stride*4]);
const int z1 = 13*(block[offset+stride*0] - block[offset+stride*4]);
const int z2 = 7* block[offset+stride*1] - 17*block[offset+stride*5];
const int z3 = 17* block[offset+stride*1] + 7*block[offset+stride*5];
const int z0= 13*(input[4*i+0] + input[4*i+1]);
const int z1= 13*(input[4*i+0] - input[4*i+1]);
const int z2= 7* input[4*i+2] - 17*input[4*i+3];
const int z3= 17* input[4*i+2] + 7*input[4*i+3];
temp[4*i+0] = z0+z3;
temp[4*i+1] = z1+z2;
@ -155,10 +153,10 @@ void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp)
const int z2 = 7* temp[4*1+i] - 17*temp[4*3+i];
const int z3 = 17* temp[4*1+i] + 7*temp[4*3+i];
block[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20;
block[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20;
block[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20;
block[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20;
output[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20;
output[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20;
output[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20;
output[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20;
}
}
#undef stride

View File

@ -41,6 +41,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
{0x8000000080000000ULL, 0x8000000080000000ULL};
DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL;
DECLARE_ALIGNED(8, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};

View File

@ -47,6 +47,7 @@ scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
%endif
cextern pw_32
cextern pw_1
SECTION .text
@ -854,3 +855,156 @@ cglobal h264_idct_add8_sse2, 5, 7, 8
add8_sse2_cycle 2, 0x21
add8_sse2_cycle 3, 0x29
RET
;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)
%macro WALSH4_1D 5
SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
SWAP %1, %4, %3
%endmacro
%macro DEQUANT_MMX 3
mova m7, [pw_1]
mova m4, %1
punpcklwd %1, m7
punpckhwd m4, m7
mova m5, %2
punpcklwd %2, m7
punpckhwd m5, m7
movd m7, t3d
punpckldq m7, m7
pmaddwd %1, m7
pmaddwd %2, m7
pmaddwd m4, m7
pmaddwd m5, m7
psrad %1, %3
psrad %2, %3
psrad m4, %3
psrad m5, %3
packssdw %1, m4
packssdw %2, m5
%endmacro
%macro STORE_WORDS_MMX 5
movd t0d, %1
psrlq %1, 32
movd t1d, %1
mov [t2+%2*32], t0w
mov [t2+%4*32], t1w
shr t0d, 16
shr t1d, 16
mov [t2+%3*32], t0w
mov [t2+%5*32], t1w
%endmacro
%macro DEQUANT_STORE_MMX 1
DEQUANT_MMX m0, m1, %1
STORE_WORDS_MMX m0, 0, 1, 4, 5
STORE_WORDS_MMX m1, 2, 3, 6, 7
DEQUANT_MMX m2, m3, %1
STORE_WORDS_MMX m2, 8, 9, 12, 13
STORE_WORDS_MMX m3, 10, 11, 14, 15
%endmacro
%macro STORE_WORDS_SSE 9
movd t0d, %1
psrldq %1, 4
movd t1d, %1
psrldq %1, 4
mov [t2+%2*32], t0w
mov [t2+%4*32], t1w
shr t0d, 16
shr t1d, 16
mov [t2+%3*32], t0w
mov [t2+%5*32], t1w
movd t0d, %1
psrldq %1, 4
movd t1d, %1
mov [t2+%6*32], t0w
mov [t2+%8*32], t1w
shr t0d, 16
shr t1d, 16
mov [t2+%7*32], t0w
mov [t2+%9*32], t1w
%endmacro
%macro DEQUANT_STORE_SSE2 1
movd xmm4, t3d
movq xmm5, [pw_1]
pshufd xmm4, xmm4, 0
movq2dq xmm0, m0
movq2dq xmm1, m1
movq2dq xmm2, m2
movq2dq xmm3, m3
punpcklwd xmm0, xmm5
punpcklwd xmm1, xmm5
punpcklwd xmm2, xmm5
punpcklwd xmm3, xmm5
pmaddwd xmm0, xmm4
pmaddwd xmm1, xmm4
pmaddwd xmm2, xmm4
pmaddwd xmm3, xmm4
psrad xmm0, %1
psrad xmm1, %1
psrad xmm2, %1
psrad xmm3, %1
packssdw xmm0, xmm1
packssdw xmm2, xmm3
STORE_WORDS_SSE xmm0, 0, 1, 4, 5, 2, 3, 6, 7
STORE_WORDS_SSE xmm2, 8, 9, 12, 13, 10, 11, 14, 15
%endmacro
%macro IDCT_DC_DEQUANT 2
cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
movq m3, [r1+24]
movq m2, [r1+16]
movq m1, [r1+ 8]
movq m0, [r1+ 0]
WALSH4_1D 0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4
WALSH4_1D 0,1,2,3,4
; shift, tmp, output, qmul
%ifdef WIN64
DECLARE_REG_TMP 0,3,1,2
; we can't avoid this, because r0 is the shift register (ecx) on win64
xchg r0, t2
%elifdef ARCH_X86_64
DECLARE_REG_TMP 3,1,0,2
%else
DECLARE_REG_TMP 1,3,0,2
%endif
cmp t3d, 32767
jg .big_qmul
add t3d, 128 << 16
%ifidn %1,mmx
DEQUANT_STORE_MMX 8
%else
DEQUANT_STORE_SSE2 8
%endif
RET
.big_qmul:
bsr t0d, t3d
add t3d, 128 << 16
mov t1d, 7
cmp t0d, t1d
cmovg t0d, t1d
inc t1d
shr t3d, t0b
sub t1d, t0d
%ifidn %1,mmx
movd m6, t1d
DEQUANT_STORE_MMX m6
%else
movd xmm6, t1d
DEQUANT_STORE_SSE2 xmm6
%endif
RET
%endmacro
INIT_MMX
IDCT_DC_DEQUANT mmx, 0
IDCT_DC_DEQUANT sse2, 7

View File

@ -59,6 +59,8 @@ void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM
int stride, const uint8_t nnzc[6*8]);
void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTELEM *block,
int stride, const uint8_t nnzc[6*8]);
void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul);
void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul);
/***********************************/
/* deblocking */
@ -301,6 +303,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c)
c->h264_idct8_add4 = ff_h264_idct8_add4_mmx;
c->h264_idct_add8 = ff_h264_idct_add8_mmx;
c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx;
if (mm_flags & AV_CPU_FLAG_MMX2) {
c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
@ -341,6 +344,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c)
if (mm_flags&AV_CPU_FLAG_SSE2) {
c->h264_idct8_add = ff_h264_idct8_add_sse2;
c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;