From 6a786b15c34765ec00be3cd808dafbb041fd5881 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Mon, 21 Feb 2011 09:07:13 -0500 Subject: [PATCH] VC1: merge idct8x8, coeff adjustments and put_pixels. Merging these functions allows merging some loops, which makes the results (particularly after SIMD optimizations) much faster. (cherry picked from commit f8bed30d8b176fa030f6737765338bb4a2bcabc9) --- libavcodec/ppc/vc1dsp_altivec.c | 64 +++++++++++++++++++++++++++++++-- libavcodec/vc1.c | 28 ++++++++++++++- libavcodec/vc1dec.c | 58 +++++++++++++++++------------- libavcodec/vc1dsp.c | 54 ++++++++++++++++++++++------ libavcodec/vc1dsp.h | 6 +++- 5 files changed, 170 insertions(+), 40 deletions(-) diff --git a/libavcodec/ppc/vc1dsp_altivec.c b/libavcodec/ppc/vc1dsp_altivec.c index 5a0dddbe1d..b5cc71c3cf 100644 --- a/libavcodec/ppc/vc1dsp_altivec.c +++ b/libavcodec/ppc/vc1dsp_altivec.c @@ -130,7 +130,8 @@ do { \ /** Do inverse transform on 8x8 block */ -static void vc1_inv_trans_8x8_altivec(DCTELEM block[64]) +static void vc1_inv_trans_8x8_altivec(DCTELEM block[64], + int sign, int rangered) { vector signed short src0, src1, src2, src3, src4, src5, src6, src7; vector signed int s0, s1, s2, s3, s4, s5, s6, s7; @@ -144,7 +145,9 @@ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64]) const vector unsigned int vec_2 = vec_splat_u32(2); const vector signed int vec_1s = vec_splat_s32(1); const vector unsigned int vec_1 = vec_splat_u32(1); - + const vector unsigned short rangered_shift = vec_splat_u16(1); + const vector signed short signed_bias = vec_sl(vec_splat_u16(4), + vec_splat_u16(4)); src0 = vec_ld( 0, block); src1 = vec_ld( 16, block); @@ -214,6 +217,27 @@ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64]) src6 = vec_pack(sE, s6); src7 = vec_pack(sF, s7); + if (rangered) { + if (!sign) { + vec_sub(src0, signed_bias); + vec_sub(src1, signed_bias); + vec_sub(src2, signed_bias); + vec_sub(src3, signed_bias); + vec_sub(src4, signed_bias); + vec_sub(src5, signed_bias); + vec_sub(src6, signed_bias); + vec_sub(src7, signed_bias); + } + vec_sl(src0, rangered_shift); + vec_sl(src1, rangered_shift); + vec_sl(src2, rangered_shift); + vec_sl(src3, rangered_shift); + vec_sl(src4, rangered_shift); + vec_sl(src5, rangered_shift); + vec_sl(src6, rangered_shift); + vec_sl(src7, rangered_shift); + } + vec_st(src0, 0, block); vec_st(src1, 16, block); vec_st(src2, 32, block); @@ -224,6 +248,36 @@ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64]) vec_st(src7,112, block); } +static void vc1_inv_trans_8x8_add_altivec(uint8_t *dest, int stride, DCTELEM *b) +{ + vc1_inv_trans_8x8_altivec(b, 0, 0); + ff_add_pixels_clamped_c(b, dest, stride); +} + +static void vc1_inv_trans_8x8_put_signed_altivec(uint8_t *dest, int stride, DCTELEM *b) +{ + vc1_inv_trans_8x8_altivec(b, 1, 0); + ff_put_signed_pixels_clamped_c(b, dest, stride); +} + +static void vc1_inv_trans_8x8_put_signed_rangered_altivec(uint8_t *dest, int stride, DCTELEM *b) +{ + vc1_inv_trans_8x8_altivec(b, 1, 1); + ff_put_signed_pixels_clamped_c(b, dest, stride); +} + +static void vc1_inv_trans_8x8_put_altivec(uint8_t *dest, int stride, DCTELEM *b) +{ + vc1_inv_trans_8x8_altivec(b, 0, 0); + ff_put_pixels_clamped_c(b, dest, stride); +} + +static void vc1_inv_trans_8x8_put_rangered_altivec(uint8_t *dest, int stride, DCTELEM *b) +{ + vc1_inv_trans_8x8_altivec(b, 0, 1); + ff_put_pixels_clamped_c(b, dest, stride); +} + /** Do inverse transform on 8x4 part of block */ static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, DCTELEM *block) @@ -342,7 +396,11 @@ void ff_vc1dsp_init_altivec(VC1DSPContext* dsp) if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) return; - dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec; + dsp->vc1_inv_trans_8x8_add = vc1_inv_trans_8x8_add_altivec; + dsp->vc1_inv_trans_8x8_put_signed[0] = vc1_inv_trans_8x8_put_signed_altivec; + dsp->vc1_inv_trans_8x8_put_signed[1] = vc1_inv_trans_8x8_put_signed_rangered_altivec; + dsp->vc1_inv_trans_8x8_put[0] = vc1_inv_trans_8x8_put_altivec; + dsp->vc1_inv_trans_8x8_put[1] = vc1_inv_trans_8x8_put_rangered_altivec; dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec; dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_altivec; dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_altivec; diff --git a/libavcodec/vc1.c b/libavcodec/vc1.c index 8bd6647f13..27cd0108a5 100644 --- a/libavcodec/vc1.c +++ b/libavcodec/vc1.c @@ -280,6 +280,28 @@ static int vop_dquant_decoding(VC1Context *v) static int decode_sequence_header_adv(VC1Context *v, GetBitContext *gb); +static void simple_idct_put_rangered(uint8_t *dest, int line_size, DCTELEM *block) +{ + int i; + ff_simple_idct(block); + for (i = 0; i < 64; i++) block[i] = (block[i] - 64) << 1; + ff_put_pixels_clamped_c(block, dest, line_size); +} + +static void simple_idct_put_signed(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_simple_idct(block); + ff_put_signed_pixels_clamped_c(block, dest, line_size); +} + +static void simple_idct_put_signed_rangered(uint8_t *dest, int line_size, DCTELEM *block) +{ + int i; + ff_simple_idct(block); + for (i = 0; i < 64; i++) block[i] <<= 1; + ff_put_signed_pixels_clamped_c(block, dest, line_size); +} + /** * Decode Simple/Main Profiles sequence header * @see Figure 7-8, p16-17 @@ -337,7 +359,11 @@ int vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitConte v->res_fasttx = get_bits1(gb); if (!v->res_fasttx) { - v->vc1dsp.vc1_inv_trans_8x8 = ff_simple_idct; + v->vc1dsp.vc1_inv_trans_8x8_add = ff_simple_idct_add; + v->vc1dsp.vc1_inv_trans_8x8_put[0] = ff_simple_idct_put; + v->vc1dsp.vc1_inv_trans_8x8_put[1] = simple_idct_put_rangered; + v->vc1dsp.vc1_inv_trans_8x8_put_signed[0] = simple_idct_put_signed; + v->vc1dsp.vc1_inv_trans_8x8_put_signed[1] = simple_idct_put_signed_rangered; v->vc1dsp.vc1_inv_trans_8x4 = ff_simple_idct84_add; v->vc1dsp.vc1_inv_trans_4x8 = ff_simple_idct48_add; v->vc1dsp.vc1_inv_trans_4x4 = ff_simple_idct44_add; diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c index a3db6fe70b..ed92d8cadd 100644 --- a/libavcodec/vc1dec.c +++ b/libavcodec/vc1dec.c @@ -2009,8 +2009,7 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan if(i==1) v->vc1dsp.vc1_inv_trans_8x8_dc(dst, linesize, block); else{ - v->vc1dsp.vc1_inv_trans_8x8(block); - s->dsp.add_pixels_clamped(block, dst, linesize); + v->vc1dsp.vc1_inv_trans_8x8_add(dst, linesize, block); } if(apply_filter && cbp_top & 0xC) v->vc1dsp.vc1_v_loop_filter8(dst, linesize, v->pq); @@ -2117,7 +2116,7 @@ static int vc1_decode_p_mb(VC1Context *v) { MpegEncContext *s = &v->s; GetBitContext *gb = &s->gb; - int i, j; + int i; int mb_pos = s->mb_x + s->mb_y * s->mb_stride; int cbp; /* cbp decoding stuff */ int mqdiff, mquant; /* MB quantization */ @@ -2149,6 +2148,8 @@ static int vc1_decode_p_mb(VC1Context *v) { if (!skipped) { + vc1_idct_func idct8x8_fn; + GET_MVDATA(dmv_x, dmv_y); if (s->mb_intra) { @@ -2183,6 +2184,7 @@ static int vc1_decode_p_mb(VC1Context *v) VC1_TTMB_VLC_BITS, 2); if(!s->mb_intra) vc1_mc_1mv(v, 0); dst_idx = 0; + idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm]; for (i=0; i<6; i++) { s->dc_val[0][s->block_index[i]] = 0; @@ -2200,9 +2202,9 @@ static int vc1_decode_p_mb(VC1Context *v) vc1_decode_intra_block(v, s->block[i], i, val, mquant, (i&4)?v->codingset2:v->codingset); if((i>3) && (s->flags & CODEC_FLAG_GRAY)) continue; - v->vc1dsp.vc1_inv_trans_8x8(s->block[i]); - if(v->rangeredfrm) for(j = 0; j < 64; j++) s->block[i][j] <<= 1; - s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize); + idct8x8_fn(s->dest[dst_idx] + off, + i & 4 ? s->uvlinesize : s->linesize, + s->block[i]); if(v->pq >= 9 && v->overlap) { if(v->c_avail) v->vc1dsp.vc1_h_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize); @@ -2267,6 +2269,7 @@ static int vc1_decode_p_mb(VC1Context *v) { int intra_count = 0, coded_inter = 0; int is_intra[6], is_coded[6]; + vc1_idct_func idct8x8_fn; /* Get CBPCY */ cbp = get_vlc2(&v->s.gb, v->cbpcy_vlc->table, VC1_CBPCY_P_VLC_BITS, 2); for (i=0; i<6; i++) @@ -2316,6 +2319,7 @@ static int vc1_decode_p_mb(VC1Context *v) } if (!v->ttmbf && coded_inter) ttmb = get_vlc2(gb, ff_vc1_ttmb_vlc[v->tt_index].table, VC1_TTMB_VLC_BITS, 2); + idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm]; for (i=0; i<6; i++) { dst_idx += i >> 2; @@ -2331,9 +2335,9 @@ static int vc1_decode_p_mb(VC1Context *v) vc1_decode_intra_block(v, s->block[i], i, is_coded[i], mquant, (i&4)?v->codingset2:v->codingset); if((i>3) && (s->flags & CODEC_FLAG_GRAY)) continue; - v->vc1dsp.vc1_inv_trans_8x8(s->block[i]); - if(v->rangeredfrm) for(j = 0; j < 64; j++) s->block[i][j] <<= 1; - s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, (i&4)?s->uvlinesize:s->linesize); + idct8x8_fn(s->dest[dst_idx] + off, + (i&4)?s->uvlinesize:s->linesize, + s->block[i]); if(v->pq >= 9 && v->overlap) { if(v->c_avail) v->vc1dsp.vc1_h_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize); @@ -2409,7 +2413,7 @@ static void vc1_decode_b_mb(VC1Context *v) { MpegEncContext *s = &v->s; GetBitContext *gb = &s->gb; - int i, j; + int i; int mb_pos = s->mb_x + s->mb_y * s->mb_stride; int cbp = 0; /* cbp decoding stuff */ int mqdiff, mquant; /* MB quantization */ @@ -2422,6 +2426,7 @@ static void vc1_decode_b_mb(VC1Context *v) int skipped, direct; int dmv_x[2], dmv_y[2]; int bmvtype = BMV_TYPE_BACKWARD; + vc1_idct_func idct8x8_fn; mquant = v->pq; /* Loosy initialization */ s->mb_intra = 0; @@ -2519,6 +2524,7 @@ static void vc1_decode_b_mb(VC1Context *v) } } dst_idx = 0; + idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm]; for (i=0; i<6; i++) { s->dc_val[0][s->block_index[i]] = 0; @@ -2536,9 +2542,9 @@ static void vc1_decode_b_mb(VC1Context *v) vc1_decode_intra_block(v, s->block[i], i, val, mquant, (i&4)?v->codingset2:v->codingset); if((i>3) && (s->flags & CODEC_FLAG_GRAY)) continue; - v->vc1dsp.vc1_inv_trans_8x8(s->block[i]); - if(v->rangeredfrm) for(j = 0; j < 64; j++) s->block[i][j] <<= 1; - s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize); + idct8x8_fn(s->dest[dst_idx] + off, + i & 4 ? s->uvlinesize : s->linesize, + s->block[i]); } else if(val) { vc1_decode_p_block(v, s->block[i], i, mquant, ttmb, first_block, s->dest[dst_idx] + off, (i&4)?s->uvlinesize:s->linesize, (i&4) && (s->flags & CODEC_FLAG_GRAY), 0, 0, 0); if(!v->ttmbf && ttmb < 8) ttmb = -1; @@ -2551,11 +2557,12 @@ static void vc1_decode_b_mb(VC1Context *v) */ static void vc1_decode_i_blocks(VC1Context *v) { - int k, j; + int k; MpegEncContext *s = &v->s; int cbp, val; uint8_t *coded_val; int mb_pos; + vc1_idct_func idct8x8_fn; /* select codingmode used for VLC tables selection */ switch(v->y_ac_table_index){ @@ -2590,6 +2597,10 @@ static void vc1_decode_i_blocks(VC1Context *v) s->mb_x = s->mb_y = 0; s->mb_intra = 1; s->first_slice_line = 1; + if(v->pq >= 9 && v->overlap) { + idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm]; + } else + idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put[!!v->rangeredfrm]; for(s->mb_y = 0; s->mb_y < s->mb_height; s->mb_y++) { s->mb_x = 0; ff_init_block_index(s); @@ -2626,14 +2637,9 @@ static void vc1_decode_i_blocks(VC1Context *v) vc1_decode_i_block(v, s->block[k], k, val, (k<4)? v->codingset : v->codingset2); if (k > 3 && (s->flags & CODEC_FLAG_GRAY)) continue; - v->vc1dsp.vc1_inv_trans_8x8(s->block[k]); - if(v->pq >= 9 && v->overlap) { - if (v->rangeredfrm) for(j = 0; j < 64; j++) s->block[k][j] <<= 1; - s->dsp.put_signed_pixels_clamped(s->block[k], dst[k], k & 4 ? s->uvlinesize : s->linesize); - } else { - if (v->rangeredfrm) for(j = 0; j < 64; j++) s->block[k][j] = (s->block[k][j] - 64) << 1; - s->dsp.put_pixels_clamped(s->block[k], dst[k], k & 4 ? s->uvlinesize : s->linesize); - } + idct8x8_fn(dst[k], + k & 4 ? s->uvlinesize : s->linesize, + s->block[k]); } if(v->pq >= 9 && v->overlap) { @@ -2691,6 +2697,7 @@ static void vc1_decode_i_blocks_adv(VC1Context *v) int mqdiff; int overlap; GetBitContext *gb = &s->gb; + vc1_idct_func idct8x8_fn; /* select codingmode used for VLC tables selection */ switch(v->y_ac_table_index){ @@ -2721,6 +2728,7 @@ static void vc1_decode_i_blocks_adv(VC1Context *v) s->mb_x = s->mb_y = 0; s->mb_intra = 1; s->first_slice_line = 1; + idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[0]; for(s->mb_y = 0; s->mb_y < s->mb_height; s->mb_y++) { s->mb_x = 0; ff_init_block_index(s); @@ -2777,9 +2785,9 @@ static void vc1_decode_i_blocks_adv(VC1Context *v) vc1_decode_i_block_adv(v, s->block[k], k, val, (k<4)? v->codingset : v->codingset2, mquant); if (k > 3 && (s->flags & CODEC_FLAG_GRAY)) continue; - v->vc1dsp.vc1_inv_trans_8x8(s->block[k]); - s->dsp.put_signed_pixels_clamped(s->block[k], dst[k], - k & 4 ? s->uvlinesize : s->linesize); + idct8x8_fn(dst[k], + k & 4 ? s->uvlinesize : s->linesize, + s->block[k]); } if(overlap) { diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c index 000dad7d26..dbe2120829 100644 --- a/libavcodec/vc1dsp.c +++ b/libavcodec/vc1dsp.c @@ -199,7 +199,7 @@ static void vc1_inv_trans_8x8_dc_c(uint8_t *dest, int linesize, DCTELEM *block) } } -static void vc1_inv_trans_8x8_c(DCTELEM block[64]) +static av_always_inline void vc1_inv_trans_8x8_c(DCTELEM block[64], int shl, int sub) { int i; register int t1,t2,t3,t4,t5,t6,t7,t8; @@ -254,20 +254,50 @@ static void vc1_inv_trans_8x8_c(DCTELEM block[64]) t3 = 9 * src[ 8] - 16 * src[24] + 4 * src[40] + 15 * src[56]; t4 = 4 * src[ 8] - 9 * src[24] + 15 * src[40] - 16 * src[56]; - dst[ 0] = (t5 + t1) >> 7; - dst[ 8] = (t6 + t2) >> 7; - dst[16] = (t7 + t3) >> 7; - dst[24] = (t8 + t4) >> 7; - dst[32] = (t8 - t4 + 1) >> 7; - dst[40] = (t7 - t3 + 1) >> 7; - dst[48] = (t6 - t2 + 1) >> 7; - dst[56] = (t5 - t1 + 1) >> 7; + dst[ 0] = (((t5 + t1 ) >> 7) - sub) << shl; + dst[ 8] = (((t6 + t2 ) >> 7) - sub) << shl; + dst[16] = (((t7 + t3 ) >> 7) - sub) << shl; + dst[24] = (((t8 + t4 ) >> 7) - sub) << shl; + dst[32] = (((t8 - t4 + 1) >> 7) - sub) << shl; + dst[40] = (((t7 - t3 + 1) >> 7) - sub) << shl; + dst[48] = (((t6 - t2 + 1) >> 7) - sub) << shl; + dst[56] = (((t5 - t1 + 1) >> 7) - sub) << shl; src++; dst++; } } +static void vc1_inv_trans_8x8_add_c(uint8_t *dest, int linesize, DCTELEM *block) +{ + vc1_inv_trans_8x8_c(block, 0, 0); + ff_add_pixels_clamped_c(block, dest, linesize); +} + +static void vc1_inv_trans_8x8_put_signed_c(uint8_t *dest, int linesize, DCTELEM *block) +{ + vc1_inv_trans_8x8_c(block, 0, 0); + ff_put_signed_pixels_clamped_c(block, dest, linesize); +} + +static void vc1_inv_trans_8x8_put_signed_rangered_c(uint8_t *dest, int linesize, DCTELEM *block) +{ + vc1_inv_trans_8x8_c(block, 1, 0); + ff_put_signed_pixels_clamped_c(block, dest, linesize); +} + +static void vc1_inv_trans_8x8_put_c(uint8_t *dest, int linesize, DCTELEM *block) +{ + vc1_inv_trans_8x8_c(block, 0, 0); + ff_put_pixels_clamped_c(block, dest, linesize); +} + +static void vc1_inv_trans_8x8_put_rangered_c(uint8_t *dest, int linesize, DCTELEM *block) +{ + vc1_inv_trans_8x8_c(block, 1, 64); + ff_put_pixels_clamped_c(block, dest, linesize); +} + /** Do inverse transform on 8x4 part of block */ static void vc1_inv_trans_8x4_dc_c(uint8_t *dest, int linesize, DCTELEM *block) @@ -662,7 +692,11 @@ static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*a } av_cold void ff_vc1dsp_init(VC1DSPContext* dsp) { - dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_c; + dsp->vc1_inv_trans_8x8_add = vc1_inv_trans_8x8_add_c; + dsp->vc1_inv_trans_8x8_put_signed[0] = vc1_inv_trans_8x8_put_signed_c; + dsp->vc1_inv_trans_8x8_put_signed[1] = vc1_inv_trans_8x8_put_signed_rangered_c; + dsp->vc1_inv_trans_8x8_put[0] = vc1_inv_trans_8x8_put_c; + dsp->vc1_inv_trans_8x8_put[1] = vc1_inv_trans_8x8_put_rangered_c; dsp->vc1_inv_trans_4x8 = vc1_inv_trans_4x8_c; dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_c; dsp->vc1_inv_trans_4x4 = vc1_inv_trans_4x4_c; diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h index a1f3d90574..db9d892a23 100644 --- a/libavcodec/vc1dsp.h +++ b/libavcodec/vc1dsp.h @@ -30,9 +30,13 @@ #include "dsputil.h" +typedef void (*vc1_idct_func)(uint8_t *dest, int line_size, DCTELEM *block); + typedef struct VC1DSPContext { /* vc1 functions */ - void (*vc1_inv_trans_8x8)(DCTELEM *b); + vc1_idct_func vc1_inv_trans_8x8_add; + vc1_idct_func vc1_inv_trans_8x8_put_signed[2]; + vc1_idct_func vc1_inv_trans_8x8_put[2]; void (*vc1_inv_trans_8x4)(uint8_t *dest, int line_size, DCTELEM *block); void (*vc1_inv_trans_4x8)(uint8_t *dest, int line_size, DCTELEM *block); void (*vc1_inv_trans_4x4)(uint8_t *dest, int line_size, DCTELEM *block);