mirror of https://git.ffmpeg.org/ffmpeg.git
Reorganize how values are stored in h->non_zero_count.
~1% faster Originally committed as revision 21273 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
c36a4294dc
commit
b6303e6d2a
|
@ -300,6 +300,13 @@ typedef struct H264Context{
|
||||||
* is 64 if not available.
|
* is 64 if not available.
|
||||||
*/
|
*/
|
||||||
DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
|
DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
|
||||||
|
|
||||||
|
/*
|
||||||
|
.UU.YYYY
|
||||||
|
.UU.YYYY
|
||||||
|
.vv.YYYY
|
||||||
|
.VV.YYYY
|
||||||
|
*/
|
||||||
uint8_t (*non_zero_count)[32];
|
uint8_t (*non_zero_count)[32];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -727,11 +734,11 @@ static av_always_inline void fill_caches(H264Context *h, int mb_type, int for_de
|
||||||
const uint8_t * left_block;
|
const uint8_t * left_block;
|
||||||
int topleft_partition= -1;
|
int topleft_partition= -1;
|
||||||
int i;
|
int i;
|
||||||
static const uint8_t left_block_options[4][8]={
|
static const uint8_t left_block_options[4][16]={
|
||||||
{0,1,2,3,7,10,8,11},
|
{0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8},
|
||||||
{2,2,3,3,8,11,8,11},
|
{2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8},
|
||||||
{0,0,1,1,7,10,7,10},
|
{0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8},
|
||||||
{0,2,0,2,7,10,7,10}
|
{0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}
|
||||||
};
|
};
|
||||||
|
|
||||||
top_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
|
top_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
|
||||||
|
@ -788,6 +795,12 @@ static av_always_inline void fill_caches(H264Context *h, int mb_type, int for_de
|
||||||
h->left_mb_xy[0] = left_xy[0];
|
h->left_mb_xy[0] = left_xy[0];
|
||||||
h->left_mb_xy[1] = left_xy[1];
|
h->left_mb_xy[1] = left_xy[1];
|
||||||
if(for_deblock){
|
if(for_deblock){
|
||||||
|
*((uint64_t*)&h->non_zero_count_cache[0+8*1])= *((uint64_t*)&h->non_zero_count[mb_xy][ 0]);
|
||||||
|
*((uint64_t*)&h->non_zero_count_cache[0+8*2])= *((uint64_t*)&h->non_zero_count[mb_xy][ 8]);
|
||||||
|
*((uint32_t*)&h->non_zero_count_cache[0+8*5])= *((uint32_t*)&h->non_zero_count[mb_xy][16]);
|
||||||
|
*((uint32_t*)&h->non_zero_count_cache[4+8*3])= *((uint32_t*)&h->non_zero_count[mb_xy][20]);
|
||||||
|
*((uint64_t*)&h->non_zero_count_cache[0+8*4])= *((uint64_t*)&h->non_zero_count[mb_xy][24]);
|
||||||
|
|
||||||
topleft_type = 0;
|
topleft_type = 0;
|
||||||
topright_type = 0;
|
topright_type = 0;
|
||||||
top_type = h->slice_table[top_xy ] < 0xFFFF ? s->current_picture.mb_type[top_xy] : 0;
|
top_type = h->slice_table[top_xy ] < 0xFFFF ? s->current_picture.mb_type[top_xy] : 0;
|
||||||
|
@ -922,16 +935,13 @@ static av_always_inline void fill_caches(H264Context *h, int mb_type, int for_de
|
||||||
*/
|
*/
|
||||||
//FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
|
//FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
|
||||||
if(top_type){
|
if(top_type){
|
||||||
h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
|
*(uint32_t*)&h->non_zero_count_cache[4+8*0]= *(uint32_t*)&h->non_zero_count[top_xy][4+3*8];
|
||||||
h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
|
|
||||||
h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
|
|
||||||
h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
|
|
||||||
|
|
||||||
h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
|
h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][1+1*8];
|
||||||
h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
|
h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][2+1*8];
|
||||||
|
|
||||||
h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
|
h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][1+2*8];
|
||||||
h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
|
h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][2+2*8];
|
||||||
|
|
||||||
}else{
|
}else{
|
||||||
h->non_zero_count_cache[4+8*0]=
|
h->non_zero_count_cache[4+8*0]=
|
||||||
|
@ -949,10 +959,10 @@ static av_always_inline void fill_caches(H264Context *h, int mb_type, int for_de
|
||||||
|
|
||||||
for (i=0; i<2; i++) {
|
for (i=0; i<2; i++) {
|
||||||
if(left_type[i]){
|
if(left_type[i]){
|
||||||
h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
|
h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+0+2*i]];
|
||||||
h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
|
h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+1+2*i]];
|
||||||
h->non_zero_count_cache[0+8*1 + 8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
|
h->non_zero_count_cache[0+8*1 + 8*i]= h->non_zero_count[left_xy[i]][left_block[8+4+2*i]];
|
||||||
h->non_zero_count_cache[0+8*4 + 8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
|
h->non_zero_count_cache[0+8*4 + 8*i]= h->non_zero_count[left_xy[i]][left_block[8+5+2*i]];
|
||||||
}else{
|
}else{
|
||||||
h->non_zero_count_cache[3+8*1 + 2*8*i]=
|
h->non_zero_count_cache[3+8*1 + 2*8*i]=
|
||||||
h->non_zero_count_cache[3+8*2 + 2*8*i]=
|
h->non_zero_count_cache[3+8*2 + 2*8*i]=
|
||||||
|
@ -1204,38 +1214,11 @@ static inline int pred_intra_mode(H264Context *h, int n){
|
||||||
static inline void write_back_non_zero_count(H264Context *h){
|
static inline void write_back_non_zero_count(H264Context *h){
|
||||||
const int mb_xy= h->mb_xy;
|
const int mb_xy= h->mb_xy;
|
||||||
|
|
||||||
h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
|
*((uint64_t*)&h->non_zero_count[mb_xy][ 0]) = *((uint64_t*)&h->non_zero_count_cache[0+8*1]);
|
||||||
h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
|
*((uint64_t*)&h->non_zero_count[mb_xy][ 8]) = *((uint64_t*)&h->non_zero_count_cache[0+8*2]);
|
||||||
h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
|
*((uint32_t*)&h->non_zero_count[mb_xy][16]) = *((uint32_t*)&h->non_zero_count_cache[0+8*5]);
|
||||||
h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
|
*((uint32_t*)&h->non_zero_count[mb_xy][20]) = *((uint32_t*)&h->non_zero_count_cache[4+8*3]);
|
||||||
h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
|
*((uint64_t*)&h->non_zero_count[mb_xy][24]) = *((uint64_t*)&h->non_zero_count_cache[0+8*4]);
|
||||||
h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
|
|
||||||
h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
|
|
||||||
|
|
||||||
h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
|
|
||||||
h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
|
|
||||||
h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
|
|
||||||
|
|
||||||
h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
|
|
||||||
h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
|
|
||||||
h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
|
|
||||||
|
|
||||||
//FIXME sort better how things are stored in non_zero_count
|
|
||||||
|
|
||||||
|
|
||||||
h->non_zero_count[mb_xy][13]= h->non_zero_count_cache[6+8*1];
|
|
||||||
h->non_zero_count[mb_xy][14]= h->non_zero_count_cache[6+8*2];
|
|
||||||
h->non_zero_count[mb_xy][15]= h->non_zero_count_cache[6+8*3];
|
|
||||||
h->non_zero_count[mb_xy][16]= h->non_zero_count_cache[5+8*1];
|
|
||||||
h->non_zero_count[mb_xy][17]= h->non_zero_count_cache[5+8*2];
|
|
||||||
h->non_zero_count[mb_xy][18]= h->non_zero_count_cache[5+8*3];
|
|
||||||
h->non_zero_count[mb_xy][19]= h->non_zero_count_cache[4+8*1];
|
|
||||||
h->non_zero_count[mb_xy][20]= h->non_zero_count_cache[4+8*2];
|
|
||||||
h->non_zero_count[mb_xy][21]= h->non_zero_count_cache[4+8*3];
|
|
||||||
|
|
||||||
h->non_zero_count[mb_xy][22]= h->non_zero_count_cache[1+8*1];
|
|
||||||
h->non_zero_count[mb_xy][23]= h->non_zero_count_cache[1+8*4];
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void write_back_motion(H264Context *h, int mb_type){
|
static inline void write_back_motion(H264Context *h, int mb_type){
|
||||||
|
|
|
@ -472,7 +472,6 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
|
||||||
// be done twice (one each of the field) even if we are in a
|
// be done twice (one each of the field) even if we are in a
|
||||||
// frame macroblock.
|
// frame macroblock.
|
||||||
//
|
//
|
||||||
static const int nnz_idx[4] = {4,5,6,3};
|
|
||||||
unsigned int tmp_linesize = 2 * linesize;
|
unsigned int tmp_linesize = 2 * linesize;
|
||||||
unsigned int tmp_uvlinesize = 2 * uvlinesize;
|
unsigned int tmp_uvlinesize = 2 * uvlinesize;
|
||||||
int mbn_xy = mb_xy - 2 * s->mb_stride;
|
int mbn_xy = mb_xy - 2 * s->mb_stride;
|
||||||
|
@ -488,7 +487,7 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
|
||||||
const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
|
const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
|
||||||
for( i = 0; i < 4; i++ ) {
|
for( i = 0; i < 4; i++ ) {
|
||||||
if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
|
if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
|
||||||
mbn_nnz[nnz_idx[i]] != 0 )
|
mbn_nnz[i+4+3*8] != 0 )
|
||||||
bS[i] = 2;
|
bS[i] = 2;
|
||||||
else
|
else
|
||||||
bS[i] = 1;
|
bS[i] = 1;
|
||||||
|
@ -663,36 +662,6 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
h->non_zero_count_cache[7+8*1]=h->non_zero_count[mb_xy][0];
|
|
||||||
h->non_zero_count_cache[7+8*2]=h->non_zero_count[mb_xy][1];
|
|
||||||
h->non_zero_count_cache[7+8*3]=h->non_zero_count[mb_xy][2];
|
|
||||||
h->non_zero_count_cache[7+8*4]=h->non_zero_count[mb_xy][3];
|
|
||||||
h->non_zero_count_cache[4+8*4]=h->non_zero_count[mb_xy][4];
|
|
||||||
h->non_zero_count_cache[5+8*4]=h->non_zero_count[mb_xy][5];
|
|
||||||
h->non_zero_count_cache[6+8*4]=h->non_zero_count[mb_xy][6];
|
|
||||||
|
|
||||||
h->non_zero_count_cache[1+8*2]=h->non_zero_count[mb_xy][9];
|
|
||||||
h->non_zero_count_cache[2+8*2]=h->non_zero_count[mb_xy][8];
|
|
||||||
h->non_zero_count_cache[2+8*1]=h->non_zero_count[mb_xy][7];
|
|
||||||
|
|
||||||
h->non_zero_count_cache[1+8*5]=h->non_zero_count[mb_xy][12];
|
|
||||||
h->non_zero_count_cache[2+8*5]=h->non_zero_count[mb_xy][11];
|
|
||||||
h->non_zero_count_cache[2+8*4]=h->non_zero_count[mb_xy][10];
|
|
||||||
|
|
||||||
h->non_zero_count_cache[6+8*1]=h->non_zero_count[mb_xy][13];
|
|
||||||
h->non_zero_count_cache[6+8*2]=h->non_zero_count[mb_xy][14];
|
|
||||||
h->non_zero_count_cache[6+8*3]=h->non_zero_count[mb_xy][15];
|
|
||||||
h->non_zero_count_cache[5+8*1]=h->non_zero_count[mb_xy][16];
|
|
||||||
h->non_zero_count_cache[5+8*2]=h->non_zero_count[mb_xy][17];
|
|
||||||
h->non_zero_count_cache[5+8*3]=h->non_zero_count[mb_xy][18];
|
|
||||||
h->non_zero_count_cache[4+8*1]=h->non_zero_count[mb_xy][19];
|
|
||||||
h->non_zero_count_cache[4+8*2]=h->non_zero_count[mb_xy][20];
|
|
||||||
h->non_zero_count_cache[4+8*3]=h->non_zero_count[mb_xy][21];
|
|
||||||
|
|
||||||
h->non_zero_count_cache[1+8*1]=h->non_zero_count[mb_xy][22];
|
|
||||||
h->non_zero_count_cache[1+8*4]=h->non_zero_count[mb_xy][23];
|
|
||||||
|
|
||||||
// CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
|
// CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
|
||||||
if(!h->pps.cabac && h->pps.transform_8x8_mode){
|
if(!h->pps.cabac && h->pps.transform_8x8_mode){
|
||||||
int top_type, left_type[2];
|
int top_type, left_type[2];
|
||||||
|
@ -762,7 +731,7 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
|
||||||
((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
|
((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
|
||||||
(h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
|
(h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
|
||||||
:
|
:
|
||||||
h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
|
h->non_zero_count[mbn_xy][7+(MB_FIELD ? (i&3) : (i>>2)+(mb_y&1)*2)*8]))
|
||||||
bS[i] = 2;
|
bS[i] = 2;
|
||||||
else
|
else
|
||||||
bS[i] = 1;
|
bS[i] = 1;
|
||||||
|
|
Loading…
Reference in New Issue