mirror of https://git.ffmpeg.org/ffmpeg.git
truehd: add hand-scheduled ARM asm version of ff_mlp_rematrix_channel.
Profiling results for overall audio decode and the rematrix_channels function in particular are as follows: Before After Mean StdDev Mean StdDev Confidence Change 6:2 total 370.8 17.0 348.8 20.1 99.9% +6.3% 6:2 function 46.4 8.4 45.8 6.6 18.0% +1.2% (insignificant) 8:2 total 343.2 19.0 339.1 15.4 54.7% +1.2% (insignificant) 8:2 function 38.9 3.9 40.2 6.9 52.4% -3.2% (insignificant) 6:6 total 658.4 15.7 604.6 20.8 100.0% +8.9% 6:6 function 109.0 8.7 59.5 5.4 100.0% +83.3% 8:8 total 896.2 24.5 766.4 17.6 100.0% +16.9% 8:8 function 223.4 12.8 93.8 5.0 100.0% +138.3% The assembly version has also been tested with a fuzz tester to ensure that any combinations of inputs not exercised by my available test streams still generate mathematically identical results to the C version. Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
4e5aa080bb
commit
483321fe78
|
@ -431,3 +431,225 @@ endfunc
|
|||
.unreq ST3
|
||||
.unreq I
|
||||
.unreq PSAMP
|
||||
|
||||
/********************************************************************/
|
||||
|
||||
PSA .req a1 // samples
|
||||
PCO .req a2 // coeffs
|
||||
PBL .req a3 // bypassed_lsbs
|
||||
INDEX .req a4
|
||||
CO0 .req v1
|
||||
CO1 .req v2
|
||||
CO2 .req v3
|
||||
CO3 .req v4
|
||||
SA0 .req v5
|
||||
SA1 .req v6
|
||||
SA2 .req sl
|
||||
SA3 .req fp
|
||||
AC0 .req ip
|
||||
AC1 .req lr
|
||||
NOISE .req SA0
|
||||
LSB .req SA1
|
||||
DCH .req SA2 // dest_ch
|
||||
MASK .req SA3
|
||||
|
||||
// INDEX is used as follows:
|
||||
// bits 0..6 index2 (values up to 17, but wider so that we can
|
||||
// add to index field without needing to mask)
|
||||
// bits 7..14 i (values up to 160)
|
||||
// bit 15 underflow detect for i
|
||||
// bits 25..31 (if access_unit_size_pow2 == 128) \ index
|
||||
// bits 26..31 (if access_unit_size_pow2 == 64) /
|
||||
|
||||
.macro implement_rematrix shift, index_mask, mask_minus1, maxchan
|
||||
.if \maxchan == 1
|
||||
// We can just leave the coefficients in registers in this case
|
||||
ldrd CO0, CO1, [PCO]
|
||||
.endif
|
||||
1:
|
||||
.if \maxchan == 1
|
||||
ldrd SA0, SA1, [PSA]
|
||||
smull AC0, AC1, CO0, SA0
|
||||
.elseif \maxchan == 5
|
||||
ldr CO0, [PCO, #0]
|
||||
ldr SA0, [PSA, #0]
|
||||
ldr CO1, [PCO, #4]
|
||||
ldr SA1, [PSA, #4]
|
||||
ldrd CO2, CO3, [PCO, #8]
|
||||
smull AC0, AC1, CO0, SA0
|
||||
ldrd SA2, SA3, [PSA, #8]
|
||||
smlal AC0, AC1, CO1, SA1
|
||||
ldrd CO0, CO1, [PCO, #16]
|
||||
smlal AC0, AC1, CO2, SA2
|
||||
ldrd SA0, SA1, [PSA, #16]
|
||||
smlal AC0, AC1, CO3, SA3
|
||||
smlal AC0, AC1, CO0, SA0
|
||||
.else // \maxchan == 7
|
||||
ldr CO2, [PCO, #0]
|
||||
ldr SA2, [PSA, #0]
|
||||
ldr CO3, [PCO, #4]
|
||||
ldr SA3, [PSA, #4]
|
||||
ldrd CO0, CO1, [PCO, #8]
|
||||
smull AC0, AC1, CO2, SA2
|
||||
ldrd SA0, SA1, [PSA, #8]
|
||||
smlal AC0, AC1, CO3, SA3
|
||||
ldrd CO2, CO3, [PCO, #16]
|
||||
smlal AC0, AC1, CO0, SA0
|
||||
ldrd SA2, SA3, [PSA, #16]
|
||||
smlal AC0, AC1, CO1, SA1
|
||||
ldrd CO0, CO1, [PCO, #24]
|
||||
smlal AC0, AC1, CO2, SA2
|
||||
ldrd SA0, SA1, [PSA, #24]
|
||||
smlal AC0, AC1, CO3, SA3
|
||||
smlal AC0, AC1, CO0, SA0
|
||||
.endif
|
||||
ldm sp, {NOISE, DCH, MASK}
|
||||
smlal AC0, AC1, CO1, SA1
|
||||
.if \shift != 0
|
||||
.if \index_mask == 63
|
||||
add NOISE, NOISE, INDEX, lsr #32-6
|
||||
ldrb LSB, [PBL], #MAX_CHANNELS
|
||||
ldrsb NOISE, [NOISE]
|
||||
add INDEX, INDEX, INDEX, lsl #32-6
|
||||
.else // \index_mask == 127
|
||||
add NOISE, NOISE, INDEX, lsr #32-7
|
||||
ldrb LSB, [PBL], #MAX_CHANNELS
|
||||
ldrsb NOISE, [NOISE]
|
||||
add INDEX, INDEX, INDEX, lsl #32-7
|
||||
.endif
|
||||
sub INDEX, INDEX, #1<<7
|
||||
adds AC0, AC0, NOISE, lsl #\shift + 7
|
||||
adc AC1, AC1, NOISE, asr #31
|
||||
.else
|
||||
ldrb LSB, [PBL], #MAX_CHANNELS
|
||||
sub INDEX, INDEX, #1<<7
|
||||
.endif
|
||||
add PSA, PSA, #MAX_CHANNELS*4
|
||||
mov AC0, AC0, lsr #14
|
||||
orr AC0, AC0, AC1, lsl #18
|
||||
.if !\mask_minus1
|
||||
and AC0, AC0, MASK
|
||||
.endif
|
||||
add AC0, AC0, LSB
|
||||
tst INDEX, #1<<15
|
||||
str AC0, [PSA, DCH, lsl #2] // DCH is precompensated for the early increment of PSA
|
||||
beq 1b
|
||||
b 98f
|
||||
.endm
|
||||
|
||||
.macro switch_on_maxchan shift, index_mask, mask_minus1
|
||||
cmp v4, #5
|
||||
blo 51f
|
||||
beq 50f
|
||||
implement_rematrix \shift, \index_mask, \mask_minus1, 7
|
||||
50: implement_rematrix \shift, \index_mask, \mask_minus1, 5
|
||||
51: implement_rematrix \shift, \index_mask, \mask_minus1, 1
|
||||
.endm
|
||||
|
||||
.macro switch_on_mask shift, index_mask
|
||||
cmp sl, #-1
|
||||
bne 40f
|
||||
switch_on_maxchan \shift, \index_mask, 1
|
||||
40: switch_on_maxchan \shift, \index_mask, 0
|
||||
.endm
|
||||
|
||||
.macro switch_on_au_size shift
|
||||
.if \shift == 0
|
||||
switch_on_mask \shift, undefined
|
||||
.else
|
||||
teq v6, #64
|
||||
bne 30f
|
||||
orr INDEX, INDEX, v1, lsl #32-6
|
||||
switch_on_mask \shift, 63
|
||||
30: orr INDEX, INDEX, v1, lsl #32-7
|
||||
switch_on_mask \shift, 127
|
||||
.endif
|
||||
.endm
|
||||
|
||||
/* void ff_mlp_rematrix_channel_arm(int32_t *samples,
|
||||
* const int32_t *coeffs,
|
||||
* const uint8_t *bypassed_lsbs,
|
||||
* const int8_t *noise_buffer,
|
||||
* int index,
|
||||
* unsigned int dest_ch,
|
||||
* uint16_t blockpos,
|
||||
* unsigned int maxchan,
|
||||
* int matrix_noise_shift,
|
||||
* int access_unit_size_pow2,
|
||||
* int32_t mask);
|
||||
*/
|
||||
function ff_mlp_rematrix_channel_arm, export=1
|
||||
push {v1-fp,lr}
|
||||
add v1, sp, #9*4 // point at arguments on stack
|
||||
ldm v1, {v1-sl}
|
||||
teq v4, #1
|
||||
itt ne
|
||||
teqne v4, #5
|
||||
teqne v4, #7
|
||||
bne 99f
|
||||
teq v6, #64
|
||||
it ne
|
||||
teqne v6, #128
|
||||
bne 99f
|
||||
sub v2, v2, #MAX_CHANNELS
|
||||
push {a4,v2,sl} // initialise NOISE,DCH,MASK; make sp dword-aligned
|
||||
movs INDEX, v3, lsl #7
|
||||
beq 98f // just in case, do nothing if blockpos = 0
|
||||
subs INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time
|
||||
adc lr, v1, v1 // calculate index2 (C was set by preceding subs)
|
||||
orr INDEX, INDEX, lr
|
||||
// Switch on matrix_noise_shift: values 0 and 1 are
|
||||
// disproportionately common so do those in a form the branch
|
||||
// predictor can accelerate. Values can only go up to 15.
|
||||
cmp v5, #1
|
||||
beq 11f
|
||||
blo 10f
|
||||
A ldr pc, [pc, v5, lsl #2]
|
||||
T tbh [pc, v5, lsl #1]
|
||||
0:
|
||||
A .word 0, 0, 0, 12f, 13f, 14f, 15f, 16f, 17f, 18f, 19f, 20f, 21f, 22f, 23f, 24f, 25f
|
||||
T .hword 0, 0, (12f - 0b) / 2, (13f - 0b) / 2, (14f - 0b) / 2, (15f - 0b) / 2
|
||||
T .hword (16f - 0b) / 2, (17f - 0b) / 2, (18f - 0b) / 2, (19f - 0b) / 2
|
||||
T .hword (20f - 0b) / 2, (21f - 0b) / 2, (22f - 0b) / 2, (23f - 0b) / 2, (24f - 0b) / 2, (25f - 0b) / 2
|
||||
10: switch_on_au_size 0
|
||||
11: switch_on_au_size 1
|
||||
12: switch_on_au_size 2
|
||||
13: switch_on_au_size 3
|
||||
14: switch_on_au_size 4
|
||||
15: switch_on_au_size 5
|
||||
16: switch_on_au_size 6
|
||||
17: switch_on_au_size 7
|
||||
18: switch_on_au_size 8
|
||||
19: switch_on_au_size 9
|
||||
20: switch_on_au_size 10
|
||||
21: switch_on_au_size 11
|
||||
22: switch_on_au_size 12
|
||||
23: switch_on_au_size 13
|
||||
24: switch_on_au_size 14
|
||||
25: switch_on_au_size 15
|
||||
|
||||
98: add sp, sp, #3*4
|
||||
pop {v1-fp,pc}
|
||||
99: // Can't handle these parameters, drop back to C
|
||||
pop {v1-fp,lr}
|
||||
b X(ff_mlp_rematrix_channel)
|
||||
endfunc
|
||||
|
||||
.unreq PSA
|
||||
.unreq PCO
|
||||
.unreq PBL
|
||||
.unreq INDEX
|
||||
.unreq CO0
|
||||
.unreq CO1
|
||||
.unreq CO2
|
||||
.unreq CO3
|
||||
.unreq SA0
|
||||
.unreq SA1
|
||||
.unreq SA2
|
||||
.unreq SA3
|
||||
.unreq AC0
|
||||
.unreq AC1
|
||||
.unreq NOISE
|
||||
.unreq LSB
|
||||
.unreq DCH
|
||||
.unreq MASK
|
||||
|
|
|
@ -29,6 +29,17 @@ void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
|
|||
int firorder, int iirorder,
|
||||
unsigned int filter_shift, int32_t mask,
|
||||
int blocksize, int32_t *sample_buffer);
|
||||
void ff_mlp_rematrix_channel_arm(int32_t *samples,
|
||||
const int32_t *coeffs,
|
||||
const uint8_t *bypassed_lsbs,
|
||||
const int8_t *noise_buffer,
|
||||
int index,
|
||||
unsigned int dest_ch,
|
||||
uint16_t blockpos,
|
||||
unsigned int maxchan,
|
||||
int matrix_noise_shift,
|
||||
int access_unit_size_pow2,
|
||||
int32_t mask);
|
||||
|
||||
av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
|
||||
{
|
||||
|
@ -36,5 +47,6 @@ av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
|
|||
|
||||
if (have_armv5te(cpu_flags)) {
|
||||
c->mlp_filter_channel = ff_mlp_filter_channel_arm;
|
||||
c->mlp_rematrix_channel = ff_mlp_rematrix_channel_arm;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue