aarch64: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32

This work is sponsored by, and copyright, Google.

Previously all subpartitions except the eob=1 (DC) case ran with
the same runtime:

vp9_inv_dct_dct_16x16_sub16_add_neon:   1373.2
vp9_inv_dct_dct_32x32_sub32_add_neon:   8089.0

By skipping individual 8x16 or 8x32 pixel slices in the first pass,
we reduce the runtime of these functions like this:

vp9_inv_dct_dct_16x16_sub1_add_neon:     235.3
vp9_inv_dct_dct_16x16_sub2_add_neon:    1036.7
vp9_inv_dct_dct_16x16_sub4_add_neon:    1036.7
vp9_inv_dct_dct_16x16_sub8_add_neon:    1036.7
vp9_inv_dct_dct_16x16_sub12_add_neon:   1372.1
vp9_inv_dct_dct_16x16_sub16_add_neon:   1372.1
vp9_inv_dct_dct_32x32_sub1_add_neon:     555.1
vp9_inv_dct_dct_32x32_sub2_add_neon:    5190.2
vp9_inv_dct_dct_32x32_sub4_add_neon:    5180.0
vp9_inv_dct_dct_32x32_sub8_add_neon:    5183.1
vp9_inv_dct_dct_32x32_sub12_add_neon:   6161.5
vp9_inv_dct_dct_32x32_sub16_add_neon:   6155.5
vp9_inv_dct_dct_32x32_sub20_add_neon:   7136.3
vp9_inv_dct_dct_32x32_sub24_add_neon:   7128.4
vp9_inv_dct_dct_32x32_sub28_add_neon:   8098.9
vp9_inv_dct_dct_32x32_sub32_add_neon:   8098.8

I.e. in general a very minor overhead for the full subpartition case due
to the additional cmps, but a significant speedup for the cases when we
only need to process a small part of the actual input data.

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Martin Storsjö 2016-11-18 12:26:04 +02:00
parent 9c8bc74c2b
commit cad42fadcd
1 changed files with 56 additions and 5 deletions

View File

@ -588,6 +588,9 @@ endfunc
.macro store i, dst, inc .macro store i, dst, inc
st1 {v\i\().8h}, [\dst], \inc st1 {v\i\().8h}, [\dst], \inc
.endm .endm
.macro movi_v i, size, imm
movi v\i\()\size, \imm
.endm
.macro load_clear i, src, inc .macro load_clear i, src, inc
ld1 {v\i\().8h}, [\src] ld1 {v\i\().8h}, [\src]
st1 {v2.8h}, [\src], \inc st1 {v2.8h}, [\src], \inc
@ -596,9 +599,8 @@ endfunc
// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
// transpose into a horizontal 16x8 slice and store. // transpose into a horizontal 16x8 slice and store.
// x0 = dst (temp buffer) // x0 = dst (temp buffer)
// x1 = unused // x1 = slice offset
// x2 = src // x2 = src
// x3 = slice offset
// x9 = input stride // x9 = input stride
.macro itxfm16_1d_funcs txfm .macro itxfm16_1d_funcs txfm
function \txfm\()16_1d_8x16_pass1_neon function \txfm\()16_1d_8x16_pass1_neon
@ -616,14 +618,14 @@ function \txfm\()16_1d_8x16_pass1_neon
transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
// Store the transposed 8x8 blocks horizontally. // Store the transposed 8x8 blocks horizontally.
cmp x3, #8 cmp x1, #8
b.eq 1f b.eq 1f
.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
store \i, x0, #16 store \i, x0, #16
.endr .endr
ret ret
1: 1:
// Special case: For the last input column (x3 == 8), // Special case: For the last input column (x1 == 8),
// which would be stored as the last row in the temp buffer, // which would be stored as the last row in the temp buffer,
// don't store the first 8x8 block, but keep it in registers // don't store the first 8x8 block, but keep it in registers
// for the first slice of the second pass (where it is the // for the first slice of the second pass (where it is the
@ -751,13 +753,36 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
.irp i, 0, 8 .irp i, 0, 8
add x0, sp, #(\i*32) add x0, sp, #(\i*32)
.ifc \txfm1\()_\txfm2,idct_idct
.if \i == 8
cmp w3, #38
b.le 1f
.endif
.endif
mov x1, #\i
add x2, x6, #(\i*2) add x2, x6, #(\i*2)
mov x3, #\i
bl \txfm1\()16_1d_8x16_pass1_neon bl \txfm1\()16_1d_8x16_pass1_neon
.endr .endr
.ifc \txfm1\()_\txfm2,iadst_idct .ifc \txfm1\()_\txfm2,iadst_idct
ld1 {v0.8h,v1.8h}, [x10] ld1 {v0.8h,v1.8h}, [x10]
.endif .endif
.ifc \txfm1\()_\txfm2,idct_idct
b 3f
1:
// Set v24-v31 to zero, for the in-register passthrough of
// coefficients to pass 2. Since we only do two slices, this can
// only ever happen for the second slice. So we only need to store
// zeros to the temp buffer for the second half of the buffer.
// Move x0 to the second half, and use x9 == 32 as increment.
add x0, x0, #16
.irp i, 24, 25, 26, 27, 28, 29, 30, 31
movi_v \i, .16b, #0
st1 {v24.8h}, [x0], x9
.endr
3:
.endif
.irp i, 0, 8 .irp i, 0, 8
add x0, x4, #(\i) add x0, x4, #(\i)
mov x1, x5 mov x1, x5
@ -1073,12 +1098,17 @@ function idct32_1d_8x32_pass2_neon
ret ret
endfunc endfunc
const min_eob_idct_idct_32, align=4
.short 0, 34, 135, 336
endconst
function ff_vp9_idct_idct_32x32_add_neon, export=1 function ff_vp9_idct_idct_32x32_add_neon, export=1
cmp w3, #1 cmp w3, #1
b.eq idct32x32_dc_add_neon b.eq idct32x32_dc_add_neon
movrel x10, idct_coeffs movrel x10, idct_coeffs
add x11, x10, #32 add x11, x10, #32
movrel x12, min_eob_idct_idct_32 + 2
mov x15, x30 mov x15, x30
@ -1099,9 +1129,30 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
.irp i, 0, 8, 16, 24 .irp i, 0, 8, 16, 24
add x0, sp, #(\i*64) add x0, sp, #(\i*64)
.if \i > 0
ldrh w1, [x12], #2
cmp w3, w1
mov x1, #(32 - \i)/4
b.le 1f
.endif
add x2, x6, #(\i*2) add x2, x6, #(\i*2)
bl idct32_1d_8x32_pass1_neon bl idct32_1d_8x32_pass1_neon
.endr .endr
b 3f
1:
// Write zeros to the temp buffer for pass 2
movi v16.8h, #0
movi v17.8h, #0
movi v18.8h, #0
movi v19.8h, #0
2:
subs x1, x1, #1
.rept 4
st1 {v16.8h-v19.8h}, [x0], #64
.endr
b.ne 2b
3:
.irp i, 0, 8, 16, 24 .irp i, 0, 8, 16, 24
add x0, x4, #(\i) add x0, x4, #(\i)
mov x1, x5 mov x1, x5