mirror of https://git.ffmpeg.org/ffmpeg.git
aarch64: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32
This work is sponsored by, and copyright, Google. Previously all subpartitions except the eob=1 (DC) case ran with the same runtime: vp9_inv_dct_dct_16x16_sub16_add_neon: 1373.2 vp9_inv_dct_dct_32x32_sub32_add_neon: 8089.0 By skipping individual 8x16 or 8x32 pixel slices in the first pass, we reduce the runtime of these functions like this: vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3 vp9_inv_dct_dct_16x16_sub2_add_neon: 1036.7 vp9_inv_dct_dct_16x16_sub4_add_neon: 1036.7 vp9_inv_dct_dct_16x16_sub8_add_neon: 1036.7 vp9_inv_dct_dct_16x16_sub12_add_neon: 1372.1 vp9_inv_dct_dct_16x16_sub16_add_neon: 1372.1 vp9_inv_dct_dct_32x32_sub1_add_neon: 555.1 vp9_inv_dct_dct_32x32_sub2_add_neon: 5190.2 vp9_inv_dct_dct_32x32_sub4_add_neon: 5180.0 vp9_inv_dct_dct_32x32_sub8_add_neon: 5183.1 vp9_inv_dct_dct_32x32_sub12_add_neon: 6161.5 vp9_inv_dct_dct_32x32_sub16_add_neon: 6155.5 vp9_inv_dct_dct_32x32_sub20_add_neon: 7136.3 vp9_inv_dct_dct_32x32_sub24_add_neon: 7128.4 vp9_inv_dct_dct_32x32_sub28_add_neon: 8098.9 vp9_inv_dct_dct_32x32_sub32_add_neon: 8098.8 I.e. in general a very minor overhead for the full subpartition case due to the additional cmps, but a significant speedup for the cases when we only need to process a small part of the actual input data. Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
9c8bc74c2b
commit
cad42fadcd
|
@ -588,6 +588,9 @@ endfunc
|
||||||
.macro store i, dst, inc
|
.macro store i, dst, inc
|
||||||
st1 {v\i\().8h}, [\dst], \inc
|
st1 {v\i\().8h}, [\dst], \inc
|
||||||
.endm
|
.endm
|
||||||
|
.macro movi_v i, size, imm
|
||||||
|
movi v\i\()\size, \imm
|
||||||
|
.endm
|
||||||
.macro load_clear i, src, inc
|
.macro load_clear i, src, inc
|
||||||
ld1 {v\i\().8h}, [\src]
|
ld1 {v\i\().8h}, [\src]
|
||||||
st1 {v2.8h}, [\src], \inc
|
st1 {v2.8h}, [\src], \inc
|
||||||
|
@ -596,9 +599,8 @@ endfunc
|
||||||
// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
|
// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
|
||||||
// transpose into a horizontal 16x8 slice and store.
|
// transpose into a horizontal 16x8 slice and store.
|
||||||
// x0 = dst (temp buffer)
|
// x0 = dst (temp buffer)
|
||||||
// x1 = unused
|
// x1 = slice offset
|
||||||
// x2 = src
|
// x2 = src
|
||||||
// x3 = slice offset
|
|
||||||
// x9 = input stride
|
// x9 = input stride
|
||||||
.macro itxfm16_1d_funcs txfm
|
.macro itxfm16_1d_funcs txfm
|
||||||
function \txfm\()16_1d_8x16_pass1_neon
|
function \txfm\()16_1d_8x16_pass1_neon
|
||||||
|
@ -616,14 +618,14 @@ function \txfm\()16_1d_8x16_pass1_neon
|
||||||
transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
|
transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
|
||||||
|
|
||||||
// Store the transposed 8x8 blocks horizontally.
|
// Store the transposed 8x8 blocks horizontally.
|
||||||
cmp x3, #8
|
cmp x1, #8
|
||||||
b.eq 1f
|
b.eq 1f
|
||||||
.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
|
.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
|
||||||
store \i, x0, #16
|
store \i, x0, #16
|
||||||
.endr
|
.endr
|
||||||
ret
|
ret
|
||||||
1:
|
1:
|
||||||
// Special case: For the last input column (x3 == 8),
|
// Special case: For the last input column (x1 == 8),
|
||||||
// which would be stored as the last row in the temp buffer,
|
// which would be stored as the last row in the temp buffer,
|
||||||
// don't store the first 8x8 block, but keep it in registers
|
// don't store the first 8x8 block, but keep it in registers
|
||||||
// for the first slice of the second pass (where it is the
|
// for the first slice of the second pass (where it is the
|
||||||
|
@ -751,13 +753,36 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
|
||||||
|
|
||||||
.irp i, 0, 8
|
.irp i, 0, 8
|
||||||
add x0, sp, #(\i*32)
|
add x0, sp, #(\i*32)
|
||||||
|
.ifc \txfm1\()_\txfm2,idct_idct
|
||||||
|
.if \i == 8
|
||||||
|
cmp w3, #38
|
||||||
|
b.le 1f
|
||||||
|
.endif
|
||||||
|
.endif
|
||||||
|
mov x1, #\i
|
||||||
add x2, x6, #(\i*2)
|
add x2, x6, #(\i*2)
|
||||||
mov x3, #\i
|
|
||||||
bl \txfm1\()16_1d_8x16_pass1_neon
|
bl \txfm1\()16_1d_8x16_pass1_neon
|
||||||
.endr
|
.endr
|
||||||
.ifc \txfm1\()_\txfm2,iadst_idct
|
.ifc \txfm1\()_\txfm2,iadst_idct
|
||||||
ld1 {v0.8h,v1.8h}, [x10]
|
ld1 {v0.8h,v1.8h}, [x10]
|
||||||
.endif
|
.endif
|
||||||
|
|
||||||
|
.ifc \txfm1\()_\txfm2,idct_idct
|
||||||
|
b 3f
|
||||||
|
1:
|
||||||
|
// Set v24-v31 to zero, for the in-register passthrough of
|
||||||
|
// coefficients to pass 2. Since we only do two slices, this can
|
||||||
|
// only ever happen for the second slice. So we only need to store
|
||||||
|
// zeros to the temp buffer for the second half of the buffer.
|
||||||
|
// Move x0 to the second half, and use x9 == 32 as increment.
|
||||||
|
add x0, x0, #16
|
||||||
|
.irp i, 24, 25, 26, 27, 28, 29, 30, 31
|
||||||
|
movi_v \i, .16b, #0
|
||||||
|
st1 {v24.8h}, [x0], x9
|
||||||
|
.endr
|
||||||
|
3:
|
||||||
|
.endif
|
||||||
|
|
||||||
.irp i, 0, 8
|
.irp i, 0, 8
|
||||||
add x0, x4, #(\i)
|
add x0, x4, #(\i)
|
||||||
mov x1, x5
|
mov x1, x5
|
||||||
|
@ -1073,12 +1098,17 @@ function idct32_1d_8x32_pass2_neon
|
||||||
ret
|
ret
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
|
const min_eob_idct_idct_32, align=4
|
||||||
|
.short 0, 34, 135, 336
|
||||||
|
endconst
|
||||||
|
|
||||||
function ff_vp9_idct_idct_32x32_add_neon, export=1
|
function ff_vp9_idct_idct_32x32_add_neon, export=1
|
||||||
cmp w3, #1
|
cmp w3, #1
|
||||||
b.eq idct32x32_dc_add_neon
|
b.eq idct32x32_dc_add_neon
|
||||||
|
|
||||||
movrel x10, idct_coeffs
|
movrel x10, idct_coeffs
|
||||||
add x11, x10, #32
|
add x11, x10, #32
|
||||||
|
movrel x12, min_eob_idct_idct_32 + 2
|
||||||
|
|
||||||
mov x15, x30
|
mov x15, x30
|
||||||
|
|
||||||
|
@ -1099,9 +1129,30 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
|
||||||
|
|
||||||
.irp i, 0, 8, 16, 24
|
.irp i, 0, 8, 16, 24
|
||||||
add x0, sp, #(\i*64)
|
add x0, sp, #(\i*64)
|
||||||
|
.if \i > 0
|
||||||
|
ldrh w1, [x12], #2
|
||||||
|
cmp w3, w1
|
||||||
|
mov x1, #(32 - \i)/4
|
||||||
|
b.le 1f
|
||||||
|
.endif
|
||||||
add x2, x6, #(\i*2)
|
add x2, x6, #(\i*2)
|
||||||
bl idct32_1d_8x32_pass1_neon
|
bl idct32_1d_8x32_pass1_neon
|
||||||
.endr
|
.endr
|
||||||
|
b 3f
|
||||||
|
|
||||||
|
1:
|
||||||
|
// Write zeros to the temp buffer for pass 2
|
||||||
|
movi v16.8h, #0
|
||||||
|
movi v17.8h, #0
|
||||||
|
movi v18.8h, #0
|
||||||
|
movi v19.8h, #0
|
||||||
|
2:
|
||||||
|
subs x1, x1, #1
|
||||||
|
.rept 4
|
||||||
|
st1 {v16.8h-v19.8h}, [x0], #64
|
||||||
|
.endr
|
||||||
|
b.ne 2b
|
||||||
|
3:
|
||||||
.irp i, 0, 8, 16, 24
|
.irp i, 0, 8, 16, 24
|
||||||
add x0, x4, #(\i)
|
add x0, x4, #(\i)
|
||||||
mov x1, x5
|
mov x1, x5
|
||||||
|
|
Loading…
Reference in New Issue