diff --git a/libavcodec/x86/vp9itxfm_16bpp.asm b/libavcodec/x86/vp9itxfm_16bpp.asm index 291b4a1880..3257986d30 100644 --- a/libavcodec/x86/vp9itxfm_16bpp.asm +++ b/libavcodec/x86/vp9itxfm_16bpp.asm @@ -97,6 +97,40 @@ pw_m3196_m16069: times 4 dw -3196, -16069 pw_m13623_m9102: times 4 dw -13623, -9102 pw_m6270_m15137: times 4 dw -6270, -15137 +default_8x8: +times 12 db 1 +times 52 db 2 +row_8x8: +times 18 db 1 +times 46 db 2 +col_8x8: +times 6 db 1 +times 58 db 2 +default_16x16: +times 10 db 1 +times 28 db 2 +times 51 db 3 +times 167 db 4 +row_16x16: +times 21 db 1 +times 45 db 2 +times 60 db 3 +times 130 db 4 +col_16x16: +times 5 db 1 +times 12 db 2 +times 25 db 3 +times 214 db 4 +default_32x32: +times 9 db 1 +times 25 db 2 +times 36 db 3 +times 65 db 4 +times 105 db 5 +times 96 db 6 +times 112 db 7 +times 576 db 8 + SECTION .text %macro VP9_STORE_2X 6-7 dstq ; reg1, reg2, tmp1, tmp2, min, max, dst @@ -636,18 +670,21 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \ jg .loop_dc RET - ; FIXME a sub-idct for the top-left 4x4 coefficients would save 1 loop - ; iteration in the first idct (2->1) and thus probably a lot of time. - ; I haven't implemented that yet, though - .idctfull: mova [rsp+16*mmsize], m0 - DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak %if ARCH_X86_64 mov dstbakq, dstq + movsxd cntq, cntd %endif - lea stride3q, [strideq*3] - mov cntd, 2 +%ifdef PIC + lea ptrq, [default_8x8] + movzx cntd, byte [ptrq+cntq-1] +%else + movzx cntd, byte [default_8x8+cntq-1] +%endif + mov skipd, 2 + sub skipd, cntd mov ptrq, rsp .loop_1: IDCT8_1D blockq @@ -668,6 +705,24 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \ dec cntd jg .loop_1 + ; zero-pad the remainder (skipped cols) + test skipd, skipd + jz .end + add skipd, skipd + lea blockq, [blockq+skipq*(mmsize/2)] + pxor m0, m0 +.loop_z: + mova [ptrq+mmsize*0], m0 + mova [ptrq+mmsize*1], m0 + mova [ptrq+mmsize*2], m0 + mova [ptrq+mmsize*3], m0 + add ptrq, 4 * mmsize + dec skipd + jg .loop_z +.end: + + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + lea stride3q, [strideq*3] mov cntd, 2 mov ptrq, rsp .loop_2: @@ -854,20 +909,27 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \ SWAP 2, 7, 6 %endmacro -%macro IADST8_FN 4 -cglobal vp9_%1_%3_8x8_add_10, 3, 6 + ARCH_X86_64, 13, \ - 17 * mmsize + ARCH_X86_32 * 5 * mmsize, \ +%macro IADST8_FN 5 +cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \ + 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \ dst, stride, block, eob mova m0, [pw_1023] .body: mova [rsp+16*mmsize], m0 - DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak %if ARCH_X86_64 mov dstbakq, dstq + movsxd cntq, cntd %endif - lea stride3q, [strideq*3] - mov cntd, 2 +%ifdef PIC + lea ptrq, [%5_8x8] + movzx cntd, byte [ptrq+cntq-1] +%else + movzx cntd, byte [%5_8x8+cntq-1] +%endif + mov skipd, 2 + sub skipd, cntd mov ptrq, rsp .loop_1: %2_1D blockq @@ -888,6 +950,24 @@ cglobal vp9_%1_%3_8x8_add_10, 3, 6 + ARCH_X86_64, 13, \ dec cntd jg .loop_1 + ; zero-pad the remainder (skipped cols) + test skipd, skipd + jz .end + add skipd, skipd + lea blockq, [blockq+skipq*(mmsize/2)] + pxor m0, m0 +.loop_z: + mova [ptrq+mmsize*0], m0 + mova [ptrq+mmsize*1], m0 + mova [ptrq+mmsize*2], m0 + mova [ptrq+mmsize*3], m0 + add ptrq, 4 * mmsize + dec skipd + jg .loop_z +.end: + + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + lea stride3q, [strideq*3] mov cntd, 2 mov ptrq, rsp .loop_2: @@ -913,17 +993,17 @@ cglobal vp9_%1_%3_8x8_add_10, 3, 6 + ARCH_X86_64, 13, \ ZERO_BLOCK blockq-2*mmsize, 32, 8, m6 RET -cglobal vp9_%1_%3_8x8_add_12, 3, 6 + ARCH_X86_64, 13, \ - 17 * mmsize + ARCH_X86_32 * 5 * mmsize, \ +cglobal vp9_%1_%3_8x8_add_12, 4, 6 + ARCH_X86_64, 16, \ + 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \ dst, stride, block, eob mova m0, [pw_4095] jmp mangle(private_prefix %+ _ %+ vp9_%1_%3_8x8_add_10 %+ SUFFIX).body %endmacro INIT_XMM sse2 -IADST8_FN idct, IDCT8, iadst, IADST8 -IADST8_FN iadst, IADST8, idct, IDCT8 -IADST8_FN iadst, IADST8, iadst, IADST8 +IADST8_FN idct, IDCT8, iadst, IADST8, row +IADST8_FN iadst, IADST8, idct, IDCT8, col +IADST8_FN iadst, IADST8, iadst, IADST8, default %macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset IDCT8_1D %1, %2 * 2, %4 ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7 @@ -1040,12 +1120,19 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \ .idctfull: mova [rsp+64*mmsize], m0 - DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak %if ARCH_X86_64 mov dstbakq, dstq + movsxd cntq, cntd %endif - lea stride3q, [strideq*3] - mov cntd, 4 +%ifdef PIC + lea ptrq, [default_16x16] + movzx cntd, byte [ptrq+cntq-1] +%else + movzx cntd, byte [default_16x16+cntq-1] +%endif + mov skipd, 4 + sub skipd, cntd mov ptrq, rsp .loop_1: IDCT16_1D blockq @@ -1084,6 +1171,28 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \ dec cntd jg .loop_1 + ; zero-pad the remainder (skipped cols) + test skipd, skipd + jz .end + add skipd, skipd + lea blockq, [blockq+skipq*(mmsize/2)] + pxor m0, m0 +.loop_z: + mova [ptrq+mmsize*0], m0 + mova [ptrq+mmsize*1], m0 + mova [ptrq+mmsize*2], m0 + mova [ptrq+mmsize*3], m0 + mova [ptrq+mmsize*4], m0 + mova [ptrq+mmsize*5], m0 + mova [ptrq+mmsize*6], m0 + mova [ptrq+mmsize*7], m0 + add ptrq, 8 * mmsize + dec skipd + jg .loop_z +.end: + + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + lea stride3q, [strideq*3] mov cntd, 4 mov ptrq, rsp .loop_2: @@ -1318,20 +1427,27 @@ cglobal vp9_idct_idct_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \ SWAP 2, 5, 4, 6, 7, 3 %endmacro -%macro IADST16_FN 6 -cglobal vp9_%1_%4_16x16_add_10, 3, 6 + ARCH_X86_64, 16, \ +%macro IADST16_FN 7 +cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \ 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \ dst, stride, block, eob mova m0, [pw_1023] .body: mova [rsp+64*mmsize], m0 - DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak %if ARCH_X86_64 mov dstbakq, dstq + movsxd cntq, cntd %endif - lea stride3q, [strideq*3] - mov cntd, 4 +%ifdef PIC + lea ptrq, [%7_16x16] + movzx cntd, byte [ptrq+cntq-1] +%else + movzx cntd, byte [%7_16x16+cntq-1] +%endif + mov skipd, 4 + sub skipd, cntd mov ptrq, rsp .loop_1: %2_1D blockq @@ -1370,6 +1486,28 @@ cglobal vp9_%1_%4_16x16_add_10, 3, 6 + ARCH_X86_64, 16, \ dec cntd jg .loop_1 + ; zero-pad the remainder (skipped cols) + test skipd, skipd + jz .end + add skipd, skipd + lea blockq, [blockq+skipq*(mmsize/2)] + pxor m0, m0 +.loop_z: + mova [ptrq+mmsize*0], m0 + mova [ptrq+mmsize*1], m0 + mova [ptrq+mmsize*2], m0 + mova [ptrq+mmsize*3], m0 + mova [ptrq+mmsize*4], m0 + mova [ptrq+mmsize*5], m0 + mova [ptrq+mmsize*6], m0 + mova [ptrq+mmsize*7], m0 + add ptrq, 8 * mmsize + dec skipd + jg .loop_z +.end: + + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + lea stride3q, [strideq*3] mov cntd, 4 mov ptrq, rsp .loop_2: @@ -1419,7 +1557,7 @@ cglobal vp9_%1_%4_16x16_add_10, 3, 6 + ARCH_X86_64, 16, \ ZERO_BLOCK blockq-4*mmsize, 64, 16, m7 RET -cglobal vp9_%1_%4_16x16_add_12, 3, 6 + ARCH_X86_64, 16, \ +cglobal vp9_%1_%4_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \ 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \ dst, stride, block, eob mova m0, [pw_4095] @@ -1427,9 +1565,9 @@ cglobal vp9_%1_%4_16x16_add_12, 3, 6 + ARCH_X86_64, 16, \ %endmacro INIT_XMM sse2 -IADST16_FN idct, IDCT16, 67, iadst, IADST16, 70 -IADST16_FN iadst, IADST16, 70, idct, IDCT16, 67 -IADST16_FN iadst, IADST16, 70, iadst, IADST16, 70 +IADST16_FN idct, IDCT16, 67, iadst, IADST16, 70, row +IADST16_FN iadst, IADST16, 70, idct, IDCT16, 67, col +IADST16_FN iadst, IADST16, 70, iadst, IADST16, 70, default %macro IDCT32_1D 2-3 8 * mmsize; pass[1/2], src, src_stride IDCT16_1D %2, 2 * %3, 272, 257 @@ -1808,12 +1946,19 @@ cglobal vp9_idct_idct_32x32_add_10, 4, 6 + ARCH_X86_64, 16, \ .idctfull: mova [rsp+256*mmsize], m0 - DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak %if ARCH_X86_64 mov dstbakq, dstq + movsxd cntq, cntd %endif - lea stride3q, [strideq*3] - mov cntd, 8 +%ifdef PIC + lea ptrq, [default_32x32] + movzx cntd, byte [ptrq+cntq-1] +%else + movzx cntd, byte [default_32x32+cntq-1] +%endif + mov skipd, 8 + sub skipd, cntd mov ptrq, rsp .loop_1: IDCT32_1D 1, blockq @@ -1823,6 +1968,28 @@ cglobal vp9_idct_idct_32x32_add_10, 4, 6 + ARCH_X86_64, 16, \ dec cntd jg .loop_1 + ; zero-pad the remainder (skipped cols) + test skipd, skipd + jz .end + shl skipd, 2 + lea blockq, [blockq+skipq*(mmsize/4)] + pxor m0, m0 +.loop_z: + mova [ptrq+mmsize*0], m0 + mova [ptrq+mmsize*1], m0 + mova [ptrq+mmsize*2], m0 + mova [ptrq+mmsize*3], m0 + mova [ptrq+mmsize*4], m0 + mova [ptrq+mmsize*5], m0 + mova [ptrq+mmsize*6], m0 + mova [ptrq+mmsize*7], m0 + add ptrq, 8 * mmsize + dec skipd + jg .loop_z +.end: + + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + lea stride3q, [strideq*3] mov cntd, 8 mov ptrq, rsp .loop_2: diff --git a/tests/checkasm/vp9dsp.c b/tests/checkasm/vp9dsp.c index 37a3ca6738..c1e13764e2 100644 --- a/tests/checkasm/vp9dsp.c +++ b/tests/checkasm/vp9dsp.c @@ -337,7 +337,7 @@ static void check_itxfm(void) randomize_buffers(); ftx(coef, tx, txtp, sz, bit_depth); - for (sub = (txtp == 0) ? 1 : sz; sub <= sz; sub <<= 1) { + for (sub = (txtp == 0) ? 1 : 2; sub <= sz; sub <<= 1) { int eob; if (sub < sz) {