diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm index 8318a81c5e..3be0ff7757 100644 --- a/libavcodec/x86/proresdsp.asm +++ b/libavcodec/x86/proresdsp.asm @@ -52,7 +52,7 @@ SECTION .text %macro idct_fn 0 cglobal prores_idct_put_10, 4, 4, 15, pixels, lsize, block, qmat - IDCT_FN pw_1, 15, pw_88, 18, pw_4, pw_1019, r3 + IDCT_FN pw_1, 15, pw_88, 18, "put", pw_4, pw_1019, r3 RET %endmacro diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm index 7cfd33eaa3..1a5a2eae9b 100644 --- a/libavcodec/x86/simple_idct10.asm +++ b/libavcodec/x86/simple_idct10.asm @@ -69,24 +69,24 @@ SECTION .text %macro idct_fn 0 cglobal simple_idct10, 1, 1, 16, block - IDCT_FN "", 12, "", 19 + IDCT_FN "", 12, "", 19, "store" RET cglobal simple_idct10_put, 3, 3, 16, pixels, lsize, block - IDCT_FN "", 12, "", 19, 0, pw_1023 + IDCT_FN "", 12, "", 19, "put", 0, pw_1023 RET cglobal simple_idct12, 1, 1, 16, block ; coeffs are already 15bits, adding the offset would cause ; overflow in the input - IDCT_FN "", 15, pw_2, 16 + IDCT_FN "", 15, pw_2, 16, "store" RET cglobal simple_idct12_put, 3, 3, 16, pixels, lsize, block ; range isn't known, so the C simple_idct range is used ; Also, using a bias on input overflows, so use the bias ; on output of the first butterfly instead - IDCT_FN "", 15, pw_2, 16, 0, pw_4095 + IDCT_FN "", 15, pw_2, 16, "put", 0, pw_4095 RET %endmacro diff --git a/libavcodec/x86/simple_idct10_template.asm b/libavcodec/x86/simple_idct10_template.asm index 3f398985a5..8367011dfd 100644 --- a/libavcodec/x86/simple_idct10_template.asm +++ b/libavcodec/x86/simple_idct10_template.asm @@ -218,11 +218,12 @@ ; %2 = row bias macro ; %3 = column shift ; %4 = column bias macro -; %5 = min pixel value -; %6 = max pixel value -; %7 = qmat (for prores) +; %5 = final action (nothing, "store", "put", "add") +; %6 = min pixel value +; %7 = max pixel value +; %8 = qmat (for prores) -%macro IDCT_FN 4-7 +%macro IDCT_FN 4-8 ; for (i = 0; i < 8; i++) ; idctRowCondDC(block + i*8); mova m10,[blockq+ 0] ; { row[0] }[0-7] @@ -230,13 +231,13 @@ mova m13,[blockq+64] ; { row[4] }[0-7] mova m12,[blockq+96] ; { row[6] }[0-7] -%if %0 == 7 - pmullw m10,[%7+ 0] - pmullw m8, [%7+32] - pmullw m13,[%7+64] - pmullw m12,[%7+96] +%if %0 == 8 + pmullw m10,[%8+ 0] + pmullw m8, [%8+32] + pmullw m13,[%8+64] + pmullw m12,[%8+96] - IDCT_1D %1, %2, %7 + IDCT_1D %1, %2, %8 %else IDCT_1D %1, %2 %endif @@ -257,7 +258,8 @@ IDCT_1D %3, %4 ; clip/store -%if %0 == 4 +%if %0 >= 5 +%ifidn %5,"store" ; No clamping, means pure idct mova [blockq+ 0], m8 mova [blockq+ 16], m0 @@ -267,13 +269,13 @@ mova [blockq+ 80], m11 mova [blockq+ 96], m9 mova [blockq+112], m10 -%else -%ifidn %5, 0 +%elifidn %5,"put" +%ifidn %6, 0 pxor m3, m3 %else - mova m3, [%5] -%endif - mova m5, [%6] + mova m3, [%6] +%endif ; ifidn %6, 0 + mova m5, [%7] pmaxsw m8, m3 pmaxsw m0, m3 pmaxsw m1, m3 @@ -301,7 +303,8 @@ mova [r0+r1 ], m11 mova [r0+r1*2], m9 mova [r0+r2 ], m10 -%endif +%endif ; %5 action +%endif; if %0 >= 5 %endmacro %endif