diff --git a/common.mak b/common.mak index ec706480d0..821e84c4c4 100644 --- a/common.mak +++ b/common.mak @@ -29,7 +29,8 @@ CFLAGS += $(ECFLAGS) CCFLAGS = $(CPPFLAGS) $(CFLAGS) ASFLAGS := $(CPPFLAGS) $(ASFLAGS) CXXFLAGS += $(CPPFLAGS) $(CFLAGS) -YASMFLAGS += $(IFLAGS) -I$(SRC_PATH)/libavutil/x86/ -Pconfig.asm +YASMFLAGS += $(IFLAGS:%=%/) -I$(SRC_PATH)/libavutil/x86/ -Pconfig.asm + HOSTCCFLAGS = $(IFLAGS) $(HOSTCFLAGS) LDFLAGS := $(ALLFFLIBS:%=-Llib%) $(LDFLAGS) diff --git a/configure b/configure index ef4298a21f..98444d9758 100755 --- a/configure +++ b/configure @@ -1220,6 +1220,7 @@ HAVE_LIST=" closesocket cmov cpuid + cpunop dcbzl dev_bktr_ioctl_bt848_h dev_bktr_ioctl_meteor_h @@ -3229,6 +3230,7 @@ EOF die "yasm not found, use --disable-yasm for a crippled build" check_yasm "vextractf128 xmm0, ymm0, 0" || disable avx check_yasm "vfmaddps ymm0, ymm1, ymm2, ymm3" || disable fma4 + check_yasm "CPU amdnop" && enable cpunop fi case "$cpu" in diff --git a/libavcodec/x86/deinterlace.asm b/libavcodec/x86/deinterlace.asm index a09473bdae..bcc275be89 100644 --- a/libavcodec/x86/deinterlace.asm +++ b/libavcodec/x86/deinterlace.asm @@ -39,7 +39,7 @@ cglobal deinterlace_line_mmx, 7,7,7, dst, lum_m4, lum_m3, lum_m2, lum_m1 %endif pxor mm7, mm7 movq mm6, [pw_4] -.nextrow +.nextrow: movd mm0, [lum_m4q] movd mm1, [lum_m3q] movd mm2, [lum_m2q] diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index 06d2027c69..19884a36a8 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -1143,7 +1143,7 @@ VECTOR_CLIP_INT32 6, 1, 0, 0 cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len lea lenq, [lend*4 - 2*mmsize] ALIGN 16 -.loop +.loop: %if cpuflag(avx) vmovaps xmm0, [src1q + 16] vinsertf128 m0, m0, [src1q], 1 @@ -1182,7 +1182,7 @@ VECTOR_FMUL_REVERSE cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len lea lenq, [lend*4 - 2*mmsize] ALIGN 16 -.loop +.loop: mova m0, [src0q + lenq] mova m1, [src0q + lenq + mmsize] mulps m0, m0, [src1q + lenq] @@ -1313,7 +1313,7 @@ cglobal bswap32_buf, 3,4,5 add r0, 4 dec r2 jnz .loop2 -.end +.end: RET ; %1 = aligned/unaligned diff --git a/libavcodec/x86/dsputilenc_yasm.asm b/libavcodec/x86/dsputilenc_yasm.asm index 1be359d667..2d805c7da8 100644 --- a/libavcodec/x86/dsputilenc_yasm.asm +++ b/libavcodec/x86/dsputilenc_yasm.asm @@ -184,7 +184,7 @@ cglobal hadamard8_diff16_%1, 5, 6, %2 call hadamard8x8_diff_%1 add r5d, eax -.done +.done: mov eax, r5d %ifndef m8 ADD rsp, pad @@ -288,7 +288,7 @@ cglobal sse16_sse2, 5, 5, 8 pxor m0, m0 ; mm0 = 0 pxor m7, m7 ; mm7 holds the sum -.next2lines ; FIXME why are these unaligned movs? pix1[] is aligned +.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned movu m1, [r1 ] ; mm1 = pix1[0][0-15] movu m2, [r2 ] ; mm2 = pix2[0][0-15] movu m3, [r1+r3] ; mm3 = pix1[1][0-15] diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm index faa27a01c6..79c7c183b9 100644 --- a/libavcodec/x86/fft_mmx.asm +++ b/libavcodec/x86/fft_mmx.asm @@ -608,7 +608,7 @@ cglobal fft_calc, 2,5,8 add rcx, 3 shl r2, cl sub r4, r2 -.loop +.loop: %if mmsize == 8 PSWAPD m0, [r4 + r2 + 4] mova [r4 + r2 + 4], m0 diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm index 7368b8f518..21e0cce3b9 100644 --- a/libavcodec/x86/fmtconvert.asm +++ b/libavcodec/x86/fmtconvert.asm @@ -404,7 +404,7 @@ cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1 mov src1q, [srcq+gprsize] mov srcq, [srcq ] sub src1q, srcq -.loop +.loop: MOVPS m0, [srcq ] MOVPS m1, [srcq+src1q ] MOVPS m3, [srcq +mmsize] diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm index 4155947e58..bee833051f 100644 --- a/libavcodec/x86/h264_chromamc.asm +++ b/libavcodec/x86/h264_chromamc.asm @@ -69,7 +69,7 @@ SECTION .text %macro mv0_pixels_mc8 0 lea r4, [r2*2 ] -.next4rows +.next4rows: movq mm0, [r1 ] movq mm1, [r1+r2] add r1, r4 @@ -117,7 +117,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0 mv0_pixels_mc8 REP_RET -.at_least_one_non_zero +.at_least_one_non_zero: %ifidn %2, rv40 %if ARCH_X86_64 mov r7, r5 @@ -145,7 +145,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0 test r4d, r4d mov r6, r2 ; dxy = x ? 1 : stride jne .both_non_zero -.my_is_zero +.my_is_zero: ; mx == 0 XOR my == 0 - 1 dimensional filter only or r4d, r5d ; x + y @@ -166,7 +166,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0 pxor m7, m7 psubw m4, m5 ; mm4 = A = 8-x -.next1drow +.next1drow: movq m0, [r1 ] ; mm0 = src[0..7] movq m2, [r1+r6] ; mm1 = src[1..8] @@ -197,7 +197,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0 jne .next1drow REP_RET -.both_non_zero ; general case, bilinear +.both_non_zero: ; general case, bilinear movd m4, r4d ; x movd m6, r5d ; y %ifidn %2, rv40 @@ -232,7 +232,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0 movq m0, [r1 ] ; mm0 = src[0..7] movq m1, [r1+1] ; mm1 = src[1..8] -.next2drow +.next2drow: add r1, r2 movq m2, m0 @@ -330,7 +330,7 @@ cglobal %1_%2_chroma_mc4_%3, 6, 6 + extra_regs, 0 pmullw m6, m2 paddw m6, m0 -.next2rows +.next2rows: movd m0, [r1 ] movd m1, [r1+1] add r1, r2 @@ -397,7 +397,7 @@ cglobal %1_%2_chroma_mc2_%3, 6, 7, 0 punpcklbw m2, m7 pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2] -.nextrow +.nextrow: add r1, r2 movq m1, m2 pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] @@ -474,7 +474,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 mv0_pixels_mc8 REP_RET -.at_least_one_non_zero +.at_least_one_non_zero: test r5d, r5d je .my_is_zero test r4d, r4d @@ -501,7 +501,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 movlhps m7, m7 movlhps m6, m6 -.next2rows +.next2rows: movq m1, [r1+r2*1 ] movq m2, [r1+r2*1+1] movq m3, [r1+r2*2 ] @@ -535,7 +535,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 jg .next2rows REP_RET -.my_is_zero +.my_is_zero: mov r5d, r4d shl r4d, 8 add r4, 8 @@ -545,7 +545,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 pshuflw m7, m7, 0 movlhps m7, m7 -.next2xrows +.next2xrows: movq m0, [r1 ] movq m1, [r1 +1] movq m2, [r1+r2 ] @@ -572,7 +572,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 jg .next2xrows REP_RET -.mx_is_zero +.mx_is_zero: mov r4d, r5d shl r5d, 8 add r5, 8 @@ -582,7 +582,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 pshuflw m7, m7, 0 movlhps m7, m7 -.next2yrows +.next2yrows: movq m0, [r1 ] movq m1, [r1+r2 ] movdqa m2, m1 @@ -632,7 +632,7 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 punpcklbw m0, [r1+1] pshufw m6, m6, 0 -.next2rows +.next2rows: movd m1, [r1+r2*1 ] movd m3, [r1+r2*2 ] punpcklbw m1, [r1+r2*1+1] diff --git a/libavcodec/x86/h264_chromamc_10bit.asm b/libavcodec/x86/h264_chromamc_10bit.asm index 370c7b5a46..f8a2cff68f 100644 --- a/libavcodec/x86/h264_chromamc_10bit.asm +++ b/libavcodec/x86/h264_chromamc_10bit.asm @@ -38,7 +38,7 @@ SECTION .text %macro MV0_PIXELS_MC8 0 lea r4, [r2*3 ] lea r5, [r2*4 ] -.next4rows +.next4rows: movu m0, [r1 ] movu m1, [r1+r2 ] CHROMAMC_AVG m0, [r0 ] @@ -72,14 +72,14 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8 MV0_PIXELS_MC8 REP_RET -.at_least_one_non_zero +.at_least_one_non_zero: mov r6d, 2 test r5d, r5d je .x_interpolation mov r6, r2 ; dxy = x ? 1 : stride test r4d, r4d jne .xy_interpolation -.x_interpolation +.x_interpolation: ; mx == 0 XOR my == 0 - 1 dimensional filter only or r4d, r5d ; x + y movd m5, r4d @@ -88,7 +88,7 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8 SPLATW m5, m5 ; mm5 = B = x psubw m4, m5 ; mm4 = A = 8-x -.next1drow +.next1drow: movu m0, [r1 ] ; mm0 = src[0..7] movu m2, [r1+r6] ; mm2 = src[1..8] @@ -107,7 +107,7 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8 jne .next1drow REP_RET -.xy_interpolation ; general case, bilinear +.xy_interpolation: ; general case, bilinear movd m4, r4m ; x movd m6, r5m ; y @@ -125,7 +125,7 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8 movu m0, [r1 ] ; mm0 = src[0..7] movu m1, [r1+2] ; mm1 = src[1..8] -.next2drow +.next2drow: add r1, r2 pmullw m2, m0, m4 @@ -192,7 +192,7 @@ cglobal %1_h264_chroma_mc4_10, 6,6,7 pmullw m6, m2 paddw m6, m0 -.next2rows +.next2rows: MC4_OP m0, m6 MC4_OP m6, m0 sub r3d, 2 @@ -221,7 +221,7 @@ cglobal %1_h264_chroma_mc2_10, 6,7 pxor m7, m7 pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2] -.nextrow +.nextrow: add r1, r2 movq m1, m2 pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm index d625eee4a4..8a6bbb9d9e 100644 --- a/libavcodec/x86/h264_deblock_10bit.asm +++ b/libavcodec/x86/h264_deblock_10bit.asm @@ -625,7 +625,7 @@ cglobal deblock_v_luma_intra_10, 4,7,16 shl r2d, 2 shl r3d, 2 LOAD_AB aa, bb, r2d, r3d -.loop +.loop: mova p2, [r4+r1] mova p1, [r4+2*r1] mova p0, [r4+r5] @@ -676,7 +676,7 @@ cglobal deblock_h_luma_intra_10, 4,7,16 mova m0, [pw_2] shl r2d, 2 shl r3d, 2 -.loop +.loop: movu q3, [r0-8] movu q2, [r0+r1-8] movu q1, [r0+r1*2-8] diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index edd603b793..7003a14add 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -308,7 +308,7 @@ cglobal h264_idct_add16_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, str %ifdef PIC lea picregq, [scan8_mem] %endif -.nextblock +.nextblock: movzx r6, byte [scan8+r5] movzx r6, byte [r4+r6] test r6, r6 @@ -316,7 +316,7 @@ cglobal h264_idct_add16_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, str mov r6d, dword [r1+r5*4] lea r6, [r0+r6] IDCT4_ADD r6, r2, r3 -.skipblock +.skipblock: inc r5 add r2, 32 cmp r5, 16 @@ -333,7 +333,7 @@ cglobal h264_idct8_add4_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, str %ifdef PIC lea picregq, [scan8_mem] %endif -.nextblock +.nextblock: movzx r6, byte [scan8+r5] movzx r6, byte [r4+r6] test r6, r6 @@ -347,7 +347,7 @@ cglobal h264_idct8_add4_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, str mov r6d, dword [r1+r5*4] lea r6, [r0+r6+4] IDCT8_ADD_MMX_END r6 , rsp+8, r3 -.skipblock +.skipblock: add r5, 4 add r2, 128 cmp r5, 16 @@ -362,7 +362,7 @@ cglobal h264_idct_add16_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s %ifdef PIC lea picregq, [scan8_mem] %endif -.nextblock +.nextblock: movzx r6, byte [scan8+r5] movzx r6, byte [r4+r6] test r6, r6 @@ -388,11 +388,11 @@ cglobal h264_idct_add16_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s cmp r5, 16 jl .nextblock REP_RET -.no_dc +.no_dc: mov r6d, dword [r1+r5*4] add r6, r0 IDCT4_ADD r6, r2, r3 -.skipblock +.skipblock: inc r5 add r2, 32 cmp r5, 16 @@ -406,7 +406,7 @@ cglobal h264_idct_add16intra_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block %ifdef PIC lea picregq, [scan8_mem] %endif -.nextblock +.nextblock: movzx r6, byte [scan8+r5] movzx r6, byte [r4+r6] or r6w, word [r2] @@ -415,7 +415,7 @@ cglobal h264_idct_add16intra_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block mov r6d, dword [r1+r5*4] add r6, r0 IDCT4_ADD r6, r2, r3 -.skipblock +.skipblock: inc r5 add r2, 32 cmp r5, 16 @@ -429,7 +429,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, blo %ifdef PIC lea picregq, [scan8_mem] %endif -.nextblock +.nextblock: movzx r6, byte [scan8+r5] movzx r6, byte [r4+r6] test r6, r6 @@ -442,7 +442,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, blo cmp r5, 16 jl .nextblock REP_RET -.try_dc +.try_dc: movsx r6, word [r2] test r6, r6 jz .skipblock @@ -457,7 +457,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, blo %if ARCH_X86_64 == 0 mov r1, r1m %endif -.skipblock +.skipblock: inc r5 add r2, 32 cmp r5, 16 @@ -474,7 +474,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s %ifdef PIC lea picregq, [scan8_mem] %endif -.nextblock +.nextblock: movzx r6, byte [scan8+r5] movzx r6, byte [r4+r6] test r6, r6 @@ -504,7 +504,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s ADD rsp, pad RET -.no_dc +.no_dc: mov r6d, dword [r1+r5*4] add r6, r0 add word [r2], 32 @@ -514,7 +514,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s mov r6d, dword [r1+r5*4] lea r6, [r0+r6+4] IDCT8_ADD_MMX_END r6 , rsp+8, r3 -.skipblock +.skipblock: add r5, 4 add r2, 128 cmp r5, 16 @@ -531,7 +531,7 @@ cglobal h264_idct8_add4_8_sse2, 5, 8 + npicregs, 10, dst1, block_offset, block, %ifdef PIC lea picregq, [scan8_mem] %endif -.nextblock +.nextblock: movzx r6, byte [scan8+r5] movzx r6, byte [r4+r6] test r6, r6 @@ -560,7 +560,7 @@ INIT_MMX cmp r5, 16 jl .nextblock REP_RET -.no_dc +.no_dc: INIT_XMM mov dst2d, dword [r1+r5*4] add dst2q, r0 @@ -568,7 +568,7 @@ INIT_XMM %if ARCH_X86_64 == 0 mov r1, r1m %endif -.skipblock +.skipblock: add r5, 4 add r2, 128 cmp r5, 16 @@ -577,7 +577,7 @@ INIT_XMM INIT_MMX h264_idct_add8_mmx_plane: -.nextblock +.nextblock: movzx r6, byte [scan8+r5] movzx r6, byte [r4+r6] or r6w, word [r2] @@ -592,7 +592,7 @@ h264_idct_add8_mmx_plane: add r0, dword [r1+r5*4] %endif IDCT4_ADD r0, r2, r3 -.skipblock +.skipblock: inc r5 add r2, 32 test r5, 3 @@ -621,8 +621,8 @@ cglobal h264_idct_add8_8_mmx, 5, 8 + npicregs, 0, dst1, block_offset, block, str call h264_idct_add8_mmx_plane RET -h264_idct_add8_mmx2_plane -.nextblock +h264_idct_add8_mmx2_plane: +.nextblock: movzx r6, byte [scan8+r5] movzx r6, byte [r4+r6] test r6, r6 @@ -641,7 +641,7 @@ h264_idct_add8_mmx2_plane test r5, 3 jnz .nextblock rep ret -.try_dc +.try_dc: movsx r6, word [r2] test r6, r6 jz .skipblock @@ -655,7 +655,7 @@ h264_idct_add8_mmx2_plane add r0, dword [r1+r5*4] %endif DC_ADD_MMX2_OP movh, r0, r3, r6 -.skipblock +.skipblock: inc r5 add r2, 32 test r5, 3 @@ -734,7 +734,7 @@ h264_add8x4_idct_sse2: add r0, r0m %endif call h264_add8x4_idct_sse2 -.cycle%1end +.cycle%1end: %if %1 < 7 add r2, 64 %endif @@ -770,7 +770,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5 + ARCH_X86_64, 8 %endif call h264_add8x4_idct_sse2 jmp .cycle%1end -.try%1dc +.try%1dc: movsx r0, word [r2 ] or r0w, word [r2+32] jz .cycle%1end @@ -781,7 +781,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5 + ARCH_X86_64, 8 add r0, r0m %endif call h264_idct_dc_add8_mmx2 -.cycle%1end +.cycle%1end: %if %1 < 7 add r2, 64 %endif @@ -817,7 +817,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7 + ARCH_X86_64, 8 %endif call h264_add8x4_idct_sse2 jmp .cycle%1end -.try%1dc +.try%1dc: movsx r0, word [r2 ] or r0w, word [r2+32] jz .cycle%1end @@ -830,7 +830,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7 + ARCH_X86_64, 8 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] %endif call h264_idct_dc_add8_mmx2 -.cycle%1end +.cycle%1end: %if %1 == 1 add r2, 384+64 %elif %1 < 3 diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm index 67bc68ec5c..3d4ac5fb71 100644 --- a/libavcodec/x86/h264_idct_10bit.asm +++ b/libavcodec/x86/h264_idct_10bit.asm @@ -225,7 +225,7 @@ IDCT8_DC_ADD ; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) ;----------------------------------------------------------------------------- %macro AC 1 -.ac%1 +.ac%1: mov r5d, [r1+(%1+0)*4] call add4x4_idct %+ SUFFIX mov r5d, [r1+(%1+1)*4] diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm index fd78456294..609cb2303d 100644 --- a/libavcodec/x86/h264_intrapred.asm +++ b/libavcodec/x86/h264_intrapred.asm @@ -484,7 +484,7 @@ cglobal pred16x16_plane_%1, 2,9,7 %endif mov r4, 8 -.loop +.loop: mova m3, m0 ; b[0..7] mova m4, m2 ; b[8..15] psraw m3, 5 @@ -680,7 +680,7 @@ cglobal pred8x8_plane, 2,9,7 mov r4, 4 ALIGN 16 -.loop +.loop: %if mmsize == 16 mova m3, m0 ; b[0..7] paddw m0, m1 @@ -1045,7 +1045,7 @@ cglobal pred8x8l_top_dc_%1, 4,4 psrlq mm5, 56 psllq mm5, 56 pxor mm1, mm5 -.body +.body: PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 psadbw mm7, mm0 paddw mm7, [pw_4] @@ -1141,7 +1141,7 @@ cglobal pred8x8l_dc_%1, 4,5 jz .fix_lt_2 test r2, r2 jz .fix_tr_1 -.body +.body: lea r1, [r0+r3*2] PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 pxor mm0, mm0 @@ -1276,7 +1276,7 @@ cglobal pred8x8l_vertical_%1, 4,4 psrlq mm5, 56 psllq mm5, 56 pxor mm1, mm5 -.body +.body: PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 %rep 3 movq [r0+r3*1], mm0 @@ -1576,7 +1576,7 @@ cglobal pred8x8l_down_right_mmxext, 4,5 psllq mm5, 56 pxor mm1, mm5 jmp .do_top -.body +.body: lea r1, [r0+r3*2] movq mm1, mm7 movq mm7, mm5 @@ -1822,7 +1822,7 @@ cglobal pred8x8l_vertical_right_mmxext, 4,5 jz .fix_lt_2 test r2, r2 jz .fix_tr_1 -.do_top +.do_top: PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 lea r1, [r0+r3*2] movq mm2, mm6 @@ -1931,7 +1931,7 @@ cglobal pred8x8l_vertical_right_%1, 4,5,7 jz .fix_lt_2 test r2, r2 jz .fix_tr_1 -.do_top +.do_top: PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 lea r1, [r0+r3*2] movq2dq xmm4, mm6 diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm index 788d715d61..f380cfc77b 100644 --- a/libavcodec/x86/h264_qpel_10bit.asm +++ b/libavcodec/x86/h264_qpel_10bit.asm @@ -264,7 +264,7 @@ cglobal_mc %1, %2, mc20, %3, 3,4,9 %else %define p16 [pw_16] %endif -.nextrow +.nextrow: %if %0 == 4 movu m2, [r1-4] movu m3, [r1-2] @@ -330,7 +330,7 @@ MC_CACHE MC30 %macro MC10 3-4 cglobal_mc %1, %2, mc10, %3, 3,5,9 mov r4, r1 -.body +.body: mov r3d, %3 mova m1, [pw_pixel_max] %if num_mmregs > 8 @@ -339,7 +339,7 @@ cglobal_mc %1, %2, mc10, %3, 3,5,9 %else %define p16 [pw_16] %endif -.nextrow +.nextrow: %if %0 == 4 movu m2, [r1-4] movu m3, [r1-2] @@ -446,7 +446,7 @@ MC MC02 %macro MC01 3 cglobal_mc %1, %2, mc01, %3, 3,5,8 mov r4, r1 -.body +.body: PRELOAD_V sub r4, r2 @@ -535,7 +535,7 @@ SWAP 0,1,2,3,4,5 ; this REALLY needs x86_64 cglobal_mc %1, %2, mc11, %3, 3,6,8 mov r4, r1 -.body +.body: PRELOAD_V sub r0, r2 @@ -778,7 +778,7 @@ cglobal_mc %1, %2, mc12, %3, 3,7,12 call put_hv%3_10_%1 xor r4d, r4d -.body +.body: mov r3d, %3 pxor m0, m0 mova m7, [pw_pixel_max] @@ -837,7 +837,7 @@ put_h%2_10_%1: mov r3d, %2 xor r4d, r4d mova m6, [pad20] -.nextrow +.nextrow: movu m2, [r5-4] movu m3, [r5-2] movu m4, [r5+0] @@ -864,7 +864,7 @@ H_NRD sse2 , 8 %macro MC21 3 cglobal_mc %1, %2, mc21, %3, 3,7,12 mov r5, r1 -.body +.body: %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) mov r6, rsp ; backup stack pointer and rsp, ~(mmsize-1) ; align stack diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm index 26233bf080..2ad5a8f458 100644 --- a/libavcodec/x86/h264_weight.asm +++ b/libavcodec/x86/h264_weight.asm @@ -73,7 +73,7 @@ SECTION .text INIT_MMX cglobal h264_weight_16_mmx2, 6, 6, 0 WEIGHT_SETUP -.nextrow +.nextrow: WEIGHT_OP 0, 4 mova [r0 ], m0 WEIGHT_OP 8, 12 @@ -86,7 +86,7 @@ cglobal h264_weight_16_mmx2, 6, 6, 0 %macro WEIGHT_FUNC_MM 3 cglobal h264_weight_%1_%3, 6, 6, %2 WEIGHT_SETUP -.nextrow +.nextrow: WEIGHT_OP 0, mmsize/2 mova [r0], m0 add r0, r1 @@ -105,7 +105,7 @@ cglobal h264_weight_%1_%3, 6, 6, %2 WEIGHT_SETUP sar r2d, 1 lea r3, [r1*2] -.nextrow +.nextrow: WEIGHT_OP 0, r1 movh [r0], m0 %if mmsize == 16 @@ -178,7 +178,7 @@ INIT_MMX cglobal h264_biweight_16_mmx2, 7, 8, 0 BIWEIGHT_SETUP movifnidn r3d, r3m -.nextrow +.nextrow: BIWEIGHT_STEPA 0, 1, 0 BIWEIGHT_STEPA 1, 2, 4 BIWEIGHT_STEPB @@ -197,7 +197,7 @@ cglobal h264_biweight_16_mmx2, 7, 8, 0 cglobal h264_biweight_%1_%3, 7, 8, %2 BIWEIGHT_SETUP movifnidn r3d, r3m -.nextrow +.nextrow: BIWEIGHT_STEPA 0, 1, 0 BIWEIGHT_STEPA 1, 2, mmsize/2 BIWEIGHT_STEPB @@ -220,7 +220,7 @@ cglobal h264_biweight_%1_%3, 7, 8, %2 movifnidn r3d, r3m sar r3, 1 lea r4, [r2*2] -.nextrow +.nextrow: BIWEIGHT_STEPA 0, 1, 0 BIWEIGHT_STEPA 1, 2, r2 BIWEIGHT_STEPB @@ -288,7 +288,7 @@ cglobal h264_biweight_16_ssse3, 7, 8, 8 BIWEIGHT_SSSE3_SETUP movifnidn r3d, r3m -.nextrow +.nextrow: movh m0, [r0] movh m2, [r0+8] movh m3, [r1+8] @@ -309,7 +309,7 @@ cglobal h264_biweight_8_ssse3, 7, 8, 8 sar r3, 1 lea r4, [r2*2] -.nextrow +.nextrow: movh m0, [r0] movh m1, [r1] movh m2, [r0+r2] diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm index dec9aba461..24386f882e 100644 --- a/libavcodec/x86/h264_weight_10bit.asm +++ b/libavcodec/x86/h264_weight_10bit.asm @@ -40,7 +40,7 @@ SECTION .text ; int weight, int offset); ;----------------------------------------------------------------------------- %macro WEIGHT_PROLOGUE 0 -.prologue +.prologue: PROLOGUE 0,6,8 movifnidn r0, r0mp movifnidn r1d, r1m @@ -93,7 +93,7 @@ SECTION .text cglobal h264_weight_16_10_%1 WEIGHT_PROLOGUE WEIGHT_SETUP %1 -.nextrow +.nextrow: WEIGHT_OP %1, 0 mova [r0 ], m5 WEIGHT_OP %1, 16 @@ -113,7 +113,7 @@ WEIGHT_FUNC_DBL sse4 cglobal h264_weight_8_10_%1 WEIGHT_PROLOGUE WEIGHT_SETUP %1 -.nextrow +.nextrow: WEIGHT_OP %1, 0 mova [r0], m5 add r0, r1 @@ -133,7 +133,7 @@ cglobal h264_weight_4_10_%1 sar r2d, 1 WEIGHT_SETUP %1 lea r3, [r1*2] -.nextrow +.nextrow: WEIGHT_OP %1, 0, r1 movh [r0], m5 movhps [r0+r1], m5 @@ -159,7 +159,7 @@ DECLARE_REG_TMP 7 %endif %macro BIWEIGHT_PROLOGUE 0 -.prologue +.prologue: PROLOGUE 0,8,8 movifnidn r0, r0mp movifnidn r1, r1mp @@ -221,7 +221,7 @@ DECLARE_REG_TMP 7 cglobal h264_biweight_16_10_%1 BIWEIGHT_PROLOGUE BIWEIGHT_SETUP %1 -.nextrow +.nextrow: BIWEIGHT %1, 0 mova [r0 ], m0 BIWEIGHT %1, 16 @@ -241,7 +241,7 @@ BIWEIGHT_FUNC_DBL sse4 cglobal h264_biweight_8_10_%1 BIWEIGHT_PROLOGUE BIWEIGHT_SETUP %1 -.nextrow +.nextrow: BIWEIGHT %1, 0 mova [r0], m0 add r0, r2 @@ -261,7 +261,7 @@ cglobal h264_biweight_4_10_%1 BIWEIGHT_SETUP %1 sar r3d, 1 lea r4, [r2*2] -.nextrow +.nextrow: BIWEIGHT %1, 0, r2 movh [r0 ], m0 movhps [r0+r2], m0 diff --git a/libavcodec/x86/vp56dsp.asm b/libavcodec/x86/vp56dsp.asm index 21ff041d61..bc09903575 100644 --- a/libavcodec/x86/vp56dsp.asm +++ b/libavcodec/x86/vp56dsp.asm @@ -139,7 +139,7 @@ cglobal vp6_filter_diag4, 5, 7, 8 mov r3, rsp mov r6, 11 -.nextrow +.nextrow: DIAG4 r1, -1, 0, 1, 2, r3 add r3, 8 add r1, r2 @@ -151,7 +151,7 @@ cglobal vp6_filter_diag4, 5, 7, 8 lea r3, [rsp+8] mov r6, 8 -.nextcol +.nextcol: DIAG4 r3, -8, 0, 8, 16, r0 add r3, 8 add r0, r2 diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 75c0b56f94..19853c494f 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -189,7 +189,7 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h mova m6, [sixtap_filter_hb+mxq*8-32] mova m7, [sixtap_filter_hb+mxq*8-16] -.nextrow +.nextrow: movu m0, [srcq-2] mova m1, m0 mova m2, m0 @@ -229,7 +229,7 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes mova m6, [fourtap_filter_hb+mxq] -.nextrow +.nextrow: movu m0, [srcq-1] mova m1, m0 pshufb m0, m3 @@ -264,7 +264,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr movh m2, [srcq+2*srcstrideq] add srcq, srcstrideq -.nextrow +.nextrow: movh m3, [srcq+2*srcstrideq] ; read new row mova m4, m0 mova m0, m1 @@ -304,7 +304,7 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr movh m3, [srcq] movh m4, [srcq+srcstrideq] -.nextrow +.nextrow: movh m5, [srcq+2*srcstrideq] ; read new row mova m6, m0 punpcklbw m6, m5 @@ -350,7 +350,7 @@ cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, he movq mm7, [pw_64] pxor mm6, mm6 -.nextrow +.nextrow: movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels ; first set of 2 pixels @@ -399,7 +399,7 @@ cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, he movq mm7, [pw_64] pxor mm3, mm3 -.nextrow +.nextrow: movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels ; first set of 2 pixels @@ -459,7 +459,7 @@ cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, h mova m8, [mxq+32] mova m9, [mxq+48] %endif -.nextrow +.nextrow: movq m0, [srcq-1] movq m1, [srcq-0] movq m2, [srcq+1] @@ -510,7 +510,7 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, h mova m12, [mxq+64] mova m13, [mxq+80] %endif -.nextrow +.nextrow: movq m0, [srcq-2] movq m1, [srcq-1] movq m2, [srcq-0] @@ -577,7 +577,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr punpcklbw m1, m7 punpcklbw m2, m7 -.nextrow +.nextrow: ; first calculate negative taps (to prevent losing positive overflows) movh m4, [srcq+2*srcstrideq] ; read new row punpcklbw m4, m7 @@ -635,7 +635,7 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr punpcklbw m3, m7 punpcklbw m4, m7 -.nextrow +.nextrow: ; first calculate negative taps (to prevent losing positive overflows) mova m5, m1 pmullw m5, [myq+16] @@ -689,7 +689,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p mova m5, [bilinear_filter_vw+myq-1*16] neg myq mova m4, [bilinear_filter_vw+myq+7*16] -.nextrow +.nextrow: movh m0, [srcq+srcstrideq*0] movh m1, [srcq+srcstrideq*1] movh m3, [srcq+srcstrideq*2] @@ -733,7 +733,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride mova m5, [bilinear_filter_vw+mxq-1*16] neg mxq mova m4, [bilinear_filter_vw+mxq+7*16] -.nextrow +.nextrow: movh m0, [srcq+srcstrideq*0+0] movh m1, [srcq+srcstrideq*0+1] movh m2, [srcq+srcstrideq*1+0] @@ -783,7 +783,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, p %endif pxor m4, m4 mova m3, [bilinear_filter_vb+myq-16] -.nextrow +.nextrow: movh m0, [srcq+srcstrideq*0] movh m1, [srcq+srcstrideq*1] movh m2, [srcq+srcstrideq*2] @@ -820,7 +820,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride pxor m4, m4 mova m2, [filter_h2_shuf] mova m3, [bilinear_filter_vb+mxq-16] -.nextrow +.nextrow: movu m0, [srcq+srcstrideq*0] movu m1, [srcq+srcstrideq*1] pshufb m0, m2 @@ -1488,7 +1488,7 @@ cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr %endif %if mmsize == 8 ; mmx / mmxext -.next8px +.next8px: %endif %ifidn %1, v ; read 4 half/full rows of pixels diff --git a/libavresample/x86/audio_mix.asm b/libavresample/x86/audio_mix.asm index 8aeac8b242..bab4292e13 100644 --- a/libavresample/x86/audio_mix.asm +++ b/libavresample/x86/audio_mix.asm @@ -361,7 +361,7 @@ cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, s mov src0q, [src0q] add src0q, lenq neg lenq -.loop +.loop: ; for x86-32 with 7-8 channels we do not have enough gp registers for all src ; pointers, so we have to load some of them from the stack each time %define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5 diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index 7a18a20aca..ce823302f6 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -32,7 +32,7 @@ SECTION .text cglobal vector_fmul, 4,4,2, dst, src0, src1, len lea lenq, [lend*4 - 2*mmsize] ALIGN 16 -.loop +.loop: mova m0, [src0q + lenq] mova m1, [src0q + lenq + mmsize] mulps m0, m0, [src1q + lenq] @@ -74,7 +74,7 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len %endif %endif lea lenq, [lend*4-2*mmsize] -.loop +.loop: mulps m1, m0, [srcq+lenq ] mulps m2, m0, [srcq+lenq+mmsize] addps m1, m1, [dstq+lenq ] diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index d29740f278..01be0f8fc0 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -110,12 +110,14 @@ default rel %endif +%macro CPUNOP 1 + %if HAVE_CPUNOP + CPU %1 + %endif +%endmacro + ; Always use long nops (reduces 0x90 spam in disassembly on x86_32) -; Not supported by NASM (except via smartalign package + ALIGNMODE k8, -; however that fails when used together with the -M option) -%ifdef __YASM_VER__ -CPU amdnop -%endif +CPUNOP amdnop ; Macros to eliminate most code duplication between x86_32 and x86_64: ; Currently this works only for leaf functions which load all their arguments @@ -522,22 +524,8 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 ; Applies any symbol mangling needed for C linkage, and sets up a define such that ; subsequent uses of the function name automatically refer to the mangled version. ; Appends cpuflags to the function name if cpuflags has been specified. -%macro cglobal 1-2+ ; name, [PROLOGUE args] -%if %0 == 1 - ; HACK: work around %+ broken with empty SUFFIX for nasm 2.09.10 - %ifndef cpuname - cglobal_internal %1 - %else - cglobal_internal %1 %+ SUFFIX - %endif -%else - ; HACK: work around %+ broken with empty SUFFIX for nasm 2.09.10 - %ifndef cpuname - cglobal_internal %1, %2 - %else +%macro cglobal 1-2+ "" ; name, [PROLOGUE args] cglobal_internal %1 %+ SUFFIX, %2 - %endif -%endif %endmacro %macro cglobal_internal 1-2+ %ifndef cglobaled_%1 @@ -555,7 +543,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %1: RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer %assign stack_offset 0 - %if %0 > 1 + %ifnidn %2, "" PROLOGUE %2 %endif %endmacro @@ -622,9 +610,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. %macro INIT_CPUFLAGS 0-2 -%ifdef __YASM_VER__ - CPU amdnop -%endif + CPUNOP amdnop %if %0 >= 1 %xdefine cpuname %1 %assign cpuflags cpuflags_%1 @@ -648,7 +634,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %endif %ifdef __YASM_VER__ %if notcpuflag(mmx2) - CPU basicnop + CPUNOP basicnop %endif %endif %else @@ -826,18 +812,13 @@ INIT_XMM ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't %macro call 1 - ; HACK: work around %+ broken with empty SUFFIX for nasm 2.09.10 - %ifndef cpuname - call_internal %1, %1 - %else - call_internal %1, %1 %+ SUFFIX - %endif + call_internal %1 %+ SUFFIX, %1 %endmacro %macro call_internal 2 - %xdefine %%i %1 - %ifndef cglobaled_%1 - %ifdef cglobaled_%2 - %xdefine %%i %2 + %xdefine %%i %2 + %ifndef cglobaled_%2 + %ifdef cglobaled_%1 + %xdefine %%i %1 %endif %endif call %%i