x86: replace explicit REP_RETs with RETs

From x86inc: > On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either > a branch or a branch target. So switch to a 2-byte form of ret in that case. > We can automatically detect "follows a branch", but not a branch target. > (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) x86inc can automatically determine whether to use REP_RET rather than REP in most of these cases, so impact is minimal. Additionally, a few REP_RETs were used unnecessary, despite the return being nowhere near a branch. The only CPUs affected were AMD K10s, made between 2007 and 2011, 16 years ago and 12 years ago, respectively. In the future, everyone involved with x86inc should consider dropping REP_RETs altogether.
2025-04-01 22:49:21 +00:00 · 2023-02-01 02:26:20 +01:00 · 2023-02-01 02:26:20 +01:00 · bbe95f7353
commit bbe95f7353
parent fc9a3b584d
61 changed files with 223 additions and 223 deletions
--- a/libavcodec/x86/aacpsdsp.asm
+++ b/libavcodec/x86/aacpsdsp.asm
@ -49,7 +49,7 @@ align 16
    add  dstq, mmsize
    add    nq, mmsize*2
    jl .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse
@ -83,7 +83,7 @@ align 16
    add   src2q, mmsize
    add      nq, mmsize*2
    jl .loop
-    REP_RET
+    RET

 ;***********************************************************************
 ;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
@ -116,7 +116,7 @@ align 16
    movhps [rq+nq], m2
    add      nq, 8
    jl .loop
-    REP_RET
+    RET

 ;***************************************************************************
 ;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
@ -164,7 +164,7 @@ align 16
    movhps [rq+nq], m2
    add      nq, 8
    jl .loop
-    REP_RET
+    RET

 ;**********************************************************
 ;void ps_hybrid_analysis_ileave_sse(float out[2][38][64],
@ -484,7 +484,7 @@ align 16
    add    outq, strideq
    add      nq, 64
    jl .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@ -60,7 +60,7 @@ cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
    sub        expnq, mmsize
    jg .nextexp
 .end:
-    REP_RET
+    RET
 %endmacro

 %define LOOP_ALIGN ALIGN 16
@ -126,7 +126,7 @@ cglobal float_to_fixed24, 3, 3, 9, dst, src, len
    sub      lenq, 16
 %endif
    ja .loop
-    REP_RET
+    RET

 ;------------------------------------------------------------------------------
 ; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
@ -220,7 +220,7 @@ cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len

    add     lenq, 4
    jl .loop
-    REP_RET
+    RET
 %endmacro

 %if HAVE_SSE2_EXTERNAL
--- a/libavcodec/x86/alacdsp.asm
+++ b/libavcodec/x86/alacdsp.asm
@ -100,7 +100,7 @@ align 16

    add     lenq, mmsize*2
    jl .loop
-    REP_RET
+    RET

 %if ARCH_X86_64
 cglobal alac_append_extra_bits_mono, 2, 5, 3, buf, exbuf, exbits, ch, len
@ -130,4 +130,4 @@ align 16

    add     lenq, mmsize*2
    jl .loop
-    REP_RET
+    RET
--- a/libavcodec/x86/audiodsp.asm
+++ b/libavcodec/x86/audiodsp.asm
@ -123,7 +123,7 @@ cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
    add     dstq, mmsize*4*(%2+%3)
    sub     lend, mmsize*(%2+%3)
    jg .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
--- a/libavcodec/x86/dirac_dwt.asm
+++ b/libavcodec/x86/dirac_dwt.asm
@ -75,7 +75,7 @@ cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
    COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
    mova    [b1q+2*widthq], m0
    jg      .loop
-    REP_RET
+    RET

 ; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
 ;                                  int width)
@ -93,7 +93,7 @@ cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
    paddw   m0, [b1q+2*widthq]
    mova    [b1q+2*widthq], m0
    jg      .loop
-    REP_RET
+    RET

 ; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
 ;                               IDWTELEM *b3, IDWTELEM *b4, int width)
@ -110,7 +110,7 @@ cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
    COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
    mova    [b2q+2*widthq], m1
    jg      .loop
-    REP_RET
+    RET

 ; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
 ;                                IDWTELEM *b3, IDWTELEM *b4, int width)
@ -139,7 +139,7 @@ cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
    psubw   m5, m1
    mova    [b2q+2*widthq], m5
    jg      .loop
-    REP_RET
+    RET

 ; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
 cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
@ -159,7 +159,7 @@ cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
    paddw   m2, m0
    mova    [b1q+2*widthq], m2
    jg      .loop
-    REP_RET
+    RET
 %endmacro

 ; extend the left and right edges of the tmp array by %1 and %2 respectively
@ -225,7 +225,7 @@ cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
    cmp     xq, w2q
    jl      .highpass_loop
 .end:
-    REP_RET
+    RET
 %endmacro


@ -290,7 +290,7 @@ cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
    cmp     xd, w2d
    jl      .highpass_loop
 .end:
-    REP_RET
+    RET


 INIT_XMM
--- a/libavcodec/x86/fft.asm
+++ b/libavcodec/x86/fft.asm
@ -475,7 +475,7 @@ cglobal fft_calc, 2,5,8
    mov     r0, r1
    mov     r1, r3
    FFT_DISPATCH _interleave %+ SUFFIX, r1
-    REP_RET
+    RET

 %endif

@ -510,7 +510,7 @@ cglobal fft_calc, 2,5,8
    add      r2, mmsize*2
    jl       .loop
 .end:
-    REP_RET
+    RET

 cglobal fft_permute, 2,7,1
    mov     r4,  [r0 + FFTContext.revtab]
@ -543,7 +543,7 @@ cglobal fft_permute, 2,7,1
    movaps  [r1 + r2 + 16], xmm1
    add     r2, 32
    jl      .loopcopy
-    REP_RET
+    RET

 INIT_XMM sse
 cglobal imdct_calc, 3,5,3
@ -583,7 +583,7 @@ cglobal imdct_calc, 3,5,3
    sub     r3, mmsize
    add     r2, mmsize
    jl      .loop
-    REP_RET
+    RET

 %ifdef PIC
 %define SECTION_REL - $$
--- a/libavcodec/x86/flacdsp.asm
+++ b/libavcodec/x86/flacdsp.asm
@ -79,7 +79,7 @@ ALIGN 16
    movd   [decodedq+4], m1
    jg .loop_sample
 .ret:
-    REP_RET
+    RET
 %endmacro

 %if HAVE_XOP_EXTERNAL
@ -133,7 +133,7 @@ align 16
    mova [outq + lenq], m%2
    add      lenq, 16
    jl .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -177,7 +177,7 @@ align 16
    add      outq, mmsize*2
    sub      lend, mmsize/4
    jg .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -302,7 +302,7 @@ align 16
    add      outq, mmsize*REPCOUNT
    sub      lend, mmsize/4
    jg .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM ssse3
--- a/libavcodec/x86/h264_chromamc.asm
+++ b/libavcodec/x86/h264_chromamc.asm
@ -112,7 +112,7 @@ cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
    jne .at_least_one_non_zero
    ; mx == 0 AND my == 0 - no filter needed
    mv0_pixels_mc8
-    REP_RET
+    RET

 .at_least_one_non_zero:
 %ifidn %2, rv40
@ -192,7 +192,7 @@ cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
    add           r1, r2
    dec           r3d
    jne .next1drow
-    REP_RET
+    RET

 .both_non_zero: ; general case, bilinear
    movd          m4, r4d         ; x
@ -365,7 +365,7 @@ cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
    add           r0, r2
    sub          r3d, 2
    jnz .next2rows
-    REP_RET
+    RET
 %endmacro

 %macro chroma_mc2_mmx_func 2
@ -407,7 +407,7 @@ cglobal %1_%2_chroma_mc2, 6, 7, 0
    add           r0, r2
    sub          r3d, 1
    jnz .nextrow
-    REP_RET
+    RET
 %endmacro

 %define rnd_1d_h264 pw_4
@ -453,7 +453,7 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
    jne .at_least_one_non_zero
    ; mx == 0 AND my == 0 - no filter needed
    mv0_pixels_mc8
-    REP_RET
+    RET

 .at_least_one_non_zero:
    test         r5d, r5d
@ -514,7 +514,7 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
    sub          r3d, 2
    lea           r0, [r0+r2*2]
    jg .next2rows
-    REP_RET
+    RET

 .my_is_zero:
    mov          r5d, r4d
@ -551,7 +551,7 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
    lea           r0, [r0+r2*2]
    lea           r1, [r1+r2*2]
    jg .next2xrows
-    REP_RET
+    RET

 .mx_is_zero:
    mov          r4d, r5d
@ -588,7 +588,7 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
    sub          r3d, 2
    lea           r0, [r0+r2*2]
    jg .next2yrows
-    REP_RET
+    RET
 %endmacro

 %macro chroma_mc4_ssse3_func 2
@ -638,7 +638,7 @@ cglobal %1_%2_chroma_mc4, 6, 7, 0
    sub          r3d, 2
    lea           r0, [r0+r2*2]
    jg .next2rows
-    REP_RET
+    RET
 %endmacro

 %define CHROMAMC_AVG NOTHING
--- a/libavcodec/x86/h264_chromamc_10bit.asm
+++ b/libavcodec/x86/h264_chromamc_10bit.asm
@ -67,7 +67,7 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8
    jne .at_least_one_non_zero
    ; mx == 0 AND my == 0 - no filter needed
    MV0_PIXELS_MC8
-    REP_RET
+    RET

 .at_least_one_non_zero:
    mov          r6d, 2
@ -102,7 +102,7 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8
    add           r1, r2
    dec           r3d
    jne .next1drow
-    REP_RET
+    RET

 .xy_interpolation: ; general case, bilinear
    movd          m4, r4m         ; x
@ -144,7 +144,7 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8
    add           r0, r2
    dec          r3d
    jne .next2drow
-    REP_RET
+    RET
 %endmacro

 ;-----------------------------------------------------------------------------
@ -194,7 +194,7 @@ cglobal %1_h264_chroma_mc4_10, 6,6,7
    MC4_OP m6, m0
    sub   r3d, 2
    jnz .next2rows
-    REP_RET
+    RET
 %endmacro

 ;-----------------------------------------------------------------------------
@ -234,7 +234,7 @@ cglobal %1_h264_chroma_mc2_10, 6,7
    add           r0, r2
    dec          r3d
    jnz .nextrow
-    REP_RET
+    RET
 %endmacro

 %macro NOTHING 2-3
--- a/libavcodec/x86/h264_deblock_10bit.asm
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@ -372,7 +372,7 @@ cglobal deblock_v_luma_10, 5,5,15
    add         r4, 2
    dec         r3
    jg .loop
-    REP_RET
+    RET

 cglobal deblock_h_luma_10, 5,7,15
    shl        r2d, 2
@ -411,7 +411,7 @@ cglobal deblock_h_luma_10, 5,7,15
    lea         r5, [r5+r1*8]
    dec         r6
    jg .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -648,7 +648,7 @@ cglobal deblock_v_luma_intra_10, 4,7,16
    add     r4, mmsize
    dec     r6
    jg .loop
-    REP_RET
+    RET

 ;-----------------------------------------------------------------------------
 ; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha,
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@ -354,7 +354,7 @@ INIT_MMX cpuname
    add          r2, 128
    cmp          r5, 16
    jl .nextblock
-    REP_RET
+    RET
 .no_dc:
 INIT_XMM cpuname
    mov       dst2d, dword [r1+r5*4]
@ -368,7 +368,7 @@ INIT_XMM cpuname
    add          r2, 128
    cmp          r5, 16
    jl .nextblock
-    REP_RET
+    RET

 INIT_MMX mmx
 h264_idct_add8_mmx_plane:
@ -508,7 +508,7 @@ cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
    add16_sse2_cycle 5, 0x24
    add16_sse2_cycle 6, 0x1e
    add16_sse2_cycle 7, 0x26
-REP_RET
+RET

 %macro add16intra_sse2_cycle 2
    movzx       r0, word [r4+%2]
@ -555,7 +555,7 @@ cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
    add16intra_sse2_cycle 5, 0x24
    add16intra_sse2_cycle 6, 0x1e
    add16intra_sse2_cycle 7, 0x26
-REP_RET
+RET

 %macro add8_sse2_cycle 2
    movzx       r0, word [r4+%2]
@ -610,7 +610,7 @@ cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
 %endif
    add8_sse2_cycle 2, 0x5c
    add8_sse2_cycle 3, 0x64
-REP_RET
+RET

 ;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)

--- a/libavcodec/x86/h264_idct_10bit.asm
+++ b/libavcodec/x86/h264_idct_10bit.asm
@ -155,7 +155,7 @@ cglobal h264_idct_add16_10, 5,6
    ADD16_OP 13, 7+3*8
    ADD16_OP 14, 6+4*8
    ADD16_OP 15, 7+4*8
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -292,7 +292,7 @@ cglobal h264_idct_add16intra_10,5,7,8
    ADD16_OP_INTRA 10, 4+4*8
    ADD16_OP_INTRA 12, 6+3*8
    ADD16_OP_INTRA 14, 6+4*8
-    REP_RET
+    RET
    AC 8
    AC 10
    AC 12
@ -335,7 +335,7 @@ cglobal h264_idct_add8_10,5,8,7
 %endif
    ADD16_OP_INTRA 32, 4+11*8
    ADD16_OP_INTRA 34, 4+12*8
-    REP_RET
+    RET
    AC 16
    AC 18
    AC 32
@ -384,7 +384,7 @@ cglobal h264_idct_add8_422_10, 5, 8, 7
    ADD16_OP_INTRA 34, 4+12*8
    ADD16_OP_INTRA 40, 4+13*8 ; i+4
    ADD16_OP_INTRA 42, 4+14*8 ; i+4
-REP_RET
+RET
    AC 16
    AC 18
    AC 24 ; i+4
--- a/libavcodec/x86/h264_intrapred.asm
+++ b/libavcodec/x86/h264_intrapred.asm
@ -62,7 +62,7 @@ cglobal pred16x16_vertical_8, 2,3
    lea   r0, [r0+r1*2]
    dec   r2
    jg .loop
-    REP_RET
+    RET

 ;-----------------------------------------------------------------------------
 ; void ff_pred16x16_horizontal_8(uint8_t *src, ptrdiff_t stride)
@ -95,7 +95,7 @@ cglobal pred16x16_horizontal_8, 2,3
    lea       r0, [r0+r1*2]
    dec       r2
    jg .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -146,7 +146,7 @@ cglobal pred16x16_dc_8, 2,7
    lea   r4, [r4+r1*2]
    dec   r3d
    jg .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -192,7 +192,7 @@ cglobal pred16x16_tm_vp8_8, 2,6,6
    lea          r0, [r0+r1*2]
    dec         r5d
    jg .loop
-    REP_RET
+    RET

 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
@ -228,7 +228,7 @@ cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration
    lea                       dstq, [dstq+strideq*4]
    dec                 iterationd
    jg .loop
-    REP_RET
+    RET
 %endif

 ;-----------------------------------------------------------------------------
@ -427,7 +427,7 @@ cglobal pred16x16_plane_%1_8, 2,9,7
    lea          r0, [r0+r2*2]
    dec          r4
    jg .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -556,7 +556,7 @@ ALIGN 16
    lea          r0, [r0+r2*2]
    dec          r4
    jg .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -599,7 +599,7 @@ cglobal pred8x8_horizontal_8, 2,3
    lea       r0, [r0+r1*2]
    dec       r2
    jg .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -737,7 +737,7 @@ cglobal pred8x8_dc_rv40_8, 2,7
    lea   r4, [r4+r1*2]
    dec   r3d
    jg .loop
-    REP_RET
+    RET

 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
@ -770,7 +770,7 @@ cglobal pred8x8_tm_vp8_8, 2,6,4
    lea          r0, [r0+r1*2]
    dec         r5d
    jg .loop
-    REP_RET
+    RET

 INIT_XMM ssse3
 cglobal pred8x8_tm_vp8_8, 2,3,6
@ -797,7 +797,7 @@ cglobal pred8x8_tm_vp8_8, 2,3,6
    lea          r0, [r0+r1*2]
    dec         r2d
    jg .loop
-    REP_RET
+    RET

 ; dest, left, right, src, tmp
 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
@ -1802,7 +1802,7 @@ cglobal pred4x4_tm_vp8_8, 3,6
    lea        r0, [r0+r2*2]
    dec       r5d
    jg .loop
-    REP_RET
+    RET

 INIT_XMM ssse3
 cglobal pred4x4_tm_vp8_8, 3,3
--- a/libavcodec/x86/h264_intrapred_10bit.asm
+++ b/libavcodec/x86/h264_intrapred_10bit.asm
@ -327,7 +327,7 @@ cglobal pred8x8_horizontal_10, 2, 3
    lea          r0, [r0+r1*2]
    dec          r2d
    jg .loop
-    REP_RET
+    RET

 ;-----------------------------------------------------------------------------
 ; void ff_predict_8x8_dc_10(pixel *src, ptrdiff_t stride)
@ -481,7 +481,7 @@ cglobal pred8x8_plane_10, 2, 7, 7
    add       r0, r1
    dec r2d
    jg .loop
-    REP_RET
+    RET


 ;-----------------------------------------------------------------------------
@ -994,7 +994,7 @@ cglobal pred16x16_vertical_10, 2, 3
    lea   r0, [r0+r1*2]
    dec   r2d
    jg .loop
-    REP_RET
+    RET

 ;-----------------------------------------------------------------------------
 ; void ff_pred16x16_horizontal_10(pixel *src, ptrdiff_t stride)
@ -1012,7 +1012,7 @@ cglobal pred16x16_horizontal_10, 2, 3
    lea    r0, [r0+r1*2]
    dec    r2d
    jg .vloop
-    REP_RET
+    RET

 ;-----------------------------------------------------------------------------
 ; void ff_pred16x16_dc_10(pixel *src, ptrdiff_t stride)
@ -1048,7 +1048,7 @@ cglobal pred16x16_dc_10, 2, 6
    lea        r5, [r5+r1*2]
    dec       r3d
    jg .loop
-    REP_RET
+    RET

 ;-----------------------------------------------------------------------------
 ; void ff_pred16x16_top_dc_10(pixel *src, ptrdiff_t stride)
@ -1070,7 +1070,7 @@ cglobal pred16x16_top_dc_10, 2, 3
    lea        r0, [r0+r1*2]
    dec       r2d
    jg .loop
-    REP_RET
+    RET

 ;-----------------------------------------------------------------------------
 ; void ff_pred16x16_left_dc_10(pixel *src, ptrdiff_t stride)
@ -1101,7 +1101,7 @@ cglobal pred16x16_left_dc_10, 2, 6
    lea        r5, [r5+r1*2]
    dec       r3d
    jg .loop
-    REP_RET
+    RET

 ;-----------------------------------------------------------------------------
 ; void ff_pred16x16_128_dc_10(pixel *src, ptrdiff_t stride)
@ -1116,4 +1116,4 @@ cglobal pred16x16_128_dc_10, 2,3
    lea        r0, [r0+r1*2]
    dec       r2d
    jg .loop
-    REP_RET
+    RET
--- a/libavcodec/x86/h264_qpel_10bit.asm
+++ b/libavcodec/x86/h264_qpel_10bit.asm
@ -211,7 +211,7 @@ cglobal %1_h264_qpel16_mc00_10, 3,4
    lea            r1, [r1+r2*2]
    dec r3d
    jg .loop
-    REP_RET
+    RET
 %endmacro

 %define OP_MOV mova
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@ -89,7 +89,7 @@ cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
    add           r1, r3
    dec          r4d
    jg         .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -149,7 +149,7 @@ cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
    add           r1, r3
    dec          r4d
    jg         .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -192,7 +192,7 @@ cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
    add           r0, r2
    dec          r4d
    jne        .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM ssse3
@ -239,7 +239,7 @@ cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
    add           r2, r4
    dec          r5d
    jg         .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -303,7 +303,7 @@ cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
    add           r2, r4
    dec          r5d
    jg         .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -350,7 +350,7 @@ cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Strid
    add           r2, r4
    dec          r5d
    jg         .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM ssse3
@ -458,7 +458,7 @@ cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride,
    FILT_V        %1
    FILT_V        %1
 .end:
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -531,7 +531,7 @@ cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
    add           r1, r2
    dec          r3d
    jnz        .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -574,7 +574,7 @@ cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
    FILT_HV    14*48
    FILT_HV    15*48
 .end:
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -619,7 +619,7 @@ cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h
    add           r0, r2
    dec          r4d
    jne        .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -710,7 +710,7 @@ cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, s
    dec          r4d
    jne        .op16
 .done:
-    REP_RET
+    RET
 %endmacro

 INIT_XMM ssse3
@ -776,7 +776,7 @@ cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
    lea           r0, [r0+2*r3]
    sub          r5d, 2
    jne        .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -845,7 +845,7 @@ cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2S
    add           r2, r4
    dec          r5d
    jg         .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM ssse3
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@ -79,7 +79,7 @@ cglobal h264_weight_%1, 6, 6, %2
    add        r0, r1
    dec        r2d
    jnz .nextrow
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -102,7 +102,7 @@ cglobal h264_weight_%1, 6, 6, %2
    add        r0, r3
    dec        r2d
    jnz .nextrow
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -196,7 +196,7 @@ cglobal h264_biweight_%1, 7, 8, %2
    add        r1, r2
    dec        r3d
    jnz .nextrow
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -223,7 +223,7 @@ cglobal h264_biweight_%1, 7, 8, %2
    add        r1, r4
    dec        r3d
    jnz .nextrow
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -258,7 +258,7 @@ cglobal h264_biweight_16, 7, 8, 8
    add        r1, r2
    dec        r3d
    jnz .nextrow
-    REP_RET
+    RET

 INIT_XMM ssse3
 cglobal h264_biweight_8, 7, 8, 8
@ -281,4 +281,4 @@ cglobal h264_biweight_8, 7, 8, 8
    add        r1, r4
    dec        r3d
    jnz .nextrow
-    REP_RET
+    RET
--- a/libavcodec/x86/h264_weight_10bit.asm
+++ b/libavcodec/x86/h264_weight_10bit.asm
@ -101,7 +101,7 @@ cglobal h264_weight_16_10
    add       r0, r1
    dec       r2d
    jnz .nextrow
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -120,7 +120,7 @@ cglobal h264_weight_8_10
    add        r0, r1
    dec        r2d
    jnz .nextrow
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -142,7 +142,7 @@ cglobal h264_weight_4_10
    add         r0, r3
    dec         r2d
    jnz .nextrow
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -234,7 +234,7 @@ cglobal h264_biweight_16_10
    add       r1, r2
    dec       r3d
    jnz .nextrow
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -253,7 +253,7 @@ cglobal h264_biweight_8_10
    add      r1, r2
    dec      r3d
    jnz .nextrow
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -275,7 +275,7 @@ cglobal h264_biweight_4_10
    add         r1, r4
    dec         r3d
    jnz .nextrow
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
--- a/libavcodec/x86/hevc_sao.asm
+++ b/libavcodec/x86/hevc_sao.asm
@ -166,7 +166,7 @@ INIT_YMM cpuname
    add             srcq, srcstrideq             ; src += srcstride
    dec          heightd                         ; cmp height
    jnz               .loop                      ; height loop
-    REP_RET
+    RET
 %endmacro


--- a/libavcodec/x86/hevc_sao_10bit.asm
+++ b/libavcodec/x86/hevc_sao_10bit.asm
@ -145,7 +145,7 @@ align 16
    add             srcq, srcstrideq
    dec          heightd
    jg .loop
-    REP_RET
+    RET
 %endmacro

 %macro HEVC_SAO_BAND_FILTER_FUNCS 0
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@ -78,7 +78,7 @@ cglobal put_pixels8_x2, 4,5
    add          r0, r4
    sub         r3d, 4
    jne .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -120,7 +120,7 @@ cglobal put_pixels16_x2, 4,5
    add          r0, r4
    sub         r3d, 4
    jne .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -162,7 +162,7 @@ cglobal put_no_rnd_pixels8_x2, 4,5
    add          r0, r4
    sub         r3d, 4
    jne .loop
-    REP_RET
+    RET


 ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@ -194,7 +194,7 @@ cglobal put_pixels8_y2, 4,5
    add          r0, r4
    sub         r3d, 4
    jne .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -232,7 +232,7 @@ cglobal put_no_rnd_pixels8_y2, 4,5
    add          r0, r4
    sub         r3d, 4
    jne .loop
-    REP_RET
+    RET


 ; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@ -280,7 +280,7 @@ cglobal avg_pixels8_x2, 4,5
    add          r0, r4
    sub         r3d, 4
    jne .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -323,7 +323,7 @@ cglobal avg_pixels8_y2, 4,5
    add          r0, r4
    sub         r3d, 4
    jne .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -370,7 +370,7 @@ cglobal avg_approx_pixels8_xy2, 4,5
    add          r0, r4
    sub         r3d, 4
    jne .loop
-    REP_RET
+    RET


 ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@ -448,7 +448,7 @@ cglobal %1_pixels8_xy2, 4,5
    add         r4, r2
    sub        r3d, 2
    jnz .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -514,7 +514,7 @@ cglobal %1_pixels8_xy2, 4,5
    add         r4, r2
    sub        r3d, 2
    jnz .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX ssse3
--- a/libavcodec/x86/hpeldsp_vp3.asm
+++ b/libavcodec/x86/hpeldsp_vp3.asm
@ -60,7 +60,7 @@ cglobal put_no_rnd_pixels8_x2_exact, 4,5
    lea          r0, [r0+r2*4]
    sub         r3d, 4
    jg .loop
-    REP_RET
+    RET


 ; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@ -96,4 +96,4 @@ cglobal put_no_rnd_pixels8_y2_exact, 4,5
    lea          r0, [r0+r2*4]
    sub         r3d, 4
    jg .loop
-    REP_RET
+    RET
--- a/libavcodec/x86/huffyuvdsp.asm
+++ b/libavcodec/x86/huffyuvdsp.asm
@ -74,7 +74,7 @@ cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
    jl         .loop
    movd          m0, [dstq-4]
    movd     [leftq], m0
-    REP_RET
+    RET


 ; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
--- a/libavcodec/x86/jpeg2000dsp.asm
+++ b/libavcodec/x86/jpeg2000dsp.asm
@ -113,7 +113,7 @@ align 16
    movaps   [src1q+csizeq], m5
    add  csizeq, mmsize
    jl .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse
@ -153,7 +153,7 @@ align 16
    mova   [src0q+csizeq], m2
    add  csizeq, mmsize
    jl .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
--- a/libavcodec/x86/lossless_videodsp.asm
+++ b/libavcodec/x86/lossless_videodsp.asm
@ -229,7 +229,7 @@ cglobal add_bytes, 3,4,2, dst, src, w, size
    inc     wq
    jl .3
 .end:
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
--- a/libavcodec/x86/lossless_videoencdsp.asm
+++ b/libavcodec/x86/lossless_videoencdsp.asm
@ -110,7 +110,7 @@ cglobal diff_bytes, 4,5,2, dst, src1, src2, w
    inc               wq
        jl .loop_gpr_%1%2
 .end_%1%2:
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@ -458,7 +458,7 @@ cglobal hf_noise%1, 3,3,0, pix1, lsize, h
    psrlq      m6, 32
    paddd      m0, m6
    movd      eax, m0   ; eax = result of hf_noise8;
-    REP_RET                 ; return eax;
+    RET                 ; return eax;
 %endmacro

 INIT_MMX mmx
--- a/libavcodec/x86/pngdsp.asm
+++ b/libavcodec/x86/pngdsp.asm
@ -75,7 +75,7 @@ cglobal add_bytes_l2, 4, 6, 2, dst, src1, src2, wa, w, i
 .end_s:
    cmp                 iq, wq
    jl .loop_s
-    REP_RET
+    RET

 %macro ADD_PAETH_PRED_FN 1
 cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr
--- a/libavcodec/x86/qpel.asm
+++ b/libavcodec/x86/qpel.asm
@ -81,7 +81,7 @@ cglobal %1_pixels4_l2, 6,6
    add          r2, 16
    sub         r5d, 4
    jne       .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -125,7 +125,7 @@ cglobal %1_pixels8_l2, 6,6
    add          r2, 32
    sub         r5d, 4
    jne       .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -171,7 +171,7 @@ cglobal %1_pixels16_l2, 6,6
    add          r2, 32
    sub         r5d, 2
    jne       .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
--- a/libavcodec/x86/qpeldsp.asm
+++ b/libavcodec/x86/qpeldsp.asm
@ -92,7 +92,7 @@ cglobal put_no_rnd_pixels8_l2, 6,6
    add          r2, 32
    sub         r5d, 4
    jne .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -161,7 +161,7 @@ cglobal put_no_rnd_pixels16_l2, 6,6
    add          r2, 32
    sub         r5d, 2
    jne .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -274,7 +274,7 @@ cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16
    add          r0, r2
    dec r4d
    jne .loop
-    REP_RET
+    RET
 %endmacro

 %macro PUT_OP 2-3
@ -357,7 +357,7 @@ cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8
    add          r0, r2
    dec r4d
    jne .loop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -466,7 +466,7 @@ cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544
    add    r0, r1
    dec r4d
    jne .loopv
-    REP_RET
+    RET
 %endmacro

 %macro PUT_OPH 2-3
@ -543,7 +543,7 @@ cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288
    add    r0, r1
    dec r4d
    jne .loopv
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
--- a/libavcodec/x86/rv34dsp.asm
+++ b/libavcodec/x86/rv34dsp.asm
@ -54,7 +54,7 @@ cglobal rv34_idct_dc_noround, 1, 2, 0
    movq    [r0+ 8], m0
    movq    [r0+16], m0
    movq    [r0+24], m0
-    REP_RET
+    RET

 ; Load coeffs and perform row transform
 ; Output: coeffs in mm[0467], rounder in mm5
--- a/libavcodec/x86/rv40dsp.asm
+++ b/libavcodec/x86/rv40dsp.asm
@ -170,7 +170,7 @@ cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height,
    add     srcq, srcstrideq
    dec  heightd                           ; next row
    jg .nextrow
-    REP_RET
+    RET
 %endmacro

 %macro FILTER_H  1
@ -227,7 +227,7 @@ cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, heigh
    add     srcq, srcstrideq
    dec  heightd            ; next row
    jg .nextrow
-    REP_RET
+    RET
 %endmacro

 INIT_XMM  sse2
@ -280,7 +280,7 @@ cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height,
    add     srcq, srcstrideq
    dec       heightd                          ; next row
    jg       .nextrow
-    REP_RET
+    RET

 cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
 %ifdef PIC
@ -313,7 +313,7 @@ cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height,
    add     srcq, srcstrideq
    dec  heightd            ; next row
    jg .nextrow
-    REP_RET
+    RET
 %endmacro

 INIT_XMM ssse3
@ -464,7 +464,7 @@ cglobal rv40_weight_func_%1_%2, 6, 7, 8
 .loop:
    MAIN_LOOP  %2, RND
    jnz        .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
--- a/libavcodec/x86/sbrdsp.asm
+++ b/libavcodec/x86/sbrdsp.asm
@ -208,7 +208,7 @@ cglobal sbr_sum64x5, 1,2,4,z
    add     zq, 32
    cmp     zq, r1q
    jne  .loop
-    REP_RET
+    RET

 INIT_XMM sse
 cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
@ -227,7 +227,7 @@ cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
    add               zq, 16
    cmp               zq, r2q
    jl             .loop
-    REP_RET
+    RET

 INIT_XMM sse
 cglobal sbr_neg_odd_64, 1,2,4,z
@ -248,7 +248,7 @@ cglobal sbr_neg_odd_64, 1,2,4,z
    add         zq, 64
    cmp         zq, r1q
    jne      .loop
-    REP_RET
+    RET

 ; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
 INIT_XMM sse2
@ -276,7 +276,7 @@ cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
    add            vrevq, 2*mmsize
    sub               cq, 2*mmsize
    jge            .loop
-    REP_RET
+    RET

 INIT_XMM sse2
 cglobal sbr_qmf_pre_shuffle, 1,4,6,z
@ -306,7 +306,7 @@ cglobal sbr_qmf_pre_shuffle, 1,4,6,z
    jge      .loop
    movq       m2, [zq]
    movq    [r2q], m2
-    REP_RET
+    RET

 %ifdef PIC
 %define NREGS 1
@ -432,7 +432,7 @@ cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
    sub        vq, mmsize
    add        cq, mmsize
    jl      .loop
-    REP_RET
+    RET

 %macro SBR_AUTOCORRELATE 0
 cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
--- a/libavcodec/x86/takdsp.asm
+++ b/libavcodec/x86/takdsp.asm
@ -43,7 +43,7 @@ cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
    mova     [p2q+lengthq+mmsize*1], m1
    add                     lengthq, mmsize*2
    jl .loop
-    REP_RET
+    RET

 cglobal tak_decorrelate_sr, 3, 3, 2, p1, p2, length
    shl                     lengthd, 2
@ -60,7 +60,7 @@ cglobal tak_decorrelate_sr, 3, 3, 2, p1, p2, length
    mova     [p1q+lengthq+mmsize*1], m1
    add                     lengthq, mmsize*2
    jl .loop
-    REP_RET
+    RET

 cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
    shl                     lengthd, 2
@ -87,7 +87,7 @@ cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
    mova       [p2q+lengthq+mmsize], m4
    add                     lengthq, mmsize*2
    jl .loop
-    REP_RET
+    RET

 INIT_XMM sse4
 cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor
@ -113,4 +113,4 @@ cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor
    mova      [p1q+lengthq], m1
    add             lengthq, mmsize
    jl .loop
-    REP_RET
+    RET
--- a/libavcodec/x86/utvideodsp.asm
+++ b/libavcodec/x86/utvideodsp.asm
@ -69,7 +69,7 @@ DEFINE_ARGS src_r, src_g, src_b, linesize_r, linesize_g, linesize_b, x
    add        src_bq, linesize_bq
    sub        hd, 1
    jg .nextrow
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -125,7 +125,7 @@ DEFINE_ARGS src_r, src_g, src_b, linesize_r, linesize_g, linesize_b, x
    add        src_bq, linesize_bq
    sub        hd, 1
    jg .nextrow
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
--- a/libavcodec/x86/v210.asm
+++ b/libavcodec/x86/v210.asm
@ -116,7 +116,7 @@ cglobal v210_planar_unpack_%1, 5, 5, 6 + 2 * cpuflag(avx2), src, y, u, v, w
    add wq, (mmsize*3)/8
    jl  .loop

-    REP_RET
+    RET
 %endmacro

 INIT_XMM ssse3
--- a/libavcodec/x86/vc1dsp_mc.asm
+++ b/libavcodec/x86/vc1dsp_mc.asm
@ -139,7 +139,7 @@ cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
    add              dstq, 8
    dec                 i
        jnz         .loop
-    REP_RET
+    RET
 %undef rnd
 %undef shift
 %undef stride_neg2
--- a/libavcodec/x86/videodsp.asm
+++ b/libavcodec/x86/videodsp.asm
@ -433,4 +433,4 @@ cglobal prefetch, 3, 3, 0, buf, stride, h
    add      bufq, strideq
    dec        hd
    jg .loop
-    REP_RET
+    RET
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@ -200,7 +200,7 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h
    add     srcq, srcstrideq
    dec  heightd            ; next row
    jg .nextrow
-    REP_RET
+    RET

 cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
    shl      mxd, 4
@ -230,7 +230,7 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h
    add     srcq, srcstrideq
    dec  heightd            ; next row
    jg .nextrow
-    REP_RET
+    RET

 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 4
@ -268,7 +268,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
    add      srcq, srcstrideq
    dec   heightd                          ; next row
    jg .nextrow
-    REP_RET
+    RET

 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    lea      myd, [myq*3]
@ -314,7 +314,7 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
    add      srcq, srcstrideq
    dec   heightd                          ; next row
    jg .nextrow
-    REP_RET
+    RET
 %endmacro

 INIT_MMX ssse3
@ -368,7 +368,7 @@ cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, he
    add      srcq, srcstrideq
    dec   heightd                          ; next row
    jg .nextrow
-    REP_RET
+    RET

 ; 4x4 block, H-only 6-tap filter
 INIT_MMX mmxext
@ -426,7 +426,7 @@ cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, he
    add      srcq, srcstrideq
    dec   heightd                          ; next row
    jg .nextrow
-    REP_RET
+    RET

 INIT_XMM sse2
 cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
@ -474,7 +474,7 @@ cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, h
    add     srcq, srcstrideq
    dec  heightd            ; next row
    jg .nextrow
-    REP_RET
+    RET

 INIT_XMM sse2
 cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
@ -537,7 +537,7 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, h
    add     srcq, srcstrideq
    dec  heightd            ; next row
    jg .nextrow
-    REP_RET
+    RET

 %macro FILTER_V 1
 ; 4x4 block, V-only 4-tap filter
@ -590,7 +590,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
    add     srcq, srcstrideq
    dec  heightd                           ; next row
    jg .nextrow
-    REP_RET
+    RET


 ; 4x4 block, V-only 6-tap filter
@ -655,7 +655,7 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
    add     srcq, srcstrideq
    dec  heightd                           ; next row
    jg .nextrow
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -738,7 +738,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p
    lea     srcq, [srcq+srcstrideq*2]
    sub  heightd, 2
    jg .nextrow
-    REP_RET
+    RET

 %if cpuflag(ssse3)
 cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
@ -815,7 +815,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride
    lea     srcq, [srcq+srcstrideq*2]
    sub  heightd, 2
    jg .nextrow
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
@ -838,7 +838,7 @@ cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
    lea    dstq, [dstq+dststrideq*2]
    sub heightd, 2
    jg .nextrow
-    REP_RET
+    RET

 INIT_XMM sse
 cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
@ -851,7 +851,7 @@ cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
    lea    dstq, [dstq+dststrideq*2]
    sub heightd, 2
    jg .nextrow
-    REP_RET
+    RET

 ;-----------------------------------------------------------------------------
 ; void ff_vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
--- a/libavfilter/x86/af_volume.asm
+++ b/libavfilter/x86/af_volume.asm
@ -56,7 +56,7 @@ cglobal scale_samples_s16, 4,4,4, dst, src, len, volume
    mova  [dstq+lenq], m3
    sub       lenq, mmsize
    jge .loop
-    REP_RET
+    RET

 ;------------------------------------------------------------------------------
 ; void ff_scale_samples_s32(uint8_t *dst, const uint8_t *src, int len,
@ -93,7 +93,7 @@ cglobal scale_samples_s32, 4,4,4, dst, src, len, volume
 %endif
    sub            lenq, mmsize
    jge .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -137,4 +137,4 @@ cglobal scale_samples_s32, 4,4,8, dst, src, len, volume
    mova  [dstq+lenq], m0
    sub       lenq, mmsize
    jge .loop
-    REP_RET
+    RET
--- a/libavfilter/x86/avf_showcqt.asm
+++ b/libavfilter/x86/avf_showcqt.asm
@ -127,7 +127,7 @@ cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_v
        lea     dstq, [dstq + 16]
        lea     coeffsq, [coeffsq + 2*Coeffs.sizeof]
        jnz     .loop_k
-        REP_RET
+        RET
        align   16
        .check_loop_a:
        cmp     xd, [coeffsq + Coeffs.len]
@ -170,7 +170,7 @@ cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i
        lea     dstq, [dstq + 8]
        lea     coeffsq, [coeffsq + Coeffs.sizeof]
        jnz     .loop_k
-        REP_RET
+        RET
 %endif ; ARCH_X86_64
 %endmacro ; DECLARE_CQT_CALC

--- a/libavfilter/x86/scene_sad.asm
+++ b/libavfilter/x86/scene_sad.asm
@ -53,7 +53,7 @@ cglobal scene_sad, 6, 7, 2, src1, stride1, src2, stride2, width, end, x

    mov         r0q, r6mp
    movu      [r0q], m1      ; sum
-REP_RET
+RET
 %endmacro


--- a/libavfilter/x86/vf_blend.asm
+++ b/libavfilter/x86/vf_blend.asm
@ -63,7 +63,7 @@ cglobal blend_%1, 5, 7, %2, top, top_linesize, bottom, bottom_linesize, dst, end
    add          dstq, dst_linesizeq
    sub          endd, 1
    jg .nextrow
-REP_RET
+RET
 %endmacro

 %macro BLEND_SIMPLE 2-3 0
--- a/libavfilter/x86/vf_framerate.asm
+++ b/libavfilter/x86/vf_framerate.asm
@ -84,7 +84,7 @@ cglobal blend_frames%1, 5, 7, 5, src1, src1_linesize, src2, src2_linesize, dst,
    add      dstq, dst_linesizeq
    sub      endd, 1
    jg .nextrow
-REP_RET
+RET
 %endmacro


--- a/libavfilter/x86/vf_gradfun.asm
+++ b/libavfilter/x86/vf_gradfun.asm
@ -64,7 +64,7 @@ cglobal gradfun_filter_line, 6, 6
    add       r0, 4
    jl .loop
 .end:
-    REP_RET
+    RET

 INIT_XMM ssse3
 cglobal gradfun_filter_line, 6, 6, 8
@ -78,7 +78,7 @@ cglobal gradfun_filter_line, 6, 6, 8
    FILTER_LINE m4
    add        r0, 8
    jl .loop
-    REP_RET
+    RET

 %macro BLUR_LINE 1
 cglobal gradfun_blur_line_%1, 6, 6, 8
@ -102,7 +102,7 @@ cglobal gradfun_blur_line_%1, 6, 6, 8
    mova   [r3+r0], m0
    add         r0, 16
    jl .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
--- a/libavfilter/x86/vf_hqdn3d.asm
+++ b/libavfilter/x86/vf_hqdn3d.asm
@ -97,7 +97,7 @@ ALIGN 16
    inc    xq
    jl .loop
    je .loop2
-    REP_RET
+    RET
 %endmacro ; HQDN3D_ROW

 HQDN3D_ROW 8
--- a/libavfilter/x86/vf_interlace.asm
+++ b/libavfilter/x86/vf_interlace.asm
@ -73,7 +73,7 @@ SECTION .text
    jl .loop

 .end:
-    REP_RET
+    RET
 %endmacro

 %macro LOWPASS_LINE 0
@ -146,7 +146,7 @@ cglobal lowpass_line_complex, 5, 5, 8, dst, h, src, mref, pref
    add srcq, mmsize
    sub hd, mmsize
    jg .loop
-REP_RET
+RET

 cglobal lowpass_line_complex_12, 5, 5, 8, 16, dst, h, src, mref, pref, clip_max
    movd m7, DWORD clip_maxm
@ -208,7 +208,7 @@ cglobal lowpass_line_complex_12, 5, 5, 8, 16, dst, h, src, mref, pref, clip_max
    add srcq, 2*mmsize
    sub hd, mmsize
    jg .loop
-REP_RET
+RET
 %endmacro

 INIT_XMM sse2
--- a/libavfilter/x86/vf_maskedmerge.asm
+++ b/libavfilter/x86/vf_maskedmerge.asm
@ -81,4 +81,4 @@ cglobal maskedmerge8, 5, 7, 8, bsrc, osrc, msrc, dst, blinesize, w, x
    add          dstq, dlinesizeq
    sub         hd, 1
    jg .nextrow
-REP_RET
+RET
--- a/libavfilter/x86/vf_stereo3d.asm
+++ b/libavfilter/x86/vf_stereo3d.asm
@ -213,4 +213,4 @@ cglobal anaglyph, 3, 6, 8, 2*9*mmsize, dst, lsrc, rsrc, dst_linesize, o, cnt
    add         rsrcq, r_linesizeq
    sub       heightd, 1
    jg .nextrow
-REP_RET
+RET
--- a/libavfilter/x86/vf_w3fdif.asm
+++ b/libavfilter/x86/vf_w3fdif.asm
@ -38,7 +38,7 @@ cglobal w3fdif_scale, 3, 3, 2, 0, out_pixel, work_pixel, linesize
    add                 work_pixelq, mmsize*2
    sub                   linesized, mmsize/2
    jg .loop
-REP_RET
+RET

 cglobal w3fdif_simple_low, 4, 5, 6, 0, work_line, in_lines_cur0, coef, linesize, offset
    movd                  m1, [coefq]
@ -63,7 +63,7 @@ cglobal w3fdif_simple_low, 4, 5, 6, 0, work_line, in_lines_cur0, coef, linesize,
    add                               offsetq, mmsize/2
    sub                             linesized, mmsize/2
    jg .loop
-REP_RET
+RET

 cglobal w3fdif_complex_low, 4, 7, 8, 0, work_line, in_lines_cur0, coef, linesize
    movq                  m0, [coefq]
@ -99,7 +99,7 @@ cglobal w3fdif_complex_low, 4, 7, 8, 0, work_line, in_lines_cur0, coef, linesize
    add                               offsetq, mmsize/2
    sub                             linesized, mmsize/2
    jg .loop
-REP_RET
+RET

 %if ARCH_X86_64
 cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize
@ -179,7 +179,7 @@ cglobal w3fdif_simple_high, 4, 7, 8, 0, work_line, in_lines_cur0, in_lines_adj0,
    add                               offsetq, mmsize/2
    sub                             linesized, mmsize/2
    jg .loop
-REP_RET
+RET

 %if ARCH_X86_64

@ -254,6 +254,6 @@ cglobal w3fdif_complex_high, 5, 13, 10, 0, work_line, in_lines_cur0, in_lines_ad
    add                               offsetq, mmsize/2
    sub                             linesized, mmsize/2
    jg .loop
-REP_RET
+RET

 %endif
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@ -48,7 +48,7 @@ ALIGN 16

    sub       lenq, 64
    jge       .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse
@ -141,7 +141,7 @@ cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
 %endif ; mmsize
    sub    lenq, 64
    jge .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse
@ -178,7 +178,7 @@ cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
    mova  [dstq+lenq], m1
    sub    lenq, mmsize
    jge .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse
@ -233,7 +233,7 @@ cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len
    movaps [dstq+lenq+3*mmsize], m4
    sub    lenq, mmsize*4
    jge .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -280,7 +280,7 @@ cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
    movaps [dstq+lenq+mmsize], m2
    sub          lenq, 2*mmsize
    jge .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
@ -323,7 +323,7 @@ cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
    sub       len1q, mmsize
    add       lenq,  mmsize
    jl .loop
-    REP_RET
+    RET

 ;-----------------------------------------------------------------------------
 ; vector_fmul_add(float *dst, const float *src0, const float *src1,
@ -352,7 +352,7 @@ ALIGN 16

    sub     lenq,   2*mmsize
    jge     .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse
@ -401,7 +401,7 @@ ALIGN 16
    add     src1q, 2*mmsize
    sub     lenq,  2*mmsize
    jge     .loop
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse
@ -585,4 +585,4 @@ cglobal butterflies_float, 3,3,3, src0, src1, len
    mova        [src0q + lenq], m0
    add       lenq, mmsize
    jl .loop
-    REP_RET
+    RET
--- a/libavutil/x86/lls.asm
+++ b/libavutil/x86/lls.asm
@ -123,7 +123,7 @@ cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
    test    id, id
    jle .loop2x1
 .ret:
-    REP_RET
+    RET

 %macro UPDATE_LLS 0
 cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
@ -240,7 +240,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
    cmp     id, countd
    jle .loop2x1
 .ret:
-    REP_RET
+    RET
 %endmacro ; UPDATE_LLS

 %if HAVE_AVX_EXTERNAL
--- a/libswresample/x86/audio_convert.asm
+++ b/libswresample/x86/audio_convert.asm
@ -85,7 +85,7 @@ pack_2ch_%2_to_%1_u_int %+ SUFFIX:
    add lenq, 2*mmsize/(2<<%4)
 %endif
        jl .next
-    REP_RET
+    RET
 %endmacro

 %macro UNPACK_2CH 5-7
@ -157,7 +157,7 @@ unpack_2ch_%2_to_%1_u_int %+ SUFFIX:
    add lenq, mmsize/(1<<%4)
 %endif
        jl .next
-    REP_RET
+    RET
 %endmacro

 %macro CONV 5-7
@ -198,7 +198,7 @@ cglobal %2_to_%1_%3, 3, 3, 6, dst, src, len
    emms
    RET
 %else
-    REP_RET
+    RET
 %endif
 %endmacro

@ -301,7 +301,7 @@ pack_6ch_%2_to_%1_u_int %+ SUFFIX:
    emms
    RET
 %else
-    REP_RET
+    RET
 %endif
 %endmacro

@ -375,7 +375,7 @@ unpack_6ch_%2_to_%1_u_int %+ SUFFIX:
    add      dstq, mmsize
    sub      lend, mmsize/4
    jg .loop
-    REP_RET
+    RET
 %endmacro

 %define PACK_8CH_GPRS (10 * ARCH_X86_64) + ((6 + HAVE_ALIGNED_STACK) * ARCH_X86_32)
@ -525,7 +525,7 @@ pack_8ch_%2_to_%1_u_int %+ SUFFIX:
 %endif
    sub      lend, mmsize/4
    jg .loop
-    REP_RET
+    RET
 %endmacro

 %macro INT16_TO_INT32_N 6
--- a/libswresample/x86/rematrix.asm
+++ b/libswresample/x86/rematrix.asm
@ -68,7 +68,7 @@ mix_2_1_float_u_int %+ SUFFIX:
    mov%1  [outq + lenq + mmsize], m2
    add        lenq, mmsize*2
        jl .next
-    REP_RET
+    RET
 %endmacro

 %macro MIX1_FLT 1
@ -100,7 +100,7 @@ mix_1_1_float_u_int %+ SUFFIX:
    mov%1  [outq + lenq + mmsize], m1
    add        lenq, mmsize*2
        jl .next
-    REP_RET
+    RET
 %endmacro

 %macro MIX1_INT16 1
@ -152,7 +152,7 @@ mix_1_1_int16_u_int %+ SUFFIX:
    emms
    RET
 %else
-    REP_RET
+    RET
 %endif
 %endmacro

@ -218,7 +218,7 @@ mix_2_1_int16_u_int %+ SUFFIX:
    emms
    RET
 %else
-    REP_RET
+    RET
 %endif
 %endmacro

--- a/libswscale/x86/input.asm
+++ b/libswscale/x86/input.asm
@ -207,7 +207,7 @@ cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
    mova    [dstq+wq], m0
    add            wq, mmsize
    jl .loop
-    REP_RET
+    RET
 %endif ; ARCH_X86_64 && %0 == 3
 %endmacro

@ -313,7 +313,7 @@ cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
    mova   [dstVq+wq], m2
    add            wq, mmsize
    jl .loop
-    REP_RET
+    RET
 %endif ; ARCH_X86_64 && %0 == 3
 %endmacro

@ -394,7 +394,7 @@ cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table
    add            wq, 2
    jl .loop2
 .end:
-    REP_RET
+    RET
 %endif ; %0 == 3
 %endmacro

@ -491,7 +491,7 @@ cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
    add            wq, 2
    jl .loop2
 .end:
-    REP_RET
+    RET
 %endif ; ARCH_X86_64 && %0 == 3
 %endmacro

@ -543,7 +543,7 @@ RGB32_FUNCS 8, 12
    mova    [dstq+wq], m0
    add            wq, mmsize
    jl .loop_%1
-    REP_RET
+    RET
 %endmacro

 ; %1 = nr. of XMM registers
@ -599,7 +599,7 @@ cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
    movhps [dstVq+wq], m1
    add            wq, mmsize / 2
    jl .loop_%1
-    REP_RET
+    RET
 %endmacro

 ; %1 = nr. of XMM registers
@ -657,7 +657,7 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
 %endif ; nv12/21
    add            wq, mmsize
    jl .loop_%1
-    REP_RET
+    RET
 %endmacro

 ; %1 = nr. of XMM registers
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@ -297,7 +297,7 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
    test          dstq, 15
    jnz .unaligned
    yuv2planeX_mainloop %1, a
-    REP_RET
+    RET
 .unaligned:
    yuv2planeX_mainloop %1, u
 %endif ; mmsize == 8/16
@ -307,10 +307,10 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
    ADD             rsp, pad
    RET
 %else ; x86-64
-    REP_RET
+    RET
 %endif ; x86-32/64
 %else ; %1 == 9/10/16
-    REP_RET
+    RET
 %endif ; %1 == 8/9/10/16
 %endmacro

@ -433,10 +433,10 @@ cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
    test          dstq, 15
    jnz .unaligned
    yuv2plane1_mainloop %1, a
-    REP_RET
+    RET
 .unaligned:
    yuv2plane1_mainloop %1, u
-    REP_RET
+    RET
 %endmacro

 INIT_XMM sse2
--- a/libswscale/x86/scale.asm
+++ b/libswscale/x86/scale.asm
@ -357,7 +357,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
    add           wq, 2
 %endif ; %3 ==/!= X
    jl .loop
-    REP_RET
+    RET
 %endmacro

 ; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
--- a/libswscale/x86/scale_avx2.asm
+++ b/libswscale/x86/scale_avx2.asm
@ -144,7 +144,7 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize,
    cmp countq, wq
    jl .tail_loop
 .end:
-REP_RET
+RET
 %endmacro

 %if ARCH_X86_64
--- a/libswscale/x86/yuv2yuvX.asm
+++ b/libswscale/x86/yuv2yuvX.asm
@ -121,7 +121,7 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
    mov                  filterSizeq, filterq
    cmp                  offsetq, dstWq
    jb                  .outerloop
-    REP_RET
+    RET
 %endmacro

 INIT_MMX mmxext
--- a/libswscale/x86/yuv_2_rgb.asm
+++ b/libswscale/x86/yuv_2_rgb.asm
@ -354,7 +354,7 @@ add imageq, 8 * depth * time_num
 add indexq, 4 * time_num
 js .loop0

-REP_RET
+RET

 %endmacro

--- a/tests/checkasm/x86/checkasm.asm
+++ b/tests/checkasm/x86/checkasm.asm
@ -234,7 +234,7 @@ cglobal checked_call%1, 1,7
 .emms_ok:
 %endif
    add  esp, max_args*4
-    REP_RET
+    RET
 %endmacro

 %endif ; ARCH_X86_64