diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 5ebad92604..b6a4b0ec24 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -2406,7 +2406,12 @@ typedef struct AVCodecContext {
      *
      * Decoders cannot use the buffer after returning from
      * avcodec_decode_audio4(), so they will not call release_buffer(), as it
-     * is assumed to be released immediately upon return.
+     * is assumed to be released immediately upon return. In some rare cases,
+     * a decoder may need to call get_buffer() more than once in a single
+     * call to avcodec_decode_audio4(). In that case, when get_buffer() is
+     * called again after it has already been called once, the previously
+     * acquired buffer is assumed to be released at that time and may not be
+     * reused by the decoder.
      *
      * As a convenience, av_samples_get_buffer_size() and
      * av_samples_fill_arrays() in libavutil may be used by custom get_buffer()
diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c
index dcd2f159b7..e33c643b17 100644
--- a/libavcodec/mpegaudiodec.c
+++ b/libavcodec/mpegaudiodec.c
@@ -95,7 +95,8 @@ typedef struct MPADecodeContext {
 #   define MULH3(x, y, s) ((s)*(y)*(x))
 #   define MULLx(x, y, s) ((y)*(x))
 #   define RENAME(a) a ## _float
-#   define OUT_FMT AV_SAMPLE_FMT_FLT
+#   define OUT_FMT   AV_SAMPLE_FMT_FLT
+#   define OUT_FMT_P AV_SAMPLE_FMT_FLTP
 #else
 #   define SHR(a,b)       ((a)>>(b))
 /* WARNING: only correct for positive numbers */
@@ -105,7 +106,8 @@ typedef struct MPADecodeContext {
 #   define MULH3(x, y, s) MULH((s)*(x), y)
 #   define MULLx(x, y, s) MULL(x,y,s)
 #   define RENAME(a)      a ## _fixed
-#   define OUT_FMT AV_SAMPLE_FMT_S16
+#   define OUT_FMT   AV_SAMPLE_FMT_S16
+#   define OUT_FMT_P AV_SAMPLE_FMT_S16P
 #endif
 
 /****************/
@@ -441,7 +443,11 @@ static av_cold int decode_init(AVCodecContext * avctx)
     ff_mpadsp_init(&s->mpadsp);
     ff_dsputil_init(&s->dsp, avctx);
 
-    avctx->sample_fmt= OUT_FMT;
+    if (avctx->request_sample_fmt == OUT_FMT &&
+        avctx->codec_id != AV_CODEC_ID_MP3ON4)
+        avctx->sample_fmt = OUT_FMT;
+    else
+        avctx->sample_fmt = OUT_FMT_P;
     s->err_recognition = avctx->err_recognition;
 
     if (avctx->codec_id == AV_CODEC_ID_MP3ADU)
@@ -1564,7 +1570,7 @@ static int mp_decode_layer3(MPADecodeContext *s)
     return nb_granules * 18;
 }
 
-static int mp_decode_frame(MPADecodeContext *s, OUT_INT *samples,
+static int mp_decode_frame(MPADecodeContext *s, OUT_INT **samples,
                            const uint8_t *buf, int buf_size)
 {
     int i, nb_frames, ch, ret;
@@ -1627,20 +1633,26 @@ static int mp_decode_frame(MPADecodeContext *s, OUT_INT *samples,
             av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
             return ret;
         }
-        samples = (OUT_INT *)s->frame.data[0];
+        samples = (OUT_INT **)s->frame.extended_data;
     }
 
     /* apply the synthesis filter */
     for (ch = 0; ch < s->nb_channels; ch++) {
-        samples_ptr = samples + ch;
+        int sample_stride;
+        if (s->avctx->sample_fmt == OUT_FMT_P) {
+            samples_ptr   = samples[ch];
+            sample_stride = 1;
+        } else {
+            samples_ptr   = samples[0] + ch;
+            sample_stride = s->nb_channels;
+        }
         for (i = 0; i < nb_frames; i++) {
-            RENAME(ff_mpa_synth_filter)(
-                         &s->mpadsp,
-                         s->synth_buf[ch], &(s->synth_buf_offset[ch]),
-                         RENAME(ff_mpa_synth_window), &s->dither_state,
-                         samples_ptr, s->nb_channels,
-                         s->sb_samples[ch][i]);
-            samples_ptr += 32 * s->nb_channels;
+            RENAME(ff_mpa_synth_filter)(&s->mpadsp, s->synth_buf[ch],
+                                        &(s->synth_buf_offset[ch]),
+                                        RENAME(ff_mpa_synth_window),
+                                        &s->dither_state, samples_ptr,
+                                        sample_stride, s->sb_samples[ch][i]);
+            samples_ptr += 32 * sample_stride;
         }
     }
 
@@ -1789,7 +1801,6 @@ typedef struct MP3On4DecodeContext {
     int syncword;                   ///< syncword patch
     const uint8_t *coff;            ///< channel offsets in output buffer
     MPADecodeContext *mp3decctx[5]; ///< MPADecodeContext for every decoder instance
-    OUT_INT *decoded_buf;           ///< output buffer for decoded samples
 } MP3On4DecodeContext;
 
 #include "mpeg4audio.h"
@@ -1831,8 +1842,6 @@ static av_cold int decode_close_mp3on4(AVCodecContext * avctx)
     for (i = 0; i < s->frames; i++)
         av_free(s->mp3decctx[i]);
 
-    av_freep(&s->decoded_buf);
-
     return 0;
 }
 
@@ -1893,14 +1902,6 @@ static int decode_init_mp3on4(AVCodecContext * avctx)
         s->mp3decctx[i]->mpadsp = s->mp3decctx[0]->mpadsp;
     }
 
-    /* Allocate buffer for multi-channel output if needed */
-    if (s->frames > 1) {
-        s->decoded_buf = av_malloc(MPA_FRAME_SIZE * MPA_MAX_CHANNELS *
-                                   sizeof(*s->decoded_buf));
-        if (!s->decoded_buf)
-            goto alloc_fail;
-    }
-
     return 0;
 alloc_fail:
     decode_close_mp3on4(avctx);
@@ -1927,9 +1928,9 @@ static int decode_frame_mp3on4(AVCodecContext *avctx, void *data,
     MPADecodeContext *m;
     int fsize, len = buf_size, out_size = 0;
     uint32_t header;
-    OUT_INT *out_samples;
-    OUT_INT *outptr, *bp;
-    int fr, j, n, ch, ret;
+    OUT_INT **out_samples;
+    OUT_INT *outptr[2];
+    int fr, ch, ret;
 
     /* get output buffer */
     s->frame->nb_samples = s->frames * MPA_FRAME_SIZE;
@@ -1937,15 +1938,12 @@ static int decode_frame_mp3on4(AVCodecContext *avctx, void *data,
         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
         return ret;
     }
-    out_samples = (OUT_INT *)s->frame->data[0];
+    out_samples = (OUT_INT **)s->frame->extended_data;
 
     // Discard too short frames
     if (buf_size < HEADER_SIZE)
         return AVERROR_INVALIDDATA;
 
-    // If only one decoder interleave is not needed
-    outptr = s->frames == 1 ? out_samples : s->decoded_buf;
-
     avctx->bit_rate = 0;
 
     ch = 0;
@@ -1973,6 +1971,10 @@ static int decode_frame_mp3on4(AVCodecContext *avctx, void *data,
         }
         ch += m->nb_channels;
 
+        outptr[0] = out_samples[s->coff[fr]];
+        if (m->nb_channels > 1)
+            outptr[1] = out_samples[s->coff[fr] + 1];
+
         if ((ret = mp_decode_frame(m, outptr, buf, fsize)) < 0)
             return ret;
 
@@ -1980,23 +1982,6 @@ static int decode_frame_mp3on4(AVCodecContext *avctx, void *data,
         buf      += fsize;
         len      -= fsize;
 
-        if (s->frames > 1) {
-            n = m->avctx->frame_size*m->nb_channels;
-            /* interleave output data */
-            bp = out_samples + s->coff[fr];
-            if (m->nb_channels == 1) {
-                for (j = 0; j < n; j++) {
-                    *bp = s->decoded_buf[j];
-                    bp += avctx->channels;
-                }
-            } else {
-                for (j = 0; j < n; j++) {
-                    bp[0] = s->decoded_buf[j++];
-                    bp[1] = s->decoded_buf[j];
-                    bp   += avctx->channels;
-                }
-            }
-        }
         avctx->bit_rate += m->bit_rate;
     }
 
@@ -2023,6 +2008,9 @@ AVCodec ff_mp1_decoder = {
     .capabilities   = CODEC_CAP_DR1,
     .flush          = flush,
     .long_name      = NULL_IF_CONFIG_SMALL("MP1 (MPEG audio layer 1)"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16P,
+                                                      AV_SAMPLE_FMT_S16,
+                                                      AV_SAMPLE_FMT_NONE },
 };
 #endif
 #if CONFIG_MP2_DECODER
@@ -2036,6 +2024,9 @@ AVCodec ff_mp2_decoder = {
     .capabilities   = CODEC_CAP_DR1,
     .flush          = flush,
     .long_name      = NULL_IF_CONFIG_SMALL("MP2 (MPEG audio layer 2)"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16P,
+                                                      AV_SAMPLE_FMT_S16,
+                                                      AV_SAMPLE_FMT_NONE },
 };
 #endif
 #if CONFIG_MP3_DECODER
@@ -2049,6 +2040,9 @@ AVCodec ff_mp3_decoder = {
     .capabilities   = CODEC_CAP_DR1,
     .flush          = flush,
     .long_name      = NULL_IF_CONFIG_SMALL("MP3 (MPEG audio layer 3)"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16P,
+                                                      AV_SAMPLE_FMT_S16,
+                                                      AV_SAMPLE_FMT_NONE },
 };
 #endif
 #if CONFIG_MP3ADU_DECODER
@@ -2062,6 +2056,9 @@ AVCodec ff_mp3adu_decoder = {
     .capabilities   = CODEC_CAP_DR1,
     .flush          = flush,
     .long_name      = NULL_IF_CONFIG_SMALL("ADU (Application Data Unit) MP3 (MPEG audio layer 3)"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16P,
+                                                      AV_SAMPLE_FMT_S16,
+                                                      AV_SAMPLE_FMT_NONE },
 };
 #endif
 #if CONFIG_MP3ON4_DECODER
@@ -2076,6 +2073,8 @@ AVCodec ff_mp3on4_decoder = {
     .capabilities   = CODEC_CAP_DR1,
     .flush          = flush_mp3on4,
     .long_name      = NULL_IF_CONFIG_SMALL("MP3onMP4"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16P,
+                                                      AV_SAMPLE_FMT_NONE },
 };
 #endif
 #endif
diff --git a/libavcodec/mpegaudiodec_float.c b/libavcodec/mpegaudiodec_float.c
index e26a0a9ad9..17a7757f95 100644
--- a/libavcodec/mpegaudiodec_float.c
+++ b/libavcodec/mpegaudiodec_float.c
@@ -33,6 +33,9 @@ AVCodec ff_mp1float_decoder = {
     .capabilities   = CODEC_CAP_DR1,
     .flush          = flush,
     .long_name      = NULL_IF_CONFIG_SMALL("MP1 (MPEG audio layer 1)"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
+                                                      AV_SAMPLE_FMT_FLT,
+                                                      AV_SAMPLE_FMT_NONE },
 };
 #endif
 #if CONFIG_MP2FLOAT_DECODER
@@ -46,6 +49,9 @@ AVCodec ff_mp2float_decoder = {
     .capabilities   = CODEC_CAP_DR1,
     .flush          = flush,
     .long_name      = NULL_IF_CONFIG_SMALL("MP2 (MPEG audio layer 2)"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
+                                                      AV_SAMPLE_FMT_FLT,
+                                                      AV_SAMPLE_FMT_NONE },
 };
 #endif
 #if CONFIG_MP3FLOAT_DECODER
@@ -59,6 +65,9 @@ AVCodec ff_mp3float_decoder = {
     .capabilities   = CODEC_CAP_DR1,
     .flush          = flush,
     .long_name      = NULL_IF_CONFIG_SMALL("MP3 (MPEG audio layer 3)"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
+                                                      AV_SAMPLE_FMT_FLT,
+                                                      AV_SAMPLE_FMT_NONE },
 };
 #endif
 #if CONFIG_MP3ADUFLOAT_DECODER
@@ -72,6 +81,9 @@ AVCodec ff_mp3adufloat_decoder = {
     .capabilities   = CODEC_CAP_DR1,
     .flush          = flush,
     .long_name      = NULL_IF_CONFIG_SMALL("ADU (Application Data Unit) MP3 (MPEG audio layer 3)"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
+                                                      AV_SAMPLE_FMT_FLT,
+                                                      AV_SAMPLE_FMT_NONE },
 };
 #endif
 #if CONFIG_MP3ON4FLOAT_DECODER
@@ -86,5 +98,7 @@ AVCodec ff_mp3on4float_decoder = {
     .capabilities   = CODEC_CAP_DR1,
     .flush          = flush_mp3on4,
     .long_name      = NULL_IF_CONFIG_SMALL("MP3onMP4"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
+                                                      AV_SAMPLE_FMT_NONE },
 };
 #endif
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm
index 6c78adf053..2d287ba443 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -157,7 +157,7 @@ QPEL8_H_LOWPASS_OP put
 QPEL8_H_LOWPASS_OP avg
 
 %macro QPEL8_H_LOWPASS_OP_XMM 1
-cglobal %1_h264_qpel8_h_lowpass, 4,5,7 ; dst, src, dstStride, srcStride
+cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
     movsxdifnidn  r2, r2d
     movsxdifnidn  r3, r3d
     mov          r4d, 8
@@ -312,7 +312,7 @@ QPEL8_H_LOWPASS_L2_OP avg
 
 
 %macro QPEL8_H_LOWPASS_L2_OP_XMM 1
-cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,7 ; dst, src, src2, dstStride, src2Stride
+cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride
     movsxdifnidn  r3, r3d
     movsxdifnidn  r4, r4d
     mov          r5d, 8
@@ -415,13 +415,13 @@ QPEL4_V_LOWPASS_OP avg
 
 %macro QPEL8OR16_V_LOWPASS_OP 1
 %if cpuflag(sse2)
-cglobal %1_h264_qpel8or16_v_lowpass, 5,5,7 ; dst, src, dstStride, srcStride, h
+cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h
     movsxdifnidn  r2, r2d
     movsxdifnidn  r3, r3d
     sub           r1, r3
     sub           r1, r3
 %else
-cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,7 ; dst, src, dstStride, srcStride, h
+cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h
     movsxdifnidn  r2, r2d
     movsxdifnidn  r3, r3d
 %endif
@@ -543,7 +543,7 @@ QPEL4_HV1_LOWPASS_OP put
 QPEL4_HV1_LOWPASS_OP avg
 
 %macro QPEL8OR16_HV1_LOWPASS_OP 1
-cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,7 ; src, tmp, srcStride, size
+cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
     movsxdifnidn  r2, r2d
     pxor          m7, m7
     movh          m0, [r0]
@@ -635,7 +635,7 @@ QPEL8OR16_HV2_LOWPASS_OP put
 QPEL8OR16_HV2_LOWPASS_OP avg
 
 %macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
-cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,7 ; dst, tmp, dstStride, tmpStride, size
+cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size
     movsxdifnidn  r2, r2d
     movsxdifnidn  r3, r3d
     cmp          r4d, 16