diff --git a/audio/decode/ad.h b/audio/decode/ad.h
index 30e739d135..6c76e8dfd0 100644
--- a/audio/decode/ad.h
+++ b/audio/decode/ad.h
@@ -24,6 +24,7 @@
 #include "demux/demux.h"
 
 #include "audio/format.h"
+#include "audio/audio.h"
 
 struct mp_decoder_list;
 
@@ -35,8 +36,7 @@ struct ad_functions {
     int (*init)(sh_audio_t *sh, const char *decoder);
     void (*uninit)(sh_audio_t *sh);
     int (*control)(sh_audio_t *sh, int cmd, void *arg);
-    int (*decode_audio)(sh_audio_t *sh, unsigned char *buffer, int minlen,
-                        int maxlen);
+    int (*decode_audio)(sh_audio_t *sh, struct mp_audio *buffer, int maxlen);
 };
 
 enum ad_ctrl {
diff --git a/audio/decode/ad_lavc.c b/audio/decode/ad_lavc.c
index 1e63f0c3f2..c42c430850 100644
--- a/audio/decode/ad_lavc.c
+++ b/audio/decode/ad_lavc.c
@@ -36,25 +36,20 @@
 #include "mpvcore/av_opts.h"
 
 #include "ad.h"
-#include "audio/reorder_ch.h"
 #include "audio/fmt-conversion.h"
 
-#include "compat/mpbswap.h"
 #include "compat/libav.h"
 
 struct priv {
     AVCodecContext *avctx;
     AVFrame *avframe;
-    uint8_t *output;
-    uint8_t *output_packed; // used by deplanarize to store packed audio samples
-    int output_left;
-    int unitsize;
+    struct mp_audio frame;
     bool force_channel_map;
     struct demux_packet *packet;
 };
 
 static void uninit(sh_audio_t *sh);
-static int decode_audio(sh_audio_t *sh,unsigned char *buffer,int minlen,int maxlen);
+static int decode_new_packet(struct sh_audio *sh);
 
 #define OPT_BASE_STRUCT struct MPOpts
 
@@ -150,22 +145,21 @@ static int preinit(sh_audio_t *sh)
     return 1;
 }
 
-/* Prefer playing audio with the samplerate given in container data
- * if available, but take number the number of channels and sample format
- * from the codec, since if the codec isn't using the correct values for
- * those everything breaks anyway.
- */
-static int setup_format(sh_audio_t *sh_audio,
-                        const AVCodecContext *lavc_context)
+static int setup_format(sh_audio_t *sh_audio)
 {
     struct priv *priv = sh_audio->context;
-    int sample_format        =
-        af_from_avformat(av_get_packed_sample_fmt(lavc_context->sample_fmt));
-    int samplerate           = lavc_context->sample_rate;
-    // If not set, try container samplerate
+    AVCodecContext *lavc_context = priv->avctx;
+
+    int sample_format = af_from_avformat(lavc_context->sample_fmt);
+    if (!sample_format)
+        return -1;
+
+    int samplerate = lavc_context->sample_rate;
     if (!samplerate && sh_audio->wf) {
+        // If not set, try container samplerate.
+        // (Maybe this can't happen, and it's an artifact from the past.)
         samplerate = sh_audio->wf->nSamplesPerSec;
-        mp_tmsg(MSGT_DECAUDIO, MSGL_V, "ad_lavc: using container rate.\n");
+        mp_tmsg(MSGT_DECAUDIO, MSGL_WARN, "ad_lavc: using container rate.\n");
     }
 
     struct mp_chmap lavc_chmap;
@@ -178,14 +172,9 @@ static int setup_format(sh_audio_t *sh_audio,
             lavc_chmap = sh_audio->channels;
     }
 
-    if (!mp_chmap_equals(&lavc_chmap, &sh_audio->channels) ||
-        samplerate != sh_audio->samplerate ||
-        sample_format != sh_audio->sample_format) {
-        sh_audio->channels = lavc_chmap;
-        sh_audio->samplerate = samplerate;
-        sh_audio->sample_format = sample_format;
-        return 1;
-    }
+    sh_audio->channels = lavc_chmap;
+    sh_audio->samplerate = samplerate;
+    sh_audio->sample_format = sample_format;
     return 0;
 }
 
@@ -285,15 +274,12 @@ static int init(sh_audio_t *sh_audio, const char *decoder)
     mp_msg(MSGT_DECAUDIO, MSGL_V, "INFO: libavcodec \"%s\" init OK!\n",
            lavc_codec->name);
 
-    // Decode at least 1 byte:  (to get header filled)
-    for (int tries = 0;;) {
-        int x = decode_audio(sh_audio, sh_audio->a_buffer, 1,
-                             sh_audio->a_buffer_size);
-        if (x > 0) {
-            sh_audio->a_buffer_len = x;
+    // Decode at least 1 sample:  (to get header filled)
+    for (int tries = 1; ; tries++) {
+        int x = decode_new_packet(sh_audio);
+        if (x >= 0 && ctx->frame.samples > 0)
             break;
-        }
-        if (++tries >= 5) {
+        if (tries >= 5) {
             mp_msg(MSGT_DECAUDIO, MSGL_ERR,
                    "ad_lavc: initial decode failed\n");
             uninit(sh_audio);
@@ -305,12 +291,6 @@ static int init(sh_audio_t *sh_audio, const char *decoder)
     if (sh_audio->wf && sh_audio->wf->nAvgBytesPerSec)
         sh_audio->i_bps = sh_audio->wf->nAvgBytesPerSec;
 
-    int af_sample_fmt =
-        af_from_avformat(av_get_packed_sample_fmt(lavc_context->sample_fmt));
-    if (af_sample_fmt == AF_FORMAT_UNKNOWN) {
-        uninit(sh_audio);
-        return 0;
-    }
     return 1;
 }
 
@@ -338,7 +318,7 @@ static int control(sh_audio_t *sh, int cmd, void *arg)
     switch (cmd) {
     case ADCTRL_RESYNC_STREAM:
         avcodec_flush_buffers(ctx->avctx);
-        ctx->output_left = 0;
+        ctx->frame.samples = 0;
         talloc_free(ctx->packet);
         ctx->packet = NULL;
         return CONTROL_TRUE;
@@ -346,29 +326,13 @@ static int control(sh_audio_t *sh, int cmd, void *arg)
     return CONTROL_UNKNOWN;
 }
 
-static av_always_inline void deplanarize(struct sh_audio *sh)
-{
-    struct priv *priv = sh->context;
-
-    uint8_t **planes  = priv->avframe->extended_data;
-    size_t bps        = av_get_bytes_per_sample(priv->avctx->sample_fmt);
-    size_t nb_samples = priv->avframe->nb_samples;
-    size_t channels   = priv->avctx->channels;
-    size_t size       = bps * nb_samples * channels;
-
-    if (talloc_get_size(priv->output_packed) != size)
-        priv->output_packed =
-            talloc_realloc_size(priv, priv->output_packed, size);
-
-    reorder_to_packed(priv->output_packed, planes, bps, channels, nb_samples);
-
-    priv->output = priv->output_packed;
-}
-
 static int decode_new_packet(struct sh_audio *sh)
 {
     struct priv *priv = sh->context;
     AVCodecContext *avctx = priv->avctx;
+
+    priv->frame.samples = 0;
+
     struct demux_packet *mpkt = priv->packet;
     if (!mpkt)
         mpkt = demux_read_packet(sh->gsh);
@@ -384,7 +348,7 @@ static int decode_new_packet(struct sh_audio *sh)
 
     if (mpkt->pts != MP_NOPTS_VALUE) {
         sh->pts = mpkt->pts;
-        sh->pts_bytes = 0;
+        sh->pts_offset = 0;
     }
     int got_frame = 0;
     int ret = avcodec_decode_audio4(avctx, priv->avframe, &got_frame, &pkt);
@@ -409,58 +373,39 @@ static int decode_new_packet(struct sh_audio *sh)
     }
     if (!got_frame)
         return 0;
-    uint64_t unitsize = (uint64_t)av_get_bytes_per_sample(avctx->sample_fmt) *
-                        avctx->channels;
-    if (unitsize > 100000)
-        abort();
-    priv->unitsize = unitsize;
-    uint64_t output_left = unitsize * priv->avframe->nb_samples;
-    if (output_left > 500000000)
-        abort();
-    priv->output_left = output_left;
-    if (av_sample_fmt_is_planar(avctx->sample_fmt) && avctx->channels > 1) {
-        deplanarize(sh);
-    } else {
-        priv->output = priv->avframe->data[0];
-    }
-    mp_dbg(MSGT_DECAUDIO, MSGL_DBG2, "Decoded %d -> %d  \n", in_len,
-           priv->output_left);
+
+    if (setup_format(sh) < 0)
+        return -1;
+
+    priv->frame.samples = priv->avframe->nb_samples;
+    mp_audio_set_format(&priv->frame, sh->sample_format);
+    mp_audio_set_channels(&priv->frame, &sh->channels);
+    priv->frame.rate = sh->samplerate;
+    for (int n = 0; n < priv->frame.num_planes; n++)
+        priv->frame.planes[n] = priv->avframe->data[n];
+
+    mp_dbg(MSGT_DECAUDIO, MSGL_DBG2, "Decoded %d -> %d samples\n", in_len,
+           priv->frame.samples);
     return 0;
 }
 
-
-static int decode_audio(sh_audio_t *sh_audio, unsigned char *buf, int minlen,
-                        int maxlen)
+static int decode_audio(sh_audio_t *sh, struct mp_audio *buffer, int maxlen)
 {
-    struct priv *priv = sh_audio->context;
-    AVCodecContext *avctx = priv->avctx;
+    struct priv *priv = sh->context;
 
-    int len = -1;
-    while (len < minlen) {
-        if (!priv->output_left) {
-            if (decode_new_packet(sh_audio) < 0)
-                break;
-            continue;
-        }
-        if (setup_format(sh_audio, avctx))
-            return len;
-        int size = (minlen - len + priv->unitsize - 1);
-        size -= size % priv->unitsize;
-        size = FFMIN(size, priv->output_left);
-        if (size > maxlen)
-            abort();
-        memcpy(buf, priv->output, size);
-        priv->output += size;
-        priv->output_left -= size;
-        if (len < 0)
-            len = size;
-        else
-            len += size;
-        buf += size;
-        maxlen -= size;
-        sh_audio->pts_bytes += size;
+    if (!priv->frame.samples) {
+        if (decode_new_packet(sh) < 0)
+            return -1;
     }
-    return len;
+
+    if (!mp_audio_config_equals(buffer, &priv->frame))
+        return 0;
+
+    buffer->samples = MPMIN(priv->frame.samples, maxlen);
+    mp_audio_copy(buffer, 0, &priv->frame, 0, buffer->samples);
+    mp_audio_skip_samples(&priv->frame, buffer->samples);
+    sh->pts_offset += buffer->samples;
+    return 0;
 }
 
 static void add_decoders(struct mp_decoder_list *list)
diff --git a/audio/decode/ad_mpg123.c b/audio/decode/ad_mpg123.c
index 609e68f1c8..322f45826f 100644
--- a/audio/decode/ad_mpg123.c
+++ b/audio/decode/ad_mpg123.c
@@ -35,7 +35,9 @@
 
 struct ad_mpg123_context {
     mpg123_handle *handle;
-    char new_format;
+    bool new_format;
+    int sample_size;
+    bool need_data;
     /* Running mean for bit rate, stream length estimation. */
     float mean_rate;
     unsigned int mean_count;
@@ -45,6 +47,17 @@ struct ad_mpg123_context {
     char vbr;
 };
 
+static void uninit(sh_audio_t *sh)
+{
+    struct ad_mpg123_context *con = (struct ad_mpg123_context*) sh->context;
+
+    mpg123_close(con->handle);
+    mpg123_delete(con->handle);
+    talloc_free(sh->context);
+    sh->context = NULL;
+    mpg123_exit();
+}
+
 /* This initializes libmpg123 and prepares the handle, including funky
  * parameters. */
 static int preinit(sh_audio_t *sh)
@@ -58,7 +71,7 @@ static int preinit(sh_audio_t *sh)
     if (mpg123_init() != MPG123_OK)
         return 0;
 
-    sh->context = malloc(sizeof(struct ad_mpg123_context));
+    sh->context = talloc_zero(NULL, struct ad_mpg123_context);
     con = sh->context;
     /* Auto-choice of optimized decoder (first argument NULL). */
     con->handle = mpg123_new(NULL, &err);
@@ -92,7 +105,12 @@ static int preinit(sh_audio_t *sh)
 
     /* Prevent funky automatic resampling.
      * This way, we can be sure that one frame will never produce
-     * more than 1152 stereo samples. */
+     * more than 1152 stereo samples.
+     * Background:
+     * Going to decode directly to the output buffer. It is important to have
+     * MPG123_AUTO_RESAMPLE disabled for the buffer size being an all-time
+     * limit.
+     * We need at least 1152 samples. dec_audio.c normally guarantees this. */
     mpg123_param(con->handle, MPG123_REMOVE_FLAGS, MPG123_AUTO_RESAMPLE, 0.);
 
     return 1;
@@ -105,11 +123,125 @@ static int preinit(sh_audio_t *sh)
         mp_msg(MSGT_DECAUDIO, MSGL_ERR, "mpg123 preinit error: %s\n",
                mpg123_strerror(con->handle));
 
-    if (con->handle)
-        mpg123_delete(con->handle);
-    mpg123_exit();
-    free(sh->context);
-    sh->context = NULL;
+    uninit(sh);
+    return 0;
+}
+
+/* libmpg123 has a new format ready; query and store, return return value
+   of mpg123_getformat() */
+static int set_format(sh_audio_t *sh)
+{
+    struct ad_mpg123_context *con = sh->context;
+    int ret;
+    long rate;
+    int channels;
+    int encoding;
+    ret = mpg123_getformat(con->handle, &rate, &channels, &encoding);
+    if (ret == MPG123_OK) {
+        mp_chmap_from_channels(&sh->channels, channels);
+        sh->samplerate = rate;
+        /* Without external force, mpg123 will always choose signed encoding,
+         * and non-16-bit only on builds that don't support it.
+         * Be reminded that it doesn't matter to the MPEG file what encoding
+         * is produced from it. */
+        switch (encoding) {
+        case MPG123_ENC_SIGNED_8:
+            sh->sample_format = AF_FORMAT_S8;
+            break;
+        case MPG123_ENC_SIGNED_16:
+            sh->sample_format = AF_FORMAT_S16_NE;
+            break;
+        case MPG123_ENC_SIGNED_32:
+            sh->sample_format = AF_FORMAT_S32_NE;
+            break;
+        case MPG123_ENC_FLOAT_32:
+            sh->sample_format = AF_FORMAT_FLOAT_NE;
+            break;
+        default:
+            /* This means we got a funny custom build of libmpg123 that only supports an unknown format. */
+            mp_msg(MSGT_DECAUDIO, MSGL_ERR,
+                   "Bad encoding from mpg123: %i.\n", encoding);
+            return MPG123_ERR;
+        }
+        con->sample_size = channels * (af_fmt2bits(sh->sample_format) / 8);
+        con->new_format = 0;
+    }
+    return ret;
+}
+
+static int feed_new_packet(sh_audio_t *sh)
+{
+    struct ad_mpg123_context *con = sh->context;
+    int ret;
+
+    struct demux_packet *pkt = demux_read_packet(sh->gsh);
+    if (!pkt)
+        return -1; /* EOF. */
+
+    /* Next bytes from that presentation time. */
+    if (pkt->pts != MP_NOPTS_VALUE) {
+        sh->pts        = pkt->pts;
+        sh->pts_offset = 0;
+    }
+
+    /* Have to use mpg123_feed() to avoid decoding here. */
+    ret = mpg123_feed(con->handle, pkt->buffer, pkt->len);
+    talloc_free(pkt);
+
+    if (ret == MPG123_ERR)
+        return -1;
+
+    if (ret == MPG123_NEW_FORMAT)
+        con->new_format = 1;
+
+    return 0;
+}
+
+/* Now we really start accessing some data and determining file format.
+ * Format now is allowed to change on-the-fly. Here is the only point
+ * that has MPlayer react to errors. We have to pray that exceptional
+ * erros in other places simply cannot occur. */
+static int init(sh_audio_t *sh, const char *decoder)
+{
+    struct ad_mpg123_context *con = sh->context;
+    int ret;
+
+    ret = mpg123_open_feed(con->handle);
+    if (ret != MPG123_OK)
+        goto fail;
+
+    for (int n = 0; ; n++) {
+        if (feed_new_packet(sh) < 0) {
+            ret = MPG123_NEED_MORE;
+            goto fail;
+        }
+        size_t got_now = 0;
+        ret = mpg123_decode_frame(con->handle, NULL, NULL, &got_now);
+        if (ret == MPG123_OK || ret == MPG123_NEW_FORMAT) {
+            ret = set_format(sh);
+            if (ret == MPG123_OK)
+                break;
+        }
+        if (ret != MPG123_NEED_MORE)
+            goto fail;
+        // max. 16 retries (randomly chosen number)
+        if (n > 16) {
+            ret = MPG123_NEED_MORE;
+            goto fail;
+        }
+    }
+
+    return 1;
+
+fail:
+    if (ret == MPG123_NEED_MORE) {
+        mp_msg(MSGT_DECAUDIO, MSGL_ERR, "Could not find mp3 stream.\n");
+    } else {
+        mp_msg(MSGT_DECAUDIO, MSGL_ERR, "mpg123 init error: %s\n",
+               mpg123_strerror(con->handle));
+    }
+
+    uninit(sh);
     return 0;
 }
 
@@ -126,274 +258,17 @@ static int compute_bitrate(struct mpg123_frameinfo *i)
                   samples_per_frame[i->version][i->layer] + 0.5);
 }
 
-/* Opted against the header printout from old mp3lib, too much
- * irrelevant info. This is modelled after the mpg123 app's
- * standard output line.
- * If more verbosity is demanded, one can add more detail and
- * also throw in ID3v2 info which libmpg123 collects anyway. */
-static void print_header_compact(struct mpg123_frameinfo *i)
-{
-    static const char *smodes[5] = {
-        "stereo", "joint-stereo", "dual-channel", "mono", "invalid"
-    };
-    static const char *layers[4] = {
-        "Unknown", "I", "II", "III"
-    };
-    static const char *versions[4] = {
-        "1.0", "2.0", "2.5", "x.x"
-    };
-
-    mp_msg(MSGT_DECAUDIO, MSGL_V, "MPEG %s layer %s, ",
-           versions[i->version], layers[i->layer]);
-    switch (i->vbr) {
-    case MPG123_CBR:
-        if (i->bitrate)
-            mp_msg(MSGT_DECAUDIO, MSGL_V, "%d kbit/s", i->bitrate);
-        else
-            mp_msg(MSGT_DECAUDIO, MSGL_V, "%d kbit/s (free format)",
-                   compute_bitrate(i));
-        break;
-    case MPG123_VBR:
-        mp_msg(MSGT_DECAUDIO, MSGL_V, "VBR");
-        break;
-    case MPG123_ABR:
-        mp_msg(MSGT_DECAUDIO, MSGL_V, "%d kbit/s ABR", i->abr_rate);
-        break;
-    default:
-        mp_msg(MSGT_DECAUDIO, MSGL_V, "???");
-    }
-    mp_msg(MSGT_DECAUDIO, MSGL_V, ", %ld Hz %s\n", i->rate,
-           smodes[i->mode]);
-}
-
-/* libmpg123 has a new format ready; query and store, return return value
-   of mpg123_getformat() */
-static int set_format(sh_audio_t *sh, struct ad_mpg123_context *con)
-{
-    int ret;
-    long rate;
-    int channels;
-    int encoding;
-    ret = mpg123_getformat(con->handle, &rate, &channels, &encoding);
-    if(ret == MPG123_OK) {
-        mp_chmap_from_channels(&sh->channels, channels);
-        sh->samplerate = rate;
-        /* Without external force, mpg123 will always choose signed encoding,
-         * and non-16-bit only on builds that don't support it.
-         * Be reminded that it doesn't matter to the MPEG file what encoding
-         * is produced from it. */
-        switch (encoding) {
-        case MPG123_ENC_SIGNED_8:
-            sh->sample_format = AF_FORMAT_S8;
-            break;
-        case MPG123_ENC_SIGNED_16:
-            sh->sample_format = AF_FORMAT_S16_NE;
-            break;
-        /* To stay compatible with the oldest libmpg123 headers, do not rely
-         * on float and 32 bit encoding symbols being defined.
-         * Those formats came later */
-        case 0x1180: /* MPG123_ENC_SIGNED_32 */
-            sh->sample_format = AF_FORMAT_S32_NE;
-            break;
-        case 0x200: /* MPG123_ENC_FLOAT_32 */
-            sh->sample_format = AF_FORMAT_FLOAT_NE;
-            break;
-        default:
-            /* This means we got a funny custom build of libmpg123 that only supports an unknown format. */
-            mp_msg(MSGT_DECAUDIO, MSGL_ERR,
-                   "Bad encoding from mpg123: %i.\n", encoding);
-            return MPG123_ERR;
-        }
-        /* Going to decode directly to MPlayer's memory. It is important
-         * to have MPG123_AUTO_RESAMPLE disabled for the buffer size
-         * being an all-time limit. */
-        sh->audio_out_minsize = 1152 * 2 * (af_fmt2bits(sh->sample_format) / 8);
-        con->new_format = 0;
-    }
-    return ret;
-}
-
-/* This tries to extract a requested amount of decoded data.
- * Even when you request 0 bytes, it will feed enough input so that
- * the decoder _could_ have delivered something.
- * Returns byte count >= 0, -1 on error.
- *
- * Thoughts on exact pts keeping:
- * We have to assume that MPEG frames are cut in pieces by packet boundaries.
- * Also, it might be possible that the first packet does not contain enough
- * data to ensure initial stream sync... or re-sync on erroneous streams.
- * So we need something robust to relate the decoded byte count to the correct
- * time stamp. This is tricky, though. From the outside, you cannot tell if,
- * after having fed two packets until the first output arrives, one should
- * start counting from the first packet's pts or the second packet's.
- * So, let's just count from the last fed package's pts. If the packets are
- * exactly cut to MPEG frames, this will cause one frame mismatch in the
- * beginning (when mpg123 peeks ahead for the following header), but will
- * be corrected with the third frame already. One might add special code to
- * not increment the base pts past the first packet's after a resync before
- * the first decoded bytes arrived. */
-static int decode_a_bit(sh_audio_t *sh, unsigned char *buf, int count)
-{
-    int ret = MPG123_OK;
-    int got = 0;
-    struct ad_mpg123_context *con = sh->context;
-
-    /* There will be one MPG123_NEW_FORMAT message on first open.
-     * This will be handled in init(). */
-    do {
-        size_t got_now = 0;
-        /* Fetch new format now, after old data has been used. */
-        if(con->new_format)
-            ret = set_format(sh, con);
-
-        /* Feed the decoder. This will only fire from the second round on. */
-        if (ret == MPG123_NEED_MORE) {
-            /* Feed more input data. */
-            struct demux_packet *pkt = demux_read_packet(sh->gsh);
-            if (!pkt)
-                break;          /* Apparently that's it. EOF. */
-
-            /* Next bytes from that presentation time. */
-            if (pkt->pts != MP_NOPTS_VALUE) {
-                sh->pts       = pkt->pts;
-                sh->pts_bytes = 0;
-            }
-
-            /* Have to use mpg123_feed() to avoid decoding here. */
-            ret = mpg123_feed(con->handle, pkt->buffer, pkt->len);
-            talloc_free(pkt);
-            if (ret == MPG123_ERR)
-                break;
-
-            /* Indication of format change is possible here (from mpg123_decode()). */
-            if(ret == MPG123_NEW_FORMAT) {
-                con->new_format = 1;
-                if(got)
-                    break; /* Do not switch format during a chunk. */
-
-                ret = set_format(sh, con);
-            }
-        }
-        /* Theoretically, mpg123 could return MPG123_DONE, so be prepared.
-         * Should not happen in our usage, but it is a valid return code. */
-        else if (ret == MPG123_ERR || ret == MPG123_DONE)
-            break;
-
-        /* Try to decode a bit. This is the return value that counts
-         * for the loop condition. */
-        if (!buf) { /* fake call just for feeding to get format */
-            ret = set_format(sh, con);
-        } else { /* This is the decoding. One frame at a time. */
-            ret = mpg123_replace_buffer(con->handle, buf, count);
-            if (ret == MPG123_OK)
-                ret = mpg123_decode_frame(con->handle, NULL, NULL, &got_now);
-        }
-
-        got += got_now;
-        sh->pts_bytes += got_now;
-
-        /* Indication of format change should happen here. */
-        if(ret == MPG123_NEW_FORMAT) {
-            con->new_format = 1;
-            if(got)
-                break; /* Do not switch format during a chunk. */
-
-            ret = set_format(sh, con);
-        }
-
-    } while (ret == MPG123_NEED_MORE || (got == 0 && count != 0));
-
-    if (ret == MPG123_ERR) {
-        mp_msg(MSGT_DECAUDIO, MSGL_ERR, "mpg123 decoding failed: %s\n",
-               mpg123_strerror(con->handle));
-    }
-
-    return got;
-}
-
-/* Close, reopen stream. Feed data until we know the format of the stream.
- * 1 on success, 0 on error */
-static int reopen_stream(sh_audio_t *sh)
-{
-    struct ad_mpg123_context *con = (struct ad_mpg123_context*) sh->context;
-
-    mpg123_close(con->handle);
-    /* No resetting of the context:
-     * We do not want to loose the mean bitrate data. */
-
-    /* Open and make sure we have fed enough data to get stream properties. */
-    if (MPG123_OK == mpg123_open_feed(con->handle) &&
-        /* Feed data until mpg123 is ready (has found stream beginning). */
-        !decode_a_bit(sh, NULL, 0) &&
-        set_format(sh, con) == MPG123_OK) { /* format setting again just for return value */
-        return 1;
-    } else {
-        mp_msg(MSGT_DECAUDIO, MSGL_ERR,
-               "mpg123 failed to reopen stream: %s\n",
-               mpg123_strerror(con->handle));
-        return 0;
-    }
-}
-
-/* Now we really start accessing some data and determining file format.
- * Format now is allowed to change on-the-fly. Here is the only point
- * that has MPlayer react to errors. We have to pray that exceptional
- * erros in other places simply cannot occur. */
-static int init(sh_audio_t *sh, const char *decoder)
-{
-    mpg123_id3v2 *v2;
-    struct mpg123_frameinfo finfo;
-    struct ad_mpg123_context *con = sh->context;
-
-    con->new_format = 0;
-    if (reopen_stream(sh) &&
-        /* Get MPEG header info. */
-        MPG123_OK == mpg123_info(con->handle, &finfo) &&
-        /* Since we queried format, mpg123 should have read past ID3v2 tags.
-         * We need to decide if printing of UTF-8 encoded text info is wanted. */
-        MPG123_OK == mpg123_id3(con->handle, NULL, &v2)) {
-        /* If we are here, we passed all hurdles. Yay! Extract the info. */
-        print_header_compact(&finfo);
-        /* Do we want to print out the UTF-8 Id3v2 info?
-        if (v2)
-            print_id3v2(v2); */
-
-        /* Have kb/s, want B/s
-         * For VBR, the first frame will be a bad estimate. */
-        sh->i_bps = (finfo.bitrate ? finfo.bitrate : compute_bitrate(&finfo))
-                    * 1000 / 8;
-        con->delay      = 1;
-        con->mean_rate  = 0.;
-        con->mean_count = 0;
-        con->vbr = (finfo.vbr != MPG123_CBR);
-
-        return 1;
-    } else {
-        mp_msg(MSGT_DECAUDIO, MSGL_ERR, "mpg123 init error: %s\n",
-               mpg123_strerror(con->handle));
-        return 0;
-    }
-}
-
-static void uninit(sh_audio_t *sh)
-{
-    struct ad_mpg123_context *con = (struct ad_mpg123_context*) sh->context;
-
-    mpg123_close(con->handle);
-    mpg123_delete(con->handle);
-    free(sh->context);
-    sh->context = NULL;
-    mpg123_exit();
-}
-
 /* Update mean bitrate. This could be dropped if accurate time display
  * on audio file playback is not desired. */
 static void update_info(sh_audio_t *sh)
 {
     struct ad_mpg123_context *con = sh->context;
-    if (con->vbr && --con->delay < 1) {
-        struct mpg123_frameinfo finfo;
-        if (MPG123_OK == mpg123_info(con->handle, &finfo)) {
+    struct mpg123_frameinfo finfo;
+    if (mpg123_info(con->handle, &finfo) != MPG123_OK)
+        return;
+
+    if (finfo.vbr != MPG123_CBR) {
+        if (--con->delay < 1) {
             if (++con->mean_count > ((unsigned int) -1) / 2)
                 con->mean_count = ((unsigned int) -1) / 4;
 
@@ -404,42 +279,80 @@ static void update_info(sh_audio_t *sh)
 
             con->delay = 10;
         }
+    } else {
+        sh->i_bps = (finfo.bitrate ? finfo.bitrate : compute_bitrate(&finfo))
+                    * 1000 / 8;
+        con->delay      = 1;
+        con->mean_rate  = 0.;
+        con->mean_count = 0;
     }
 }
 
-static int decode_audio(sh_audio_t *sh, unsigned char *buf, int minlen,
-                        int maxlen)
+static int decode_audio(sh_audio_t *sh, struct mp_audio *buffer, int maxlen)
 {
-    int bytes;
+    struct ad_mpg123_context *con = sh->context;
+    void *buf = buffer->planes[0];
+    int ret;
 
-    bytes = decode_a_bit(sh, buf, maxlen);
-    /* This EOF is ignored, apparently, until input data is exhausted. */
-    if (bytes == 0)
-        return -1;              /* EOF */
+    if (con->new_format) {
+        ret = set_format(sh);
+        if (ret == MPG123_OK) {
+            return 0; // let caller handle format change
+        } else if (ret == MPG123_NEED_MORE) {
+            con->need_data = true;
+        } else {
+            goto mpg123_fail;
+        }
+    }
+
+    if (con->need_data) {
+        if (feed_new_packet(sh) < 0)
+            return -1;
+    }
+
+    size_t got_now = 0;
+    ret = mpg123_replace_buffer(con->handle, buf, maxlen * con->sample_size);
+    if (ret != MPG123_OK)
+        goto mpg123_fail;
+
+    ret = mpg123_decode_frame(con->handle, NULL, NULL, &got_now);
+
+    int got_samples = got_now / con->sample_size;
+    buffer->samples += got_samples;
+    sh->pts_offset += got_samples;
+
+    if (ret == MPG123_NEW_FORMAT) {
+        con->new_format = true;
+    } else if (ret == MPG123_NEED_MORE) {
+        con->need_data = true;
+    } else if (ret != MPG123_OK && ret != MPG123_DONE) {
+        goto mpg123_fail;
+    }
 
     update_info(sh);
-    return bytes;
+    return 0;
+
+mpg123_fail:
+    mp_msg(MSGT_DECAUDIO, MSGL_ERR, "mpg123 decoding error: %s\n",
+           mpg123_strerror(con->handle));
+    return -1;
 }
 
 static int control(sh_audio_t *sh, int cmd, void *arg)
 {
+    struct ad_mpg123_context *con = sh->context;
+
     switch (cmd) {
     case ADCTRL_RESYNC_STREAM:
-        /* Close/reopen the stream for mpg123 to make sure it doesn't
-         * think that it still knows the exact stream position.
-         * Otherwise, we would have funny effects from the gapless code.
-         * Oh, and it helps to minimize artifacts from jumping in the stream. */
-        if (reopen_stream(sh)) {
-            update_info(sh);
-            return CONTROL_TRUE;
-        } else {
-            /* MPlayer ignores this case! It just keeps on decoding.
-             * So we have to make sure resync never fails ... */
+        mpg123_close(con->handle);
+
+        if (mpg123_open_feed(con->handle) != MPG123_OK) {
             mp_msg(MSGT_DECAUDIO, MSGL_ERR,
-                   "mpg123 cannot reopen stream for resync.\n");
+                   "mpg123 failed to reopen stream: %s\n",
+                   mpg123_strerror(con->handle));
             return CONTROL_FALSE;
         }
-        break;
+        return CONTROL_TRUE;
     }
     return CONTROL_UNKNOWN;
 }
diff --git a/audio/decode/ad_spdif.c b/audio/decode/ad_spdif.c
index f03041d6a6..a233286c19 100644
--- a/audio/decode/ad_spdif.c
+++ b/audio/decode/ad_spdif.c
@@ -19,6 +19,7 @@
  */
 
 #include <string.h>
+#include <assert.h>
 
 #include <libavformat/avformat.h>
 #include <libavcodec/avcodec.h>
@@ -184,37 +185,43 @@ fail:
     return 0;
 }
 
-static int decode_audio(sh_audio_t *sh, unsigned char *buf,
-                        int minlen, int maxlen)
+static int decode_audio(sh_audio_t *sh, struct mp_audio *buffer, int maxlen)
 {
     struct spdifContext *spdif_ctx = sh->context;
     AVFormatContext     *lavf_ctx  = spdif_ctx->lavf_ctx;
 
+    int sstride = 2 * sh->channels.num;
+    assert(sstride == buffer->sstride);
+
+    if (maxlen < spdif_ctx->iec61937_packet_size)
+        return 0;
+
     spdif_ctx->out_buffer_len  = 0;
     spdif_ctx->out_buffer_size = maxlen;
-    spdif_ctx->out_buffer      = buf;
-    while (spdif_ctx->out_buffer_len + spdif_ctx->iec61937_packet_size < maxlen
-           && spdif_ctx->out_buffer_len < minlen) {
-        struct demux_packet *mpkt = demux_read_packet(sh->gsh);
-        if (!mpkt)
-            break;
-        AVPacket pkt;
-        mp_set_av_packet(&pkt, mpkt);
-        pkt.pts = pkt.dts = 0;
-        mp_msg(MSGT_DECAUDIO, MSGL_V, "spdif packet, size=%d\n", pkt.size);
-        if (mpkt->pts != MP_NOPTS_VALUE) {
-            sh->pts       = mpkt->pts;
-            sh->pts_bytes = 0;
-        }
-        int out_len = spdif_ctx->out_buffer_len;
-        int ret = av_write_frame(lavf_ctx, &pkt);
-        avio_flush(lavf_ctx->pb);
-        sh->pts_bytes += spdif_ctx->out_buffer_len - out_len;
-        talloc_free(mpkt);
-        if (ret < 0)
-            break;
+    spdif_ctx->out_buffer      = buffer->planes[0];
+
+    struct demux_packet *mpkt = demux_read_packet(sh->gsh);
+    if (!mpkt)
+        return 0;
+
+    AVPacket pkt;
+    mp_set_av_packet(&pkt, mpkt);
+    pkt.pts = pkt.dts = 0;
+    mp_msg(MSGT_DECAUDIO, MSGL_V, "spdif packet, size=%d\n", pkt.size);
+    if (mpkt->pts != MP_NOPTS_VALUE) {
+        sh->pts        = mpkt->pts;
+        sh->pts_offset = 0;
     }
-    return spdif_ctx->out_buffer_len;
+    int out_len = spdif_ctx->out_buffer_len;
+    int ret = av_write_frame(lavf_ctx, &pkt);
+    avio_flush(lavf_ctx->pb);
+    sh->pts_offset += (spdif_ctx->out_buffer_len - out_len) / sstride;
+    talloc_free(mpkt);
+    if (ret < 0)
+        return -1;
+
+    buffer->samples = spdif_ctx->out_buffer_len / sstride;
+    return 0;
 }
 
 static int control(sh_audio_t *sh, int cmd, void *arg)
diff --git a/audio/decode/dec_audio.c b/audio/decode/dec_audio.c
index 6c4af89ef9..19b5d8bdeb 100644
--- a/audio/decode/dec_audio.c
+++ b/audio/decode/dec_audio.c
@@ -38,6 +38,7 @@
 #include "dec_audio.h"
 #include "ad.h"
 #include "audio/format.h"
+#include "audio/audio.h"
 #include "audio/audio_buffer.h"
 
 #include "audio/filter/af.h"
@@ -55,31 +56,29 @@ static const struct ad_functions * const ad_drivers[] = {
     NULL
 };
 
+// At least ad_mpg123 needs to be able to decode this many samples at once
+#define DECODE_MAX_UNIT 1152
+
+// At least 8192 samples, plus hack for ad_mpg123
+#define DECODE_BUFFER_SAMPLES (8192 + DECODE_MAX_UNIT)
+
+// Drop audio buffer and reinit it (after format change)
+static void reinit_audio_buffer(sh_audio_t *sh)
+{
+    mp_audio_buffer_reinit_fmt(sh->decode_buffer, sh->sample_format,
+                               &sh->channels, sh->samplerate);
+    mp_audio_buffer_preallocate_min(sh->decode_buffer, DECODE_BUFFER_SAMPLES);
+}
+
 static int init_audio_codec(sh_audio_t *sh_audio, const char *decoder)
 {
     assert(!sh_audio->initialized);
     resync_audio_stream(sh_audio);
-    sh_audio->sample_format = AF_FORMAT_FLOAT_NE;
-    sh_audio->audio_out_minsize = 8192; // default, preinit() may change it
     if (!sh_audio->ad_driver->preinit(sh_audio)) {
         mp_tmsg(MSGT_DECAUDIO, MSGL_ERR, "Audio decoder preinit failed.\n");
         return 0;
     }
 
-    const int base_size = 65536;
-    // At least 64 KiB plus rounding up to next decodable unit size
-    sh_audio->a_buffer_size = base_size + sh_audio->audio_out_minsize;
-
-    mp_tmsg(MSGT_DECAUDIO, MSGL_V,
-            "dec_audio: Allocating %d + %d = %d bytes for output buffer.\n",
-            sh_audio->audio_out_minsize, base_size,
-            sh_audio->a_buffer_size);
-
-    sh_audio->a_buffer = av_mallocz(sh_audio->a_buffer_size);
-    if (!sh_audio->a_buffer)
-        abort();
-    sh_audio->a_buffer_len = 0;
-
     if (!sh_audio->ad_driver->init(sh_audio, decoder)) {
         mp_tmsg(MSGT_DECAUDIO, MSGL_V, "Audio decoder init failed.\n");
         uninit_audio(sh_audio); // free buffers
@@ -88,13 +87,18 @@ static int init_audio_codec(sh_audio_t *sh_audio, const char *decoder)
 
     sh_audio->initialized = 1;
 
-    if (mp_chmap_is_empty(&sh_audio->channels) || !sh_audio->samplerate) {
+    if (mp_chmap_is_empty(&sh_audio->channels) || !sh_audio->samplerate ||
+        !sh_audio->sample_format)
+    {
         mp_tmsg(MSGT_DECAUDIO, MSGL_ERR, "Audio decoder did not specify "
                 "audio format!\n");
         uninit_audio(sh_audio); // free buffers
         return 0;
     }
 
+    sh_audio->decode_buffer = mp_audio_buffer_create(NULL);
+    reinit_audio_buffer(sh_audio);
+
     return 1;
 }
 
@@ -188,7 +192,8 @@ void uninit_audio(sh_audio_t *sh_audio)
     }
     talloc_free(sh_audio->gsh->decoder_desc);
     sh_audio->gsh->decoder_desc = NULL;
-    av_freep(&sh_audio->a_buffer);
+    talloc_free(sh_audio->decode_buffer);
+    sh_audio->decode_buffer = NULL;
 }
 
 
@@ -235,37 +240,44 @@ int init_audio_filters(sh_audio_t *sh_audio, int in_samplerate,
 static int filter_n_bytes(sh_audio_t *sh, struct mp_audio_buffer *outbuf,
                           int len)
 {
-    assert(len - 1 + sh->audio_out_minsize <= sh->a_buffer_size);
-
     int error = 0;
 
-    // Decode more bytes if needed
-    int old_samplerate = sh->samplerate;
-    struct mp_chmap old_channels = sh->channels;
-    int old_sample_format = sh->sample_format;
-    while (sh->a_buffer_len < len) {
-        unsigned char *buf = sh->a_buffer + sh->a_buffer_len;
-        int minlen = len - sh->a_buffer_len;
-        int maxlen = sh->a_buffer_size - sh->a_buffer_len;
-        int ret = sh->ad_driver->decode_audio(sh, buf, minlen, maxlen);
-        int format_change = sh->samplerate != old_samplerate
-                            || !mp_chmap_equals(&sh->channels, &old_channels)
-                            || sh->sample_format != old_sample_format;
-        if (ret <= 0 || format_change) {
-            error = format_change ? -2 : -1;
-            // samples from format-changing call get discarded too
-            len = sh->a_buffer_len;
+    struct mp_audio config;
+    mp_audio_buffer_get_format(sh->decode_buffer, &config);
+
+    while (mp_audio_buffer_samples(sh->decode_buffer) < len) {
+        int maxlen = mp_audio_buffer_get_write_available(sh->decode_buffer);
+        if (maxlen < DECODE_MAX_UNIT)
+            break;
+        struct mp_audio buffer;
+        mp_audio_buffer_get_write_buffer(sh->decode_buffer, maxlen, &buffer);
+        buffer.samples = 0;
+        error = sh->ad_driver->decode_audio(sh, &buffer, maxlen);
+        if (error < 0)
+            break;
+        // Commit the data just read as valid data
+        mp_audio_buffer_finish_write(sh->decode_buffer, buffer.samples);
+        // Format change
+        if (sh->samplerate != config.rate ||
+            !mp_chmap_equals(&sh->channels, &config.channels) ||
+            sh->sample_format != config.format)
+        {
+            // If there are still samples left in the buffer, let them drain
+            // first, and don't signal a format change to the caller yet.
+            if (mp_audio_buffer_samples(sh->decode_buffer) > 0)
+                break;
+            reinit_audio_buffer(sh);
+            error = -2;
             break;
         }
-        sh->a_buffer_len += ret;
     }
 
     // Filter
-    struct mp_audio filter_input = {
-        .planes = {sh->a_buffer},
-    };
-    mp_audio_copy_config(&filter_input, &sh->afilter->input);
-    filter_input.samples = len / filter_input.sstride;
+    struct mp_audio filter_input;
+    mp_audio_buffer_peek(sh->decode_buffer, &filter_input);
+    filter_input.rate = sh->afilter->input.rate; // due to playback speed change
+    len = MPMIN(filter_input.samples, len);
+    filter_input.samples = len;
 
     struct mp_audio *filter_output = af_play(sh->afilter, &filter_input);
     if (!filter_output)
@@ -273,8 +285,7 @@ static int filter_n_bytes(sh_audio_t *sh, struct mp_audio_buffer *outbuf,
     mp_audio_buffer_append(outbuf, filter_output);
 
     // remove processed data from decoder buffer:
-    sh->a_buffer_len -= len;
-    memmove(sh->a_buffer, sh->a_buffer + len, sh->a_buffer_len);
+    mp_audio_buffer_skip(sh->decode_buffer, len);
 
     return error;
 }
@@ -289,33 +300,27 @@ int decode_audio(sh_audio_t *sh_audio, struct mp_audio_buffer *outbuf,
 {
     // Indicates that a filter seems to be buffering large amounts of data
     int huge_filter_buffer = 0;
-    int sstride =
-        af_fmt2bits(sh_audio->sample_format) / 8 * sh_audio->channels.num;
-    // Decoded audio must be cut at boundaries of this many bytes
-    int unitsize = sstride * 16;
+    // Decoded audio must be cut at boundaries of this many samples
+    // (Note: the reason for this is unknown, possibly a refactoring artifact)
+    int unitsize = 16;
 
     /* Filter output size will be about filter_multiplier times input size.
      * If some filter buffers audio in big blocks this might only hold
      * as average over time. */
     double filter_multiplier = af_calc_filter_multiplier(sh_audio->afilter);
 
-    /* If the decoder set audio_out_minsize then it can do the equivalent of
-     * "while (output_len < target_len) output_len += audio_out_minsize;",
-     * so we must guarantee there is at least audio_out_minsize-1 bytes
-     * more space in the output buffer than the minimum length we try to
-     * decode. */
-    int max_decode_len = sh_audio->a_buffer_size - sh_audio->audio_out_minsize;
-    if (!unitsize)
-        return -1;
-    max_decode_len -= max_decode_len % unitsize;
+    int prev_buffered = -1;
+    while (minsamples >= 0) {
+        int buffered = mp_audio_buffer_samples(outbuf);
+        if (minsamples < buffered || buffered == prev_buffered)
+            break;
+        prev_buffered = buffered;
 
-    while (minsamples >= 0 && mp_audio_buffer_samples(outbuf) < minsamples) {
-        int decsamples = (minsamples - mp_audio_buffer_samples(outbuf))
-                         / filter_multiplier;
-        int declen = decsamples * sstride;
+        int decsamples = (minsamples - buffered) / filter_multiplier;
         // + some extra for possible filter buffering
-        declen += unitsize << 5;
-        if (huge_filter_buffer)
+        decsamples += 1 << unitsize;
+
+        if (huge_filter_buffer) {
             /* Some filter must be doing significant buffering if the estimated
              * input length didn't produce enough output from filters.
              * Feed the filters 2k bytes at a time until we have enough output.
@@ -324,15 +329,14 @@ int decode_audio(sh_audio_t *sh_audio, struct mp_audio_buffer *outbuf,
              * to get audio data and buffer video frames in memory while doing
              * so. However the performance impact of either is probably not too
              * significant as long as the value is not completely insane. */
-            declen = 2000;
-        declen -= declen % unitsize;
-        if (declen > max_decode_len)
-            declen = max_decode_len;
-        else
-            /* if this iteration does not fill buffer, we must have lots
-             * of buffering in filters */
-            huge_filter_buffer = 1;
-        int res = filter_n_bytes(sh_audio, outbuf, declen);
+            decsamples = 2000;
+        }
+
+        /* if this iteration does not fill buffer, we must have lots
+         * of buffering in filters */
+        huge_filter_buffer = 1;
+
+        int res = filter_n_bytes(sh_audio, outbuf, decsamples);
         if (res < 0)
             return res;
     }
@@ -342,6 +346,7 @@ int decode_audio(sh_audio_t *sh_audio, struct mp_audio_buffer *outbuf,
 void resync_audio_stream(sh_audio_t *sh_audio)
 {
     sh_audio->pts = MP_NOPTS_VALUE;
+    sh_audio->pts_offset = 0;
     if (!sh_audio->initialized)
         return;
     sh_audio->ad_driver->control(sh_audio, ADCTRL_RESYNC_STREAM, NULL);
diff --git a/audio/reorder_ch.c b/audio/reorder_ch.c
index 57cb664a6f..b99731e6bf 100644
--- a/audio/reorder_ch.c
+++ b/audio/reorder_ch.c
@@ -63,38 +63,6 @@ void reorder_to_planar(void *restrict out, const void *restrict in,
         reorder_to_planar_(out, in, size, nchan, nmemb);
 }
 
-static inline void reorder_to_packed_(uint8_t *out, uint8_t **in,
-                                      size_t size, size_t nchan, size_t nmemb)
-{
-    size_t outstep = nchan * size;
-
-    for (size_t c = 0; c < nchan; ++c) {
-        char *outptr = out + c * size;
-        char *inptr = in[c];
-        for (size_t i = 0; i < nmemb; ++i, outptr += outstep, inptr += size) {
-            memcpy(outptr, inptr, size);
-        }
-    }
-}
-
-// out = destination array of packed samples of given size, nmemb frames
-// in[channel] = source array of samples for the given channel
-void reorder_to_packed(uint8_t *out, uint8_t **in,
-                       size_t size, size_t nchan, size_t nmemb)
-{
-    if (nchan == 1)
-        memcpy(out, in, size * nchan * nmemb);
-    // See reorder_to_planar() why this is done this way
-    else if (size == 1)
-        reorder_to_packed_(out, in, 1, nchan, nmemb);
-    else if (size == 2)
-        reorder_to_packed_(out, in, 2, nchan, nmemb);
-    else if (size == 4)
-        reorder_to_packed_(out, in, 4, nchan, nmemb);
-    else
-        reorder_to_packed_(out, in, size, nchan, nmemb);
-}
-
 #define MAX_SAMPLESIZE 8
 
 static void reorder_channels_(uint8_t *restrict data, int *restrict ch_order,
diff --git a/audio/reorder_ch.h b/audio/reorder_ch.h
index 6b5902c1b6..d81aab7dfa 100644
--- a/audio/reorder_ch.h
+++ b/audio/reorder_ch.h
@@ -27,8 +27,6 @@
 
 void reorder_to_planar(void *restrict out, const void *restrict in,
                        size_t size, size_t nchan, size_t nmemb);
-void reorder_to_packed(uint8_t *out, uint8_t **in,
-                       size_t size, size_t nchan, size_t nmemb);
 
 void reorder_channels(void *restrict data, int *restrict ch_order,
                       size_t sample_size, size_t num_ch, size_t num_frames);
diff --git a/demux/demux.c b/demux/demux.c
index b5934824dc..ded0dd895e 100644
--- a/demux/demux.c
+++ b/demux/demux.c
@@ -259,7 +259,6 @@ struct sh_stream *new_sh_stream(demuxer_t *demuxer, enum stream_type type)
             struct sh_audio *sht = talloc_zero(demuxer, struct sh_audio);
             sht->gsh = sh;
             sht->opts = sh->opts;
-            sht->sample_format = AF_FORMAT_S16_NE;
             sh->audio = sht;
             break;
         }
diff --git a/demux/stheader.h b/demux/stheader.h
index 5aa77ba693..c88ed0b0f7 100644
--- a/demux/stheader.h
+++ b/demux/stheader.h
@@ -92,11 +92,8 @@ typedef struct sh_audio {
     int samplerate;
     struct mp_chmap channels;
     int i_bps; // == bitrate  (compressed bytes/sec)
-    // decoder buffers:
-    int audio_out_minsize;  // minimal output from decoder may be this much
-    char *a_buffer;         // buffer for decoder output
-    int a_buffer_len;
-    int a_buffer_size;
+    // decoder state:
+    struct mp_audio_buffer *decode_buffer;
     struct af_stream *afilter;          // the audio filter stream
     const struct ad_functions *ad_driver;
     // win32-compatible codec parameters:
@@ -104,7 +101,7 @@ typedef struct sh_audio {
     // note codec extradata may be either under "wf" or "codecdata"
     unsigned char *codecdata;
     int codecdata_len;
-    int pts_bytes;   // bytes output by decoder after last known pts
+    int pts_offset; // number of samples output by decoder after last known pts
 } sh_audio_t;
 
 typedef struct sh_video {
diff --git a/mpvcore/player/audio.c b/mpvcore/player/audio.c
index 9bad5140ef..a13e8d9c07 100644
--- a/mpvcore/player/audio.c
+++ b/mpvcore/player/audio.c
@@ -180,12 +180,9 @@ no_audio:
 double written_audio_pts(struct MPContext *mpctx)
 {
     sh_audio_t *sh_audio = mpctx->sh_audio;
-    if (!sh_audio)
+    if (!sh_audio || !sh_audio->initialized)
         return MP_NOPTS_VALUE;
 
-    double bps = sh_audio->channels.num * sh_audio->samplerate *
-                 (af_fmt2bits(sh_audio->sample_format) / 8);
-
     // first calculate the end pts of audio that has been output by decoder
     double a_pts = sh_audio->pts;
     if (a_pts == MP_NOPTS_VALUE)
@@ -194,13 +191,13 @@ double written_audio_pts(struct MPContext *mpctx)
     // sh_audio->pts is the timestamp of the latest input packet with
     // known pts that the decoder has decoded. sh_audio->pts_bytes is
     // the amount of bytes the decoder has written after that timestamp.
-    a_pts += sh_audio->pts_bytes / bps;
+    a_pts += sh_audio->pts_offset / (double)sh_audio->samplerate;
 
     // Now a_pts hopefully holds the pts for end of audio from decoder.
     // Subtract data in buffers between decoder and audio out.
 
     // Decoded but not filtered
-    a_pts -= sh_audio->a_buffer_len / bps;
+    a_pts -= mp_audio_buffer_seconds(sh_audio->decode_buffer);
 
     // Data buffered in audio filters, measured in seconds of "missing" output
     double buffered_output = af_calc_delay(sh_audio->afilter);
@@ -446,6 +443,6 @@ void clear_audio_output_buffers(struct MPContext *mpctx)
 // Drop decoded data queued for filtering.
 void clear_audio_decode_buffers(struct MPContext *mpctx)
 {
-    if (mpctx->sh_audio)
-        mpctx->sh_audio->a_buffer_len = 0;
+    if (mpctx->sh_audio && mpctx->sh_audio->decode_buffer)
+        mp_audio_buffer_clear(mpctx->sh_audio->decode_buffer);
 }