diff --git a/audio/filter/af_scaletempo2.c b/audio/filter/af_scaletempo2.c
index 0dc0a974d4..5258fe3204 100644
--- a/audio/filter/af_scaletempo2.c
+++ b/audio/filter/af_scaletempo2.c
@@ -65,10 +65,13 @@ static void process(struct mp_filter *f)
             int frame_size = mp_aframe_get_size(p->pending);
             uint8_t **planes = mp_aframe_get_data_ro(p->pending);
             int read = mp_scaletempo2_fill_input_buffer(&p->data,
-                planes, frame_size, final, p->speed);
+                planes, frame_size, p->speed);
             mp_aframe_skip_samples(p->pending, read);
         }
-        p->sent_final |= final;
+        if (final && p->pending && !p->sent_final) {
+            mp_scaletempo2_set_final(&p->data);
+            p->sent_final = true;
+        }
 
         if (mp_scaletempo2_frames_available(&p->data, p->speed)) {
             if (eof) {
@@ -80,11 +83,8 @@ static void process(struct mp_filter *f)
             if (eof) {
                 mp_pin_in_write(f->ppins[1], MP_EOF_FRAME);
                 return;
-            } else if (format_change) {
-                // go on with proper reinit on the next iteration
-                p->initialized = false;
-                p->sent_final = false;
             }
+            // for format change go on with proper reinit on the next iteration
         }
     }
 
diff --git a/audio/filter/af_scaletempo2_internals.c b/audio/filter/af_scaletempo2_internals.c
index a9d0fba136..4b68fae6c4 100644
--- a/audio/filter/af_scaletempo2_internals.c
+++ b/audio/filter/af_scaletempo2_internals.c
@@ -421,6 +421,9 @@ static void seek_buffer(struct mp_scaletempo2 *p, int frames)
 {
     assert(p->input_buffer_frames >= frames);
     p->input_buffer_frames -= frames;
+    if (p->input_buffer_final_frames > 0) {
+        p->input_buffer_final_frames = MPMAX(0, p->input_buffer_final_frames - frames);
+    }
     for (int i = 0; i < p->channels; ++i) {
         memmove(p->input_buffer[i], p->input_buffer[i] + frames,
             p->input_buffer_frames * sizeof(float));
@@ -483,27 +486,53 @@ static void resize_input_buffer(struct mp_scaletempo2 *p, int size)
     p->input_buffer = realloc_2d(p->input_buffer, p->channels, size);
 }
 
+// pad end with silence until a wsola iteration can be performed
+static void add_input_buffer_final_silence(struct mp_scaletempo2 *p, double playback_rate)
+{
+    int needed = frames_needed(p, playback_rate);
+    if (needed <= 0)
+        return; // no silence needed for iteration
+
+    int required_size = needed + p->input_buffer_frames;
+    if (required_size > p->input_buffer_size)
+        resize_input_buffer(p, required_size);
+
+    for (int i = 0; i < p->channels; ++i) {
+        float *ch_input = p->input_buffer[i];
+        for (int j = 0; j < needed; ++j) {
+            ch_input[p->input_buffer_frames + j] = 0.0f;
+        }
+    }
+
+    p->input_buffer_added_silence += needed;
+    p->input_buffer_frames += needed;
+}
+
+void mp_scaletempo2_set_final(struct mp_scaletempo2 *p)
+{
+    if (p->input_buffer_final_frames <= 0) {
+        p->input_buffer_final_frames = p->input_buffer_frames;
+    }
+}
+
 int mp_scaletempo2_fill_input_buffer(struct mp_scaletempo2 *p,
-    uint8_t **planes, int frame_size, bool final, double playback_rate)
+    uint8_t **planes, int frame_size, double playback_rate)
 {
     int needed = frames_needed(p, playback_rate);
     int read = MPMIN(needed, frame_size);
-    int total_fill = final ? needed : read;
-    if (total_fill == 0) return 0;
+    if (read == 0)
+        return 0;
 
-    int required_size = total_fill + p->input_buffer_frames;
+    int required_size = read + p->input_buffer_frames;
     if (required_size > p->input_buffer_size)
         resize_input_buffer(p, required_size);
 
     for (int i = 0; i < p->channels; ++i) {
         memcpy(p->input_buffer[i] + p->input_buffer_frames,
             planes[i], read * sizeof(float));
-        for (int j = read; j < total_fill; ++j) {
-            p->input_buffer[i][p->input_buffer_frames + j] = 0.0f;
-        }
     }
 
-    p->input_buffer_frames += total_fill;
+    p->input_buffer_frames += read;
     return read;
 }
 
@@ -669,6 +698,10 @@ int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p,
 {
     if (playback_rate == 0) return 0;
 
+    if (p->input_buffer_final_frames > 0) {
+        add_input_buffer_final_silence(p, playback_rate);
+    }
+
     // Optimize the muted case to issue a single clear instead of performing
     // the full crossfade and clearing each crossfaded frame.
     if (playback_rate < p->opts->min_playback_rate
@@ -726,12 +759,15 @@ int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p,
 double mp_scaletempo2_get_latency(struct mp_scaletempo2 *p, double playback_rate)
 {
     return p->input_buffer_frames - p->output_time
+        - p->input_buffer_added_silence
         + p->num_complete_frames * playback_rate;
 }
 
 bool mp_scaletempo2_frames_available(struct mp_scaletempo2 *p, double playback_rate)
 {
-    return can_perform_wsola(p, playback_rate) || p->num_complete_frames > 0;
+    return p->input_buffer_final_frames > p->target_block_index
+        || can_perform_wsola(p, playback_rate)
+        || p->num_complete_frames > 0;
 }
 
 void mp_scaletempo2_destroy(struct mp_scaletempo2 *p)
@@ -749,6 +785,8 @@ void mp_scaletempo2_destroy(struct mp_scaletempo2 *p)
 void mp_scaletempo2_reset(struct mp_scaletempo2 *p)
 {
     p->input_buffer_frames = 0;
+    p->input_buffer_final_frames = 0;
+    p->input_buffer_added_silence = 0;
     p->output_time = 0.0;
     p->search_block_index = 0;
     p->target_block_index = 0;
@@ -827,6 +865,8 @@ void mp_scaletempo2_init(struct mp_scaletempo2 *p, int channels, int rate)
 
     resize_input_buffer(p, 4 * MPMAX(p->ola_window_size, p->search_block_size));
     p->input_buffer_frames = 0;
+    p->input_buffer_final_frames = 0;
+    p->input_buffer_added_silence = 0;
 
     p->energy_candidate_blocks = realloc(p->energy_candidate_blocks,
         sizeof(float) * p->channels * p->num_candidate_blocks);
diff --git a/audio/filter/af_scaletempo2_internals.h b/audio/filter/af_scaletempo2_internals.h
index 622d6e20da..6c3c94c0a9 100644
--- a/audio/filter/af_scaletempo2_internals.h
+++ b/audio/filter/af_scaletempo2_internals.h
@@ -112,6 +112,13 @@ struct mp_scaletempo2 {
     float **input_buffer;
     int input_buffer_size;
     int input_buffer_frames;
+    // How many frames in |input_buffer| need to be flushed by padding with
+    // silence to process the final packet. While this is nonzero, the filter
+    // appends silence to |input_buffer| until these frames are processed.
+    int input_buffer_final_frames;
+    // How many additional frames of silence have been added to |input_buffer|
+    // for padding after the final packet.
+    int input_buffer_added_silence;
     float *energy_candidate_blocks;
 };
 
@@ -120,7 +127,8 @@ void mp_scaletempo2_reset(struct mp_scaletempo2 *p);
 void mp_scaletempo2_init(struct mp_scaletempo2 *p, int channels, int rate);
 double mp_scaletempo2_get_latency(struct mp_scaletempo2 *p, double playback_rate);
 int mp_scaletempo2_fill_input_buffer(struct mp_scaletempo2 *p,
-    uint8_t **planes, int frame_size, bool final, double playback_rate);
+    uint8_t **planes, int frame_size, double playback_rate);
+void mp_scaletempo2_set_final(struct mp_scaletempo2 *p);
 int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p,
     float **dest, int dest_size, double playback_rate);
 bool mp_scaletempo2_frames_available(struct mp_scaletempo2 *p, double playback_rate);