mpv/audio/filter/af_scaletempo2_internals.c

#include <float.h>
#include <math.h>

#include "audio/chmap.h"
#include "audio/filter/af_scaletempo2_internals.h"

#include "config.h"

// Algorithm overview (from chromium):
// Waveform Similarity Overlap-and-add (WSOLA).
//
// One WSOLA iteration
//
// 1) Extract |target_block| as input frames at indices
//    [|target_block_index|, |target_block_index| + |ola_window_size|).
//    Note that |target_block| is the "natural" continuation of the output.
//
// 2) Extract |search_block| as input frames at indices
//    [|search_block_index|,
//     |search_block_index| + |num_candidate_blocks| + |ola_window_size|).
//
// 3) Find a block within the |search_block| that is most similar
//    to |target_block|. Let |optimal_index| be the index of such block and
//    write it to |optimal_block|.
//
// 4) Update:
//    |optimal_block| = |transition_window| * |target_block| +
//    (1 - |transition_window|) * |optimal_block|.
//
// 5) Overlap-and-add |optimal_block| to the |wsola_output|.
//
// 6) Update:write

struct interval {
    int lo;
    int hi;
};

static bool in_interval(int n, struct interval q)
{
    return n >= q.lo && n <= q.hi;
}

static void alloc_sample_buffer(struct mp_scaletempo2 *p, float ***ptr, size_t size)
{
    talloc_free(*ptr);

    float **buff = talloc_array(p, float*, p->channels);
    for (int i = 0; i < p->channels; ++i) {
        buff[i] = talloc_array(buff, float, size);
    }
    *ptr = buff;
}

static void zero_2d_partial(float **a, int x, int y)
{
    for (int i = 0; i < x; ++i) {
        memset(a[i], 0, sizeof(float) * y);
    }
}

// Energies of sliding windows of channels are interleaved.
// The number windows is |input_frames| - (|frames_per_window| - 1), hence,
// the method assumes |energy| must be, at least, of size
// (|input_frames| - (|frames_per_window| - 1)) * |channels|.
static void multi_channel_moving_block_energies(
    float **input, int input_frames, int channels,
    int frames_per_block, float *energy)
{
    int num_blocks = input_frames - (frames_per_block - 1);

    for (int k = 0; k < channels; ++k) {
        const float* input_channel = input[k];

        energy[k] = 0;

        // First block of channel |k|.
        for (int m = 0; m < frames_per_block; ++m) {
            energy[k] += input_channel[m] * input_channel[m];
        }

        const float* slide_out = input_channel;
        const float* slide_in = input_channel + frames_per_block;
        for (int n = 1; n < num_blocks; ++n, ++slide_in, ++slide_out) {
            energy[k + n * channels] = energy[k + (n - 1) * channels]
                - *slide_out * *slide_out + *slide_in * *slide_in;
        }
    }
}

static float multi_channel_similarity_measure(
    const float* dot_prod,
    const float* energy_target, const float* energy_candidate,
    int channels)
{
    const float epsilon = 1e-12f;
    float similarity_measure = 0.0f;
    for (int n = 0; n < channels; ++n) {
        similarity_measure += dot_prod[n] * energy_target[n]
            / sqrtf(energy_target[n] * energy_candidate[n] + epsilon);
    }
    return similarity_measure;
}

#if HAVE_VECTOR

typedef float v8sf __attribute__ ((vector_size (32), aligned (1)));

// Dot-product of channels of two AudioBus. For each AudioBus an offset is
// given. |dot_product[k]| is the dot-product of channel |k|. The caller should
// allocate sufficient space for |dot_product|.
static void multi_channel_dot_product(
    float **a, int frame_offset_a,
    float **b, int frame_offset_b,
    int channels,
    int num_frames, float *dot_product)
{
    assert(frame_offset_a >= 0);
    assert(frame_offset_b >= 0);

    for (int k = 0; k < channels; ++k) {
        const float* ch_a = a[k] + frame_offset_a;
        const float* ch_b = b[k] + frame_offset_b;
        float sum = 0.0;
        if (num_frames < 32)
            goto rest;

        const v8sf *va = (const v8sf *) ch_a;
        const v8sf *vb = (const v8sf *) ch_b;
        v8sf vsum[4] = {
            // Initialize to product of first 32 floats
            va[0] * vb[0],
            va[1] * vb[1],
            va[2] * vb[2],
            va[3] * vb[3],
        };
        va += 4;
        vb += 4;

        // Process `va` and `vb` across four vertical stripes
        for (int n = 1; n < num_frames / 32; n++) {
            vsum[0] += va[0] * vb[0];
            vsum[1] += va[1] * vb[1];
            vsum[2] += va[2] * vb[2];
            vsum[3] += va[3] * vb[3];
            va += 4;
            vb += 4;
        }

        // Vertical sum across `vsum` entries
        vsum[0] += vsum[1];
        vsum[2] += vsum[3];
        vsum[0] += vsum[2];

        // Horizontal sum across `vsum[0]`, could probably be done better but
        // this section is not super performance critical
        float *vf = (float *) &vsum[0];
        sum = vf[0] + vf[1] + vf[2] + vf[3] + vf[4] + vf[5] + vf[6] + vf[7];
        ch_a = (const float *) va;
        ch_b = (const float *) vb;

rest:
        // Process the remainder
        for (int n = 0; n < num_frames % 32; n++)
            sum += *ch_a++ * *ch_b++;

        dot_product[k] = sum;
    }
}

#else // !HAVE_VECTOR

static void multi_channel_dot_product(
    float **a, int frame_offset_a,
    float **b, int frame_offset_b,
    int channels,
    int num_frames, float *dot_product)
{
    assert(frame_offset_a >= 0);
    assert(frame_offset_b >= 0);

    for (int k = 0; k < channels; ++k) {
        const float* ch_a = a[k] + frame_offset_a;
        const float* ch_b = b[k] + frame_offset_b;
        float sum = 0.0;
        for (int n = 0; n < num_frames; n++)
            sum += *ch_a++ * *ch_b++;
        dot_product[k] = sum;
    }
}

#endif // HAVE_VECTOR

// Fit the curve f(x) = a * x^2 + b * x + c such that
//   f(-1) = y[0]
//   f(0) = y[1]
//   f(1) = y[2]
// and return the maximum, assuming that y[0] <= y[1] >= y[2].
static void quadratic_interpolation(
    const float* y_values, float* extremum, float* extremum_value)
{
    float a = 0.5f * (y_values[2] + y_values[0]) - y_values[1];
    float b = 0.5f * (y_values[2] - y_values[0]);
    float c = y_values[1];

    if (a == 0.f) {
        // The coordinates are colinear (within floating-point error).
        *extremum = 0;
        *extremum_value = y_values[1];
    } else {
        *extremum = -b / (2.f * a);
        *extremum_value = a * (*extremum) * (*extremum) + b * (*extremum) + c;
    }
}

// Search a subset of all candid blocks. The search is performed every
// |decimation| frames. This reduces complexity by a factor of about
// 1 / |decimation|. A cubic interpolation is used to have a better estimate of
// the best match.
static int decimated_search(
    int decimation, struct interval exclude_interval,
    float **target_block, int target_block_frames,
    float **search_segment, int search_segment_frames,
    int channels,
    const float *energy_target_block, const float *energy_candidate_blocks)
{
    int num_candidate_blocks = search_segment_frames - (target_block_frames - 1);
    float dot_prod [MP_NUM_CHANNELS];
    float similarity[3];  // Three elements for cubic interpolation.

    int n = 0;
    multi_channel_dot_product(
        target_block, 0,
        search_segment, n,
        channels,
        target_block_frames, dot_prod);
    similarity[0] = multi_channel_similarity_measure(
        dot_prod, energy_target_block,
        &energy_candidate_blocks[n * channels], channels);

    // Set the starting point as optimal point.
    float best_similarity = similarity[0];
    int optimal_index = 0;

    n += decimation;
    if (n >= num_candidate_blocks) {
        return 0;
    }

    multi_channel_dot_product(
        target_block, 0,
        search_segment, n,
        channels,
        target_block_frames, dot_prod);
    similarity[1] = multi_channel_similarity_measure(
        dot_prod, energy_target_block,
        &energy_candidate_blocks[n * channels], channels);

    n += decimation;
    if (n >= num_candidate_blocks) {
        // We cannot do any more sampling. Compare these two values and return the
        // optimal index.
        return similarity[1] > similarity[0] ? decimation : 0;
    }

    for (; n < num_candidate_blocks; n += decimation) {
        multi_channel_dot_product(
            target_block, 0,
            search_segment, n,
            channels,
            target_block_frames, dot_prod);

        similarity[2] = multi_channel_similarity_measure(
            dot_prod, energy_target_block,
            &energy_candidate_blocks[n * channels], channels);

        if ((similarity[1] > similarity[0] && similarity[1] >= similarity[2]) ||
            (similarity[1] >= similarity[0] && similarity[1] > similarity[2]))
        {
            // A local maximum is found. Do a cubic interpolation for a better
            // estimate of candidate maximum.
            float normalized_candidate_index;
            float candidate_similarity;
            quadratic_interpolation(similarity, &normalized_candidate_index,
                                    &candidate_similarity);

            int candidate_index = n - decimation
                 + (int)(normalized_candidate_index * decimation +  0.5f);
            if (candidate_similarity > best_similarity
                && !in_interval(candidate_index, exclude_interval)) {
                optimal_index = candidate_index;
                best_similarity = candidate_similarity;
            }
        } else if (n + decimation >= num_candidate_blocks &&
                   similarity[2] > best_similarity &&
                   !in_interval(n, exclude_interval))
        {
            // If this is the end-point and has a better similarity-measure than
            // optimal, then we accept it as optimal point.
            optimal_index = n;
            best_similarity = similarity[2];
        }
        memmove(similarity, &similarity[1], 2 * sizeof(*similarity));
    }
    return optimal_index;
}

// Search [|low_limit|, |high_limit|] of |search_segment| to find a block that
// is most similar to |target_block|. |energy_target_block| is the energy of the
// |target_block|. |energy_candidate_blocks| is the energy of all blocks within
// |search_block|.
static int full_search(
    int low_limit, int high_limit,
    struct interval exclude_interval,
    float **target_block, int target_block_frames,
    float **search_block, int search_block_frames,
    int channels,
    const float* energy_target_block,
    const float* energy_candidate_blocks)
{
    // int block_size = target_block->frames;
    float dot_prod [sizeof(float) * MP_NUM_CHANNELS];

    float best_similarity = -FLT_MAX;//FLT_MIN;
    int optimal_index = 0;

    for (int n = low_limit; n <= high_limit; ++n) {
        if (in_interval(n, exclude_interval)) {
            continue;
        }
        multi_channel_dot_product(target_block, 0, search_block, n, channels,
            target_block_frames, dot_prod);

        float similarity = multi_channel_similarity_measure(
            dot_prod, energy_target_block,
            &energy_candidate_blocks[n * channels], channels);

        if (similarity > best_similarity) {
            best_similarity = similarity;
            optimal_index = n;
        }
    }

    return optimal_index;
}

// Find the index of the block, within |search_block|, that is most similar
// to |target_block|. Obviously, the returned index is w.r.t. |search_block|.
// |exclude_interval| is an interval that is excluded from the search.
static int compute_optimal_index(
    float **search_block, int search_block_frames,
    float **target_block, int target_block_frames,
    float *energy_candidate_blocks,
    int channels,
    struct interval exclude_interval)
{
    int num_candidate_blocks = search_block_frames - (target_block_frames - 1);

    // This is a compromise between complexity reduction and search accuracy. I
    // don't have a proof that down sample of order 5 is optimal.
    // One can compute a decimation factor that minimizes complexity given
    // the size of |search_block| and |target_block|. However, my experiments
    // show the rate of missing the optimal index is significant.
    // This value is chosen heuristically based on experiments.
    const int search_decimation = 5;

    float energy_target_block [MP_NUM_CHANNELS];
    // energy_candidate_blocks must have at least size
    // sizeof(float) * channels * num_candidate_blocks

    // Energy of all candid frames.
    multi_channel_moving_block_energies(
        search_block,
        search_block_frames,
        channels,
        target_block_frames,
        energy_candidate_blocks);

    // Energy of target frame.
    multi_channel_dot_product(
        target_block, 0,
        target_block, 0,
        channels,
        target_block_frames, energy_target_block);

    int optimal_index = decimated_search(
        search_decimation, exclude_interval,
        target_block, target_block_frames,
        search_block, search_block_frames,
        channels,
        energy_target_block,
        energy_candidate_blocks);

    int lim_low = MPMAX(0, optimal_index - search_decimation);
    int lim_high = MPMIN(num_candidate_blocks - 1,
                            optimal_index + search_decimation);
    return full_search(
        lim_low, lim_high, exclude_interval,
        target_block, target_block_frames,
        search_block, search_block_frames,
        channels,
        energy_target_block, energy_candidate_blocks);
}

static void peek_buffer(struct mp_scaletempo2 *p,
    int frames, int read_offset, int write_offset, float **dest)
{
    assert(p->input_buffer_frames >= frames);
    for (int i = 0; i < p->channels; ++i) {
        memcpy(dest[i] + write_offset,
            p->input_buffer[i] + read_offset,
            frames * sizeof(float));
    }
}

static void seek_buffer(struct mp_scaletempo2 *p, int frames)
{
    assert(p->input_buffer_frames >= frames);
    p->input_buffer_frames -= frames;
    if (p->input_buffer_final_frames > 0) {
        p->input_buffer_final_frames = MPMAX(0, p->input_buffer_final_frames - frames);
    }
    for (int i = 0; i < p->channels; ++i) {
        memmove(p->input_buffer[i], p->input_buffer[i] + frames,
            p->input_buffer_frames * sizeof(float));
    }
}

static int write_completed_frames_to(struct mp_scaletempo2 *p,
    int requested_frames, int dest_offset, float **dest)
{
    int rendered_frames = MPMIN(p->num_complete_frames, requested_frames);

    if (rendered_frames == 0)
        return 0;  // There is nothing to read from |wsola_output|, return.

    for (int i = 0; i < p->channels; ++i) {
        memcpy(dest[i] + dest_offset, p->wsola_output[i],
            rendered_frames * sizeof(float));
    }

    // Remove the frames which are read.
    int frames_to_move = p->wsola_output_size - rendered_frames;
    for (int k = 0; k < p->channels; ++k) {
        float *ch = p->wsola_output[k];
        memmove(ch, &ch[rendered_frames], sizeof(*ch) * frames_to_move);
    }
    p->num_complete_frames -= rendered_frames;
    return rendered_frames;
}

// next output_time for the given playback_rate
static double get_updated_time(struct mp_scaletempo2 *p, double playback_rate)
{
    return p->output_time + p->ola_hop_size * playback_rate;
}

// search_block_index for the given output_time
static int get_search_block_index(struct mp_scaletempo2 *p, double output_time)
{
    return (int)(output_time - p->search_block_center_offset + 0.5);
}

// number of frames needed until a wsola iteration can be performed
static int frames_needed(struct mp_scaletempo2 *p, double playback_rate)
{
    int search_block_index =
        get_search_block_index(p, get_updated_time(p, playback_rate));
    return MPMAX(0, MPMAX(
        p->target_block_index + p->ola_window_size - p->input_buffer_frames,
        search_block_index + p->search_block_size - p->input_buffer_frames));
}

static bool can_perform_wsola(struct mp_scaletempo2 *p, double playback_rate)
{
    return frames_needed(p, playback_rate) <= 0;
}

// pad end with silence until a wsola iteration can be performed
static void add_input_buffer_final_silence(struct mp_scaletempo2 *p, double playback_rate)
{
    int needed = frames_needed(p, playback_rate);
    if (needed <= 0)
        return; // no silence needed for iteration

    int last_index = needed + p->input_buffer_frames - 1;
    for (int i = 0; i < p->channels; ++i) {
        MP_TARRAY_GROW(p, p->input_buffer[i], last_index);
        float *ch_input = p->input_buffer[i];
        for (int j = 0; j < needed; ++j) {
            ch_input[p->input_buffer_frames + j] = 0.0f;
        }
    }

    p->input_buffer_added_silence += needed;
    p->input_buffer_frames += needed;
}

void mp_scaletempo2_set_final(struct mp_scaletempo2 *p)
{
    if (p->input_buffer_final_frames <= 0) {
        p->input_buffer_final_frames = p->input_buffer_frames;
    }
}

int mp_scaletempo2_fill_input_buffer(struct mp_scaletempo2 *p,
    uint8_t **planes, int frame_size, double playback_rate)
{
    int needed = frames_needed(p, playback_rate);
    int read = MPMIN(needed, frame_size);
    if (read == 0)
        return 0;

    int last_index = read + p->input_buffer_frames - 1;
    for (int i = 0; i < p->channels; ++i) {
        MP_TARRAY_GROW(p, p->input_buffer[i], last_index);
        memcpy(p->input_buffer[i] + p->input_buffer_frames,
            planes[i], read * sizeof(float));
    }

    p->input_buffer_frames += read;
    return read;
}

static bool target_is_within_search_region(struct mp_scaletempo2 *p)
{
    return p->target_block_index >= p->search_block_index
        && p->target_block_index + p->ola_window_size
            <= p->search_block_index + p->search_block_size;
}


static void peek_audio_with_zero_prepend(struct mp_scaletempo2 *p,
    int read_offset_frames, float **dest, int dest_frames)
{
    assert(read_offset_frames + dest_frames <= p->input_buffer_frames);

    int write_offset = 0;
    int num_frames_to_read = dest_frames;
    if (read_offset_frames < 0) {
        int num_zero_frames_appended = MPMIN(
            -read_offset_frames, num_frames_to_read);
        read_offset_frames = 0;
        num_frames_to_read -= num_zero_frames_appended;
        write_offset = num_zero_frames_appended;
        zero_2d_partial(dest, p->channels, num_zero_frames_appended);
    }
    peek_buffer(p, num_frames_to_read, read_offset_frames, write_offset, dest);
}

static void get_optimal_block(struct mp_scaletempo2 *p)
{
    int optimal_index = 0;

    // An interval around last optimal block which is excluded from the search.
    // This is to reduce the buzzy sound. The number 160 is rather arbitrary and
    // derived heuristically.
    const int exclude_interval_length_frames = 160;
    if (target_is_within_search_region(p)) {
        optimal_index = p->target_block_index;
        peek_audio_with_zero_prepend(p,
            optimal_index, p->optimal_block, p->ola_window_size);
    } else {
        peek_audio_with_zero_prepend(p,
            p->target_block_index, p->target_block, p->ola_window_size);
        peek_audio_with_zero_prepend(p,
            p->search_block_index, p->search_block, p->search_block_size);
        int last_optimal = p->target_block_index
            - p->ola_hop_size - p->search_block_index;
        struct interval exclude_iterval = {
            .lo = last_optimal - exclude_interval_length_frames / 2,
            .hi = last_optimal + exclude_interval_length_frames / 2
        };

        // |optimal_index| is in frames and it is relative to the beginning of the
        // |search_block|.
        optimal_index = compute_optimal_index(
            p->search_block, p->search_block_size,
            p->target_block, p->ola_window_size,
            p->energy_candidate_blocks,
            p->channels,
            exclude_iterval);

        // Translate |index| w.r.t. the beginning of |audio_buffer| and extract the
        // optimal block.
        optimal_index += p->search_block_index;
        peek_audio_with_zero_prepend(p,
            optimal_index, p->optimal_block, p->ola_window_size);

        // Make a transition from target block to the optimal block if different.
        // Target block has the best continuation to the current output.
        // Optimal block is the most similar block to the target, however, it might
        // introduce some discontinuity when over-lap-added. Therefore, we combine
        // them for a smoother transition. The length of transition window is twice
        // as that of the optimal-block which makes it like a weighting function
        // where target-block has higher weight close to zero (weight of 1 at index
        // 0) and lower weight close the end.
        for (int k = 0; k < p->channels; ++k) {
            float* ch_opt = p->optimal_block[k];
            float* ch_target = p->target_block[k];
            for (int n = 0; n < p->ola_window_size; ++n) {
                ch_opt[n] = ch_opt[n] * p->transition_window[n]
                    + ch_target[n] * p->transition_window[p->ola_window_size + n];
            }
        }
    }

    // Next target is one hop ahead of the current optimal.
    p->target_block_index = optimal_index + p->ola_hop_size;
}

static void set_output_time(struct mp_scaletempo2 *p, double output_time)
{
    p->output_time = output_time;
    p->search_block_index = get_search_block_index(p, output_time);
}

static void remove_old_input_frames(struct mp_scaletempo2 *p)
{
    const int earliest_used_index = MPMIN(
        p->target_block_index, p->search_block_index);
    if (earliest_used_index <= 0)
        return;  // Nothing to remove.

    // Remove frames from input and adjust indices accordingly.
    seek_buffer(p, earliest_used_index);
    p->target_block_index -= earliest_used_index;
    p->output_time -= earliest_used_index;
    p->search_block_index -= earliest_used_index;
}

static bool run_one_wsola_iteration(struct mp_scaletempo2 *p, double playback_rate)
{
    if (!can_perform_wsola(p, playback_rate)) {
        return false;
    }

    set_output_time(p, get_updated_time(p, playback_rate));
    remove_old_input_frames(p);

    assert(p->search_block_index + p->search_block_size <= p->input_buffer_frames);

    get_optimal_block(p);

    // Overlap-and-add.
    for (int k = 0; k < p->channels; ++k) {
        float* ch_opt_frame = p->optimal_block[k];
        float* ch_output = p->wsola_output[k] + p->num_complete_frames;
        if (p->wsola_output_started) {
            for (int n = 0; n < p->ola_hop_size; ++n) {
                ch_output[n] = ch_output[n] * p->ola_window[p->ola_hop_size + n] +
                    ch_opt_frame[n] * p->ola_window[n];
            }

            // Copy the second half to the output.
            memcpy(&ch_output[p->ola_hop_size], &ch_opt_frame[p->ola_hop_size],
                   sizeof(*ch_opt_frame) * p->ola_hop_size);
        } else {
            // No overlap for the first iteration.
            memcpy(ch_output, ch_opt_frame,
                   sizeof(*ch_opt_frame) * p->ola_window_size);
        }
    }

    p->num_complete_frames += p->ola_hop_size;
    p->wsola_output_started = true;
    return true;
}

static int read_input_buffer(struct mp_scaletempo2 *p, int dest_size, float **dest)
{
    int frames_to_copy = MPMIN(dest_size, p->input_buffer_frames - p->target_block_index);

    if (frames_to_copy <= 0)
        return 0; // There is nothing to read from input buffer; return.

    peek_buffer(p, frames_to_copy, p->target_block_index, 0, dest);
    seek_buffer(p, frames_to_copy);
    return frames_to_copy;
}

int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p,
    float **dest, int dest_size, double playback_rate)
{
    if (playback_rate == 0) return 0;

    if (p->input_buffer_final_frames > 0) {
        add_input_buffer_final_silence(p, playback_rate);
    }

    // Optimize the muted case to issue a single clear instead of performing
    // the full crossfade and clearing each crossfaded frame.
    if (playback_rate < p->opts->min_playback_rate
        || (playback_rate > p->opts->max_playback_rate
            && p->opts->max_playback_rate > 0))
    {
        int frames_to_render = MPMIN(dest_size,
            (int) (p->input_buffer_frames / playback_rate));

        // Compute accurate number of frames to actually skip in the source data.
        // Includes the leftover partial frame from last request. However, we can
        // only skip over complete frames, so a partial frame may remain for next
        // time.
        p->muted_partial_frame += frames_to_render * playback_rate;
        int seek_frames = (int) (p->muted_partial_frame);
        zero_2d_partial(dest, p->channels, frames_to_render);
        seek_buffer(p, seek_frames);

        // Determine the partial frame that remains to be skipped for next call. If
        // the user switches back to playing, it may be off time by this partial
        // frame, which would be undetectable. If they subsequently switch to
        // another playback rate that mutes, the code will attempt to line up the
        // frames again.
        p->muted_partial_frame -= seek_frames;
        return frames_to_render;
    }

    int slower_step = (int) ceilf(p->ola_window_size * playback_rate);
    int faster_step = (int) ceilf(p->ola_window_size / playback_rate);

    // Optimize the most common |playback_rate| ~= 1 case to use a single copy
    // instead of copying frame by frame.
    if (p->ola_window_size <= faster_step && slower_step >= p->ola_window_size) {

        if (p->wsola_output_started) {
            p->wsola_output_started = false;

            // sync audio precisely again
            set_output_time(p, p->target_block_index);
            remove_old_input_frames(p);
        }

        return read_input_buffer(p, dest_size, dest);
    }

    int rendered_frames = 0;
    do {
        rendered_frames += write_completed_frames_to(p,
            dest_size - rendered_frames, rendered_frames, dest);
    } while (rendered_frames < dest_size
             && run_one_wsola_iteration(p, playback_rate));
    return rendered_frames;
}

double mp_scaletempo2_get_latency(struct mp_scaletempo2 *p, double playback_rate)
{
    return p->input_buffer_frames - p->output_time
        - p->input_buffer_added_silence
        + p->num_complete_frames * playback_rate;
}

bool mp_scaletempo2_frames_available(struct mp_scaletempo2 *p, double playback_rate)
{
    return (p->input_buffer_final_frames > p->target_block_index &&
            p->input_buffer_final_frames > 0)
        || can_perform_wsola(p, playback_rate)
        || p->num_complete_frames > 0;
}

void mp_scaletempo2_reset(struct mp_scaletempo2 *p)
{
    p->input_buffer_frames = 0;
    p->input_buffer_final_frames = 0;
    p->input_buffer_added_silence = 0;
    p->output_time = 0.0;
    p->search_block_index = 0;
    p->target_block_index = 0;
    p->num_complete_frames = 0;
    p->wsola_output_started = false;
}

// Return a "periodic" Hann window. This is the first L samples of an L+1
// Hann window. It is perfect reconstruction for overlap-and-add.
static void get_symmetric_hanning_window(int window_length, float* window)
{
    const float scale = 2.0f * M_PI / window_length;
    for (int n = 0; n < window_length; ++n)
        window[n] = 0.5f * (1.0f - cosf(n * scale));
}


void mp_scaletempo2_init(struct mp_scaletempo2 *p, int channels, int rate)
{
    p->muted_partial_frame = 0;
    p->output_time = 0;
    p->search_block_index = 0;
    p->target_block_index = 0;
    p->num_complete_frames = 0;
    p->wsola_output_started = false;
    p->channels = channels;

    p->samples_per_second = rate;
    p->num_candidate_blocks = (int)(p->opts->wsola_search_interval_ms
        * p->samples_per_second / 1000);
    p->ola_window_size = (int)(p->opts->ola_window_size_ms
        * p->samples_per_second / 1000);
    // Make sure window size in an even number.
    p->ola_window_size += p->ola_window_size & 1;
    p->ola_hop_size = p->ola_window_size / 2;
    // |num_candidate_blocks| / 2 is the offset of the center of the search
    // block to the center of the first (left most) candidate block. The offset
    // of the center of a candidate block to its left most point is
    // |ola_window_size| / 2 - 1. Note that |ola_window_size| is even and in
    // our convention the center belongs to the left half, so we need to subtract
    // one frame to get the correct offset.
    //
    //                             Search Block
    //              <------------------------------------------->
    //
    //   |ola_window_size| / 2 - 1
    //              <----
    //
    //             |num_candidate_blocks| / 2
    //                   <----------------
    //                                 center
    //              X----X----------------X---------------X-----X
    //              <---------->                     <---------->
    //                Candidate      ...               Candidate
    //                   1,          ...         |num_candidate_blocks|
    p->search_block_center_offset = p->num_candidate_blocks / 2
        + (p->ola_window_size / 2 - 1);
    MP_RESIZE_ARRAY(p, p->ola_window, p->ola_window_size);
    get_symmetric_hanning_window(p->ola_window_size, p->ola_window);
    MP_RESIZE_ARRAY(p, p->transition_window, p->ola_window_size * 2);
    get_symmetric_hanning_window(2 * p->ola_window_size, p->transition_window);

    p->wsola_output_size = p->ola_window_size + p->ola_hop_size;
    alloc_sample_buffer(p, &p->wsola_output, p->wsola_output_size);

    // Auxiliary containers.
    alloc_sample_buffer(p, &p->optimal_block, p->ola_window_size);
    p->search_block_size = p->num_candidate_blocks + (p->ola_window_size - 1);
    alloc_sample_buffer(p, &p->search_block, p->search_block_size);
    alloc_sample_buffer(p, &p->target_block, p->ola_window_size);

    p->input_buffer_frames = 0;
    p->input_buffer_final_frames = 0;
    p->input_buffer_added_silence = 0;
    size_t initial_size = 4 * MPMAX(p->ola_window_size, p->search_block_size);
    alloc_sample_buffer(p, &p->input_buffer, initial_size);

    MP_RESIZE_ARRAY(p, p->energy_candidate_blocks,
        p->channels * p->num_candidate_blocks);
}