mirror of https://github.com/mpv-player/mpv
845 lines
30 KiB
C
845 lines
30 KiB
C
#include <float.h>
|
|
#include <math.h>
|
|
|
|
#include "audio/chmap.h"
|
|
#include "audio/filter/af_scaletempo2_internals.h"
|
|
|
|
#include "config.h"
|
|
|
|
// Algorithm overview (from chromium):
|
|
// Waveform Similarity Overlap-and-add (WSOLA).
|
|
//
|
|
// One WSOLA iteration
|
|
//
|
|
// 1) Extract |target_block| as input frames at indices
|
|
// [|target_block_index|, |target_block_index| + |ola_window_size|).
|
|
// Note that |target_block| is the "natural" continuation of the output.
|
|
//
|
|
// 2) Extract |search_block| as input frames at indices
|
|
// [|search_block_index|,
|
|
// |search_block_index| + |num_candidate_blocks| + |ola_window_size|).
|
|
//
|
|
// 3) Find a block within the |search_block| that is most similar
|
|
// to |target_block|. Let |optimal_index| be the index of such block and
|
|
// write it to |optimal_block|.
|
|
//
|
|
// 4) Update:
|
|
// |optimal_block| = |transition_window| * |target_block| +
|
|
// (1 - |transition_window|) * |optimal_block|.
|
|
//
|
|
// 5) Overlap-and-add |optimal_block| to the |wsola_output|.
|
|
//
|
|
// 6) Update:write
|
|
|
|
struct interval {
|
|
int lo;
|
|
int hi;
|
|
};
|
|
|
|
static bool in_interval(int n, struct interval q)
|
|
{
|
|
return n >= q.lo && n <= q.hi;
|
|
}
|
|
|
|
static void alloc_sample_buffer(struct mp_scaletempo2 *p, float ***ptr, size_t size)
|
|
{
|
|
talloc_free(*ptr);
|
|
|
|
float **buff = talloc_array(p, float*, p->channels);
|
|
for (int i = 0; i < p->channels; ++i) {
|
|
buff[i] = talloc_array(buff, float, size);
|
|
}
|
|
*ptr = buff;
|
|
}
|
|
|
|
static void zero_2d_partial(float **a, int x, int y)
|
|
{
|
|
for (int i = 0; i < x; ++i) {
|
|
memset(a[i], 0, sizeof(float) * y);
|
|
}
|
|
}
|
|
|
|
// Energies of sliding windows of channels are interleaved.
|
|
// The number windows is |input_frames| - (|frames_per_window| - 1), hence,
|
|
// the method assumes |energy| must be, at least, of size
|
|
// (|input_frames| - (|frames_per_window| - 1)) * |channels|.
|
|
static void multi_channel_moving_block_energies(
|
|
float **input, int input_frames, int channels,
|
|
int frames_per_block, float *energy)
|
|
{
|
|
int num_blocks = input_frames - (frames_per_block - 1);
|
|
|
|
for (int k = 0; k < channels; ++k) {
|
|
const float* input_channel = input[k];
|
|
|
|
energy[k] = 0;
|
|
|
|
// First block of channel |k|.
|
|
for (int m = 0; m < frames_per_block; ++m) {
|
|
energy[k] += input_channel[m] * input_channel[m];
|
|
}
|
|
|
|
const float* slide_out = input_channel;
|
|
const float* slide_in = input_channel + frames_per_block;
|
|
for (int n = 1; n < num_blocks; ++n, ++slide_in, ++slide_out) {
|
|
energy[k + n * channels] = energy[k + (n - 1) * channels]
|
|
- *slide_out * *slide_out + *slide_in * *slide_in;
|
|
}
|
|
}
|
|
}
|
|
|
|
static float multi_channel_similarity_measure(
|
|
const float* dot_prod,
|
|
const float* energy_target, const float* energy_candidate,
|
|
int channels)
|
|
{
|
|
const float epsilon = 1e-12f;
|
|
float similarity_measure = 0.0f;
|
|
for (int n = 0; n < channels; ++n) {
|
|
similarity_measure += dot_prod[n] * energy_target[n]
|
|
/ sqrtf(energy_target[n] * energy_candidate[n] + epsilon);
|
|
}
|
|
return similarity_measure;
|
|
}
|
|
|
|
#if HAVE_VECTOR
|
|
|
|
typedef float v8sf __attribute__ ((vector_size (32), aligned (1)));
|
|
|
|
// Dot-product of channels of two AudioBus. For each AudioBus an offset is
|
|
// given. |dot_product[k]| is the dot-product of channel |k|. The caller should
|
|
// allocate sufficient space for |dot_product|.
|
|
static void multi_channel_dot_product(
|
|
float **a, int frame_offset_a,
|
|
float **b, int frame_offset_b,
|
|
int channels,
|
|
int num_frames, float *dot_product)
|
|
{
|
|
assert(frame_offset_a >= 0);
|
|
assert(frame_offset_b >= 0);
|
|
|
|
for (int k = 0; k < channels; ++k) {
|
|
const float* ch_a = a[k] + frame_offset_a;
|
|
const float* ch_b = b[k] + frame_offset_b;
|
|
float sum = 0.0;
|
|
if (num_frames < 32)
|
|
goto rest;
|
|
|
|
const v8sf *va = (const v8sf *) ch_a;
|
|
const v8sf *vb = (const v8sf *) ch_b;
|
|
v8sf vsum[4] = {
|
|
// Initialize to product of first 32 floats
|
|
va[0] * vb[0],
|
|
va[1] * vb[1],
|
|
va[2] * vb[2],
|
|
va[3] * vb[3],
|
|
};
|
|
va += 4;
|
|
vb += 4;
|
|
|
|
// Process `va` and `vb` across four vertical stripes
|
|
for (int n = 1; n < num_frames / 32; n++) {
|
|
vsum[0] += va[0] * vb[0];
|
|
vsum[1] += va[1] * vb[1];
|
|
vsum[2] += va[2] * vb[2];
|
|
vsum[3] += va[3] * vb[3];
|
|
va += 4;
|
|
vb += 4;
|
|
}
|
|
|
|
// Vertical sum across `vsum` entries
|
|
vsum[0] += vsum[1];
|
|
vsum[2] += vsum[3];
|
|
vsum[0] += vsum[2];
|
|
|
|
// Horizontal sum across `vsum[0]`, could probably be done better but
|
|
// this section is not super performance critical
|
|
float *vf = (float *) &vsum[0];
|
|
sum = vf[0] + vf[1] + vf[2] + vf[3] + vf[4] + vf[5] + vf[6] + vf[7];
|
|
ch_a = (const float *) va;
|
|
ch_b = (const float *) vb;
|
|
|
|
rest:
|
|
// Process the remainder
|
|
for (int n = 0; n < num_frames % 32; n++)
|
|
sum += *ch_a++ * *ch_b++;
|
|
|
|
dot_product[k] = sum;
|
|
}
|
|
}
|
|
|
|
#else // !HAVE_VECTOR
|
|
|
|
static void multi_channel_dot_product(
|
|
float **a, int frame_offset_a,
|
|
float **b, int frame_offset_b,
|
|
int channels,
|
|
int num_frames, float *dot_product)
|
|
{
|
|
assert(frame_offset_a >= 0);
|
|
assert(frame_offset_b >= 0);
|
|
|
|
for (int k = 0; k < channels; ++k) {
|
|
const float* ch_a = a[k] + frame_offset_a;
|
|
const float* ch_b = b[k] + frame_offset_b;
|
|
float sum = 0.0;
|
|
for (int n = 0; n < num_frames; n++)
|
|
sum += *ch_a++ * *ch_b++;
|
|
dot_product[k] = sum;
|
|
}
|
|
}
|
|
|
|
#endif // HAVE_VECTOR
|
|
|
|
// Fit the curve f(x) = a * x^2 + b * x + c such that
|
|
// f(-1) = y[0]
|
|
// f(0) = y[1]
|
|
// f(1) = y[2]
|
|
// and return the maximum, assuming that y[0] <= y[1] >= y[2].
|
|
static void quadratic_interpolation(
|
|
const float* y_values, float* extremum, float* extremum_value)
|
|
{
|
|
float a = 0.5f * (y_values[2] + y_values[0]) - y_values[1];
|
|
float b = 0.5f * (y_values[2] - y_values[0]);
|
|
float c = y_values[1];
|
|
|
|
if (a == 0.f) {
|
|
// The coordinates are colinear (within floating-point error).
|
|
*extremum = 0;
|
|
*extremum_value = y_values[1];
|
|
} else {
|
|
*extremum = -b / (2.f * a);
|
|
*extremum_value = a * (*extremum) * (*extremum) + b * (*extremum) + c;
|
|
}
|
|
}
|
|
|
|
// Search a subset of all candid blocks. The search is performed every
|
|
// |decimation| frames. This reduces complexity by a factor of about
|
|
// 1 / |decimation|. A cubic interpolation is used to have a better estimate of
|
|
// the best match.
|
|
static int decimated_search(
|
|
int decimation, struct interval exclude_interval,
|
|
float **target_block, int target_block_frames,
|
|
float **search_segment, int search_segment_frames,
|
|
int channels,
|
|
const float *energy_target_block, const float *energy_candidate_blocks)
|
|
{
|
|
int num_candidate_blocks = search_segment_frames - (target_block_frames - 1);
|
|
float dot_prod [MP_NUM_CHANNELS];
|
|
float similarity[3]; // Three elements for cubic interpolation.
|
|
|
|
int n = 0;
|
|
multi_channel_dot_product(
|
|
target_block, 0,
|
|
search_segment, n,
|
|
channels,
|
|
target_block_frames, dot_prod);
|
|
similarity[0] = multi_channel_similarity_measure(
|
|
dot_prod, energy_target_block,
|
|
&energy_candidate_blocks[n * channels], channels);
|
|
|
|
// Set the starting point as optimal point.
|
|
float best_similarity = similarity[0];
|
|
int optimal_index = 0;
|
|
|
|
n += decimation;
|
|
if (n >= num_candidate_blocks) {
|
|
return 0;
|
|
}
|
|
|
|
multi_channel_dot_product(
|
|
target_block, 0,
|
|
search_segment, n,
|
|
channels,
|
|
target_block_frames, dot_prod);
|
|
similarity[1] = multi_channel_similarity_measure(
|
|
dot_prod, energy_target_block,
|
|
&energy_candidate_blocks[n * channels], channels);
|
|
|
|
n += decimation;
|
|
if (n >= num_candidate_blocks) {
|
|
// We cannot do any more sampling. Compare these two values and return the
|
|
// optimal index.
|
|
return similarity[1] > similarity[0] ? decimation : 0;
|
|
}
|
|
|
|
for (; n < num_candidate_blocks; n += decimation) {
|
|
multi_channel_dot_product(
|
|
target_block, 0,
|
|
search_segment, n,
|
|
channels,
|
|
target_block_frames, dot_prod);
|
|
|
|
similarity[2] = multi_channel_similarity_measure(
|
|
dot_prod, energy_target_block,
|
|
&energy_candidate_blocks[n * channels], channels);
|
|
|
|
if ((similarity[1] > similarity[0] && similarity[1] >= similarity[2]) ||
|
|
(similarity[1] >= similarity[0] && similarity[1] > similarity[2]))
|
|
{
|
|
// A local maximum is found. Do a cubic interpolation for a better
|
|
// estimate of candidate maximum.
|
|
float normalized_candidate_index;
|
|
float candidate_similarity;
|
|
quadratic_interpolation(similarity, &normalized_candidate_index,
|
|
&candidate_similarity);
|
|
|
|
int candidate_index = n - decimation
|
|
+ (int)(normalized_candidate_index * decimation + 0.5f);
|
|
if (candidate_similarity > best_similarity
|
|
&& !in_interval(candidate_index, exclude_interval)) {
|
|
optimal_index = candidate_index;
|
|
best_similarity = candidate_similarity;
|
|
}
|
|
} else if (n + decimation >= num_candidate_blocks &&
|
|
similarity[2] > best_similarity &&
|
|
!in_interval(n, exclude_interval))
|
|
{
|
|
// If this is the end-point and has a better similarity-measure than
|
|
// optimal, then we accept it as optimal point.
|
|
optimal_index = n;
|
|
best_similarity = similarity[2];
|
|
}
|
|
memmove(similarity, &similarity[1], 2 * sizeof(*similarity));
|
|
}
|
|
return optimal_index;
|
|
}
|
|
|
|
// Search [|low_limit|, |high_limit|] of |search_segment| to find a block that
|
|
// is most similar to |target_block|. |energy_target_block| is the energy of the
|
|
// |target_block|. |energy_candidate_blocks| is the energy of all blocks within
|
|
// |search_block|.
|
|
static int full_search(
|
|
int low_limit, int high_limit,
|
|
struct interval exclude_interval,
|
|
float **target_block, int target_block_frames,
|
|
float **search_block, int search_block_frames,
|
|
int channels,
|
|
const float* energy_target_block,
|
|
const float* energy_candidate_blocks)
|
|
{
|
|
// int block_size = target_block->frames;
|
|
float dot_prod [sizeof(float) * MP_NUM_CHANNELS];
|
|
|
|
float best_similarity = -FLT_MAX;//FLT_MIN;
|
|
int optimal_index = 0;
|
|
|
|
for (int n = low_limit; n <= high_limit; ++n) {
|
|
if (in_interval(n, exclude_interval)) {
|
|
continue;
|
|
}
|
|
multi_channel_dot_product(target_block, 0, search_block, n, channels,
|
|
target_block_frames, dot_prod);
|
|
|
|
float similarity = multi_channel_similarity_measure(
|
|
dot_prod, energy_target_block,
|
|
&energy_candidate_blocks[n * channels], channels);
|
|
|
|
if (similarity > best_similarity) {
|
|
best_similarity = similarity;
|
|
optimal_index = n;
|
|
}
|
|
}
|
|
|
|
return optimal_index;
|
|
}
|
|
|
|
// Find the index of the block, within |search_block|, that is most similar
|
|
// to |target_block|. Obviously, the returned index is w.r.t. |search_block|.
|
|
// |exclude_interval| is an interval that is excluded from the search.
|
|
static int compute_optimal_index(
|
|
float **search_block, int search_block_frames,
|
|
float **target_block, int target_block_frames,
|
|
float *energy_candidate_blocks,
|
|
int channels,
|
|
struct interval exclude_interval)
|
|
{
|
|
int num_candidate_blocks = search_block_frames - (target_block_frames - 1);
|
|
|
|
// This is a compromise between complexity reduction and search accuracy. I
|
|
// don't have a proof that down sample of order 5 is optimal.
|
|
// One can compute a decimation factor that minimizes complexity given
|
|
// the size of |search_block| and |target_block|. However, my experiments
|
|
// show the rate of missing the optimal index is significant.
|
|
// This value is chosen heuristically based on experiments.
|
|
const int search_decimation = 5;
|
|
|
|
float energy_target_block [MP_NUM_CHANNELS];
|
|
// energy_candidate_blocks must have at least size
|
|
// sizeof(float) * channels * num_candidate_blocks
|
|
|
|
// Energy of all candid frames.
|
|
multi_channel_moving_block_energies(
|
|
search_block,
|
|
search_block_frames,
|
|
channels,
|
|
target_block_frames,
|
|
energy_candidate_blocks);
|
|
|
|
// Energy of target frame.
|
|
multi_channel_dot_product(
|
|
target_block, 0,
|
|
target_block, 0,
|
|
channels,
|
|
target_block_frames, energy_target_block);
|
|
|
|
int optimal_index = decimated_search(
|
|
search_decimation, exclude_interval,
|
|
target_block, target_block_frames,
|
|
search_block, search_block_frames,
|
|
channels,
|
|
energy_target_block,
|
|
energy_candidate_blocks);
|
|
|
|
int lim_low = MPMAX(0, optimal_index - search_decimation);
|
|
int lim_high = MPMIN(num_candidate_blocks - 1,
|
|
optimal_index + search_decimation);
|
|
return full_search(
|
|
lim_low, lim_high, exclude_interval,
|
|
target_block, target_block_frames,
|
|
search_block, search_block_frames,
|
|
channels,
|
|
energy_target_block, energy_candidate_blocks);
|
|
}
|
|
|
|
static void peek_buffer(struct mp_scaletempo2 *p,
|
|
int frames, int read_offset, int write_offset, float **dest)
|
|
{
|
|
assert(p->input_buffer_frames >= frames);
|
|
for (int i = 0; i < p->channels; ++i) {
|
|
memcpy(dest[i] + write_offset,
|
|
p->input_buffer[i] + read_offset,
|
|
frames * sizeof(float));
|
|
}
|
|
}
|
|
|
|
static void seek_buffer(struct mp_scaletempo2 *p, int frames)
|
|
{
|
|
assert(p->input_buffer_frames >= frames);
|
|
p->input_buffer_frames -= frames;
|
|
if (p->input_buffer_final_frames > 0) {
|
|
p->input_buffer_final_frames = MPMAX(0, p->input_buffer_final_frames - frames);
|
|
}
|
|
for (int i = 0; i < p->channels; ++i) {
|
|
memmove(p->input_buffer[i], p->input_buffer[i] + frames,
|
|
p->input_buffer_frames * sizeof(float));
|
|
}
|
|
}
|
|
|
|
static int write_completed_frames_to(struct mp_scaletempo2 *p,
|
|
int requested_frames, int dest_offset, float **dest)
|
|
{
|
|
int rendered_frames = MPMIN(p->num_complete_frames, requested_frames);
|
|
|
|
if (rendered_frames == 0)
|
|
return 0; // There is nothing to read from |wsola_output|, return.
|
|
|
|
for (int i = 0; i < p->channels; ++i) {
|
|
memcpy(dest[i] + dest_offset, p->wsola_output[i],
|
|
rendered_frames * sizeof(float));
|
|
}
|
|
|
|
// Remove the frames which are read.
|
|
int frames_to_move = p->wsola_output_size - rendered_frames;
|
|
for (int k = 0; k < p->channels; ++k) {
|
|
float *ch = p->wsola_output[k];
|
|
memmove(ch, &ch[rendered_frames], sizeof(*ch) * frames_to_move);
|
|
}
|
|
p->num_complete_frames -= rendered_frames;
|
|
return rendered_frames;
|
|
}
|
|
|
|
// next output_time for the given playback_rate
|
|
static double get_updated_time(struct mp_scaletempo2 *p, double playback_rate)
|
|
{
|
|
return p->output_time + p->ola_hop_size * playback_rate;
|
|
}
|
|
|
|
// search_block_index for the given output_time
|
|
static int get_search_block_index(struct mp_scaletempo2 *p, double output_time)
|
|
{
|
|
return (int)(output_time - p->search_block_center_offset + 0.5);
|
|
}
|
|
|
|
// number of frames needed until a wsola iteration can be performed
|
|
static int frames_needed(struct mp_scaletempo2 *p, double playback_rate)
|
|
{
|
|
int search_block_index =
|
|
get_search_block_index(p, get_updated_time(p, playback_rate));
|
|
return MPMAX(0, MPMAX(
|
|
p->target_block_index + p->ola_window_size - p->input_buffer_frames,
|
|
search_block_index + p->search_block_size - p->input_buffer_frames));
|
|
}
|
|
|
|
static bool can_perform_wsola(struct mp_scaletempo2 *p, double playback_rate)
|
|
{
|
|
return frames_needed(p, playback_rate) <= 0;
|
|
}
|
|
|
|
// pad end with silence until a wsola iteration can be performed
|
|
static void add_input_buffer_final_silence(struct mp_scaletempo2 *p, double playback_rate)
|
|
{
|
|
int needed = frames_needed(p, playback_rate);
|
|
if (needed <= 0)
|
|
return; // no silence needed for iteration
|
|
|
|
int last_index = needed + p->input_buffer_frames - 1;
|
|
for (int i = 0; i < p->channels; ++i) {
|
|
MP_TARRAY_GROW(p, p->input_buffer[i], last_index);
|
|
float *ch_input = p->input_buffer[i];
|
|
for (int j = 0; j < needed; ++j) {
|
|
ch_input[p->input_buffer_frames + j] = 0.0f;
|
|
}
|
|
}
|
|
|
|
p->input_buffer_added_silence += needed;
|
|
p->input_buffer_frames += needed;
|
|
}
|
|
|
|
void mp_scaletempo2_set_final(struct mp_scaletempo2 *p)
|
|
{
|
|
if (p->input_buffer_final_frames <= 0) {
|
|
p->input_buffer_final_frames = p->input_buffer_frames;
|
|
}
|
|
}
|
|
|
|
int mp_scaletempo2_fill_input_buffer(struct mp_scaletempo2 *p,
|
|
uint8_t **planes, int frame_size, double playback_rate)
|
|
{
|
|
int needed = frames_needed(p, playback_rate);
|
|
int read = MPMIN(needed, frame_size);
|
|
if (read == 0)
|
|
return 0;
|
|
|
|
int last_index = read + p->input_buffer_frames - 1;
|
|
for (int i = 0; i < p->channels; ++i) {
|
|
MP_TARRAY_GROW(p, p->input_buffer[i], last_index);
|
|
memcpy(p->input_buffer[i] + p->input_buffer_frames,
|
|
planes[i], read * sizeof(float));
|
|
}
|
|
|
|
p->input_buffer_frames += read;
|
|
return read;
|
|
}
|
|
|
|
static bool target_is_within_search_region(struct mp_scaletempo2 *p)
|
|
{
|
|
return p->target_block_index >= p->search_block_index
|
|
&& p->target_block_index + p->ola_window_size
|
|
<= p->search_block_index + p->search_block_size;
|
|
}
|
|
|
|
|
|
static void peek_audio_with_zero_prepend(struct mp_scaletempo2 *p,
|
|
int read_offset_frames, float **dest, int dest_frames)
|
|
{
|
|
assert(read_offset_frames + dest_frames <= p->input_buffer_frames);
|
|
|
|
int write_offset = 0;
|
|
int num_frames_to_read = dest_frames;
|
|
if (read_offset_frames < 0) {
|
|
int num_zero_frames_appended = MPMIN(
|
|
-read_offset_frames, num_frames_to_read);
|
|
read_offset_frames = 0;
|
|
num_frames_to_read -= num_zero_frames_appended;
|
|
write_offset = num_zero_frames_appended;
|
|
zero_2d_partial(dest, p->channels, num_zero_frames_appended);
|
|
}
|
|
peek_buffer(p, num_frames_to_read, read_offset_frames, write_offset, dest);
|
|
}
|
|
|
|
static void get_optimal_block(struct mp_scaletempo2 *p)
|
|
{
|
|
int optimal_index = 0;
|
|
|
|
// An interval around last optimal block which is excluded from the search.
|
|
// This is to reduce the buzzy sound. The number 160 is rather arbitrary and
|
|
// derived heuristically.
|
|
const int exclude_interval_length_frames = 160;
|
|
if (target_is_within_search_region(p)) {
|
|
optimal_index = p->target_block_index;
|
|
peek_audio_with_zero_prepend(p,
|
|
optimal_index, p->optimal_block, p->ola_window_size);
|
|
} else {
|
|
peek_audio_with_zero_prepend(p,
|
|
p->target_block_index, p->target_block, p->ola_window_size);
|
|
peek_audio_with_zero_prepend(p,
|
|
p->search_block_index, p->search_block, p->search_block_size);
|
|
int last_optimal = p->target_block_index
|
|
- p->ola_hop_size - p->search_block_index;
|
|
struct interval exclude_iterval = {
|
|
.lo = last_optimal - exclude_interval_length_frames / 2,
|
|
.hi = last_optimal + exclude_interval_length_frames / 2
|
|
};
|
|
|
|
// |optimal_index| is in frames and it is relative to the beginning of the
|
|
// |search_block|.
|
|
optimal_index = compute_optimal_index(
|
|
p->search_block, p->search_block_size,
|
|
p->target_block, p->ola_window_size,
|
|
p->energy_candidate_blocks,
|
|
p->channels,
|
|
exclude_iterval);
|
|
|
|
// Translate |index| w.r.t. the beginning of |audio_buffer| and extract the
|
|
// optimal block.
|
|
optimal_index += p->search_block_index;
|
|
peek_audio_with_zero_prepend(p,
|
|
optimal_index, p->optimal_block, p->ola_window_size);
|
|
|
|
// Make a transition from target block to the optimal block if different.
|
|
// Target block has the best continuation to the current output.
|
|
// Optimal block is the most similar block to the target, however, it might
|
|
// introduce some discontinuity when over-lap-added. Therefore, we combine
|
|
// them for a smoother transition. The length of transition window is twice
|
|
// as that of the optimal-block which makes it like a weighting function
|
|
// where target-block has higher weight close to zero (weight of 1 at index
|
|
// 0) and lower weight close the end.
|
|
for (int k = 0; k < p->channels; ++k) {
|
|
float* ch_opt = p->optimal_block[k];
|
|
float* ch_target = p->target_block[k];
|
|
for (int n = 0; n < p->ola_window_size; ++n) {
|
|
ch_opt[n] = ch_opt[n] * p->transition_window[n]
|
|
+ ch_target[n] * p->transition_window[p->ola_window_size + n];
|
|
}
|
|
}
|
|
}
|
|
|
|
// Next target is one hop ahead of the current optimal.
|
|
p->target_block_index = optimal_index + p->ola_hop_size;
|
|
}
|
|
|
|
static void set_output_time(struct mp_scaletempo2 *p, double output_time)
|
|
{
|
|
p->output_time = output_time;
|
|
p->search_block_index = get_search_block_index(p, output_time);
|
|
}
|
|
|
|
static void remove_old_input_frames(struct mp_scaletempo2 *p)
|
|
{
|
|
const int earliest_used_index = MPMIN(
|
|
p->target_block_index, p->search_block_index);
|
|
if (earliest_used_index <= 0)
|
|
return; // Nothing to remove.
|
|
|
|
// Remove frames from input and adjust indices accordingly.
|
|
seek_buffer(p, earliest_used_index);
|
|
p->target_block_index -= earliest_used_index;
|
|
p->output_time -= earliest_used_index;
|
|
p->search_block_index -= earliest_used_index;
|
|
}
|
|
|
|
static bool run_one_wsola_iteration(struct mp_scaletempo2 *p, double playback_rate)
|
|
{
|
|
if (!can_perform_wsola(p, playback_rate)) {
|
|
return false;
|
|
}
|
|
|
|
set_output_time(p, get_updated_time(p, playback_rate));
|
|
remove_old_input_frames(p);
|
|
|
|
assert(p->search_block_index + p->search_block_size <= p->input_buffer_frames);
|
|
|
|
get_optimal_block(p);
|
|
|
|
// Overlap-and-add.
|
|
for (int k = 0; k < p->channels; ++k) {
|
|
float* ch_opt_frame = p->optimal_block[k];
|
|
float* ch_output = p->wsola_output[k] + p->num_complete_frames;
|
|
if (p->wsola_output_started) {
|
|
for (int n = 0; n < p->ola_hop_size; ++n) {
|
|
ch_output[n] = ch_output[n] * p->ola_window[p->ola_hop_size + n] +
|
|
ch_opt_frame[n] * p->ola_window[n];
|
|
}
|
|
|
|
// Copy the second half to the output.
|
|
memcpy(&ch_output[p->ola_hop_size], &ch_opt_frame[p->ola_hop_size],
|
|
sizeof(*ch_opt_frame) * p->ola_hop_size);
|
|
} else {
|
|
// No overlap for the first iteration.
|
|
memcpy(ch_output, ch_opt_frame,
|
|
sizeof(*ch_opt_frame) * p->ola_window_size);
|
|
}
|
|
}
|
|
|
|
p->num_complete_frames += p->ola_hop_size;
|
|
p->wsola_output_started = true;
|
|
return true;
|
|
}
|
|
|
|
static int read_input_buffer(struct mp_scaletempo2 *p, int dest_size, float **dest)
|
|
{
|
|
int frames_to_copy = MPMIN(dest_size, p->input_buffer_frames - p->target_block_index);
|
|
|
|
if (frames_to_copy <= 0)
|
|
return 0; // There is nothing to read from input buffer; return.
|
|
|
|
peek_buffer(p, frames_to_copy, p->target_block_index, 0, dest);
|
|
seek_buffer(p, frames_to_copy);
|
|
return frames_to_copy;
|
|
}
|
|
|
|
int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p,
|
|
float **dest, int dest_size, double playback_rate)
|
|
{
|
|
if (playback_rate == 0) return 0;
|
|
|
|
if (p->input_buffer_final_frames > 0) {
|
|
add_input_buffer_final_silence(p, playback_rate);
|
|
}
|
|
|
|
// Optimize the muted case to issue a single clear instead of performing
|
|
// the full crossfade and clearing each crossfaded frame.
|
|
if (playback_rate < p->opts->min_playback_rate
|
|
|| (playback_rate > p->opts->max_playback_rate
|
|
&& p->opts->max_playback_rate > 0))
|
|
{
|
|
int frames_to_render = MPMIN(dest_size,
|
|
(int) (p->input_buffer_frames / playback_rate));
|
|
|
|
// Compute accurate number of frames to actually skip in the source data.
|
|
// Includes the leftover partial frame from last request. However, we can
|
|
// only skip over complete frames, so a partial frame may remain for next
|
|
// time.
|
|
p->muted_partial_frame += frames_to_render * playback_rate;
|
|
int seek_frames = (int) (p->muted_partial_frame);
|
|
zero_2d_partial(dest, p->channels, frames_to_render);
|
|
seek_buffer(p, seek_frames);
|
|
|
|
// Determine the partial frame that remains to be skipped for next call. If
|
|
// the user switches back to playing, it may be off time by this partial
|
|
// frame, which would be undetectable. If they subsequently switch to
|
|
// another playback rate that mutes, the code will attempt to line up the
|
|
// frames again.
|
|
p->muted_partial_frame -= seek_frames;
|
|
return frames_to_render;
|
|
}
|
|
|
|
int slower_step = (int) ceilf(p->ola_window_size * playback_rate);
|
|
int faster_step = (int) ceilf(p->ola_window_size / playback_rate);
|
|
|
|
// Optimize the most common |playback_rate| ~= 1 case to use a single copy
|
|
// instead of copying frame by frame.
|
|
if (p->ola_window_size <= faster_step && slower_step >= p->ola_window_size) {
|
|
|
|
if (p->wsola_output_started) {
|
|
p->wsola_output_started = false;
|
|
|
|
// sync audio precisely again
|
|
set_output_time(p, p->target_block_index);
|
|
remove_old_input_frames(p);
|
|
}
|
|
|
|
return read_input_buffer(p, dest_size, dest);
|
|
}
|
|
|
|
int rendered_frames = 0;
|
|
do {
|
|
rendered_frames += write_completed_frames_to(p,
|
|
dest_size - rendered_frames, rendered_frames, dest);
|
|
} while (rendered_frames < dest_size
|
|
&& run_one_wsola_iteration(p, playback_rate));
|
|
return rendered_frames;
|
|
}
|
|
|
|
double mp_scaletempo2_get_latency(struct mp_scaletempo2 *p, double playback_rate)
|
|
{
|
|
return p->input_buffer_frames - p->output_time
|
|
- p->input_buffer_added_silence
|
|
+ p->num_complete_frames * playback_rate;
|
|
}
|
|
|
|
bool mp_scaletempo2_frames_available(struct mp_scaletempo2 *p, double playback_rate)
|
|
{
|
|
return (p->input_buffer_final_frames > p->target_block_index &&
|
|
p->input_buffer_final_frames > 0)
|
|
|| can_perform_wsola(p, playback_rate)
|
|
|| p->num_complete_frames > 0;
|
|
}
|
|
|
|
void mp_scaletempo2_reset(struct mp_scaletempo2 *p)
|
|
{
|
|
p->input_buffer_frames = 0;
|
|
p->input_buffer_final_frames = 0;
|
|
p->input_buffer_added_silence = 0;
|
|
p->output_time = 0.0;
|
|
p->search_block_index = 0;
|
|
p->target_block_index = 0;
|
|
p->num_complete_frames = 0;
|
|
p->wsola_output_started = false;
|
|
}
|
|
|
|
// Return a "periodic" Hann window. This is the first L samples of an L+1
|
|
// Hann window. It is perfect reconstruction for overlap-and-add.
|
|
static void get_symmetric_hanning_window(int window_length, float* window)
|
|
{
|
|
const float scale = 2.0f * M_PI / window_length;
|
|
for (int n = 0; n < window_length; ++n)
|
|
window[n] = 0.5f * (1.0f - cosf(n * scale));
|
|
}
|
|
|
|
|
|
void mp_scaletempo2_init(struct mp_scaletempo2 *p, int channels, int rate)
|
|
{
|
|
p->muted_partial_frame = 0;
|
|
p->output_time = 0;
|
|
p->search_block_index = 0;
|
|
p->target_block_index = 0;
|
|
p->num_complete_frames = 0;
|
|
p->wsola_output_started = false;
|
|
p->channels = channels;
|
|
|
|
p->samples_per_second = rate;
|
|
p->num_candidate_blocks = (int)(p->opts->wsola_search_interval_ms
|
|
* p->samples_per_second / 1000);
|
|
p->ola_window_size = (int)(p->opts->ola_window_size_ms
|
|
* p->samples_per_second / 1000);
|
|
// Make sure window size in an even number.
|
|
p->ola_window_size += p->ola_window_size & 1;
|
|
p->ola_hop_size = p->ola_window_size / 2;
|
|
// |num_candidate_blocks| / 2 is the offset of the center of the search
|
|
// block to the center of the first (left most) candidate block. The offset
|
|
// of the center of a candidate block to its left most point is
|
|
// |ola_window_size| / 2 - 1. Note that |ola_window_size| is even and in
|
|
// our convention the center belongs to the left half, so we need to subtract
|
|
// one frame to get the correct offset.
|
|
//
|
|
// Search Block
|
|
// <------------------------------------------->
|
|
//
|
|
// |ola_window_size| / 2 - 1
|
|
// <----
|
|
//
|
|
// |num_candidate_blocks| / 2
|
|
// <----------------
|
|
// center
|
|
// X----X----------------X---------------X-----X
|
|
// <----------> <---------->
|
|
// Candidate ... Candidate
|
|
// 1, ... |num_candidate_blocks|
|
|
p->search_block_center_offset = p->num_candidate_blocks / 2
|
|
+ (p->ola_window_size / 2 - 1);
|
|
MP_RESIZE_ARRAY(p, p->ola_window, p->ola_window_size);
|
|
get_symmetric_hanning_window(p->ola_window_size, p->ola_window);
|
|
MP_RESIZE_ARRAY(p, p->transition_window, p->ola_window_size * 2);
|
|
get_symmetric_hanning_window(2 * p->ola_window_size, p->transition_window);
|
|
|
|
p->wsola_output_size = p->ola_window_size + p->ola_hop_size;
|
|
alloc_sample_buffer(p, &p->wsola_output, p->wsola_output_size);
|
|
|
|
// Auxiliary containers.
|
|
alloc_sample_buffer(p, &p->optimal_block, p->ola_window_size);
|
|
p->search_block_size = p->num_candidate_blocks + (p->ola_window_size - 1);
|
|
alloc_sample_buffer(p, &p->search_block, p->search_block_size);
|
|
alloc_sample_buffer(p, &p->target_block, p->ola_window_size);
|
|
|
|
p->input_buffer_frames = 0;
|
|
p->input_buffer_final_frames = 0;
|
|
p->input_buffer_added_silence = 0;
|
|
size_t initial_size = 4 * MPMAX(p->ola_window_size, p->search_block_size);
|
|
alloc_sample_buffer(p, &p->input_buffer, initial_size);
|
|
|
|
MP_RESIZE_ARRAY(p, p->energy_candidate_blocks,
|
|
p->channels * p->num_candidate_blocks);
|
|
}
|