libabigail/include/abg-diff-utils.h
Dodji Seketeli fbb6b1bc73 Initial support for diffing ABI corpus files
* include/abg-comparison.h (string_function_ptr_map)
	(changed_function_ptr, string_changed_function_ptr_map)
	(corpus_diff_sptr): New convenience typedefs.
	(translation_unit_diff): Add comments.
	(class corpus_diff): New type.
	(compute_diff): New overload for corpus_diff.
	* include/abg-corpus.h (corpus::{functions, variables}): New
	typedefs.
	(corpus::{operator==, get_functions, get_variables}): New members.
	* include/abg-diff-utils.h (struct deep_ptr_eq_functor): New
	functor.
	* include/abg-ir.h (translation_unit::operator==): New member
	equality operator.
	* src/abg-comparison.cc (struct corpus_diff::priv): New private
	struct holding the private members of corpus_diff.
	(corpus_diff::priv::{lookup_tables_empty, clear_lookup_tables,
	ensure_lookup_tables_populated}): Define new private member functions.
	(corpus_diff::{corpus_diff, first_corpus, second_corpus,
	function_changes, variable_changes, length, report}): New public members.
	(struct noop_deleter): New struct.
	(compute_diff): New implementation for corpus_diff.
	* src/abg-corpus.cc (struct corpus::priv): Renamed corpus::impl
	into this.  Add new fns, vars and is_symbol_table_built data
	members.
	(corpus::priv::build_symbol_table): New member function.
	(class symtab_build_visitor_type): New visitor type to build the
	symbol table.
	(struct func_comp, struct var_comp): New comparison functors.
	(corpus::priv::build_symbol_table): Define new member function.
	(corpus::{corpus, add, get_translation_units, operator==,
	get_functions, get_variables}): Define new members.
	* src/abg-ir.cc (translation_unit::operator==): Define new member
	equality operator.
	(operator==(translation_unit_sptr l, translation_unit_sptr r)):
	Define new equality operator.
	* tools/abg-tools-utils.h (enum file_type): New enum.
	(guess_file_type): Declare new function.
	* tools/abg-tools-utils.cc (guess_file_type): define new function.
	* tools/bidiff.cc (main): Guess the type of the files given in
	input and support elf files reading and diffing.

Signed-off-by: Dodji Seketeli <dodji@redhat.com>
2013-12-23 14:05:19 +01:00

2007 lines
59 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// -*- Mode: C++ -*-
//
// Copyright (C) 2013 Red Hat, Inc.
//
// This file is part of the GNU Application Binary Interface Generic
// Analysis and Instrumentation Library (libabigail). This library is
// free software; you can redistribute it and/or modify it under the
// terms of the GNU Lesser General Public License as published by the
// Free Software Foundation; either version 3, or (at your option) any
// later version.
// This library is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// General Lesser Public License for more details.
// You should have received a copy of the GNU Lesser General Public
// License along with this program; see the file COPYING-LGPLV3. If
// not, see <http://www.gnu.org/licenses/>.
/// @file
///
/// This file declares types and operations implementing the "O(ND)
/// Difference Algorithm" (aka diff2) from Eugene W. Myers, to compute
/// the difference between two sequences.
///
/// To understand what is going on here, one must read the paper at
/// http://www.xmailserver.org/diff2.pdf. Throughout this file, that
/// paper is referred to as "the paper".
///
/// The implementations goes as far as calculating the shortest edit
/// script (the set of insertions and deletions) for transforming a
/// sequence into another. The main entry point for that is the
/// compute_diff() function.
#include <stdexcept>
#include <cassert>
#include <cstdlib>
#include <ostream>
#include <string>
#include <vector>
#include <sstream>
namespace abigail
{
namespace diff_utils
{
// Inject the names from std:: below into this namespace
using std::string;
using std::ostream;
using std::vector;
using std::abs;
using std::ostringstream;
/// A class representing a vertex in an edit graph, as explained in
/// the paper. A vertex is a basically a pair of coordinates
/// (abscissa and ordinate).
class point
{
int x_;
int y_;
bool empty_;
public:
point()
: x_(-1), y_(-1),empty_(true)
{}
point(int x, int y)
: x_(x), y_(y), empty_(false)
{}
point(const point& p)
: x_(p.x()), y_(p.y()), empty_(p.is_empty())
{}
int
x() const
{return x_;}
void
x(int x)
{
x_ = x;
empty_ = false;
}
int
y() const
{return y_;}
void
y(int y)
{
y_ = y;
empty_ = false;
}
void
set(int x, int y)
{
x_ = x;
y_ = y;
empty_ = false;
}
void
set(int x, int y, bool empty)
{
x_ = x;
y_ = y;
empty_ = empty;
}
void
add(int ax, int ay)
{set (x() + ax, y() + ay);}
bool
operator!=(const point& o) const
{return (x() != o.x() || y() != o.y() || is_empty() != o.is_empty());}
bool
operator==(const point& o) const
{return !(operator!=(o));}
bool
operator<(const point& o) const
{return (x() < o.x() && y() < o.y());}
bool
operator>(const point& o) const
{return (x() > o.x() && y() > o.y());}
bool
operator<=(const point& o) const
{return (x() <= o.x() && y() <= o.y());}
bool
operator>=(const point& o) const
{return (x() >= o.x() && y() >= o.y());}
point
operator+(int val) const
{return point(x() + val, y() + val);}
point
operator-(int val) const
{return point(x() - val, y() - val);}
point&
operator+= (int val)
{
set(x_ + val, y_ + val);
return *this;
}
point&
operator-= (int val)
{return (*this) += (-val);}
point&
operator--()
{return (*this) -= 1;}
point&
operator++()
{return (*this) += 1;}
point
operator--(int)
{
point tmp(*this);
(*this)--;
return tmp;
}
point
operator++(int)
{
point tmp(*this);
(*this)++;
return tmp;
}
point&
operator=(int val)
{
set(val, val);
return *this;
}
point&
operator=(const point& p)
{
set(p.x(), p.y(), p.is_empty());
return *this;
}
bool
is_empty() const
{return empty_;}
operator bool () const
{return !is_empty();}
bool
operator!() const
{return is_empty();}
void
clear()
{
x_ = -1;
y_ = -1;
empty_ = true;
}
};// end point
/// The abstraction of the Snake concept, from the paper.
///
/// In a given path from the edit graph, a snake is a non-diagonal
/// edge followed by zero or more diagonal edges.
///
/// The starting poing of the non-diagonal edge is the beginning of
/// the snake. This is given by the snake::begin() method. This point
/// is not explicitely referenced in the paper, but we need it for
/// some grunt implementation details of the algorithm.
///
/// The end point of the non-diagonal edge is the intermediate point
/// of the snake; it's given by the snake::intermediate() method.
/// This point is what is referred to as "the begining of the snake"
/// in the paper.
///
/// The end point of the first diagonal edge is given by the
/// snake::diagonal_start() method.
///
/// The end point of the last diagonal edge is given by the
/// snake::end() method. Note that when the snake contains no
/// diagonal edge, snake::intermediate(), and snake::end() return the
/// same point; snake::diagonal_start() contains an empty point (i.e,
/// a point for which point::is_empty() returns true).
class snake
{
point begin_, intermediate_, diagonal_start_, end_;
bool forward_;
public:
/// Default constructor for snake.
snake()
: forward_(false)
{}
/// Constructor from the beginning, intermediate and end points.
///
/// @param b the beginning point of the snake. That is, the
/// starting point of the non-diagonal edge.
///
/// @param i the intermediate point of the snake. That is, the end
/// point of the non-diagonal edge.
///
/// @param e the end point of the snake. That is the end point of
/// the last diagonal edge.
snake(const point& b,
const point& i,
const point& e)
: begin_(b), intermediate_(i),
end_(e), forward_(false)
{}
/// Constructor from the beginning, intermediate and end points.
///
/// @param b the beginning point of the snake. That is, the
/// starting point of the non-diagonal edge.
///
/// @param i the intermediate point of the snake. That is, the end
/// point of the non-diagonal edge.
///
/// @param d the beginning of the diagonal edge. That is the end of
/// the first diagonal edge of the snake.
///
/// @param e the end point of the snake. That is the end point of
/// the last diagonal edge.
snake(const point& b,
const point& i,
const point& d,
const point& e)
: begin_(b), intermediate_(i),
diagonal_start_(d), end_(e),
forward_(false)
{}
/// Getter for the starting point of the non-diagonal edge of the
/// snake.
///
/// @return the starting point of the non-diagonal edge of the snake
const point&
begin() const
{return begin_;}
/// Getter for the starting point of the non-diagonal edge of the
/// snake, aka begin point.
///
///@param p the new begin point.
void
begin(const point& p)
{begin_ = p;}
/// Getter for the end point of the non-diagonal edge of the snake.
///
/// @return the end point of the non-diagonal edge of the snake
const point&
intermediate() const
{return intermediate_;}
/// Setter for the end point of the non-diagonal edge of the snake,
/// aka intermediate point.
///
/// @param p the new intermediate point.
void
intermediate(const point& p)
{intermediate_ = p;}
/// Getter for the end point of the first diagonal edge, aka
/// diagonal start point. Note that if the snake has no diagonal
/// edge, this point is empty.
///
/// @return the end point of the first diagonal edge.
const point&
diagonal_start() const
{return diagonal_start_;}
/// Setter for the end point of the first diagonal edge, aka
/// diagonal start point.
///
/// @param p the new diagonal start.d
void
diagonal_start(const point& p)
{diagonal_start_ = p;}
/// Getter for the end point of the last diagonal edge, aka snake
/// end point. Note that if the snake has no diagonal edge, this
/// point is equal to the intermediate point.
///
/// @return the end point of the last diagonal edge
const point&
end() const
{return end_;}
/// Setter for the end point of the last diagonal edge, aka snake
/// end point. Note that if the snake has no diagonal edge, this
/// point is equal to the intermediate point.
void
end(const point& p)
{end_ = p;}
/// Setter for the begin, intermediate and end points of the snake.
///
/// @param b the new snake begin point
///
/// @param i the new snake intermediate point
///
/// @param e the new snake end point
void
set(const point& b, const point&i, const point&e)
{
begin(b);
intermediate(i);
end(e);
}
/// Setter for the begin, intermediate, diagonal start and end points
/// of the snake.
///
/// @param b the new snake begin point
///
/// @param i the new snake intermediate point
///
/// @param d the new diagonal start point
///
/// @param e the new snake end point
void
set(const point& b, const point&i, const point& d, const point&e)
{
begin(b);
intermediate(i);
diagonal_start(d);
end(e);
}
/// @return true iff the snake is a forward snake. That is, if it
/// was built while walking the edit graph going forward (from the
/// top left corner to the right bottom corner.
bool
is_forward() const
{return forward_;}
/// Set to true if the snake is a forward snake; that is, if it was
/// built while walking the edit graph going forward (from the top
/// left corner to the right bottom corner. Set to false otherwise.
///
/// @param f whether the snake is a forward snake or not.
void
set_forward(bool f)
{forward_ = f;}
/// Add an offset to the abscissas of the points of the snake, and
/// add another offset to the ordinates of these same points.
///
/// @param x_offset the offset to add to the abscissas of all the
/// points of the snake.
///
/// @param y_offset the offset to add to the ordinates of all the
/// points of the snake.
void
add(int x_offset, int y_offset)
{
if (is_empty())
return;
begin_.add(x_offset, y_offset);
intermediate_.add(x_offset, y_offset);
if (diagonal_start_)
diagonal_start_.add(x_offset, y_offset);
end_.add(x_offset, y_offset);
}
/// @return true iff the snake has at least one diagonal edge.
bool
has_diagonal_edge() const
{return !diagonal_start().is_empty();}
/// @return true iff the non-diagonal edge is horizontal.
bool
has_horizontal_edge() const
{return (begin().y() == intermediate().y());}
/// @return true iff the non-diagonal edge is vertical.
bool
has_vertical_edge() const
{return (begin().x() == intermediate().x());}
/// @return true iff the snake is empty, that is, if all the points
/// it contains are empty.
bool is_empty() const
{return begin().is_empty() && intermediate().is_empty() && end().is_empty();}
};// end class snake
/// The array containing the furthest D-path end-points, for each value
/// of K. MAX_D is the maximum value of the D-Path. That is, M+N if
/// M is the size of the first input string, and N is the size of the
/// second.
class d_path_vec : public std::vector<int>
{
private:
unsigned a_size_;
unsigned b_size_;
/// Forbid vector size modifications
void
push_back(const typename vector<int>::value_type&);
/// Forbid default constructor.
d_path_vec();
bool
over_bounds(long long index) const
{return (index + offset()) >= (long long) size();}
void
check_index_against_bound(int index) const
{
if (over_bounds(index))
{
ostringstream o;
o << "index '" << index
<< "' out of range [-" << max_d() << ", " << max_d() << "]";
throw std::out_of_range(o.str());
}
}
public:
/// Constructor of the d_path_vec.
///
/// For forward vectors, the underlying vector allocates 2 *
/// [MAX_D+1].
/// space, so that one can address elements in the index range
/// [-MAX_D, MAX_D]. And MAX_D is the sum of the two sequence
/// sizes. delta is the difference.
///
/// For reverse vectors, note that we need to be able to address
/// [-MAX_D - delta, MAX_D + delta], with delta being the (signed)
/// difference between the size of the two sequences. We consider
/// delta being bounded by MAX_D itself; so we say we need to be
/// able to address [-2MAX_D, 2MAX_D].
///
/// @params size1 the size of the first sequence we are interested
/// in.
///
/// @param size2 the size of the second sequence we are interested
/// in.
d_path_vec(unsigned size1, unsigned size2)
: vector<int>(2 * (size1 + size2 + 1) + (size1 + size2) + 1, 0),
a_size_(size1), b_size_(size2)
{
}
typename std::vector<int>::const_reference
operator[](int index) const
{return at(index);}
typename std::vector<int>::reference
operator[](int index)
{return at(index);}
typename std::vector<int>::reference
at(long long index)
{
check_index_against_bound(index);
long long i = index + offset();
return static_cast<vector<int>* >(this)->at(i);
}
typename std::vector<int>::const_reference
at(long long index) const
{
check_index_against_bound(index);
long long i = offset() + index;
return static_cast<const vector<int>* >(this)->at(i);
}
unsigned
a_size() const
{return a_size_;}
unsigned
b_size() const
{return b_size_;}
unsigned
max_d() const
{return a_size() + b_size();}
unsigned
offset() const
{return max_d() + abs((long long) a_size() - (long long) b_size());}
}; // end class d_path_vec
/// The abstration of an insertion of elements of a sequence B into a
/// sequence A. This is used to represent the edit script for
/// transforming a sequence A into a sequence B.
///
/// And insertion mainly encapsulates two components:
///
/// - An insertion point: this is the index (starting at 0) of the
/// element of the sequence A after which the insertion occurs.
///
/// - Inserted elements: this is a vector of indexes of elements of
/// sequence B (starting at 0) that got inserted into sequence A,
/// after the insertion point.
class insertion
{
int insertion_point_;
vector<unsigned> inserted_;
public:
insertion(int insertion_point,
const vector<unsigned>& inserted_indexes)
: insertion_point_(insertion_point),
inserted_(inserted_indexes)
{}
insertion(int insertion_point = 0)
: insertion_point_(insertion_point)
{}
int
insertion_point_index() const
{return insertion_point_;}
void
insertion_point_index(int i)
{insertion_point_ = i;}
const vector<unsigned>&
inserted_indexes() const
{return inserted_;}
vector<unsigned>&
inserted_indexes()
{return inserted_;}
};// end class insertion
/// The abstraction of the deletion of one element of a sequence A.
///
/// This encapsulates the index of the element A that got deleted.
class deletion
{
int index_;
public:
deletion(int i)
: index_(i)
{}
int
index() const
{return index_;}
void
index(int i)
{index_ = i;}
};// end class deletion
/// The abstraction of an edit script for transforming a sequence A
/// into a sequence B.
///
/// It encapsulates the insertions and deletions for transforming A
/// into B.
class edit_script
{
vector<insertion> insertions_;
vector<deletion> deletions_;
public:
edit_script()
{}
const vector<insertion>&
insertions() const
{return insertions_;}
vector<insertion>&
insertions()
{return insertions_;}
const vector<deletion>&
deletions() const
{return deletions_;}
vector<deletion>&
deletions()
{return deletions_;}
void
append(const edit_script& es)
{
insertions().insert(insertions().end(),
es.insertions().begin(),
es.insertions().end());
deletions().insert(deletions().end(),
es.deletions().begin(),
es.deletions().end());
}
void
prepend(const edit_script& es)
{
insertions().insert(insertions().begin(),
es.insertions().begin(),
es.insertions().end());
deletions().insert(deletions().begin(),
es.deletions().begin(),
es.deletions().end());
}
void
clear()
{
insertions().clear();
deletions().clear();
}
bool
is_empty() const
{return insertions().empty() && deletions().empty();}
operator bool() const
{return !is_empty();}
int
num_insertions() const
{
int l = 0;
for (vector<insertion>::const_iterator i = insertions().begin();
i != insertions().end();
++i)
l += i->inserted_indexes().size();
return l;
}
int
num_deletions() const
{return deletions().size();}
int
length() const
{return num_insertions() + num_deletions();}
};//end class edit_script
bool
point_is_valid_in_graph(point& p,
unsigned a_size,
unsigned b_size);
bool
ends_of_furthest_d_paths_overlap(const point& forward_d_path_end,
const point& reverse_d_path_end);
/// The default equality functor used by the core diffing algorithms.
struct default_eq_functor
{
/// This equality operator uses the default "==" to compare its
/// arguments.
///
/// @param a the first comparison argument.
///
/// @param b the second comparison argument.
///
/// @return true if the two arguments are equal, false otherwise.
template<typename T>
bool
operator()(const T a, const T b) const
{return a == b;}
};
/// An equality functor to deeply compare pointers.
struct deep_ptr_eq_functor
{
/// This equality operator compares pointers by comparing the
/// pointed-to objects.
///
/// @param first the first comparison argument.
///
/// @param second the second comparison argument.
///
/// @return true if the objects pointed to by the pointers are
/// equal, false otherwise.
template<typename T>
bool
operator()(const T* first,
const T* second)
{
if (!!first != !!second)
return false;
if (!first)
return true;
return *first == *second;
}
};
/// Find the end of the furthest reaching d-path on diagonal k, for
/// two sequences. In the paper This is referred to as "the basic
/// algorithm".
///
/// Unlike in the paper, the coordinates of the edit graph start at
/// (-1,-1), rather than (0,0), and they end at (M-1, N-1), rather
/// than (M,N).
///
/// @tparm RandomAccessOutputIterator the type of iterators passed to
/// this function. It must be a random access output iterator kind.
///
/// @tparm EqualityFunctor this must be a class that declares a public
/// call operator member returning a boolean and taking two arguments
/// that must be of the same type as the one pointed to by the @ref
/// RandomAccessOutputIterator template parameter. This functor is
/// used to compare the elements referred to by the iterators pased in
/// argument to this function.
///
/// @param k the number of the diagonal on which we want to find the
/// end of the furthest reaching D-path.
///
/// @param d the D in D-Path. That's the number of insertions/deletions
/// (the number of changes, in other words) in the changeset. That is
/// also the number of non-diagonals in the D-Path.
///
/// @param a_begin an iterator to the beginning of the first sequence
///
/// @param a_end an iterator that points right after the last element
/// of the second sequence to consider.
///
/// @param b_begin an iterator to the beginning of the second sequence.
///
/// @param b_end an iterator that points right after the last element
/// of the second sequence to consider.
///
/// @param v the vector of furthest end points of d_paths, at (d-1).
/// It contains the abscissas of the furthest end points for different
/// values of k, at (d-1). That is, for k in [-D + 1, -D + 3, -D + 5,
/// ..., D - 1], v[k] is the abscissa of the end of the furthest
/// reaching (D-1)-path on diagonal k.
///
/// @param snak the last snake of the furthest path found. The end
/// point of the snake is the end point of the furthest path.
///
/// @return true if the end of the furthest reaching path that was
/// found was inside the boundaries of the edit graph, false
/// otherwise.
template<typename RandomAccessOutputIterator,
typename EqualityFunctor>
bool
end_of_fr_d_path_in_k(int k, int d,
RandomAccessOutputIterator a_begin,
RandomAccessOutputIterator a_end,
RandomAccessOutputIterator b_start,
RandomAccessOutputIterator b_end,
d_path_vec& v, snake& snak)
{
int x = -1, y = -1;
point begin, intermediate, diag_start, end;
snake s;
EqualityFunctor eq;
// Let's pick the end point of the furthest reaching
// (D-1)-path. It's either v[k-1] or v[k+1]; the word
// "furthest" means we choose the one which abscissa is the
// greatest (that is, furthest from abscissa zero).
if (k == -d || ((k != d) && (v[k-1] < v[k + 1])))
// So, the abscissa of the end point of the furthest
// reaching (D-1)-path is v[k+1]. That is a diagonal that
// is above the current (k) diagonal, and on the right.
// To move to the current k diagonal, one has to move
// "down" from the diagonal k+1. So the abscissa won't
// change. Only the ordinate will. It will be given by y
// = x - k (a bit below); as k has changed from k - 1 (it
// has increased), y is going to be the new y that is
// 'down' from the previous y in k - 1.
{
x = v[k+1];
begin.set(x, x - (k + 1));
}
else
{
// So the abscissa of the end point of the furthest
// (D-1)-path is v[k-1]. That is on the left of the
// current k diagonal. To move to the current k diagonal,
// one has to move "right" from diagonal k - 1. That is,
// the y stays constant and x is incremented.
x = v[k-1];
begin.set(x, x - (k - 1));
++x;
}
// Now get the value of y from the equation k = x -y.
// This is the point where we first touch K, when we move
// from the end of the furthest reaching (D-1)-path.
y = x - k;
intermediate.x(x);
intermediate.y(y);
int last_x_index = a_end - a_begin - 1;
int last_y_index = b_end - b_start - 1;
// Now, follow the snake (aka, zero or more consecutive
// diagonals). Note that we stay on the k diagonal when we
// do this.
while ((x < last_x_index) && (y < last_y_index))
if (eq(a_begin[x + 1], b_start[y + 1]))
{
x = x + 1;
y = y + 1;
if (!diag_start)
diag_start.set(x, y);
}
else
break;
end.x(x);
end.y(y);
// Note the point that we store in v here might be outside the
// bounds of the edit graph. But we store it at this step (for a
// given D) anyway, because out of bound or not, we need this value
// at this step to be able to compute the value of the point on the
// "next" diagonal for the next D.
v[k] = x;
if (x >= (int) v.a_size()
|| y >= (int) v.b_size()
|| x < -1 || y < -1)
return false;
s.set(begin, intermediate, diag_start, end);
s.set_forward(true);
snak = s;
return true;
}
/// Find the end of the furthest reaching reverse d-path on diagonal k
/// + delta. Delta is abs(M - N), with M being the size of a and N
/// being the size of b. This is the "basic algorithm", run backward.
/// That is, starting from the point (M,N) of the edit graph.
///
/// Unlike in the paper, the coordinates of the edit graph start at
/// (-1,-1), rather than (0,0), and they end at (M-1, N-1), rather
/// than (M,N).
///
/// @tparm RandomAccessOutputIterator the type of iterators passed to
/// this function. It must be a random access output iterator kind.
///
/// @tparm EqualityFunctor this must be a class that declares a public
/// call operator member returning a boolean and taking two arguments
/// that must be of the same type as the one pointed to by the @ref
/// RandomAccessOutputIterator template parameter. This functor is
/// used to compare the elements referred to by the iterators pased in
/// argument to this function.
///
/// @param k the number of the diagonal on which we want to find the
/// end of the furthest reaching reverse D-path. Actually, we want to
/// find the end of the furthest reaching reverse D-path on diagonal (k
/// - delta).
///
/// @param d the D in D-path. That's the number of insertions/deletions
/// (the number of changes, in other words) in the changeset. That is
/// also the number of non-diagonals in the D-Path.
///
/// @param a_begin an iterator to the beginning of the first sequence
///
/// @param a_end an iterator that points right after the last element
/// of the second sequence to consider.
///
/// @param b_begin an iterator to the beginning of the second sequence.
///
/// @param b_end an iterator that points right after the last element
/// of the second sequence to consider.
///
/// @param v the vector of furthest end points of d_paths, at (d-1).
/// It contains the abscissae of the furthest end points for different
/// values of k - delta, at (d-1). That is, for k in [-D + 1, -D + 3,
/// -D + 5, ..., D - 1], v[k - delta] is the abscissa of the end of the
/// furthest reaching (D-1)-path on diagonal k - delta.
///
/// @param snak the last snake of the furthest path found. The end
/// point of the snake is the end point of the furthest path.
///
/// @return true iff the end of the furthest reaching path that was
/// found was inside the boundaries of the edit graph, false
/// otherwise.
template<typename RandomAccessOutputIterator,
typename EqualityFunctor>
bool
end_of_frr_d_path_in_k_plus_delta (int k, int d,
RandomAccessOutputIterator a_begin,
RandomAccessOutputIterator a_end,
RandomAccessOutputIterator b_begin,
RandomAccessOutputIterator b_end,
d_path_vec& v, snake& snak)
{
int a_size = a_end - a_begin;
int b_size = b_end - b_begin;
int delta = a_size - b_size;
int k_plus_delta = k + delta;
int x = -1, y = -1;
point begin, intermediate, diag_start, end;
snake s;
EqualityFunctor eq;
// Let's pick the end point of the furthest reaching (D-1)-path and
// move from there to reach the current k_plus_delta-line. That end
// point of the furthest reaching (D-1)-path is either on
// v[k_plus_delta-1] or on v[k_plus_delta+1]; the word "furthest"
// means we choose the one which abscissa is the lowest (that is,
// furthest from abscissa M).
if (k_plus_delta == -d + delta
|| ((k_plus_delta != d + delta)
&& (v[k_plus_delta + 1] < v[k_plus_delta - 1])))
{
// We move left, that means ordinate won't change ...
x = v[k_plus_delta + 1];
y = x - (k_plus_delta + 1);
begin.set(x, y);
// ... and abscissa decreases.
x = x - 1;
}
else
{
// So the furthest end point is on the k_plus_delta - 1
// diagonal. That is a diagonal that is 'below' the
// k_plus_delta current diagonal. So to join the current
// diagonal from the k_plus_delta - 1 one, we need to move up.
// So moving up means abscissa won't change ...
x = v[k_plus_delta - 1];
begin.set(x, x - (k_plus_delta - 1));
// ... and that ordinate decreases.
y = x - (k_plus_delta - 1) - 1;
}
intermediate.set(x, y);
// Now, follow the snake. Note that we stay on the k_plus_delta
// diagonal when we do this.
while (x >= 0 && y >= 0)
if (eq(a_begin[x], b_begin[y]))
{
if (!diag_start)
diag_start.set(x, y);
x = x - 1;
y = y - 1;
}
else
break;
end.set(x, y);
// Note the point that we store in v here might be outside the
// bounds of the edit graph. But we store it at this step (for a
// given D) anyway, because out of bound or not, we need this value
// at this step to be able to compute the value of the point on the
// "next" diagonal for the next D.
v[k_plus_delta] = x;
if (x == -1 && y == -1)
;
else if (x < -1 || y < -1)
return false;
s.set(begin, intermediate, diag_start, end);
s.set_forward(false);
snak = s;
return true;
}
/// Tests if a given point is a match point in an edit graph.
///
/// @param a_begin the begin iterator of the first input sequence of
/// the edit graph.
///
/// @param a_end the end iterator of the first input sequence of the
/// edit graph. This points to one element passed the end of the
/// sequence.
///
/// @param b_begin the begin iterator of the second input sequence of
/// the edit graph.
///
/// @param b_end the end iterator of the second input sequence of the
/// edit graph. This points the one element passed the end of the
/// sequence.
///
/// @param point the point to test for being a match point.
///
/// @return true iff \a point is a match point.
template<typename RandomAccessOutputIterator>
bool
is_match_point(RandomAccessOutputIterator a_begin,
RandomAccessOutputIterator a_end,
RandomAccessOutputIterator b_begin,
RandomAccessOutputIterator b_end,
const point& point)
{
int a_size = a_end - a_begin, b_size = b_end - b_begin;
if (point.x() < 0
|| point.x () >= a_size
|| point.y() < 0
|| point.y() >= b_size)
return false;
return (a_begin[point.x()] == b_begin[point.y()]);
}
/// Returns the middle snake of two sequences A and B, as well as the
/// length of their shortest editing script.
///
/// This uses the "linear space refinement" algorithm presented in
/// section 4b in the paper. As the paper says, "The idea for doing
/// so is to simultaneously run the basic algorithm in both the
/// forward and reverse directions until furthest reaching forward and
/// reverse paths starting at opposing corners overlap."
///
/// @tparm RandomAccessOutputIterator the type of iterators passed to
/// this function. It must be a random access output iterator kind.
///
/// @tparm EqualityFunctor this must be a class that declares a public
/// call operator member returning a boolean and taking two arguments
/// that must be of the same type as the one pointed to by the @ref
/// RandomAccessOutputIterator template parameter. This functor is
/// used to compare the elements referred to by the iterators pased in
/// argument to this function.
///
/// @param a_begin an iterator pointing to the begining of sequence A.
///
/// @param a_end an iterator pointing to the end of sequence A. Note
/// that this points right /after/ the end of vector A.
///
/// @param b_begin an iterator pointing to the begining of sequence B.
///
/// @param b_end an iterator pointing to the end of sequence B. Note
/// that this points right /after/ the end of vector B
///
/// @param snak out parameter. This is the snake current when the two
/// paths overlapped. This is set iff the function returns true;
/// otherwise, this is not touched.
///
/// @return true is the snake was found, false otherwise.
template<typename RandomAccessOutputIterator,
typename EqualityFunctor>
bool
compute_middle_snake(RandomAccessOutputIterator a_begin,
RandomAccessOutputIterator a_end,
RandomAccessOutputIterator b_begin,
RandomAccessOutputIterator b_end,
snake& snak, int& ses_len)
{
int a_size = a_end - a_begin;
int N = a_size;
int b_size = b_end - b_begin;
int M = b_size;
int delta = N - M;
d_path_vec forward_d_paths(a_size, b_size);
d_path_vec reverse_d_paths(a_size, b_size);
// These points below are the top leftmost point and bottom
// right-most points of the edit graph.
point first_point(-1, -1), last_point(a_size -1, b_size -1), point_zero(0, 0);
// We want the initial step (D = 0, k = 0 in the paper) to find a
// furthest reaching point on diagonal k == 0; For that, we need the
// value of x for k == 1; So let's set that value to -1; that is for
// k == 1 (diagonal 1), the point in the edit graph is (-1,-2).
// That way, to get the furthest reaching point on diagonal 0 (k ==
// 0), we go down from (-1,-2) on diagonal 1 and we hit diagonal 0
// on (-1,-1); that is the starting value that the algorithm expects
// for k == 0.
forward_d_paths[1] = -1;
// Similarly for the reverse paths, for diagonal delta + 1 (note
// that diagonals are centered on delta, unlike for forward paths
// where they are centered on zero), we set the initial point to
// (a_size, b_size - 1). That way, at step D == 0 and k == delta,
// to reach diagonal delta from the point (a_size, b_size - 1) on
// diagonal delta + 1, we just have to move left, and we hit
// diagonal delta on (a_size - 1, b_size -1); that is the starting
// point value the algorithm expects for k == 0 in the reverse case.
reverse_d_paths[delta + 1] = a_size;
int d_max = (M + N) / 2 + 1;
for (int d = 0; d <= d_max; ++d)
{
// First build forward paths.
for (int k = -d; k <= d; k += 2)
{
snake s;
bool found =
end_of_fr_d_path_in_k<RandomAccessOutputIterator,
EqualityFunctor>(k, d,
a_begin, a_end,
b_begin, b_end,
forward_d_paths, s);
if (!found)
continue;
// As the paper says in 4b while explaining the middle snake
// algorithm:
//
// "Thus when delta is odd, check for overlap only while
// extending forward paths ..."
if ((delta % 2)
&& (k >= (delta - (d - 1))) && (k <= (delta + (d - 1))))
{
point reverse_end;
reverse_end.x(reverse_d_paths[k]);
reverse_end.y(reverse_end.x() - k);
if (ends_of_furthest_d_paths_overlap(s.end(), reverse_end))
{
ses_len = 2 * d - 1;
snak = s;
return true;
}
}
}
// Now build reverse paths.
for (int k = -d; k <= d; k += 2)
{
snake s;
bool found =
end_of_frr_d_path_in_k_plus_delta<RandomAccessOutputIterator,
EqualityFunctor>(k, d,
a_begin, a_end,
b_begin, b_end,
reverse_d_paths,
s);
if (!found)
continue;
// And the paper continues by saying:
//
// "... and when delta is even, check for overlap only while
// extending reverse paths."
int k_plus_delta = k + delta;
if (!(delta % 2)
&& (k_plus_delta >= -d) && (k_plus_delta <= d))
{
point forward_end;
forward_end.x(forward_d_paths[k_plus_delta]);
forward_end.y(forward_end.x() - k_plus_delta);
if (ends_of_furthest_d_paths_overlap(forward_end, s.end()))
{
ses_len = 2 * d;
snak = s;
return true;
}
}
}
}
return false;
}
bool
compute_middle_snake(const char* str1, const char* str2,
snake& s, int& ses_len);
/// This prints the middle snake of two strings.
///
/// @param a_begin the beginning of the first string.
///
/// @param b_begin the beginning of the second string.
///
/// @param s the snake to print.
///
/// @param out the output stream to print the snake to.
template<typename RandomAccessOutputIterator>
void
print_snake(RandomAccessOutputIterator a_begin,
RandomAccessOutputIterator b_begin,
const snake s, ostream& out)
{
if (s.is_empty())
return;
out << "snake start: ";
out << "(" << s.begin().x() << ", " << s.end().y() << ")\n";
out << "snake intermediate: ";
out << "(" << s.intermediate().x() << ", " << s.intermediate().y() << ")\n";
out << "diagonal point(s): ";
if (s.has_diagonal_edge())
for (int x = s.intermediate().x(), y = s.intermediate().y();
x <= s.end().x() && y <= s.end().y();
++x, ++y)
{
assert(a_begin[x] == b_begin[y]);
out << "(" << x << "," << y << ") ";
}
out << "\n";
out << "snake end: ";
out << "(" << s.end().x() << ", " << s.end().y() << ")\n";
}
/// Compute the length of the shortest edit script for two sequences a
/// and b. This is done using the "Greedy LCS/SES" of figure 2 in the
/// paper. It can walk the edit graph either foward (when reverse is
/// false) or backward starting from the end (when reverse is true).
///
/// Here, note that the real content of a and b should start at index
/// 1, for this implementatikon algorithm to match the paper's
/// algorithm in a straightforward manner. So pleast make sure that
/// at index 0, we just get some non-used value.
///
/// @tparm RandomAccessOutputIterator the type of iterators passed to
/// this function. It must be a random access output iterator kind.
///
/// @tparm EqualityFunctor this must be a class that declares a public
/// call operator member returning a boolean and taking two arguments
/// that must be of the same type as the one pointed to by the @ref
/// RandomAccessOutputIterator template parameter. This functor is
/// used to compare the elements referred to by the iterators pased in
/// argument to this function.
///
/// @param a the first sequence we care about.
///
/// @param b the second sequence we care about.
///
/// @param v the vector that contains the end points of the furthest
/// reaching d-path and (d-1)-path.
template<typename RandomAccessOutputIterator,
typename EqualityFunctor>
int
ses_len(RandomAccessOutputIterator a_begin,
RandomAccessOutputIterator a_end,
RandomAccessOutputIterator b_begin,
RandomAccessOutputIterator b_end,
d_path_vec& v, bool reverse)
{
unsigned a_size = a_end - a_begin;
unsigned b_size = b_end - b_begin;
snake snak;
assert(v.max_d() == a_size + b_size);
int delta = a_size - b_size;
if (reverse)
// Set a fictitious (M, N-1) into v[1], to find the furthest
// reaching reverse 0-path (i.e, when we are at d == 0 and k == 0).
v[delta + 1] = a_size - 1;
else
// Set a fictitious (-1,-2) point into v[1], to find the furthest
// reaching forward 0-path (i.e, when we are at d == 0 and k == 0).
v[1] = -1;
for (unsigned d = 0; d <= v.max_d(); ++d)
{
for (int k = -d; k <= (int) d; k += 2)
{
point end;
if (reverse)
{
bool found =
end_of_frr_d_path_in_k_plus_delta<RandomAccessOutputIterator,
EqualityFunctor>(k, d,
a_begin, a_end,
b_begin, b_end,
v, snak);
// If we reached the upper left corner of the edit graph then
// we are done.
if (found && snak.end().x() == -1 && snak.end().y() == -1)
return d;
}
else
{
end_of_fr_d_path_in_k<RandomAccessOutputIterator,
EqualityFunctor>(k, d,
a_begin, a_end,
b_begin, b_end,
v, snak);
// If we reached the lower right corner of the edit
// graph then we are done.
if ((snak.end().x() == (int) a_size - 1)
&& (snak.end().y() == (int) b_size - 1))
return d;
}
}
}
return 0;
}
/// Compute the length of the shortest edit script for two sequences a
/// and b. This is done using the "Greedy LCS/SES" of figure 2 in the
/// paper. It can walk the edit graph either foward (when reverse is
/// false) or backward starting from the end (when reverse is true).
///
/// Here, note that the real content of a and b should start at index
/// 1, for this implementatikon algorithm to match the paper's
/// algorithm in a straightforward manner. So pleast make sure that
/// at index 0, we just get some non-used value.
///
/// Note that the equality operator used to compare the elements
/// passed in argument to this function is the default "==" operator.
///
/// @tparm RandomAccessOutputIterator the type of iterators passed to
/// this function. It must be a random access output iterator kind.
///
/// @param a the first sequence we care about.
///
/// @param b the second sequence we care about.
///
/// @param v the vector that contains the end points of the furthest
/// reaching d-path and (d-1)-path.
template<typename RandomAccessOutputIterator>
int
ses_len(RandomAccessOutputIterator a_begin,
RandomAccessOutputIterator a_end,
RandomAccessOutputIterator b_begin,
RandomAccessOutputIterator b_end,
d_path_vec& v, bool reverse)
{
return ses_len<RandomAccessOutputIterator, default_eq_functor>(a_begin, a_end,
b_begin, b_end,
v, reverse);
}
int
ses_len(const char* str1,
const char* str2,
bool reverse = false);
bool
snake_end_points(const snake& s, point&, point&);
/// Compute the longest common subsequence of two (sub-regions of)
/// sequences as well as the shortest edit script from transforming
/// the first (sub-region of) sequence into the second (sub-region of)
/// sequence.
///
/// A sequence is determined by a base, a beginning offset and an end
/// offset. The base always points to the container that contains the
/// sequence to consider. The beginning offset is an iterator that
/// points the beginning of the sub-region of the sequence that we
/// actually want to consider. The end offset is an iterator that
/// points to the end of the sub-region of the sequence that we
/// actually want to consider.
///
/// This uses the LCS algorithm of the paper at section 4b.
///
/// @tparm RandomAccessOutputIterator the type of iterators passed to
/// this function. It must be a random access output iterator kind.
///
/// @tparm EqualityFunctor this must be a class that declares a public
/// call operator member returning a boolean and taking two arguments
/// that must be of the same type as the one pointed to by the @ref
/// RandomAccessOutputIterator template parameter. This functor is
/// used to compare the elements referred to by the iterators pased in
/// argument to this function.
///
/// @param a_base the iterator to the base of the first sequence.
///
/// @param a_start an iterator to the beginning of the sub-region
/// of the first sequence to actually consider.
///
/// @param a_end an iterator to the end of the sub-region of the first
/// sequence to consider.
///
///@param b_base an iterator to the base of the second sequence to
///consider.
///
/// @param b_start an iterator to the beginning of the sub-region
/// of the second sequence to actually consider.
///
/// @param b_end an iterator to the end of the sub-region of the
/// second sequence to actually consider.
///
/// @param lcs the resulting lcs. This is set iff the function
/// returns true.
///
/// @param ses the resulting shortest editing script.
///
/// @param ses_len the length of the ses above. Normally this can be
/// retrived from ses.length(), but this parameter is here for sanity
/// check purposes. The function computes the length of the ses in two
/// redundant redundant ways and ensures that both methods lead to the
/// same result.
///
/// @return true upon successful completion, false otherwise.
template<typename RandomAccessOutputIterator,
typename EqualityFunctor>
void
compute_diff(RandomAccessOutputIterator a_base,
RandomAccessOutputIterator a_begin,
RandomAccessOutputIterator a_end,
RandomAccessOutputIterator b_base,
RandomAccessOutputIterator b_begin,
RandomAccessOutputIterator b_end,
vector<point>& lcs,
edit_script& ses,
int& ses_len)
{
int a_size = a_end - a_begin;
int b_size = b_end - b_begin;
unsigned a_offset = a_begin - a_base, b_offset = b_begin - b_base;
if (a_size == 0 || b_size == 0)
{
if (a_size > 0 && b_size == 0)
// All elements of the first sequences have been deleted. So add
// the relevant deletions to the edit script.
for (RandomAccessOutputIterator i = a_begin; i < a_end; ++i)
ses.deletions().push_back(deletion(i - a_base));
if (b_size > 0 && a_size == 0)
{
// All elements present in the second sequence are part of
// an insertion into the first sequence at a_end. So add
// that insertion to the edit script.
int a_full_size = a_end - a_base;
int insertion_index = a_full_size ? a_full_size - 1 : -1;
insertion ins(insertion_index);
for (RandomAccessOutputIterator i = b_begin; i < b_end; ++i)
ins.inserted_indexes().push_back(i - b_base);
ses.insertions().push_back(ins);
}
else
return;
ses_len = a_size + b_size;
return;
}
int d = 0;
snake snak;
vector<point> trace; // the trace of the edit graph. Read the paper
// to understand what a trace is.
bool has_snake =
compute_middle_snake<RandomAccessOutputIterator,
EqualityFunctor>(a_begin, a_end,
b_begin, b_end,
snak, d);
if (has_snake)
{
// So middle_{begin,end} are expressed wrt a_begin and b_begin.
// Let's express them wrt a_base and b_base.
snak.add(a_offset, b_offset);
ses_len = d;
}
if (has_snake)
{
if ( snak.has_diagonal_edge())
for (int x = snak.diagonal_start().x(), y = snak.diagonal_start().y();
x <= snak.end().x() && y <= snak.end().y();
++x, ++y)
{
point p(x, y);
trace.push_back(p);
}
}
else
{
// So there is no middle snake. That means there is no lcs, so
// the two sequences are different.
// In other words, all the elements of the first sequence have
// been deleted ...
for (RandomAccessOutputIterator i = a_begin; i < a_end; ++i)
ses.deletions().push_back(deletion(i - a_base));
// ... and all the elements of the second sequence are insertions
// that happen at the beginning of the first sequence.
insertion ins(a_begin - a_base);
for (RandomAccessOutputIterator i = b_begin; i < b_end; ++i)
ins.inserted_indexes().push_back(i - b_base);
ses.insertions().push_back(ins);
ses_len = a_size + b_size;
assert(ses_len == ses.length());
return;
}
if (d > 1)
{
int tmp_ses_len = 0;
point px, pu;
snake_end_points(snak, px, pu);
compute_diff<RandomAccessOutputIterator,
EqualityFunctor>(a_base, a_begin, a_base + px.x() + 1,
b_base, b_begin, b_base + px.y() + 1,
lcs, ses, tmp_ses_len);
lcs.insert(lcs.end(), trace.begin(), trace.end());
tmp_ses_len = 0;
edit_script tmp_ses;
compute_diff<RandomAccessOutputIterator,
EqualityFunctor>(a_base, a_base + pu.x() + 1, a_end,
b_base, b_base + pu.y() + 1, b_end,
lcs, tmp_ses, tmp_ses_len);
ses.append(tmp_ses);
}
else if (d == 1)
{
if (snak.has_diagonal_edge())
{
for (int x = snak.diagonal_start().x(), y = snak.diagonal_start().y();
x <= snak.end().x() && y <= snak.end().y();
++x, ++y)
{
point p(x, y);
trace.push_back(p);
}
}
if (snak.has_vertical_edge())
{
point p = snak.intermediate();
insertion ins(p.x());
ins.inserted_indexes().push_back(p.y());
ses.insertions().push_back(ins);
}
else if (snak.has_horizontal_edge())
{
if (snak.is_forward())
{
deletion del(snak.intermediate().x());
ses.deletions().push_back(del);
}
else
{
deletion del(snak.begin().x());
ses.deletions().push_back(del);
}
}
}
else if (d == 0)
{
// Obviously on the middle snake is part of the solution, as
// there is no edit script; iow, the two sequences are
// identical.
lcs.insert(lcs.end(), trace.begin(), trace.end());
ses_len = 0;
}
assert(ses_len == ses.length());
}
/// Compute the longest common subsequence of two (sub-regions of)
/// sequences as well as the shortest edit script from transforming
/// the first (sub-region of) sequence into the second (sub-region of)
/// sequence.
///
/// This uses the LCS algorithm of the paper at section 4b.
///
/// @tparm RandomAccessOutputIterator the type of iterators passed to
/// this function. It must be a random access output iterator kind.
///
/// @tparm EqualityFunctor this must be a class that declares a public
/// call operator member returning a boolean and taking two arguments
/// that must be of the same type as the one pointed to by the @ref
/// RandomAccessOutputIterator template parameter. This functor is
/// used to compare the elements referred to by the iterators pased in
/// argument to this function.
///
/// @param a_start an iterator to the beginning of the first sequence
/// to consider.
///
/// @param a_end an iterator to the end of the first sequence to
/// consider.
///
/// @param b_start an iterator to the beginning of the second sequence
/// to consider.
///
/// @param b_end an iterator to the end of the second sequence to
/// consider.
///
/// @param lcs the resulting lcs. This is set iff the function
/// returns true.
///
/// @param ses the resulting shortest editing script.
///
/// @param ses_len the length of the ses above. Normally this can be
/// retrived from ses.length(), but this parameter is here for sanity
/// check purposes. The function computes the length of the ses in two
/// redundant redundant ways and ensures that both methods lead to the
/// same result.
///
/// @return true upon successful completion, false otherwise.
template<typename RandomAccessOutputIterator,
typename EqualityFunctor>
void
compute_diff(RandomAccessOutputIterator a_begin,
RandomAccessOutputIterator a_end,
RandomAccessOutputIterator b_begin,
RandomAccessOutputIterator b_end,
vector<point>& lcs,
edit_script& ses,
int& ses_len)
{
compute_diff<RandomAccessOutputIterator,
EqualityFunctor>(a_begin, a_begin, a_end,
b_begin, b_begin, b_end,
lcs, ses, ses_len);
}
/// Compute the longest common subsequence of two (sub-regions of)
/// sequences as well as the shortest edit script from transforming
/// the first (sub-region of) sequence into the second (sub-region of)
/// sequence.
///
/// A sequence is determined by a base, a beginning offset and an end
/// offset. The base always points to the container that contains the
/// sequence to consider. The beginning offset is an iterator that
/// points the beginning of the sub-region of the sequence that we
/// actually want to consider. The end offset is an iterator that
/// points to the end of the sub-region of the sequence that we
/// actually want to consider.
///
/// This uses the LCS algorithm of the paper at section 4b.
///
/// @tparm RandomAccessOutputIterator the type of iterators passed to
/// this function. It must be a random access output iterator kind.
///
/// @tparm EqualityFunctor this must be a class that declares a public
/// call operator member returning a boolean and taking two arguments
/// that must be of the same type as the one pointed to by the @ref
/// RandomAccessOutputIterator template parameter. This functor is
/// used to compare the elements referred to by the iterators pased in
/// argument to this function.
///
/// @param a_base the iterator to the base of the first sequence.
///
/// @param a_start an iterator to the beginning of the sub-region
/// of the first sequence to actually consider.
///
/// @param a_end an iterator to the end of the sub-region of the first
/// sequence to consider.
///
///@param b_base an iterator to the base of the second sequence to
///consider.
///
/// @param b_start an iterator to the beginning of the sub-region
/// of the second sequence to actually consider.
///
/// @param b_end an iterator to the end of the sub-region of the
/// second sequence to actually consider.
///
/// @param lcs the resulting lcs. This is set iff the function
/// returns true.
///
/// @param ses the resulting shortest editing script.
///
/// @return true upon successful completion, false otherwise.
template<typename RandomAccessOutputIterator,
typename EqualityFunctor>
void
compute_diff(RandomAccessOutputIterator a_base,
RandomAccessOutputIterator a_begin,
RandomAccessOutputIterator a_end,
RandomAccessOutputIterator b_base,
RandomAccessOutputIterator b_begin,
RandomAccessOutputIterator b_end,
vector<point>& lcs,
edit_script& ses)
{
int ses_len = 0;
compute_diff<RandomAccessOutputIterator,
EqualityFunctor>(a_base, a_begin, a_end,
b_base, b_begin, b_end,
lcs, ses, ses_len);
}
/// Compute the longest common subsequence of two (sub-regions of)
/// sequences as well as the shortest edit script from transforming
/// the first (sub-region of) sequence into the second (sub-region of)
/// sequence.
///
/// This uses the LCS algorithm of the paper at section 4b.
///
/// @tparm RandomAccessOutputIterator the type of iterators passed to
/// this function. It must be a random access output iterator kind.
///
/// @tparm EqualityFunctor this must be a class that declares a public
/// call operator member returning a boolean and taking two arguments
/// that must be of the same type as the one pointed to by the @ref
/// RandomAccessOutputIterator template parameter. This functor is
/// used to compare the elements referred to by the iterators pased in
/// argument to this function.
///
/// @param a_start an iterator to the beginning of the first sequence
/// to consider.
///
/// @param a_end an iterator to the end of the first sequence to
/// consider.
///
/// @param b_start an iterator to the beginning of the sequence to
/// actually consider.
///
/// @param b_end an iterator to the end of second sequence to
/// consider.
///
/// @param lcs the resulting lcs. This is set iff the function
/// returns true.
///
/// @param ses the resulting shortest editing script.
///
/// @return true upon successful completion, false otherwise.
template<typename RandomAccessOutputIterator,
typename EqualityFunctor>
void
compute_diff(RandomAccessOutputIterator a_begin,
RandomAccessOutputIterator a_end,
RandomAccessOutputIterator b_begin,
RandomAccessOutputIterator b_end,
vector<point>& lcs,
edit_script& ses)
{
compute_diff<RandomAccessOutputIterator,
EqualityFunctor>(a_begin, a_begin, a_end,
b_begin, b_begin, b_end,
lcs, ses);
}
template<typename RandomAccessOutputIterator>
void
compute_diff(RandomAccessOutputIterator a_begin,
RandomAccessOutputIterator a_end,
RandomAccessOutputIterator b_begin,
RandomAccessOutputIterator b_end,
vector<point>& lcs,
edit_script& ses)
{
compute_diff<RandomAccessOutputIterator,
default_eq_functor>(a_begin, a_end, b_begin, b_end, lcs, ses);
}
/// Compute the longest common subsequence of two (sub-regions of)
/// sequences as well as the shortest edit script from transforming
/// the first (sub-region of) sequence into the second (sub-region of)
/// sequence.
///
/// A sequence is determined by a base, a beginning offset and an end
/// offset. The base always points to the container that contains the
/// sequence to consider. The beginning offset is an iterator that
/// points the beginning of the sub-region of the sequence that we
/// actually want to consider. The end offset is an iterator that
/// points to the end of the sub-region of the sequence that we
/// actually want to consider.
///
/// This uses the LCS algorithm of the paper at section 4b.
///
/// @tparm RandomAccessOutputIterator the type of iterators passed to
/// this function. It must be a random access output iterator kind.
///
/// @tparm EqualityFunctor this must be a class that declares a public
/// call operator member returning a boolean and taking two arguments
/// that must be of the same type as the one pointed to by the @ref
/// RandomAccessOutputIterator template parameter. This functor is
/// used to compare the elements referred to by the iterators pased in
/// argument to this function.
///
/// @param a_base the iterator to the base of the first sequence.
///
/// @param a_start an iterator to the beginning of the sub-region
/// of the first sequence to actually consider.
///
/// @param a_end an iterator to the end of the sub-region of the first
/// sequence to consider.
///
/// @param b_base an iterator to the base of the second sequence to
/// consider.
///
/// @param b_start an iterator to the beginning of the sub-region
/// of the second sequence to actually consider.
///
/// @param b_end an iterator to the end of the sub-region of the
/// second sequence to actually consider.
///
/// @param ses the resulting shortest editing script.
///
/// @return true upon successful completion, false otherwise.
template<typename RandomAccessOutputIterator,
typename EqualityFunctor>
void
compute_diff(RandomAccessOutputIterator a_base,
RandomAccessOutputIterator a_begin,
RandomAccessOutputIterator a_end,
RandomAccessOutputIterator b_base,
RandomAccessOutputIterator b_begin,
RandomAccessOutputIterator b_end,
edit_script& ses)
{
vector<point> lcs;
compute_diff<RandomAccessOutputIterator,
EqualityFunctor>(a_base, a_begin, a_end,
b_base, b_begin, b_end,
lcs, ses);
}
/// Compute the longest common subsequence of two (sub-regions of)
/// sequences as well as the shortest edit script from transforming
/// the first (sub-region of) sequence into the second (sub-region of)
/// sequence.
///
/// This uses the LCS algorithm of the paper at section 4b.
///
/// @tparm RandomAccessOutputIterator the type of iterators passed to
/// this function. It must be a random access output iterator kind.
///
/// @tparm EqualityFunctor this must be a class that declares a public
/// call operator member returning a boolean and taking two arguments
/// that must be of the same type as the one pointed to by the @ref
/// RandomAccessOutputIterator template parameter. This functor is
/// used to compare the elements referred to by the iterators pased in
/// argument to this function.
///
/// @param a_start an iterator to the beginning of the first sequence
/// to consider.
///
/// @param a_end an iterator to the end of the first sequence to
/// consider.
///
/// @param b_start an iterator to the beginning of the second sequence
/// to consider.
///
/// @param b_end an iterator to the end of the second sequence to
/// consider.
///
/// @param ses the resulting shortest editing script.
///
/// @return true upon successful completion, false otherwise.
template<typename RandomAccessOutputIterator,
typename EqualityFunctor>
void
compute_diff(RandomAccessOutputIterator a_begin,
RandomAccessOutputIterator a_end,
RandomAccessOutputIterator b_begin,
RandomAccessOutputIterator b_end,
edit_script& ses)
{
compute_diff<RandomAccessOutputIterator,
EqualityFunctor>(a_begin, a_begin, a_end,
b_begin, b_begin, b_end,
ses);
}
template<typename RandomAccessOutputIterator>
void
compute_diff(RandomAccessOutputIterator a_begin,
RandomAccessOutputIterator a_end,
RandomAccessOutputIterator b_begin,
RandomAccessOutputIterator b_end,
edit_script& ses)
{
compute_diff<RandomAccessOutputIterator, default_eq_functor>(a_begin, a_end,
b_begin, b_end,
ses);
}
void
compute_lcs(const char* str1, const char* str2, int &ses_len, string& lcs);
void
compute_ses(const char* str1, const char* str2, edit_script& ses);
/// Display an edit script on standard output.
///
/// @param es the edit script to display
///
/// @param str1_base the first string the edit script is about.
///
/// @pram str2_base the second string the edit script is about.
template<typename RandomAccessOutputIterator>
void
display_edit_script(const edit_script& es,
const RandomAccessOutputIterator str1_base,
const RandomAccessOutputIterator str2_base,
ostream& out)
{
if (es.num_deletions() == 0)
out << "no deletion:\n";
else if (es.num_deletions() == 1)
{
out << "1 deletion:\n"
<< "\t happened at index: ";;
}
else
{
out << es.num_deletions() << " deletions:\n"
<< "\t happened at indexes: ";
}
for (vector<deletion>::const_iterator i = es.deletions().begin();
i != es.deletions().end();
++i)
{
if (i != es.deletions().begin())
out << ", ";
out << i->index() << " (" << str1_base[i->index()] << ")";
}
out << "\n\n";
if (es.num_insertions() == 0)
out << "no insertion\n";
else if (es.num_insertions() == 1)
out << "1 insertion\n";
else
out << es.num_insertions() << " insertions:\n";
for (vector<insertion>::const_iterator i = es.insertions().begin();
i != es.insertions().end();
++i)
{
int idx = i->insertion_point_index();
if (idx < 0)
out << "\t before index of first sequence: " << idx + 1
<< " (" << str1_base[idx + 1] << ")\n";
else
out << "\t after index of first sequence: " << idx
<< " (" << str1_base[idx] << ")\n";
if (!i->inserted_indexes().empty())
out << "\t\t inserted indexes from second sequence: ";
for (vector<unsigned>::const_iterator j = i->inserted_indexes().begin();
j != i->inserted_indexes().end();
++j)
{
if (j != i->inserted_indexes().begin())
out << ", ";
out << *j << " (" << str2_base[*j] << ")";
}
out << "\n";
}
out << "\n\n";
}
}//end namespace diff_utils
}//end namespace abigail