// -*- Mode: C++ -*- // // Copyright (C) 2013 Red Hat, Inc. // // This file is part of the GNU Application Binary Interface Generic // Analysis and Instrumentation Library (libabigail). This library is // free software; you can redistribute it and/or modify it under the // terms of the GNU Lesser General Public License as published by the // Free Software Foundation; either version 3, or (at your option) any // later version. // This library is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Lesser Public License for more details. // You should have received a copy of the GNU Lesser General Public // License along with this program; see the file COPYING-LGPLV3. If // not, see . /// @file /// /// This file declares types and operations implementing the "O(ND) /// Difference Algorithm" (aka diff2) from Eugene W. Myers, to compute /// the difference between two sequences. /// /// To understand what is going on here, one must read the paper at /// http://www.xmailserver.org/diff2.pdf. Throughout this file, that /// paper is referred to as "the paper". /// /// The implementations goes as far as calculating the shortest edit /// script (the set of insertions and deletions) for transforming a /// sequence into another. The main entry point for that is the /// compute_diff() function. #include #include #include #include #include #include #include namespace abigail { namespace diff_utils { // Inject the names from std:: below into this namespace using std::string; using std::ostream; using std::vector; using std::abs; using std::ostringstream; /// A class representing a vertex in an edit graph, as explained in /// the paper. A vertex is a basically a pair of coordinates /// (abscissa and ordinate). class point { int x_; int y_; bool empty_; public: point() : x_(-1), y_(-1),empty_(true) {} point(int x, int y) : x_(x), y_(y), empty_(false) {} point(const point& p) : x_(p.x()), y_(p.y()), empty_(p.is_empty()) {} int x() const {return x_;} void x(int x) { x_ = x; empty_ = false; } int y() const {return y_;} void y(int y) { y_ = y; empty_ = false; } void set(int x, int y) { x_ = x; y_ = y; empty_ = false; } void set(int x, int y, bool empty) { x_ = x; y_ = y; empty_ = empty; } void add(int ax, int ay) {set (x() + ax, y() + ay);} bool operator!=(const point& o) const {return (x() != o.x() || y() != o.y() || is_empty() != o.is_empty());} bool operator==(const point& o) const {return !(operator!=(o));} bool operator<(const point& o) const {return (x() < o.x() && y() < o.y());} bool operator>(const point& o) const {return (x() > o.x() && y() > o.y());} bool operator<=(const point& o) const {return (x() <= o.x() && y() <= o.y());} bool operator>=(const point& o) const {return (x() >= o.x() && y() >= o.y());} point operator+(int val) const {return point(x() + val, y() + val);} point operator-(int val) const {return point(x() - val, y() - val);} point& operator+= (int val) { set(x_ + val, y_ + val); return *this; } point& operator-= (int val) {return (*this) += (-val);} point& operator--() {return (*this) -= 1;} point& operator++() {return (*this) += 1;} point operator--(int) { point tmp(*this); (*this)--; return tmp; } point operator++(int) { point tmp(*this); (*this)++; return tmp; } point& operator=(int val) { set(val, val); return *this; } point& operator=(const point& p) { set(p.x(), p.y(), p.is_empty()); return *this; } bool is_empty() const {return empty_;} operator bool () const {return !is_empty();} bool operator!() const {return is_empty();} void clear() { x_ = -1; y_ = -1; empty_ = true; } };// end point /// The abstraction of the Snake concept, from the paper. /// /// In a given path from the edit graph, a snake is a non-diagonal /// edge followed by zero or more diagonal edges. /// /// The starting poing of the non-diagonal edge is the beginning of /// the snake. This is given by the snake::begin() method. This point /// is not explicitely referenced in the paper, but we need it for /// some grunt implementation details of the algorithm. /// /// The end point of the non-diagonal edge is the intermediate point /// of the snake; it's given by the snake::intermediate() method. /// This point is what is referred to as "the begining of the snake" /// in the paper. /// /// The end point of the first diagonal edge is given by the /// snake::diagonal_start() method. /// /// The end point of the last diagonal edge is given by the /// snake::end() method. Note that when the snake contains no /// diagonal edge, snake::intermediate(), and snake::end() return the /// same point; snake::diagonal_start() contains an empty point (i.e, /// a point for which point::is_empty() returns true). class snake { point begin_, intermediate_, diagonal_start_, end_; bool forward_; public: /// Default constructor for snake. snake() : forward_(false) {} /// Constructor from the beginning, intermediate and end points. /// /// @param b the beginning point of the snake. That is, the /// starting point of the non-diagonal edge. /// /// @param i the intermediate point of the snake. That is, the end /// point of the non-diagonal edge. /// /// @param e the end point of the snake. That is the end point of /// the last diagonal edge. snake(const point& b, const point& i, const point& e) : begin_(b), intermediate_(i), end_(e), forward_(false) {} /// Constructor from the beginning, intermediate and end points. /// /// @param b the beginning point of the snake. That is, the /// starting point of the non-diagonal edge. /// /// @param i the intermediate point of the snake. That is, the end /// point of the non-diagonal edge. /// /// @param d the beginning of the diagonal edge. That is the end of /// the first diagonal edge of the snake. /// /// @param e the end point of the snake. That is the end point of /// the last diagonal edge. snake(const point& b, const point& i, const point& d, const point& e) : begin_(b), intermediate_(i), diagonal_start_(d), end_(e), forward_(false) {} /// Getter for the starting point of the non-diagonal edge of the /// snake. /// /// @return the starting point of the non-diagonal edge of the snake const point& begin() const {return begin_;} /// Getter for the starting point of the non-diagonal edge of the /// snake, aka begin point. /// ///@param p the new begin point. void begin(const point& p) {begin_ = p;} /// Getter for the end point of the non-diagonal edge of the snake. /// /// @return the end point of the non-diagonal edge of the snake const point& intermediate() const {return intermediate_;} /// Setter for the end point of the non-diagonal edge of the snake, /// aka intermediate point. /// /// @param p the new intermediate point. void intermediate(const point& p) {intermediate_ = p;} /// Getter for the end point of the first diagonal edge, aka /// diagonal start point. Note that if the snake has no diagonal /// edge, this point is empty. /// /// @return the end point of the first diagonal edge. const point& diagonal_start() const {return diagonal_start_;} /// Setter for the end point of the first diagonal edge, aka /// diagonal start point. /// /// @param p the new diagonal start.d void diagonal_start(const point& p) {diagonal_start_ = p;} /// Getter for the end point of the last diagonal edge, aka snake /// end point. Note that if the snake has no diagonal edge, this /// point is equal to the intermediate point. /// /// @return the end point of the last diagonal edge const point& end() const {return end_;} /// Setter for the end point of the last diagonal edge, aka snake /// end point. Note that if the snake has no diagonal edge, this /// point is equal to the intermediate point. void end(const point& p) {end_ = p;} /// Setter for the begin, intermediate and end points of the snake. /// /// @param b the new snake begin point /// /// @param i the new snake intermediate point /// /// @param e the new snake end point void set(const point& b, const point&i, const point&e) { begin(b); intermediate(i); end(e); } /// Setter for the begin, intermediate, diagonal start and end points /// of the snake. /// /// @param b the new snake begin point /// /// @param i the new snake intermediate point /// /// @param d the new diagonal start point /// /// @param e the new snake end point void set(const point& b, const point&i, const point& d, const point&e) { begin(b); intermediate(i); diagonal_start(d); end(e); } /// @return true iff the snake is a forward snake. That is, if it /// was built while walking the edit graph going forward (from the /// top left corner to the right bottom corner. bool is_forward() const {return forward_;} /// Set to true if the snake is a forward snake; that is, if it was /// built while walking the edit graph going forward (from the top /// left corner to the right bottom corner. Set to false otherwise. /// /// @param f whether the snake is a forward snake or not. void set_forward(bool f) {forward_ = f;} /// Add an offset to the abscissas of the points of the snake, and /// add another offset to the ordinates of these same points. /// /// @param x_offset the offset to add to the abscissas of all the /// points of the snake. /// /// @param y_offset the offset to add to the ordinates of all the /// points of the snake. void add(int x_offset, int y_offset) { if (is_empty()) return; begin_.add(x_offset, y_offset); intermediate_.add(x_offset, y_offset); if (diagonal_start_) diagonal_start_.add(x_offset, y_offset); end_.add(x_offset, y_offset); } /// @return true iff the snake has at least one diagonal edge. bool has_diagonal_edge() const {return !diagonal_start().is_empty();} /// @return true iff the non-diagonal edge is horizontal. bool has_horizontal_edge() const {return (begin().y() == intermediate().y());} /// @return true iff the non-diagonal edge is vertical. bool has_vertical_edge() const {return (begin().x() == intermediate().x());} /// @return true iff the snake is empty, that is, if all the points /// it contains are empty. bool is_empty() const {return begin().is_empty() && intermediate().is_empty() && end().is_empty();} };// end class snake /// The array containing the furthest D-path end-points, for each value /// of K. MAX_D is the maximum value of the D-Path. That is, M+N if /// M is the size of the first input string, and N is the size of the /// second. class d_path_vec : public std::vector { private: unsigned a_size_; unsigned b_size_; /// Forbid vector size modifications void push_back(const typename vector::value_type&); /// Forbid default constructor. d_path_vec(); bool over_bounds(long long index) const {return (index + offset()) >= (long long) size();} void check_index_against_bound(int index) const { if (over_bounds(index)) { ostringstream o; o << "index '" << index << "' out of range [-" << max_d() << ", " << max_d() << "]"; throw std::out_of_range(o.str()); } } public: /// Constructor of the d_path_vec. /// /// For forward vectors, the underlying vector allocates 2 * /// [MAX_D+1]. /// space, so that one can address elements in the index range /// [-MAX_D, MAX_D]. And MAX_D is the sum of the two sequence /// sizes. delta is the difference. /// /// For reverse vectors, note that we need to be able to address /// [-MAX_D - delta, MAX_D + delta], with delta being the (signed) /// difference between the size of the two sequences. We consider /// delta being bounded by MAX_D itself; so we say we need to be /// able to address [-2MAX_D, 2MAX_D]. /// /// @param size1 the size of the first sequence we are interested /// in. /// /// @param size2 the size of the second sequence we are interested /// in. d_path_vec(unsigned size1, unsigned size2) : vector(2 * (size1 + size2 + 1 + (size1 + size2)) + 1, 0), a_size_(size1), b_size_(size2) { } typename std::vector::const_reference operator[](int index) const {return at(index);} typename std::vector::reference operator[](int index) {return at(index);} typename std::vector::reference at(long long index) { //check_index_against_bound(index); long long i = index + offset(); return vector::operator[](i); } typename std::vector::const_reference at(long long index) const { check_index_against_bound(index); long long i = offset() + index; return static_cast* >(this)->at(i); } unsigned a_size() const {return a_size_;} unsigned b_size() const {return b_size_;} unsigned max_d() const {return a_size_ + b_size_;} unsigned offset() const {return max_d() + abs((long long) a_size() - (long long) b_size());} }; // end class d_path_vec /// The abstration of an insertion of elements of a sequence B into a /// sequence A. This is used to represent the edit script for /// transforming a sequence A into a sequence B. /// /// And insertion mainly encapsulates two components: /// /// - An insertion point: this is the index (starting at 0) of the /// element of the sequence A after which the insertion occurs. /// /// - Inserted elements: this is a vector of indexes of elements of /// sequence B (starting at 0) that got inserted into sequence A, /// after the insertion point. class insertion { int insertion_point_; vector inserted_; public: insertion(int insertion_point, const vector& inserted_indexes) : insertion_point_(insertion_point), inserted_(inserted_indexes) {} insertion(int insertion_point = 0) : insertion_point_(insertion_point) {} int insertion_point_index() const {return insertion_point_;} void insertion_point_index(int i) {insertion_point_ = i;} const vector& inserted_indexes() const {return inserted_;} vector& inserted_indexes() {return inserted_;} };// end class insertion /// The abstraction of the deletion of one element of a sequence A. /// /// This encapsulates the index of the element A that got deleted. class deletion { int index_; public: deletion(int i) : index_(i) {} int index() const {return index_;} void index(int i) {index_ = i;} };// end class deletion /// The abstraction of an edit script for transforming a sequence A /// into a sequence B. /// /// It encapsulates the insertions and deletions for transforming A /// into B. class edit_script { vector insertions_; vector deletions_; public: edit_script() {} const vector& insertions() const {return insertions_;} vector& insertions() {return insertions_;} const vector& deletions() const {return deletions_;} vector& deletions() {return deletions_;} void append(const edit_script& es) { insertions().insert(insertions().end(), es.insertions().begin(), es.insertions().end()); deletions().insert(deletions().end(), es.deletions().begin(), es.deletions().end()); } void prepend(const edit_script& es) { insertions().insert(insertions().begin(), es.insertions().begin(), es.insertions().end()); deletions().insert(deletions().begin(), es.deletions().begin(), es.deletions().end()); } void clear() { insertions().clear(); deletions().clear(); } bool is_empty() const {return insertions().empty() && deletions().empty();} operator bool() const {return !is_empty();} int num_insertions() const { int l = 0; for (vector::const_iterator i = insertions().begin(); i != insertions().end(); ++i) l += i->inserted_indexes().size(); return l; } int num_deletions() const {return deletions().size();} int length() const {return num_insertions() + num_deletions();} };//end class edit_script bool point_is_valid_in_graph(point& p, unsigned a_size, unsigned b_size); bool ends_of_furthest_d_paths_overlap(const point& forward_d_path_end, const point& reverse_d_path_end); /// The default equality functor used by the core diffing algorithms. struct default_eq_functor { /// This equality operator uses the default "==" to compare its /// arguments. /// /// @param a the first comparison argument. /// /// @param b the second comparison argument. /// /// @return true if the two arguments are equal, false otherwise. template bool operator()(const T a, const T b) const {return a == b;} }; /// An equality functor to deeply compare pointers. struct deep_ptr_eq_functor { /// This equality operator compares pointers by comparing the /// pointed-to objects. /// /// @param first the first comparison argument. /// /// @param second the second comparison argument. /// /// @return true if the objects pointed to by the pointers are /// equal, false otherwise. template bool operator()(const T* first, const T* second) { if (!!first != !!second) return false; if (!first) return true; return *first == *second; } }; /// Find the end of the furthest reaching d-path on diagonal k, for /// two sequences. In the paper This is referred to as "the basic /// algorithm". /// /// Unlike in the paper, the coordinates of the edit graph start at /// (-1,-1), rather than (0,0), and they end at (M-1, N-1), rather /// than (M,N). /// /// @tparm RandomAccessOutputIterator the type of iterators passed to /// this function. It must be a random access output iterator kind. /// /// @tparm EqualityFunctor this must be a class that declares a public /// call operator member returning a boolean and taking two arguments /// that must be of the same type as the one pointed to by the @ref /// RandomAccessOutputIterator template parameter. This functor is /// used to compare the elements referred to by the iterators pased in /// argument to this function. /// /// @param k the number of the diagonal on which we want to find the /// end of the furthest reaching D-path. /// /// @param d the D in D-Path. That's the number of insertions/deletions /// (the number of changes, in other words) in the changeset. That is /// also the number of non-diagonals in the D-Path. /// /// @param a_begin an iterator to the beginning of the first sequence /// /// @param a_end an iterator that points right after the last element /// of the second sequence to consider. /// /// @param b_begin an iterator to the beginning of the second sequence. /// /// @param b_end an iterator that points right after the last element /// of the second sequence to consider. /// /// @param v the vector of furthest end points of d_paths, at (d-1). /// It contains the abscissas of the furthest end points for different /// values of k, at (d-1). That is, for k in [-D + 1, -D + 3, -D + 5, /// ..., D - 1], v[k] is the abscissa of the end of the furthest /// reaching (D-1)-path on diagonal k. /// /// @param snak the last snake of the furthest path found. The end /// point of the snake is the end point of the furthest path. /// /// @return true if the end of the furthest reaching path that was /// found was inside the boundaries of the edit graph, false /// otherwise. template bool end_of_fr_d_path_in_k(int k, int d, RandomAccessOutputIterator a_begin, RandomAccessOutputIterator a_end, RandomAccessOutputIterator b_start, RandomAccessOutputIterator b_end, d_path_vec& v, snake& snak) { int x = -1, y = -1; point begin, intermediate, diag_start, end; snake s; EqualityFunctor eq; // Let's pick the end point of the furthest reaching // (D-1)-path. It's either v[k-1] or v[k+1]; the word // "furthest" means we choose the one which abscissa is the // greatest (that is, furthest from abscissa zero). if (k == -d || ((k != d) && (v[k-1] < v[k + 1]))) // So, the abscissa of the end point of the furthest // reaching (D-1)-path is v[k+1]. That is a diagonal that // is above the current (k) diagonal, and on the right. // To move to the current k diagonal, one has to move // "down" from the diagonal k+1. So the abscissa won't // change. Only the ordinate will. It will be given by y // = x - k (a bit below); as k has changed from k - 1 (it // has increased), y is going to be the new y that is // 'down' from the previous y in k - 1. { x = v[k+1]; begin.set(x, x - (k + 1)); } else { // So the abscissa of the end point of the furthest // (D-1)-path is v[k-1]. That is on the left of the // current k diagonal. To move to the current k diagonal, // one has to move "right" from diagonal k - 1. That is, // the y stays constant and x is incremented. x = v[k-1]; begin.set(x, x - (k - 1)); ++x; } // Now get the value of y from the equation k = x -y. // This is the point where we first touch K, when we move // from the end of the furthest reaching (D-1)-path. y = x - k; intermediate.x(x); intermediate.y(y); int last_x_index = a_end - a_begin - 1; int last_y_index = b_end - b_start - 1; // Now, follow the snake (aka, zero or more consecutive // diagonals). Note that we stay on the k diagonal when we // do this. while ((x < last_x_index) && (y < last_y_index)) if (eq(a_begin[x + 1], b_start[y + 1])) { x = x + 1; y = y + 1; if (!diag_start) diag_start.set(x, y); } else break; end.x(x); end.y(y); // Note the point that we store in v here might be outside the // bounds of the edit graph. But we store it at this step (for a // given D) anyway, because out of bound or not, we need this value // at this step to be able to compute the value of the point on the // "next" diagonal for the next D. v[k] = x; if (x >= (int) v.a_size() || y >= (int) v.b_size() || x < -1 || y < -1) return false; s.set(begin, intermediate, diag_start, end); s.set_forward(true); snak = s; return true; } /// Find the end of the furthest reaching reverse d-path on diagonal k /// + delta. Delta is abs(M - N), with M being the size of a and N /// being the size of b. This is the "basic algorithm", run backward. /// That is, starting from the point (M,N) of the edit graph. /// /// Unlike in the paper, the coordinates of the edit graph start at /// (-1,-1), rather than (0,0), and they end at (M-1, N-1), rather /// than (M,N). /// /// @tparm RandomAccessOutputIterator the type of iterators passed to /// this function. It must be a random access output iterator kind. /// /// @tparm EqualityFunctor this must be a class that declares a public /// call operator member returning a boolean and taking two arguments /// that must be of the same type as the one pointed to by the @ref /// RandomAccessOutputIterator template parameter. This functor is /// used to compare the elements referred to by the iterators pased in /// argument to this function. /// /// @param k the number of the diagonal on which we want to find the /// end of the furthest reaching reverse D-path. Actually, we want to /// find the end of the furthest reaching reverse D-path on diagonal (k /// - delta). /// /// @param d the D in D-path. That's the number of insertions/deletions /// (the number of changes, in other words) in the changeset. That is /// also the number of non-diagonals in the D-Path. /// /// @param a_begin an iterator to the beginning of the first sequence /// /// @param a_end an iterator that points right after the last element /// of the second sequence to consider. /// /// @param b_begin an iterator to the beginning of the second sequence. /// /// @param b_end an iterator that points right after the last element /// of the second sequence to consider. /// /// @param v the vector of furthest end points of d_paths, at (d-1). /// It contains the abscissae of the furthest end points for different /// values of k - delta, at (d-1). That is, for k in [-D + 1, -D + 3, /// -D + 5, ..., D - 1], v[k - delta] is the abscissa of the end of the /// furthest reaching (D-1)-path on diagonal k - delta. /// /// @param snak the last snake of the furthest path found. The end /// point of the snake is the end point of the furthest path. /// /// @return true iff the end of the furthest reaching path that was /// found was inside the boundaries of the edit graph, false /// otherwise. template bool end_of_frr_d_path_in_k_plus_delta (int k, int d, RandomAccessOutputIterator a_begin, RandomAccessOutputIterator a_end, RandomAccessOutputIterator b_begin, RandomAccessOutputIterator b_end, d_path_vec& v, snake& snak) { int a_size = a_end - a_begin; int b_size = b_end - b_begin; int delta = a_size - b_size; int k_plus_delta = k + delta; int x = -1, y = -1; point begin, intermediate, diag_start, end; snake s; EqualityFunctor eq; // Let's pick the end point of the furthest reaching (D-1)-path and // move from there to reach the current k_plus_delta-line. That end // point of the furthest reaching (D-1)-path is either on // v[k_plus_delta-1] or on v[k_plus_delta+1]; the word "furthest" // means we choose the one which abscissa is the lowest (that is, // furthest from abscissa M). if (k_plus_delta == -d + delta || ((k_plus_delta != d + delta) && (v[k_plus_delta + 1] <= v[k_plus_delta - 1]))) { // We move left, that means ordinate won't change ... x = v[k_plus_delta + 1]; y = x - (k_plus_delta + 1); begin.set(x, y); // ... and abscissa decreases. x = x - 1; } else { // So the furthest end point is on the k_plus_delta - 1 // diagonal. That is a diagonal that is 'below' the // k_plus_delta current diagonal. So to join the current // diagonal from the k_plus_delta - 1 one, we need to move up. // So moving up means abscissa won't change ... x = v[k_plus_delta - 1]; begin.set(x, x - (k_plus_delta - 1)); // ... and that ordinate decreases. y = x - (k_plus_delta - 1) - 1; } intermediate.set(x, y); // Now, follow the snake. Note that we stay on the k_plus_delta // diagonal when we do this. while (x >= 0 && y >= 0) if (eq(a_begin[x], b_begin[y])) { if (!diag_start) diag_start.set(x, y); x = x - 1; y = y - 1; } else break; end.set(x, y); // Note the point that we store in v here might be outside the // bounds of the edit graph. But we store it at this step (for a // given D) anyway, because out of bound or not, we need this value // at this step to be able to compute the value of the point on the // "next" diagonal for the next D. v[k_plus_delta] = x; if (x == -1 && y == -1) ; else if (x < -1 || y < -1) return false; s.set(begin, intermediate, diag_start, end); s.set_forward(false); snak = s; return true; } /// Tests if a given point is a match point in an edit graph. /// /// @param a_begin the begin iterator of the first input sequence of /// the edit graph. /// /// @param a_end the end iterator of the first input sequence of the /// edit graph. This points to one element passed the end of the /// sequence. /// /// @param b_begin the begin iterator of the second input sequence of /// the edit graph. /// /// @param b_end the end iterator of the second input sequence of the /// edit graph. This points the one element passed the end of the /// sequence. /// /// @param point the point to test for being a match point. /// /// @return true iff \a point is a match point. template bool is_match_point(RandomAccessOutputIterator a_begin, RandomAccessOutputIterator a_end, RandomAccessOutputIterator b_begin, RandomAccessOutputIterator b_end, const point& point) { int a_size = a_end - a_begin, b_size = b_end - b_begin; if (point.x() < 0 || point.x () >= a_size || point.y() < 0 || point.y() >= b_size) return false; return (a_begin[point.x()] == b_begin[point.y()]); } /// Returns the middle snake of two sequences A and B, as well as the /// length of their shortest editing script. /// /// This uses the "linear space refinement" algorithm presented in /// section 4b in the paper. As the paper says, "The idea for doing /// so is to simultaneously run the basic algorithm in both the /// forward and reverse directions until furthest reaching forward and /// reverse paths starting at opposing corners ‘‘overlap’’." /// /// @tparm RandomAccessOutputIterator the type of iterators passed to /// this function. It must be a random access output iterator kind. /// /// @tparm EqualityFunctor this must be a class that declares a public /// call operator member returning a boolean and taking two arguments /// that must be of the same type as the one pointed to by the @ref /// RandomAccessOutputIterator template parameter. This functor is /// used to compare the elements referred to by the iterators pased in /// argument to this function. /// /// @param a_begin an iterator pointing to the begining of sequence A. /// /// @param a_end an iterator pointing to the end of sequence A. Note /// that this points right /after/ the end of vector A. /// /// @param b_begin an iterator pointing to the begining of sequence B. /// /// @param b_end an iterator pointing to the end of sequence B. Note /// that this points right /after/ the end of vector B /// /// @param snak out parameter. This is the snake current when the two /// paths overlapped. This is set iff the function returns true; /// otherwise, this is not touched. /// /// @return true is the snake was found, false otherwise. template bool compute_middle_snake(RandomAccessOutputIterator a_begin, RandomAccessOutputIterator a_end, RandomAccessOutputIterator b_begin, RandomAccessOutputIterator b_end, snake& snak, int& ses_len) { int a_size = a_end - a_begin; int N = a_size; int b_size = b_end - b_begin; int M = b_size; int delta = N - M; d_path_vec forward_d_paths(a_size, b_size); d_path_vec reverse_d_paths(a_size, b_size); // These points below are the top leftmost point and bottom // right-most points of the edit graph. point first_point(-1, -1), last_point(a_size -1, b_size -1), point_zero(0, 0); // We want the initial step (D = 0, k = 0 in the paper) to find a // furthest reaching point on diagonal k == 0; For that, we need the // value of x for k == 1; So let's set that value to -1; that is for // k == 1 (diagonal 1), the point in the edit graph is (-1,-2). // That way, to get the furthest reaching point on diagonal 0 (k == // 0), we go down from (-1,-2) on diagonal 1 and we hit diagonal 0 // on (-1,-1); that is the starting value that the algorithm expects // for k == 0. forward_d_paths[1] = -1; // Similarly for the reverse paths, for diagonal delta + 1 (note // that diagonals are centered on delta, unlike for forward paths // where they are centered on zero), we set the initial point to // (a_size, b_size - 1). That way, at step D == 0 and k == delta, // to reach diagonal delta from the point (a_size, b_size - 1) on // diagonal delta + 1, we just have to move left, and we hit // diagonal delta on (a_size - 1, b_size -1); that is the starting // point value the algorithm expects for k == 0 in the reverse case. reverse_d_paths[delta + 1] = a_size; int d_max = (M + N) / 2 + 1; for (int d = 0; d <= d_max; ++d) { // First build forward paths. for (int k = -d; k <= d; k += 2) { snake s; bool found = end_of_fr_d_path_in_k(k, d, a_begin, a_end, b_begin, b_end, forward_d_paths, s); if (!found) continue; // As the paper says in 4b while explaining the middle snake // algorithm: // // "Thus when delta is odd, check for overlap only while // extending forward paths ..." if ((delta % 2) && (k >= (delta - (d - 1))) && (k <= (delta + (d - 1)))) { point reverse_end; reverse_end.x(reverse_d_paths[k]); reverse_end.y(reverse_end.x() - k); if (ends_of_furthest_d_paths_overlap(s.end(), reverse_end)) { ses_len = 2 * d - 1; snak = s; return true; } } } // Now build reverse paths. for (int k = -d; k <= d; k += 2) { snake s; bool found = end_of_frr_d_path_in_k_plus_delta(k, d, a_begin, a_end, b_begin, b_end, reverse_d_paths, s); if (!found) continue; // And the paper continues by saying: // // "... and when delta is even, check for overlap only while // extending reverse paths." int k_plus_delta = k + delta; if (!(delta % 2) && (k_plus_delta >= -d) && (k_plus_delta <= d)) { point forward_end; forward_end.x(forward_d_paths[k_plus_delta]); forward_end.y(forward_end.x() - k_plus_delta); if (ends_of_furthest_d_paths_overlap(forward_end, s.end())) { ses_len = 2 * d; snak = s; return true; } } } } return false; } bool compute_middle_snake(const char* str1, const char* str2, snake& s, int& ses_len); /// This prints the middle snake of two strings. /// /// @param a_begin the beginning of the first string. /// /// @param b_begin the beginning of the second string. /// /// @param s the snake to print. /// /// @param out the output stream to print the snake to. template void print_snake(RandomAccessOutputIterator a_begin, RandomAccessOutputIterator b_begin, const snake s, ostream& out) { if (s.is_empty()) return; out << "snake start: "; out << "(" << s.begin().x() << ", " << s.end().y() << ")\n"; out << "snake intermediate: "; out << "(" << s.intermediate().x() << ", " << s.intermediate().y() << ")\n"; out << "diagonal point(s): "; if (s.has_diagonal_edge()) for (int x = s.intermediate().x(), y = s.intermediate().y(); x <= s.end().x() && y <= s.end().y(); ++x, ++y) { assert(a_begin[x] == b_begin[y]); out << "(" << x << "," << y << ") "; } out << "\n"; out << "snake end: "; out << "(" << s.end().x() << ", " << s.end().y() << ")\n"; } /// Compute the length of the shortest edit script for two sequences a /// and b. This is done using the "Greedy LCS/SES" of figure 2 in the /// paper. It can walk the edit graph either foward (when reverse is /// false) or backward starting from the end (when reverse is true). /// /// Here, note that the real content of a and b should start at index /// 1, for this implementatikon algorithm to match the paper's /// algorithm in a straightforward manner. So pleast make sure that /// at index 0, we just get some non-used value. /// /// @tparm RandomAccessOutputIterator the type of iterators passed to /// this function. It must be a random access output iterator kind. /// /// @tparm EqualityFunctor this must be a class that declares a public /// call operator member returning a boolean and taking two arguments /// that must be of the same type as the one pointed to by the @ref /// RandomAccessOutputIterator template parameter. This functor is /// used to compare the elements referred to by the iterators pased in /// argument to this function. /// /// @param a the first sequence we care about. /// /// @param b the second sequence we care about. /// /// @param v the vector that contains the end points of the furthest /// reaching d-path and (d-1)-path. template int ses_len(RandomAccessOutputIterator a_begin, RandomAccessOutputIterator a_end, RandomAccessOutputIterator b_begin, RandomAccessOutputIterator b_end, d_path_vec& v, bool reverse) { unsigned a_size = a_end - a_begin; unsigned b_size = b_end - b_begin; snake snak; assert(v.max_d() == a_size + b_size); int delta = a_size - b_size; if (reverse) // Set a fictitious (M, N-1) into v[1], to find the furthest // reaching reverse 0-path (i.e, when we are at d == 0 and k == 0). v[delta + 1] = a_size - 1; else // Set a fictitious (-1,-2) point into v[1], to find the furthest // reaching forward 0-path (i.e, when we are at d == 0 and k == 0). v[1] = -1; for (unsigned d = 0; d <= v.max_d(); ++d) { for (int k = -d; k <= (int) d; k += 2) { point end; if (reverse) { bool found = end_of_frr_d_path_in_k_plus_delta(k, d, a_begin, a_end, b_begin, b_end, v, snak); // If we reached the upper left corner of the edit graph then // we are done. if (found && snak.end().x() == -1 && snak.end().y() == -1) return d; } else { end_of_fr_d_path_in_k(k, d, a_begin, a_end, b_begin, b_end, v, snak); // If we reached the lower right corner of the edit // graph then we are done. if ((snak.end().x() == (int) a_size - 1) && (snak.end().y() == (int) b_size - 1)) return d; } } } return 0; } /// Compute the length of the shortest edit script for two sequences a /// and b. This is done using the "Greedy LCS/SES" of figure 2 in the /// paper. It can walk the edit graph either foward (when reverse is /// false) or backward starting from the end (when reverse is true). /// /// Here, note that the real content of a and b should start at index /// 1, for this implementatikon algorithm to match the paper's /// algorithm in a straightforward manner. So pleast make sure that /// at index 0, we just get some non-used value. /// /// Note that the equality operator used to compare the elements /// passed in argument to this function is the default "==" operator. /// /// @tparm RandomAccessOutputIterator the type of iterators passed to /// this function. It must be a random access output iterator kind. /// /// @param a the first sequence we care about. /// /// @param b the second sequence we care about. /// /// @param v the vector that contains the end points of the furthest /// reaching d-path and (d-1)-path. template int ses_len(RandomAccessOutputIterator a_begin, RandomAccessOutputIterator a_end, RandomAccessOutputIterator b_begin, RandomAccessOutputIterator b_end, d_path_vec& v, bool reverse) { return ses_len(a_begin, a_end, b_begin, b_end, v, reverse); } int ses_len(const char* str1, const char* str2, bool reverse = false); bool snake_end_points(const snake& s, point&, point&); /// Compute the longest common subsequence of two (sub-regions of) /// sequences as well as the shortest edit script from transforming /// the first (sub-region of) sequence into the second (sub-region of) /// sequence. /// /// A sequence is determined by a base, a beginning offset and an end /// offset. The base always points to the container that contains the /// sequence to consider. The beginning offset is an iterator that /// points the beginning of the sub-region of the sequence that we /// actually want to consider. The end offset is an iterator that /// points to the end of the sub-region of the sequence that we /// actually want to consider. /// /// This uses the LCS algorithm of the paper at section 4b. /// /// @tparm RandomAccessOutputIterator the type of iterators passed to /// this function. It must be a random access output iterator kind. /// /// @tparm EqualityFunctor this must be a class that declares a public /// call operator member returning a boolean and taking two arguments /// that must be of the same type as the one pointed to by the @ref /// RandomAccessOutputIterator template parameter. This functor is /// used to compare the elements referred to by the iterators pased in /// argument to this function. /// /// @param a_base the iterator to the base of the first sequence. /// /// @param a_start an iterator to the beginning of the sub-region /// of the first sequence to actually consider. /// /// @param a_end an iterator to the end of the sub-region of the first /// sequence to consider. /// ///@param b_base an iterator to the base of the second sequence to ///consider. /// /// @param b_start an iterator to the beginning of the sub-region /// of the second sequence to actually consider. /// /// @param b_end an iterator to the end of the sub-region of the /// second sequence to actually consider. /// /// @param lcs the resulting lcs. This is set iff the function /// returns true. /// /// @param ses the resulting shortest editing script. /// /// @param ses_len the length of the ses above. Normally this can be /// retrived from ses.length(), but this parameter is here for sanity /// check purposes. The function computes the length of the ses in two /// redundant redundant ways and ensures that both methods lead to the /// same result. /// /// @return true upon successful completion, false otherwise. template void compute_diff(RandomAccessOutputIterator a_base, RandomAccessOutputIterator a_begin, RandomAccessOutputIterator a_end, RandomAccessOutputIterator b_base, RandomAccessOutputIterator b_begin, RandomAccessOutputIterator b_end, vector& lcs, edit_script& ses, int& ses_len) { int a_size = a_end - a_begin; int b_size = b_end - b_begin; unsigned a_offset = a_begin - a_base, b_offset = b_begin - b_base; if (a_size == 0 || b_size == 0) { if (a_size > 0 && b_size == 0) // All elements of the first sequences have been deleted. So add // the relevant deletions to the edit script. for (RandomAccessOutputIterator i = a_begin; i < a_end; ++i) ses.deletions().push_back(deletion(i - a_base)); if (b_size > 0 && a_size == 0) { // All elements present in the second sequence are part of // an insertion into the first sequence at a_end. So add // that insertion to the edit script. int a_full_size = a_end - a_base; int insertion_index = a_full_size ? a_full_size - 1 : -1; insertion ins(insertion_index); for (RandomAccessOutputIterator i = b_begin; i < b_end; ++i) ins.inserted_indexes().push_back(i - b_base); ses.insertions().push_back(ins); } ses_len = a_size + b_size; return; } int d = 0; snake snak; vector trace; // the trace of the edit graph. Read the paper // to understand what a trace is. bool has_snake = compute_middle_snake(a_begin, a_end, b_begin, b_end, snak, d); if (has_snake) { // So middle_{begin,end} are expressed wrt a_begin and b_begin. // Let's express them wrt a_base and b_base. snak.add(a_offset, b_offset); ses_len = d; } if (has_snake) { if ( snak.has_diagonal_edge()) for (int x = snak.diagonal_start().x(), y = snak.diagonal_start().y(); x <= snak.end().x() && y <= snak.end().y(); ++x, ++y) { point p(x, y); trace.push_back(p); } } else { // So there is no middle snake. That means there is no lcs, so // the two sequences are different. // In other words, all the elements of the first sequence have // been deleted ... for (RandomAccessOutputIterator i = a_begin; i < a_end; ++i) ses.deletions().push_back(deletion(i - a_base)); // ... and all the elements of the second sequence are insertions // that happen at the beginning of the first sequence. insertion ins(a_begin - a_base); for (RandomAccessOutputIterator i = b_begin; i < b_end; ++i) ins.inserted_indexes().push_back(i - b_base); ses.insertions().push_back(ins); ses_len = a_size + b_size; assert(ses_len == ses.length()); return; } if (d > 1) { int tmp_ses_len0 = 0; edit_script tmp_ses0; point px, pu; snake_end_points(snak, px, pu); compute_diff(a_base, a_begin, a_base + px.x() + 1, b_base, b_begin, b_base + px.y() + 1, lcs, tmp_ses0, tmp_ses_len0); lcs.insert(lcs.end(), trace.begin(), trace.end()); int tmp_ses_len1 = 0; edit_script tmp_ses1; compute_diff(a_base, a_base + pu.x() + 1, a_end, b_base, b_base + pu.y() + 1, b_end, lcs, tmp_ses1, tmp_ses_len1); assert(tmp_ses0.length() + tmp_ses1.length() == d); assert(tmp_ses_len0 + tmp_ses_len1 == d); ses.append(tmp_ses0); ses.append(tmp_ses1); } else if (d == 1) { if (snak.has_diagonal_edge()) { for (int x = snak.diagonal_start().x(), y = snak.diagonal_start().y(); x <= snak.end().x() && y <= snak.end().y(); ++x, ++y) { point p(x, y); trace.push_back(p); } } if (snak.has_vertical_edge()) { point p = snak.intermediate(); insertion ins(p.x()); ins.inserted_indexes().push_back(p.y()); ses.insertions().push_back(ins); } else if (snak.has_horizontal_edge()) { if (snak.is_forward()) { deletion del(snak.intermediate().x()); ses.deletions().push_back(del); } else { deletion del(snak.begin().x()); ses.deletions().push_back(del); } } } else if (d == 0) { // Obviously on the middle snake is part of the solution, as // there is no edit script; iow, the two sequences are // identical. lcs.insert(lcs.end(), trace.begin(), trace.end()); ses_len = 0; } assert(ses_len == ses.length()); } /// Compute the longest common subsequence of two (sub-regions of) /// sequences as well as the shortest edit script from transforming /// the first (sub-region of) sequence into the second (sub-region of) /// sequence. /// /// This uses the LCS algorithm of the paper at section 4b. /// /// @tparm RandomAccessOutputIterator the type of iterators passed to /// this function. It must be a random access output iterator kind. /// /// @tparm EqualityFunctor this must be a class that declares a public /// call operator member returning a boolean and taking two arguments /// that must be of the same type as the one pointed to by the @ref /// RandomAccessOutputIterator template parameter. This functor is /// used to compare the elements referred to by the iterators pased in /// argument to this function. /// /// @param a_start an iterator to the beginning of the first sequence /// to consider. /// /// @param a_end an iterator to the end of the first sequence to /// consider. /// /// @param b_start an iterator to the beginning of the second sequence /// to consider. /// /// @param b_end an iterator to the end of the second sequence to /// consider. /// /// @param lcs the resulting lcs. This is set iff the function /// returns true. /// /// @param ses the resulting shortest editing script. /// /// @param ses_len the length of the ses above. Normally this can be /// retrived from ses.length(), but this parameter is here for sanity /// check purposes. The function computes the length of the ses in two /// redundant redundant ways and ensures that both methods lead to the /// same result. /// /// @return true upon successful completion, false otherwise. template void compute_diff(RandomAccessOutputIterator a_begin, RandomAccessOutputIterator a_end, RandomAccessOutputIterator b_begin, RandomAccessOutputIterator b_end, vector& lcs, edit_script& ses, int& ses_len) { compute_diff(a_begin, a_begin, a_end, b_begin, b_begin, b_end, lcs, ses, ses_len); } /// Compute the longest common subsequence of two (sub-regions of) /// sequences as well as the shortest edit script from transforming /// the first (sub-region of) sequence into the second (sub-region of) /// sequence. /// /// A sequence is determined by a base, a beginning offset and an end /// offset. The base always points to the container that contains the /// sequence to consider. The beginning offset is an iterator that /// points the beginning of the sub-region of the sequence that we /// actually want to consider. The end offset is an iterator that /// points to the end of the sub-region of the sequence that we /// actually want to consider. /// /// This uses the LCS algorithm of the paper at section 4b. /// /// @tparm RandomAccessOutputIterator the type of iterators passed to /// this function. It must be a random access output iterator kind. /// /// @tparm EqualityFunctor this must be a class that declares a public /// call operator member returning a boolean and taking two arguments /// that must be of the same type as the one pointed to by the @ref /// RandomAccessOutputIterator template parameter. This functor is /// used to compare the elements referred to by the iterators pased in /// argument to this function. /// /// @param a_base the iterator to the base of the first sequence. /// /// @param a_start an iterator to the beginning of the sub-region /// of the first sequence to actually consider. /// /// @param a_end an iterator to the end of the sub-region of the first /// sequence to consider. /// ///@param b_base an iterator to the base of the second sequence to ///consider. /// /// @param b_start an iterator to the beginning of the sub-region /// of the second sequence to actually consider. /// /// @param b_end an iterator to the end of the sub-region of the /// second sequence to actually consider. /// /// @param lcs the resulting lcs. This is set iff the function /// returns true. /// /// @param ses the resulting shortest editing script. /// /// @return true upon successful completion, false otherwise. template void compute_diff(RandomAccessOutputIterator a_base, RandomAccessOutputIterator a_begin, RandomAccessOutputIterator a_end, RandomAccessOutputIterator b_base, RandomAccessOutputIterator b_begin, RandomAccessOutputIterator b_end, vector& lcs, edit_script& ses) { int ses_len = 0; compute_diff(a_base, a_begin, a_end, b_base, b_begin, b_end, lcs, ses, ses_len); } /// Compute the longest common subsequence of two (sub-regions of) /// sequences as well as the shortest edit script from transforming /// the first (sub-region of) sequence into the second (sub-region of) /// sequence. /// /// This uses the LCS algorithm of the paper at section 4b. /// /// @tparm RandomAccessOutputIterator the type of iterators passed to /// this function. It must be a random access output iterator kind. /// /// @tparm EqualityFunctor this must be a class that declares a public /// call operator member returning a boolean and taking two arguments /// that must be of the same type as the one pointed to by the @ref /// RandomAccessOutputIterator template parameter. This functor is /// used to compare the elements referred to by the iterators pased in /// argument to this function. /// /// @param a_start an iterator to the beginning of the first sequence /// to consider. /// /// @param a_end an iterator to the end of the first sequence to /// consider. /// /// @param b_start an iterator to the beginning of the sequence to /// actually consider. /// /// @param b_end an iterator to the end of second sequence to /// consider. /// /// @param lcs the resulting lcs. This is set iff the function /// returns true. /// /// @param ses the resulting shortest editing script. /// /// @return true upon successful completion, false otherwise. template void compute_diff(RandomAccessOutputIterator a_begin, RandomAccessOutputIterator a_end, RandomAccessOutputIterator b_begin, RandomAccessOutputIterator b_end, vector& lcs, edit_script& ses) { compute_diff(a_begin, a_begin, a_end, b_begin, b_begin, b_end, lcs, ses); } template void compute_diff(RandomAccessOutputIterator a_begin, RandomAccessOutputIterator a_end, RandomAccessOutputIterator b_begin, RandomAccessOutputIterator b_end, vector& lcs, edit_script& ses) { compute_diff(a_begin, a_end, b_begin, b_end, lcs, ses); } /// Compute the longest common subsequence of two (sub-regions of) /// sequences as well as the shortest edit script from transforming /// the first (sub-region of) sequence into the second (sub-region of) /// sequence. /// /// A sequence is determined by a base, a beginning offset and an end /// offset. The base always points to the container that contains the /// sequence to consider. The beginning offset is an iterator that /// points the beginning of the sub-region of the sequence that we /// actually want to consider. The end offset is an iterator that /// points to the end of the sub-region of the sequence that we /// actually want to consider. /// /// This uses the LCS algorithm of the paper at section 4b. /// /// @tparm RandomAccessOutputIterator the type of iterators passed to /// this function. It must be a random access output iterator kind. /// /// @tparm EqualityFunctor this must be a class that declares a public /// call operator member returning a boolean and taking two arguments /// that must be of the same type as the one pointed to by the @ref /// RandomAccessOutputIterator template parameter. This functor is /// used to compare the elements referred to by the iterators pased in /// argument to this function. /// /// @param a_base the iterator to the base of the first sequence. /// /// @param a_start an iterator to the beginning of the sub-region /// of the first sequence to actually consider. /// /// @param a_end an iterator to the end of the sub-region of the first /// sequence to consider. /// /// @param b_base an iterator to the base of the second sequence to /// consider. /// /// @param b_start an iterator to the beginning of the sub-region /// of the second sequence to actually consider. /// /// @param b_end an iterator to the end of the sub-region of the /// second sequence to actually consider. /// /// @param ses the resulting shortest editing script. /// /// @return true upon successful completion, false otherwise. template void compute_diff(RandomAccessOutputIterator a_base, RandomAccessOutputIterator a_begin, RandomAccessOutputIterator a_end, RandomAccessOutputIterator b_base, RandomAccessOutputIterator b_begin, RandomAccessOutputIterator b_end, edit_script& ses) { vector lcs; compute_diff(a_base, a_begin, a_end, b_base, b_begin, b_end, lcs, ses); } /// Compute the longest common subsequence of two (sub-regions of) /// sequences as well as the shortest edit script from transforming /// the first (sub-region of) sequence into the second (sub-region of) /// sequence. /// /// This uses the LCS algorithm of the paper at section 4b. /// /// @tparm RandomAccessOutputIterator the type of iterators passed to /// this function. It must be a random access output iterator kind. /// /// @tparm EqualityFunctor this must be a class that declares a public /// call operator member returning a boolean and taking two arguments /// that must be of the same type as the one pointed to by the @ref /// RandomAccessOutputIterator template parameter. This functor is /// used to compare the elements referred to by the iterators pased in /// argument to this function. /// /// @param a_start an iterator to the beginning of the first sequence /// to consider. /// /// @param a_end an iterator to the end of the first sequence to /// consider. /// /// @param b_start an iterator to the beginning of the second sequence /// to consider. /// /// @param b_end an iterator to the end of the second sequence to /// consider. /// /// @param ses the resulting shortest editing script. /// /// @return true upon successful completion, false otherwise. template void compute_diff(RandomAccessOutputIterator a_begin, RandomAccessOutputIterator a_end, RandomAccessOutputIterator b_begin, RandomAccessOutputIterator b_end, edit_script& ses) { compute_diff(a_begin, a_begin, a_end, b_begin, b_begin, b_end, ses); } template void compute_diff(RandomAccessOutputIterator a_begin, RandomAccessOutputIterator a_end, RandomAccessOutputIterator b_begin, RandomAccessOutputIterator b_end, edit_script& ses) { compute_diff(a_begin, a_end, b_begin, b_end, ses); } void compute_lcs(const char* str1, const char* str2, int &ses_len, string& lcs); void compute_ses(const char* str1, const char* str2, edit_script& ses); /// Display an edit script on standard output. /// /// @param es the edit script to display /// /// @param str1_base the first string the edit script is about. /// /// @pram str2_base the second string the edit script is about. template void display_edit_script(const edit_script& es, const RandomAccessOutputIterator str1_base, const RandomAccessOutputIterator str2_base, ostream& out) { if (es.num_deletions() == 0) out << "no deletion:\n"; else if (es.num_deletions() == 1) { out << "1 deletion:\n" << "\t happened at index: ";; } else { out << es.num_deletions() << " deletions:\n" << "\t happened at indexes: "; } for (vector::const_iterator i = es.deletions().begin(); i != es.deletions().end(); ++i) { if (i != es.deletions().begin()) out << ", "; out << i->index() << " (" << str1_base[i->index()] << ")"; } out << "\n\n"; if (es.num_insertions() == 0) out << "no insertion\n"; else if (es.num_insertions() == 1) out << "1 insertion\n"; else out << es.num_insertions() << " insertions:\n"; for (vector::const_iterator i = es.insertions().begin(); i != es.insertions().end(); ++i) { int idx = i->insertion_point_index(); if (idx < 0) out << "\t before index of first sequence: " << idx + 1 << " (" << str1_base[idx + 1] << ")\n"; else out << "\t after index of first sequence: " << idx << " (" << str1_base[idx] << ")\n"; if (!i->inserted_indexes().empty()) out << "\t\t inserted indexes from second sequence: "; for (vector::const_iterator j = i->inserted_indexes().begin(); j != i->inserted_indexes().end(); ++j) { if (j != i->inserted_indexes().begin()) out << ", "; out << *j << " (" << str2_base[*j] << ")"; } out << "\n"; } out << "\n\n"; } }//end namespace diff_utils }//end namespace abigail