From 8c52f28864bd2cddab65c135eae49a54d30eef4c Mon Sep 17 00:00:00 2001 From: Jacques Distler Date: Sat, 3 Feb 2007 22:52:48 -0600 Subject: [PATCH] Replaced diff.rb with xhtmldiff.rb, which (unlike its predecessor) produces well-formed redline documents. --- lib/diff.rb | 316 ----------- lib/diff/lcs.rb | 1105 +++++++++++++++++++++++++++++++++++++ lib/diff/lcs/array.rb | 21 + lib/diff/lcs/block.rb | 51 ++ lib/diff/lcs/callbacks.rb | 322 +++++++++++ lib/diff/lcs/change.rb | 169 ++++++ lib/diff/lcs/hunk.rb | 257 +++++++++ lib/diff/lcs/ldiff.rb | 226 ++++++++ lib/diff/lcs/string.rb | 19 + lib/page_renderer.rb | 22 +- lib/xhtmldiff.rb | 179 ++++++ test/unit/diff_test.rb | 124 ++--- 12 files changed, 2420 insertions(+), 391 deletions(-) delete mode 100644 lib/diff.rb create mode 100644 lib/diff/lcs.rb create mode 100644 lib/diff/lcs/array.rb create mode 100644 lib/diff/lcs/block.rb create mode 100644 lib/diff/lcs/callbacks.rb create mode 100644 lib/diff/lcs/change.rb create mode 100644 lib/diff/lcs/hunk.rb create mode 100644 lib/diff/lcs/ldiff.rb create mode 100644 lib/diff/lcs/string.rb create mode 100644 lib/xhtmldiff.rb diff --git a/lib/diff.rb b/lib/diff.rb deleted file mode 100644 index 1e3fe298..00000000 --- a/lib/diff.rb +++ /dev/null @@ -1,316 +0,0 @@ -module HTMLDiff - - Match = Struct.new(:start_in_old, :start_in_new, :size) - class Match - def end_in_old - self.start_in_old + self.size - end - - def end_in_new - self.start_in_new + self.size - end - end - - Operation = Struct.new(:action, :start_in_old, :end_in_old, :start_in_new, :end_in_new) - - class DiffBuilder - - def initialize(old_version, new_version) - @old_version, @new_version = old_version, new_version - @content = [] - end - - def build - split_inputs_to_words - index_new_words - operations.each { |op| perform_operation(op) } - return @content.join - end - - def split_inputs_to_words - @old_words = convert_html_to_list_of_words(explode(@old_version)) - @new_words = convert_html_to_list_of_words(explode(@new_version)) - end - - def index_new_words - @word_indices = Hash.new { |h, word| h[word] = [] } - @new_words.each_with_index { |word, i| @word_indices[word] << i } - end - - def operations - position_in_old = position_in_new = 0 - operations = [] - - matches = matching_blocks - # an empty match at the end forces the loop below to handle the unmatched tails - # I'm sure it can be done more gracefully, but not at 23:52 - matches << Match.new(@old_words.length, @new_words.length, 0) - - matches.each_with_index do |match, i| - match_starts_at_current_position_in_old = (position_in_old == match.start_in_old) - match_starts_at_current_position_in_new = (position_in_new == match.start_in_new) - - action_upto_match_positions = - case [match_starts_at_current_position_in_old, match_starts_at_current_position_in_new] - when [false, false] - :replace - when [true, false] - :insert - when [false, true] - :delete - else - # this happens if the first few words are same in both versions - :none - end - - if action_upto_match_positions != :none - operation_upto_match_positions = - Operation.new(action_upto_match_positions, - position_in_old, match.start_in_old, - position_in_new, match.start_in_new) - operations << operation_upto_match_positions - end - if match.size != 0 - match_operation = Operation.new(:equal, - match.start_in_old, match.end_in_old, - match.start_in_new, match.end_in_new) - operations << match_operation - end - - position_in_old = match.end_in_old - position_in_new = match.end_in_new - end - - operations - end - - def matching_blocks - matching_blocks = [] - recursively_find_matching_blocks(0, @old_words.size, 0, @new_words.size, matching_blocks) - matching_blocks - end - - def recursively_find_matching_blocks(start_in_old, end_in_old, start_in_new, end_in_new, matching_blocks) - match = find_match(start_in_old, end_in_old, start_in_new, end_in_new) - if match - if start_in_old < match.start_in_old and start_in_new < match.start_in_new - recursively_find_matching_blocks( - start_in_old, match.start_in_old, start_in_new, match.start_in_new, matching_blocks) - end - matching_blocks << match - if match.end_in_old < end_in_old and match.end_in_new < end_in_new - recursively_find_matching_blocks( - match.end_in_old, end_in_old, match.end_in_new, end_in_new, matching_blocks) - end - end - end - - def find_match(start_in_old, end_in_old, start_in_new, end_in_new) - - best_match_in_old = start_in_old - best_match_in_new = start_in_new - best_match_size = 0 - - match_length_at = Hash.new { |h, index| h[index] = 0 } - - start_in_old.upto(end_in_old - 1) do |index_in_old| - - new_match_length_at = Hash.new { |h, index| h[index] = 0 } - - @word_indices[@old_words[index_in_old]].each do |index_in_new| - next if index_in_new < start_in_new - break if index_in_new >= end_in_new - - new_match_length = match_length_at[index_in_new - 1] + 1 - new_match_length_at[index_in_new] = new_match_length - - if new_match_length > best_match_size - best_match_in_old = index_in_old - new_match_length + 1 - best_match_in_new = index_in_new - new_match_length + 1 - best_match_size = new_match_length - end - end - match_length_at = new_match_length_at - end - -# best_match_in_old, best_match_in_new, best_match_size = add_matching_words_left( -# best_match_in_old, best_match_in_new, best_match_size, start_in_old, start_in_new) -# best_match_in_old, best_match_in_new, match_size = add_matching_words_right( -# best_match_in_old, best_match_in_new, best_match_size, end_in_old, end_in_new) - - return (best_match_size != 0 ? Match.new(best_match_in_old, best_match_in_new, best_match_size) : nil) - end - - def add_matching_words_left(match_in_old, match_in_new, match_size, start_in_old, start_in_new) - while match_in_old > start_in_old and - match_in_new > start_in_new and - @old_words[match_in_old - 1] == @new_words[match_in_new - 1] - match_in_old -= 1 - match_in_new -= 1 - match_size += 1 - end - [match_in_old, match_in_new, match_size] - end - - def add_matching_words_right(match_in_old, match_in_new, match_size, end_in_old, end_in_new) - while match_in_old + match_size < end_in_old and - match_in_new + match_size < end_in_new and - @old_words[match_in_old + match_size] == @new_words[match_in_new + match_size] - match_size += 1 - end - [match_in_old, match_in_new, match_size] - end - - VALID_METHODS = [:replace, :insert, :delete, :equal] - - def perform_operation(operation) - @operation = operation - self.send operation.action, operation - end - - def replace(operation) - delete(operation, 'diffmod') - insert(operation, 'diffmod') - end - - def insert(operation, tagclass = 'diffins') - insert_tag('ins', tagclass, @new_words[operation.start_in_new...operation.end_in_new]) - end - - def delete(operation, tagclass = 'diffdel') - insert_tag('del', tagclass, @old_words[operation.start_in_old...operation.end_in_old]) - end - - def equal(operation) - # no tags to insert, simply copy the matching words from one of the versions - @content += @new_words[operation.start_in_new...operation.end_in_new] - end - - def opening_tag?(item) - item =~ %r!^\s*<[^>]+>\s*$! - end - - def closing_tag?(item) - item =~ %r!^\s*]+>\s*$! - end - - def tag?(item) - opening_tag?(item) or closing_tag?(item) - end - - def extract_consecutive_words(words, &condition) - index_of_first_tag = nil - words.each_with_index do |word, i| - if !condition.call(word) - index_of_first_tag = i - break - end - end - if index_of_first_tag - return words.slice!(0...index_of_first_tag) - else - return words.slice!(0..words.length) - end - end - - # This method encloses words within a specified tag (ins or del), and adds this into @content, - # with a twist: if there are words contain tags, it actually creates multiple ins or del, - # so that they don't include any ins or del. This handles cases like - # old: '

a

' - # new: '

ab

c' - # diff result: '

ab

c

' - # this still doesn't guarantee valid HTML (hint: think about diffing a text containing ins or - # del tags), but handles correctly more cases than the earlier version. - # - # P.S.: Spare a thought for people who write HTML browsers. They live in this ... every day. - - def insert_tag(tagname, cssclass, words) - loop do - break if words.empty? - non_tags = extract_consecutive_words(words) { |word| not tag?(word) } - @content << wrap_text(non_tags.join, tagname, cssclass) unless non_tags.empty? - - break if words.empty? - @content += extract_consecutive_words(words) { |word| tag?(word) } - end - end - - def wrap_text(text, tagname, cssclass) - %(<#{tagname} class="#{cssclass}">#{text}) - end - - def explode(sequence) - sequence.is_a?(String) ? sequence.split(//) : sequence - end - - def end_of_tag?(char) - char == '>' - end - - def start_of_tag?(char) - char == '<' - end - - def whitespace?(char) - char =~ /\s/ - end - - def convert_html_to_list_of_words(x, use_brackets = false) - mode = :char - current_word = '' - words = [] - - explode(x).each do |char| - case mode - when :tag - if end_of_tag? char - current_word << (use_brackets ? ']' : '>') - words << current_word - current_word = '' - if whitespace?(char) - mode = :whitespace - else - mode = :char - end - else - current_word << char - end - when :char - if start_of_tag? char - words << current_word unless current_word.empty? - current_word = (use_brackets ? '[' : '<') - mode = :tag - elsif /\s/.match char - words << current_word unless current_word.empty? - current_word = char - mode = :whitespace - else - current_word << char - end - when :whitespace - if start_of_tag? char - words << current_word unless current_word.empty? - current_word = (use_brackets ? '[' : '<') - mode = :tag - elsif /\s/.match char - current_word << char - else - words << current_word unless current_word.empty? - current_word = char - mode = :char - end - else - raise "Unknown mode #{mode.inspect}" - end - end - words << current_word unless current_word.empty? - words - end - - end # of class Diff Builder - - def diff(a, b) - DiffBuilder.new(a, b).build - end - -end diff --git a/lib/diff/lcs.rb b/lib/diff/lcs.rb new file mode 100644 index 00000000..d4f4d5a8 --- /dev/null +++ b/lib/diff/lcs.rb @@ -0,0 +1,1105 @@ +#! /usr/env/bin ruby +#-- +# Copyright 2004 Austin Ziegler +# adapted from: +# Algorithm::Diff (Perl) by Ned Konz +# Smalltalk by Mario I. Wolczko +# implements McIlroy-Hunt diff algorithm +# +# This program is free software. It may be redistributed and/or modified +# under the terms of the GPL version 2 (or later), the Perl Artistic +# licence, or the Ruby licence. +# +# $Id: lcs.rb,v 1.9 2004/10/17 20:31:10 austin Exp $ +#++ + +module Diff + # = Diff::LCS 1.1.2 + # Computes "intelligent" differences between two sequenced Enumerables. + # This is an implementation of the McIlroy-Hunt "diff" algorithm for + # Enumerable objects that include Diffable. + # + # Based on Mario I. Wolczko's Smalltalk version + # (1.2, 1993) and Ned Konz's Perl version + # (Algorithm::Diff). + # + # == Synopsis + # require 'diff/lcs' + # + # seq1 = %w(a b c e h j l m n p) + # seq2 = %w(b c d e f j k l m r s t) + # + # lcs = Diff::LCS.LCS(seq1, seq2) + # diffs = Diff::LCS.diff(seq1, seq2) + # sdiff = Diff::LCS.sdiff(seq1, seq2) + # seq = Diff::LCS.traverse_sequences(seq1, seq2, callback_obj) + # bal = Diff::LCS.traverse_balanced(seq1, seq2, callback_obj) + # seq2 == Diff::LCS.patch(seq1, diffs) + # seq2 == Diff::LCS.patch!(seq1, diffs) + # seq1 == Diff::LCS.unpatch(seq2, diffs) + # seq1 == Diff::LCS.unpatch!(seq2, diffs) + # seq2 == Diff::LCS.patch(seq1, sdiff) + # seq2 == Diff::LCS.patch!(seq1, sdiff) + # seq1 == Diff::LCS.unpatch(seq2, sdiff) + # seq1 == Diff::LCS.unpatch!(seq2, sdiff) + # + # Alternatively, objects can be extended with Diff::LCS: + # + # seq1.extend(Diff::LCS) + # lcs = seq1.lcs(seq2) + # diffs = seq1.diff(seq2) + # sdiff = seq1.sdiff(seq2) + # seq = seq1.traverse_sequences(seq2, callback_obj) + # bal = seq1.traverse_balanced(seq2, callback_obj) + # seq2 == seq1.patch(diffs) + # seq2 == seq1.patch!(diffs) + # seq1 == seq2.unpatch(diffs) + # seq1 == seq2.unpatch!(diffs) + # seq2 == seq1.patch(sdiff) + # seq2 == seq1.patch!(sdiff) + # seq1 == seq2.unpatch(sdiff) + # seq1 == seq2.unpatch!(sdiff) + # + # Default extensions are provided for Array and String objects through + # the use of 'diff/lcs/array' and 'diff/lcs/string'. + # + # == Introduction (by Mark-Jason Dominus) + # + # The following text is from the Perl documentation. The only + # changes have been to make the text appear better in Rdoc. + # + # I once read an article written by the authors of +diff+; they said + # that they hard worked very hard on the algorithm until they found the + # right one. + # + # I think what they ended up using (and I hope someone will correct me, + # because I am not very confident about this) was the `longest common + # subsequence' method. In the LCS problem, you have two sequences of + # items: + # + # a b c d f g h j q z + # a b c d e f g i j k r x y z + # + # and you want to find the longest sequence of items that is present in + # both original sequences in the same order. That is, you want to find a + # new sequence *S* which can be obtained from the first sequence by + # deleting some items, and from the second sequence by deleting other + # items. You also want *S* to be as long as possible. In this case *S* + # is: + # + # a b c d f g j z + # + # From there it's only a small step to get diff-like output: + # + # e h i k q r x y + # + - + + - + + + + # + # This module solves the LCS problem. It also includes a canned function + # to generate +diff+-like output. + # + # It might seem from the example above that the LCS of two sequences is + # always pretty obvious, but that's not always the case, especially when + # the two sequences have many repeated elements. For example, consider + # + # a x b y c z p d q + # a b c a x b y c z + # + # A naive approach might start by matching up the +a+ and +b+ that + # appear at the beginning of each sequence, like this: + # + # a x b y c z p d q + # a b c a b y c z + # + # This finds the common subsequence +a b c z+. But actually, the LCS is + # +a x b y c z+: + # + # a x b y c z p d q + # a b c a x b y c z + # + # == Author + # This version is by Austin Ziegler . + # + # It is based on the Perl Algorithm::Diff by Ned Konz + # , copyright © 2000 - 2002 and the Smalltalk + # diff version by Mario I. Wolczko , copyright © + # 1993. Documentation includes work by Mark-Jason Dominus. + # + # == Licence + # Copyright © 2004 Austin Ziegler + # This program is free software; you can redistribute it and/or modify it + # under the same terms as Ruby, or alternatively under the Perl Artistic + # licence. + # + # == Credits + # Much of the documentation is taken directly from the Perl + # Algorithm::Diff implementation and was written originally by Mark-Jason + # Dominus and later by Ned Konz. The basic Ruby + # implementation was re-ported from the Smalltalk implementation, available + # at ftp://st.cs.uiuc.edu/pub/Smalltalk/MANCHESTER/manchester/4.0/diff.st + # + # #sdiff and #traverse_balanced were written for the Perl version by Mike + # Schilli . + # + # "The algorithm is described in A Fast Algorithm for Computing Longest + # Common Subsequences, CACM, vol.20, no.5, pp.350-353, May 1977, with + # a few minor improvements to improve the speed." + module LCS + VERSION = '1.1.2' + end +end + +require 'diff/lcs/callbacks' + +module Diff::LCS + # Returns an Array containing the longest common subsequence(s) between + # +self+ and +other+. See Diff::LCS#LCS. + # + # lcs = seq1.lcs(seq2) + def lcs(other, &block) #:yields self[ii] if there are matched subsequences: + Diff::LCS.LCS(self, other, &block) + end + + # Returns the difference set between +self+ and +other+. See + # Diff::LCS#diff. + def diff(other, callbacks = nil, &block) + Diff::LCS::diff(self, other, callbacks, &block) + end + + # Returns the balanced ("side-by-side") difference set between +self+ and + # +other+. See Diff::LCS#sdiff. + def sdiff(other, callbacks = nil, &block) + Diff::LCS::sdiff(self, other, callbacks, &block) + end + + # Traverses the discovered longest common subsequences between +self+ and + # +other+. See Diff::LCS#traverse_sequences. + def traverse_sequences(other, callbacks = nil, &block) + traverse_sequences(self, other, callbacks || Diff::LCS::YieldingCallbacks, + &block) + end + + # Traverses the discovered longest common subsequences between +self+ and + # +other+ using the alternate, balanced algorithm. See + # Diff::LCS#traverse_balanced. + def traverse_balanced(other, callbacks = nil, &block) + traverse_balanced(self, other, callbacks || Diff::LCS::YieldingCallbacks, + &block) + end + + # Attempts to patch a copy of +self+ with the provided +patchset+. See + # Diff::LCS#patch. + def patch(patchset) + Diff::LCS::patch(self.dup, patchset) + end + + # Attempts to unpatch a copy of +self+ with the provided +patchset+. + # See Diff::LCS#patch. + def unpatch(patchset) + Diff::LCS::unpatch(self.dup, patchset) + end + + # Attempts to patch +self+ with the provided +patchset+. See + # Diff::LCS#patch!. Does no autodiscovery. + def patch!(patchset) + Diff::LCS::patch!(self, patchset) + end + + # Attempts to unpatch +self+ with the provided +patchset+. See + # Diff::LCS#unpatch. Does no autodiscovery. + def unpatch!(patchset) + Diff::LCS::unpatch!(self, patchset) + end +end + +module Diff::LCS + class << self + # Given two sequenced Enumerables, LCS returns an Array containing their + # longest common subsequences. + # + # lcs = Diff::LCS.LCS(seq1, seq2) + # + # This array whose contents is such that: + # + # lcs.each_with_index do |ee, ii| + # assert(ee.nil? || (seq1[ii] == seq2[ee])) + # end + # + # If a block is provided, the matching subsequences will be yielded from + # +seq1+ in turn and may be modified before they are placed into the + # returned Array of subsequences. + def LCS(seq1, seq2, &block) #:yields seq1[ii] for each matched: + matches = Diff::LCS.__lcs(seq1, seq2) + ret = [] + matches.each_with_index do |ee, ii| + unless matches[ii].nil? + if block_given? + ret << (yield seq1[ii]) + else + ret << seq1[ii] + end + end + end + ret + end + + # Diff::LCS.diff computes the smallest set of additions and deletions + # necessary to turn the first sequence into the second, and returns a + # description of these changes. + # + # See Diff::LCS::DiffCallbacks for the default behaviour. An alternate + # behaviour may be implemented with Diff::LCS::ContextDiffCallbacks. + # If a Class argument is provided for +callbacks+, #diff will attempt + # to initialise it. If the +callbacks+ object (possibly initialised) + # responds to #finish, it will be called. + def diff(seq1, seq2, callbacks = nil, &block) # :yields diff changes: + callbacks ||= Diff::LCS::DiffCallbacks + if callbacks.kind_of?(Class) + cb = callbacks.new rescue callbacks + callbacks = cb + end + traverse_sequences(seq1, seq2, callbacks) + callbacks.finish if callbacks.respond_to?(:finish) + + if block_given? + res = callbacks.diffs.map do |hunk| + if hunk.kind_of?(Array) + hunk = hunk.map { |block| yield block } + else + yield hunk + end + end + res + else + callbacks.diffs + end + end + + # Diff::LCS.sdiff computes all necessary components to show two sequences + # and their minimized differences side by side, just like the Unix + # utility sdiff does: + # + # old < - + # same same + # before | after + # - > new + # + # See Diff::LCS::SDiffCallbacks for the default behaviour. An alternate + # behaviour may be implemented with Diff::LCS::ContextDiffCallbacks. If + # a Class argument is provided for +callbacks+, #diff will attempt to + # initialise it. If the +callbacks+ object (possibly initialised) + # responds to #finish, it will be called. + def sdiff(seq1, seq2, callbacks = nil, &block) #:yields diff changes: + callbacks ||= Diff::LCS::SDiffCallbacks + if callbacks.kind_of?(Class) + cb = callbacks.new rescue callbacks + callbacks = cb + end + traverse_balanced(seq1, seq2, callbacks) + callbacks.finish if callbacks.respond_to?(:finish) + + if block_given? + res = callbacks.diffs.map do |hunk| + if hunk.kind_of?(Array) + hunk = hunk.map { |block| yield block } + else + yield hunk + end + end + res + else + callbacks.diffs + end + end + + # Diff::LCS.traverse_sequences is the most general facility provided by this + # module; +diff+ and +LCS+ are implemented as calls to it. + # + # The arguments to #traverse_sequences are the two sequences to + # traverse, and a callback object, like this: + # + # traverse_sequences(seq1, seq2, Diff::LCS::ContextDiffCallbacks.new) + # + # #diff is implemented with #traverse_sequences. + # + # == Callback Methods + # Optional callback methods are emphasized. + # + # callbacks#match:: Called when +a+ and +b+ are pointing + # to common elements in +A+ and +B+. + # callbacks#discard_a:: Called when +a+ is pointing to an + # element not in +B+. + # callbacks#discard_b:: Called when +b+ is pointing to an + # element not in +A+. + # callbacks#finished_a:: Called when +a+ has reached the end of + # sequence +A+. + # callbacks#finished_b:: Called when +b+ has reached the end of + # sequence +B+. + # + # == Algorithm + # a---+ + # v + # A = a b c e h j l m n p + # B = b c d e f j k l m r s t + # ^ + # b---+ + # + # If there are two arrows (+a+ and +b+) pointing to elements of + # sequences +A+ and +B+, the arrows will initially point to the first + # elements of their respective sequences. #traverse_sequences will + # advance the arrows through the sequences one element at a time, + # calling a method on the user-specified callback object before each + # advance. It will advance the arrows in such a way that if there are + # elements A[ii] and B[jj] which are both equal and + # part of the longest common subsequence, there will be some moment + # during the execution of #traverse_sequences when arrow +a+ is pointing + # to A[ii] and arrow +b+ is pointing to B[jj]. When + # this happens, #traverse_sequences will call callbacks#match + # and then it will advance both arrows. + # + # Otherwise, one of the arrows is pointing to an element of its sequence + # that is not part of the longest common subsequence. + # #traverse_sequences will advance that arrow and will call + # callbacks#discard_a or callbacks#discard_b, depending + # on which arrow it advanced. If both arrows point to elements that are + # not part of the longest common subsequence, then #traverse_sequences + # will advance one of them and call the appropriate callback, but it is + # not specified which it will call. + # + # The methods for callbacks#match, callbacks#discard_a, + # and callbacks#discard_b are invoked with an event comprising + # the action ("=", "+", or "-", respectively), the indicies +ii+ and + # +jj+, and the elements A[ii] and B[jj]. Return + # values are discarded by #traverse_sequences. + # + # === End of Sequences + # If arrow +a+ reaches the end of its sequence before arrow +b+ does, + # #traverse_sequence try to call callbacks#finished_a with the + # last index and element of +A+ (A[-1]) and the current index + # and element of +B+ (B[jj]). If callbacks#finished_a + # does not exist, then callbacks#discard_b will be called on + # each element of +B+ until the end of the sequence is reached (the call + # will be done with A[-1] and B[jj] for each element). + # + # If +b+ reaches the end of +B+ before +a+ reaches the end of +A+, + # callbacks#finished_b will be called with the current index + # and element of +A+ (A[ii]) and the last index and element of + # +B+ (A[-1]). Again, if callbacks#finished_b does not + # exist on the callback object, then callbacks#discard_a will + # be called on each element of +A+ until the end of the sequence is + # reached (A[ii] and B[-1]). + # + # There is a chance that one additional callbacks#discard_a or + # callbacks#discard_b will be called after the end of the + # sequence is reached, if +a+ has not yet reached the end of +A+ or +b+ + # has not yet reached the end of +B+. + def traverse_sequences(seq1, seq2, callbacks = Diff::LCS::SequenceCallbacks, &block) #:yields change events: + matches = Diff::LCS.__lcs(seq1, seq2) + + run_finished_a = run_finished_b = false + string = seq1.kind_of?(String) + + a_size = seq1.size + b_size = seq2.size + ai = bj = 0 + + (0 .. matches.size).each do |ii| + b_line = matches[ii] + + ax = string ? seq1[ii, 1] : seq1[ii] + bx = string ? seq2[bj, 1] : seq2[bj] + + if b_line.nil? + unless ax.nil? + event = Diff::LCS::ContextChange.new('-', ii, ax, bj, bx) + event = yield event if block_given? + callbacks.discard_a(event) + end + else + loop do + break unless bj < b_line + bx = string ? seq2[bj, 1] : seq2[bj] + event = Diff::LCS::ContextChange.new('+', ii, ax, bj, bx) + event = yield event if block_given? + callbacks.discard_b(event) + bj += 1 + end + bx = string ? seq2[bj, 1] : seq2[bj] + event = Diff::LCS::ContextChange.new('=', ii, ax, bj, bx) + event = yield event if block_given? + callbacks.match(event) + bj += 1 + end + ai = ii + end + ai += 1 + + # The last entry (if any) processed was a match. +ai+ and +bj+ point + # just past the last matching lines in their sequences. + while (ai < a_size) or (bj < b_size) + # last A? + if ai == a_size and bj < b_size + if callbacks.respond_to?(:finished_a) and not run_finished_a + ax = string ? seq1[-1, 1] : seq1[-1] + bx = string ? seq2[bj, 1] : seq2[bj] + event = Diff::LCS::ContextChange.new('>', (a_size - 1), ax, bj, bx) + event = yield event if block_given? + callbacks.finished_a(event) + run_finished_a = true + else + ax = string ? seq1[ai, 1] : seq1[ai] + loop do + bx = string ? seq2[bj, 1] : seq2[bj] + event = Diff::LCS::ContextChange.new('+', ai, ax, bj, bx) + event = yield event if block_given? + callbacks.discard_b(event) + bj += 1 + break unless bj < b_size + end + end + end + + # last B? + if bj == b_size and ai < a_size + if callbacks.respond_to?(:finished_b) and not run_finished_b + ax = string ? seq1[ai, 1] : seq1[ai] + bx = string ? seq2[-1, 1] : seq2[-1] + event = Diff::LCS::ContextChange.new('<', ai, ax, (b_size - 1), bx) + event = yield event if block_given? + callbacks.finished_b(event) + run_finished_b = true + else + bx = string ? seq2[bj, 1] : seq2[bj] + loop do + ax = string ? seq1[ai, 1] : seq1[ai] + event = Diff::LCS::ContextChange.new('-', ai, ax, bj, bx) + event = yield event if block_given? + callbacks.discard_a(event) + ai += 1 + break unless bj < b_size + end + end + end + + if ai < a_size + ax = string ? seq1[ai, 1] : seq1[ai] + bx = string ? seq2[bj, 1] : seq2[bj] + event = Diff::LCS::ContextChange.new('-', ai, ax, bj, bx) + event = yield event if block_given? + callbacks.discard_a(event) + ai += 1 + end + + if bj < b_size + ax = string ? seq1[ai, 1] : seq1[ai] + bx = string ? seq2[bj, 1] : seq2[bj] + event = Diff::LCS::ContextChange.new('+', ai, ax, bj, bx) + event = yield event if block_given? + callbacks.discard_b(event) + bj += 1 + end + end + end + + # #traverse_balanced is an alternative to #traverse_sequences. It + # uses a different algorithm to iterate through the entries in the + # computed longest common subsequence. Instead of viewing the changes as + # insertions or deletions from one of the sequences, #traverse_balanced + # will report changes between the sequences. To represent a + # + # The arguments to #traverse_balanced are the two sequences to traverse + # and a callback object, like this: + # + # traverse_balanced(seq1, seq2, Diff::LCS::ContextDiffCallbacks.new) + # + # #sdiff is implemented with #traverse_balanced. + # + # == Callback Methods + # Optional callback methods are emphasized. + # + # callbacks#match:: Called when +a+ and +b+ are pointing + # to common elements in +A+ and +B+. + # callbacks#discard_a:: Called when +a+ is pointing to an + # element not in +B+. + # callbacks#discard_b:: Called when +b+ is pointing to an + # element not in +A+. + # callbacks#change:: Called when +a+ and +b+ are pointing + # to the same relative position, but + # A[a] and B[b] are + # not the same; a change has + # occurred. + # + # #traverse_balanced might be a bit slower than #traverse_sequences, + # noticable only while processing huge amounts of data. + # + # The +sdiff+ function of this module is implemented as call to + # #traverse_balanced. + # + # == Algorithm + # a---+ + # v + # A = a b c e h j l m n p + # B = b c d e f j k l m r s t + # ^ + # b---+ + # + # === Matches + # If there are two arrows (+a+ and +b+) pointing to elements of + # sequences +A+ and +B+, the arrows will initially point to the first + # elements of their respective sequences. #traverse_sequences will + # advance the arrows through the sequences one element at a time, + # calling a method on the user-specified callback object before each + # advance. It will advance the arrows in such a way that if there are + # elements A[ii] and B[jj] which are both equal and + # part of the longest common subsequence, there will be some moment + # during the execution of #traverse_sequences when arrow +a+ is pointing + # to A[ii] and arrow +b+ is pointing to B[jj]. When + # this happens, #traverse_sequences will call callbacks#match + # and then it will advance both arrows. + # + # === Discards + # Otherwise, one of the arrows is pointing to an element of its sequence + # that is not part of the longest common subsequence. + # #traverse_sequences will advance that arrow and will call + # callbacks#discard_a or callbacks#discard_b, + # depending on which arrow it advanced. + # + # === Changes + # If both +a+ and +b+ point to elements that are not part of the longest + # common subsequence, then #traverse_sequences will try to call + # callbacks#change and advance both arrows. If + # callbacks#change is not implemented, then + # callbacks#discard_a and callbacks#discard_b will be + # called in turn. + # + # The methods for callbacks#match, callbacks#discard_a, + # callbacks#discard_b, and callbacks#change are + # invoked with an event comprising the action ("=", "+", "-", or "!", + # respectively), the indicies +ii+ and +jj+, and the elements + # A[ii] and B[jj]. Return values are discarded by + # #traverse_balanced. + # + # === Context + # Note that +ii+ and +jj+ may not be the same index position, even if + # +a+ and +b+ are considered to be pointing to matching or changed + # elements. + def traverse_balanced(seq1, seq2, callbacks = Diff::LCS::BalancedCallbacks) + matches = Diff::LCS.__lcs(seq1, seq2) + a_size = seq1.size + b_size = seq2.size + ai = bj = mb = 0 + ma = -1 + string = seq1.kind_of?(String) + + # Process all the lines in the match vector. + loop do + # Find next match indices +ma+ and +mb+ + loop do + ma += 1 + break unless ma < matches.size and matches[ma].nil? + end + + break if ma >= matches.size # end of matches? + mb = matches[ma] + + # Change(seq2) + while (ai < ma) or (bj < mb) + ax = string ? seq1[ai, 1] : seq1[ai] + bx = string ? seq2[bj, 1] : seq2[bj] + + case [(ai < ma), (bj < mb)] + when [true, true] + if callbacks.respond_to?(:change) + event = Diff::LCS::ContextChange.new('!', ai, ax, bj, bx) + event = yield event if block_given? + callbacks.change(event) + ai += 1 + bj += 1 + else + event = Diff::LCS::ContextChange.new('-', ai, ax, bj, bx) + event = yield event if block_given? + callbacks.discard_a(event) + ai += 1 + ax = string ? seq1[ai, 1] : seq1[ai] + event = Diff::LCS::ContextChange.new('+', ai, ax, bj, bx) + event = yield event if block_given? + callbacks.discard_b(event) + bj += 1 + end + when [true, false] + event = Diff::LCS::ContextChange.new('-', ai, ax, bj, bx) + event = yield event if block_given? + callbacks.discard_a(event) + ai += 1 + when [false, true] + event = Diff::LCS::ContextChange.new('+', ai, ax, bj, bx) + event = yield event if block_given? + callbacks.discard_b(event) + bj += 1 + end + end + + # Match + ax = string ? seq1[ai, 1] : seq1[ai] + bx = string ? seq2[bj, 1] : seq2[bj] + event = Diff::LCS::ContextChange.new('=', ai, ax, bj, bx) + event = yield event if block_given? + callbacks.match(event) + ai += 1 + bj += 1 + end + + while (ai < a_size) or (bj < b_size) + ax = string ? seq1[ai, 1] : seq1[ai] + bx = string ? seq2[bj, 1] : seq2[bj] + + case [(ai < a_size), (bj < b_size)] + when [true, true] + if callbacks.respond_to?(:change) + event = Diff::LCS::ContextChange.new('!', ai, ax, bj, bx) + event = yield event if block_given? + callbacks.change(event) + ai += 1 + bj += 1 + else + event = Diff::LCS::ContextChange.new('-', ai, ax, bj, bx) + event = yield event if block_given? + callbacks.discard_a(event) + ai += 1 + ax = string ? seq1[ai, 1] : seq1[ai] + event = Diff::LCS::ContextChange.new('+', ai, ax, bj, bx) + event = yield event if block_given? + callbacks.discard_b(event) + bj += 1 + end + when [true, false] + event = Diff::LCS::ContextChange.new('-', ai, ax, bj, bx) + event = yield event if block_given? + callbacks.discard_a(event) + ai += 1 + when [false, true] + event = Diff::LCS::ContextChange.new('+', ai, ax, bj, bx) + event = yield event if block_given? + callbacks.discard_b(event) + bj += 1 + end + end + end + + PATCH_MAP = { #:nodoc: + :patch => { '+' => '+', '-' => '-', '!' => '!', '=' => '=' }, + :unpatch => { '+' => '-', '-' => '+', '!' => '!', '=' => '=' } + } + + # Given a patchset, convert the current version to the new + # version. If +direction+ is not specified (must be + # :patch or :unpatch), then discovery of the + # direction of the patch will be attempted. + def patch(src, patchset, direction = nil) + string = src.kind_of?(String) + # Start with a new empty type of the source's class + res = src.class.new + + # Normalize the patchset. + patchset = __normalize_patchset(patchset) + + direction ||= Diff::LCS.__diff_direction(src, patchset) + direction ||= :patch + + ai = bj = 0 + + patchset.each do |change| + # Both Change and ContextChange support #action + action = PATCH_MAP[direction][change.action] + + case change + when Diff::LCS::ContextChange + case direction + when :patch + el = change.new_element + op = change.old_position + np = change.new_position + when :unpatch + el = change.old_element + op = change.new_position + np = change.old_position + end + + case action + when '-' # Remove details from the old string + while ai < op + res << (string ? src[ai, 1] : src[ai]) + ai += 1 + bj += 1 + end + ai += 1 + when '+' + while bj < np + res << (string ? src[ai, 1] : src[ai]) + ai += 1 + bj += 1 + end + + res << el + bj += 1 + when '=' + # This only appears in sdiff output with the SDiff callback. + # Therefore, we only need to worry about dealing with a single + # element. + res << el + + ai += 1 + bj += 1 + when '!' + while ai < op + res << (string ? src[ai, 1] : src[ai]) + ai += 1 + bj += 1 + end + + bj += 1 + ai += 1 + + res << el + end + when Diff::LCS::Change + case action + when '-' + while ai < change.position + res << (string ? src[ai, 1] : src[ai]) + ai += 1 + bj += 1 + end + ai += 1 + when '+' + while bj < change.position + res << (string ? src[ai, 1] : src[ai]) + ai += 1 + bj += 1 + end + + bj += 1 + + res << change.element + end + end + end + + while ai < src.size + res << (string ? src[ai, 1] : src[ai]) + ai += 1 + bj += 1 + end + + res + end + + # Given a set of patchset, convert the current version to the prior + # version. Does no auto-discovery. + def unpatch!(src, patchset) + Diff::LCS.patch(src, patchset, :unpatch) + end + + # Given a set of patchset, convert the current version to the next + # version. Does no auto-discovery. + def patch!(src, patchset) + Diff::LCS.patch(src, patchset, :patch) + end + +# private + # Compute the longest common subsequence between the sequenced Enumerables + # +a+ and +b+. The result is an array whose contents is such that + # + # result = Diff::LCS.__lcs(a, b) + # result.each_with_index do |e, ii| + # assert_equal(a[ii], b[e]) unless e.nil? + # end + def __lcs(a, b) + a_start = b_start = 0 + a_finish = a.size - 1 + b_finish = b.size - 1 + vector = [] + + # Prune off any common elements at the beginning... + while (a_start <= a_finish) and + (b_start <= b_finish) and + (a[a_start] == b[b_start]) + vector[a_start] = b_start + a_start += 1 + b_start += 1 + end + + # Now the end... + while (a_start <= a_finish) and + (b_start <= b_finish) and + (a[a_finish] == b[b_finish]) + vector[a_finish] = b_finish + a_finish -= 1 + b_finish -= 1 + end + + # Now, compute the equivalence classes of positions of elements. + b_matches = Diff::LCS.__position_hash(b, b_start .. b_finish) + + thresh = [] + links = [] + + (a_start .. a_finish).each do |ii| + ai = a.kind_of?(String) ? a[ii, 1] : a[ii] + bm = b_matches[ai] + kk = nil + bm.reverse_each do |jj| + if kk and (thresh[kk] > jj) and (thresh[kk - 1] < jj) + thresh[kk] = jj + else + kk = Diff::LCS.__replace_next_larger(thresh, jj, kk) + end + links[kk] = [ (kk > 0) ? links[kk - 1] : nil, ii, jj ] unless kk.nil? + end + end + + unless thresh.empty? + link = links[thresh.size - 1] + while not link.nil? + vector[link[1]] = link[2] + link = link[0] + end + end + + vector + end + + # Find the place at which +value+ would normally be inserted into the + # Enumerable. If that place is already occupied by +value+, do nothing + # and return +nil+. If the place does not exist (i.e., it is off the end + # of the Enumerable), add it to the end. Otherwise, replace the element + # at that point with +value+. It is assumed that the Enumerable's values + # are numeric. + # + # This operation preserves the sort order. + def __replace_next_larger(enum, value, last_index = nil) + # Off the end? + if enum.empty? or (value > enum[-1]) + enum << value + return enum.size - 1 + end + + # Binary search for the insertion point + last_index ||= enum.size + first_index = 0 + while (first_index <= last_index) + ii = (first_index + last_index) >> 1 + + found = enum[ii] + + if value == found + return nil + elsif value > found + first_index = ii + 1 + else + last_index = ii - 1 + end + end + + # The insertion point is in first_index; overwrite the next larger + # value. + enum[first_index] = value + return first_index + end + + # If +vector+ maps the matching elements of another collection onto this + # Enumerable, compute the inverse +vector+ that maps this Enumerable + # onto the collection. (Currently unused.) + def __inverse_vector(a, vector) + inverse = a.dup + (0 ... vector.size).each do |ii| + inverse[vector[ii]] = ii unless vector[ii].nil? + end + inverse + end + + # Returns a hash mapping each element of an Enumerable to the set of + # positions it occupies in the Enumerable, optionally restricted to the + # elements specified in the range of indexes specified by +interval+. + def __position_hash(enum, interval = 0 .. -1) + hash = Hash.new { |hh, kk| hh[kk] = [] } + interval.each do |ii| + kk = enum.kind_of?(String) ? enum[ii, 1] : enum[ii] + hash[kk] << ii + end + hash + end + + # Examine the patchset and the source to see in which direction the + # patch should be applied. + # + # WARNING: By default, this examines the whole patch, so this could take + # some time. This also works better with Diff::LCS::ContextChange or + # Diff::LCS::Change as its source, as an array will cause the creation + # of one of the above. + def __diff_direction(src, patchset, limit = nil) + count = left = left_miss = right = right_miss = 0 + string = src.kind_of?(String) + + patchset.each do |change| + count += 1 + + case change + when Diff::LCS::Change + # With a simplistic change, we can't tell the difference between + # the left and right on '!' actions, so we ignore those. On '=' + # actions, if there's a miss, we miss both left and right. + element = string ? src[change.position, 1] : src[change.position] + + case change.action + when '-' + if element == change.element + left += 1 + else + left_miss += 1 + end + when '+' + if element == change.element + right += 1 + else + right_miss += 1 + end + when '=' + if element != change.element + left_miss += 1 + right_miss += 1 + end + end + when Diff::LCS::ContextChange + case change.action + when '-' # Remove details from the old string + element = string ? src[change.old_position, 1] : src[change.old_position] + if element == change.old_element + left += 1 + else + left_miss += 1 + end + when '+' + element = string ? src[change.new_position, 1] : src[change.new_position] + if element == change.new_element + right += 1 + else + right_miss += 1 + end + when '=' + le = string ? src[change.old_position, 1] : src[change.old_position] + re = string ? src[change.new_position, 1] : src[change.new_position] + + left_miss += 1 if le != change.old_element + right_miss += 1 if re != change.new_element + when '!' + element = string ? src[change.old_position, 1] : src[change.old_position] + if element == change.old_element + left += 1 + else + element = string ? src[change.new_position, 1] : src[change.new_position] + if element == change.new_element + right += 1 + else + left_miss += 1 + right_miss += 1 + end + end + end + end + + break if not limit.nil? and count > limit + end + + no_left = (left == 0) and (left_miss >= 0) + no_right = (right == 0) and (right_miss >= 0) + + case [no_left, no_right] + when [false, true] + return :patch + when [true, false] + return :unpatch + else + raise "The provided patchset does not appear to apply to the provided value as either source or destination value." + end + end + + # Normalize the patchset. A patchset is always a sequence of changes, but + # how those changes are represented may vary, depending on how they were + # generated. In all cases we support, we also support the array + # representation of the changes. The formats are: + # + # [ # patchset <- Diff::LCS.diff(a, b) + # [ # one or more hunks + # Diff::LCS::Change # one or more changes + # ] ] + # + # [ # patchset, equivalent to the above + # [ # one or more hunks + # [ action, line, value ] # one or more changes + # ] ] + # + # [ # patchset <- Diff::LCS.diff(a, b, Diff::LCS::ContextDiffCallbacks) + # # OR <- Diff::LCS.sdiff(a, b, Diff::LCS::ContextDiffCallbacks) + # [ # one or more hunks + # Diff::LCS::ContextChange # one or more changes + # ] ] + # + # [ # patchset, equivalent to the above + # [ # one or more hunks + # [ action, [ old line, old value ], [ new line, new value ] ] + # # one or more changes + # ] ] + # + # [ # patchset <- Diff::LCS.sdiff(a, b) + # # OR <- Diff::LCS.diff(a, b, Diff::LCS::SDiffCallbacks) + # Diff::LCS::ContextChange # one or more changes + # ] + # + # [ # patchset, equivalent to the above + # [ action, [ old line, old value ], [ new line, new value ] ] + # # one or more changes + # ] + # + # The result of this will be either of the following. + # + # [ # patchset + # Diff::LCS::ContextChange # one or more changes + # ] + # + # [ # patchset + # Diff::LCS::Change # one or more changes + # ] + # + # If either of the above is provided, it will be returned as such. + # + def __normalize_patchset(patchset) + patchset.map do |hunk| + case hunk + when Diff::LCS::ContextChange, Diff::LCS::Change + hunk + when Array + if (not hunk[0].kind_of?(Array)) and hunk[1].kind_of?(Array) and hunk[2].kind_of?(Array) + Diff::LCS::ContextChange.from_a(hunk) + else + hunk.map do |change| + case change + when Diff::LCS::ContextChange, Diff::LCS::Change + change + when Array + # change[1] will ONLY be an array in a ContextChange#to_a call. + # In Change#to_a, it represents the line (singular). + if change[1].kind_of?(Array) + Diff::LCS::ContextChange.from_a(change) + else + Diff::LCS::Change.from_a(change) + end + end + end + end + else + raise ArgumentError, "Cannot normalise a hunk of class #{hunk.class}." + end + end.flatten + end + end +end diff --git a/lib/diff/lcs/array.rb b/lib/diff/lcs/array.rb new file mode 100644 index 00000000..34bf1cb1 --- /dev/null +++ b/lib/diff/lcs/array.rb @@ -0,0 +1,21 @@ +#! /usr/env/bin ruby +#-- +# Copyright 2004 Austin Ziegler +# adapted from: +# Algorithm::Diff (Perl) by Ned Konz +# Smalltalk by Mario I. Wolczko +# implements McIlroy-Hunt diff algorithm +# +# This program is free software. It may be redistributed and/or modified under +# the terms of the GPL version 2 (or later), the Perl Artistic licence, or the +# Ruby licence. +# +# $Id: array.rb,v 1.3 2004/08/08 20:33:09 austin Exp $ +#++ +# Includes Diff::LCS into the Array built-in class. + +require 'diff/lcs' + +class Array + include Diff::LCS +end diff --git a/lib/diff/lcs/block.rb b/lib/diff/lcs/block.rb new file mode 100644 index 00000000..20ecce3b --- /dev/null +++ b/lib/diff/lcs/block.rb @@ -0,0 +1,51 @@ +#! /usr/env/bin ruby +#-- +# Copyright 2004 Austin Ziegler +# adapted from: +# Algorithm::Diff (Perl) by Ned Konz +# Smalltalk by Mario I. Wolczko +# implements McIlroy-Hunt diff algorithm +# +# This program is free software. It may be redistributed and/or modified under +# the terms of the GPL version 2 (or later), the Perl Artistic licence, or the +# Ruby licence. +# +# $Id: block.rb,v 1.3 2004/08/08 20:33:09 austin Exp $ +#++ +# Contains Diff::LCS::Block for bin/ldiff. + + # A block is an operation removing, adding, or changing a group of items. + # Basically, this is just a list of changes, where each change adds or + # deletes a single item. Used by bin/ldiff. +class Diff::LCS::Block + attr_reader :changes, :insert, :remove + + def initialize(chunk) + @changes = [] + @insert = [] + @remove = [] + + chunk.each do |item| + @changes << item + @remove << item if item.deleting? + @insert << item if item.adding? + end + end + + def diff_size + @insert.size - @remove.size + end + + def op + case [@remove.empty?, @insert.empty?] + when [false, false] + '!' + when [false, true] + '-' + when [true, false] + '+' + else # [true, true] + '^' + end + end +end diff --git a/lib/diff/lcs/callbacks.rb b/lib/diff/lcs/callbacks.rb new file mode 100644 index 00000000..4c3e40e1 --- /dev/null +++ b/lib/diff/lcs/callbacks.rb @@ -0,0 +1,322 @@ +#! /usr/env/bin ruby +#-- +# Copyright 2004 Austin Ziegler +# adapted from: +# Algorithm::Diff (Perl) by Ned Konz +# Smalltalk by Mario I. Wolczko +# implements McIlroy-Hunt diff algorithm +# +# This program is free software. It may be redistributed and/or modified under +# the terms of the GPL version 2 (or later), the Perl Artistic licence, or the +# Ruby licence. +# +# $Id: callbacks.rb,v 1.4 2004/09/14 18:51:26 austin Exp $ +#++ +# Contains definitions for all default callback objects. + +require 'diff/lcs/change' + +module Diff::LCS + # This callback object implements the default set of callback events, which + # only returns the event itself. Note that #finished_a and #finished_b are + # not implemented -- I haven't yet figured out where they would be useful. + # + # Note that this is intended to be called as is, e.g., + # + # Diff::LCS.LCS(seq1, seq2, Diff::LCS::DefaultCallbacks) + class DefaultCallbacks + class << self + # Called when two items match. + def match(event) + event + end + # Called when the old value is discarded in favour of the new value. + def discard_a(event) + event + end + # Called when the new value is discarded in favour of the old value. + def discard_b(event) + event + end + # Called when both the old and new values have changed. + def change(event) + event + end + + private :new + end + end + + # An alias for DefaultCallbacks that is used in Diff::LCS#traverse_sequences. + # + # Diff::LCS.LCS(seq1, seq2, Diff::LCS::SequenceCallbacks) + SequenceCallbacks = DefaultCallbacks + # An alias for DefaultCallbacks that is used in Diff::LCS#traverse_balanced. + # + # Diff::LCS.LCS(seq1, seq2, Diff::LCS::BalancedCallbacks) + BalancedCallbacks = DefaultCallbacks +end + + # This will produce a compound array of simple diff change objects. Each + # element in the #diffs array is a +hunk+ or +hunk+ array, where each + # element in each +hunk+ array is a single Change object representing the + # addition or removal of a single element from one of the two tested + # sequences. The +hunk+ provides the full context for the changes. + # + # diffs = Diff::LCS.diff(seq1, seq2) + # # This example shows a simplified array format. + # # [ [ [ '-', 0, 'a' ] ], # 1 + # # [ [ '+', 2, 'd' ] ], # 2 + # # [ [ '-', 4, 'h' ], # 3 + # # [ '+', 4, 'f' ] ], + # # [ [ '+', 6, 'k' ] ], # 4 + # # [ [ '-', 8, 'n' ], # 5 + # # [ '-', 9, 'p' ], + # # [ '+', 9, 'r' ], + # # [ '+', 10, 's' ], + # # [ '+', 11, 't' ] ] ] + # + # There are five hunks here. The first hunk says that the +a+ at position 0 + # of the first sequence should be deleted ('-'). The second hunk + # says that the +d+ at position 2 of the second sequence should be inserted + # ('+'). The third hunk says that the +h+ at position 4 of the + # first sequence should be removed and replaced with the +f+ from position 4 + # of the second sequence. The other two hunks are described similarly. + # + # === Use + # This callback object must be initialised and is used by the Diff::LCS#diff + # method. + # + # cbo = Diff::LCS::DiffCallbacks.new + # Diff::LCS.LCS(seq1, seq2, cbo) + # cbo.finish + # + # Note that the call to #finish is absolutely necessary, or the last set of + # changes will not be visible. Alternatively, can be used as: + # + # cbo = Diff::LCS::DiffCallbacks.new { |tcbo| Diff::LCS.LCS(seq1, seq2, tcbo) } + # + # The necessary #finish call will be made. + # + # === Simplified Array Format + # The simplified array format used in the example above can be obtained + # with: + # + # require 'pp' + # pp diffs.map { |e| e.map { |f| f.to_a } } +class Diff::LCS::DiffCallbacks + # Returns the difference set collected during the diff process. + attr_reader :diffs + + def initialize # :yields self: + @hunk = [] + @diffs = [] + + if block_given? + begin + yield self + ensure + self.finish + end + end + end + + # Finalizes the diff process. If an unprocessed hunk still exists, then it + # is appended to the diff list. + def finish + add_nonempty_hunk + end + + def match(event) + add_nonempty_hunk + end + + def discard_a(event) + @hunk << Diff::LCS::Change.new('-', event.old_position, event.old_element) + end + + def discard_b(event) + @hunk << Diff::LCS::Change.new('+', event.new_position, event.new_element) + end + +private + def add_nonempty_hunk + @diffs << @hunk unless @hunk.empty? + @hunk = [] + end +end + + # This will produce a compound array of contextual diff change objects. Each + # element in the #diffs array is a "hunk" array, where each element in each + # "hunk" array is a single change. Each change is a Diff::LCS::ContextChange + # that contains both the old index and new index values for the change. The + # "hunk" provides the full context for the changes. Both old and new objects + # will be presented for changed objects. +nil+ will be substituted for a + # discarded object. + # + # seq1 = %w(a b c e h j l m n p) + # seq2 = %w(b c d e f j k l m r s t) + # + # diffs = Diff::LCS.diff(seq1, seq2, Diff::LCS::ContextDiffCallbacks) + # # This example shows a simplified array format. + # # [ [ [ '-', [ 0, 'a' ], [ 0, nil ] ] ], # 1 + # # [ [ '+', [ 3, nil ], [ 2, 'd' ] ] ], # 2 + # # [ [ '-', [ 4, 'h' ], [ 4, nil ] ], # 3 + # # [ '+', [ 5, nil ], [ 4, 'f' ] ] ], + # # [ [ '+', [ 6, nil ], [ 6, 'k' ] ] ], # 4 + # # [ [ '-', [ 8, 'n' ], [ 9, nil ] ], # 5 + # # [ '+', [ 9, nil ], [ 9, 'r' ] ], + # # [ '-', [ 9, 'p' ], [ 10, nil ] ], + # # [ '+', [ 10, nil ], [ 10, 's' ] ], + # # [ '+', [ 10, nil ], [ 11, 't' ] ] ] ] + # + # The five hunks shown are comprised of individual changes; if there is a + # related set of changes, they are still shown individually. + # + # This callback can also be used with Diff::LCS#sdiff, which will produce + # results like: + # + # diffs = Diff::LCS.sdiff(seq1, seq2, Diff::LCS::ContextCallbacks) + # # This example shows a simplified array format. + # # [ [ [ "-", [ 0, "a" ], [ 0, nil ] ] ], # 1 + # # [ [ "+", [ 3, nil ], [ 2, "d" ] ] ], # 2 + # # [ [ "!", [ 4, "h" ], [ 4, "f" ] ] ], # 3 + # # [ [ "+", [ 6, nil ], [ 6, "k" ] ] ], # 4 + # # [ [ "!", [ 8, "n" ], [ 9, "r" ] ], # 5 + # # [ "!", [ 9, "p" ], [ 10, "s" ] ], + # # [ "+", [ 10, nil ], [ 11, "t" ] ] ] ] + # + # The five hunks are still present, but are significantly shorter in total + # presentation, because changed items are shown as changes ("!") instead of + # potentially "mismatched" pairs of additions and deletions. + # + # The result of this operation is similar to that of + # Diff::LCS::SDiffCallbacks. They may be compared as: + # + # s = Diff::LCS.sdiff(seq1, seq2).reject { |e| e.action == "=" } + # c = Diff::LCS.sdiff(seq1, seq2, Diff::LCS::ContextDiffCallbacks).flatten + # + # s == c # -> true + # + # === Use + # This callback object must be initialised and can be used by the + # Diff::LCS#diff or Diff::LCS#sdiff methods. + # + # cbo = Diff::LCS::ContextDiffCallbacks.new + # Diff::LCS.LCS(seq1, seq2, cbo) + # cbo.finish + # + # Note that the call to #finish is absolutely necessary, or the last set of + # changes will not be visible. Alternatively, can be used as: + # + # cbo = Diff::LCS::ContextDiffCallbacks.new { |tcbo| Diff::LCS.LCS(seq1, seq2, tcbo) } + # + # The necessary #finish call will be made. + # + # === Simplified Array Format + # The simplified array format used in the example above can be obtained + # with: + # + # require 'pp' + # pp diffs.map { |e| e.map { |f| f.to_a } } +class Diff::LCS::ContextDiffCallbacks < Diff::LCS::DiffCallbacks + def discard_a(event) + @hunk << Diff::LCS::ContextChange.simplify(event) + end + + def discard_b(event) + @hunk << Diff::LCS::ContextChange.simplify(event) + end + + def change(event) + @hunk << Diff::LCS::ContextChange.simplify(event) + end +end + + # This will produce a simple array of diff change objects. Each element in + # the #diffs array is a single ContextChange. In the set of #diffs provided + # by SDiffCallbacks, both old and new objects will be presented for both + # changed and unchanged objects. +nil+ will be substituted + # for a discarded object. + # + # The diffset produced by this callback, when provided to Diff::LCS#sdiff, + # will compute and display the necessary components to show two sequences + # and their minimized differences side by side, just like the Unix utility + # +sdiff+. + # + # same same + # before | after + # old < - + # - > new + # + # seq1 = %w(a b c e h j l m n p) + # seq2 = %w(b c d e f j k l m r s t) + # + # diffs = Diff::LCS.sdiff(seq1, seq2) + # # This example shows a simplified array format. + # # [ [ "-", [ 0, "a"], [ 0, nil ] ], + # # [ "=", [ 1, "b"], [ 0, "b" ] ], + # # [ "=", [ 2, "c"], [ 1, "c" ] ], + # # [ "+", [ 3, nil], [ 2, "d" ] ], + # # [ "=", [ 3, "e"], [ 3, "e" ] ], + # # [ "!", [ 4, "h"], [ 4, "f" ] ], + # # [ "=", [ 5, "j"], [ 5, "j" ] ], + # # [ "+", [ 6, nil], [ 6, "k" ] ], + # # [ "=", [ 6, "l"], [ 7, "l" ] ], + # # [ "=", [ 7, "m"], [ 8, "m" ] ], + # # [ "!", [ 8, "n"], [ 9, "r" ] ], + # # [ "!", [ 9, "p"], [ 10, "s" ] ], + # # [ "+", [ 10, nil], [ 11, "t" ] ] ] + # + # The result of this operation is similar to that of + # Diff::LCS::ContextDiffCallbacks. They may be compared as: + # + # s = Diff::LCS.sdiff(seq1, seq2).reject { |e| e.action == "=" } + # c = Diff::LCS.sdiff(seq1, seq2, Diff::LCS::ContextDiffCallbacks).flatten + # + # s == c # -> true + # + # === Use + # This callback object must be initialised and is used by the Diff::LCS#sdiff + # method. + # + # cbo = Diff::LCS::SDiffCallbacks.new + # Diff::LCS.LCS(seq1, seq2, cbo) + # + # As with the other initialisable callback objects, Diff::LCS::SDiffCallbacks + # can be initialised with a block. As there is no "fininishing" to be done, + # this has no effect on the state of the object. + # + # cbo = Diff::LCS::SDiffCallbacks.new { |tcbo| Diff::LCS.LCS(seq1, seq2, tcbo) } + # + # === Simplified Array Format + # The simplified array format used in the example above can be obtained + # with: + # + # require 'pp' + # pp diffs.map { |e| e.to_a } +class Diff::LCS::SDiffCallbacks + # Returns the difference set collected during the diff process. + attr_reader :diffs + + def initialize #:yields self: + @diffs = [] + yield self if block_given? + end + + def match(event) + @diffs << Diff::LCS::ContextChange.simplify(event) + end + + def discard_a(event) + @diffs << Diff::LCS::ContextChange.simplify(event) + end + + def discard_b(event) + @diffs << Diff::LCS::ContextChange.simplify(event) + end + + def change(event) + @diffs << Diff::LCS::ContextChange.simplify(event) + end +end diff --git a/lib/diff/lcs/change.rb b/lib/diff/lcs/change.rb new file mode 100644 index 00000000..1f282e1e --- /dev/null +++ b/lib/diff/lcs/change.rb @@ -0,0 +1,169 @@ +#! /usr/env/bin ruby +#-- +# Copyright 2004 Austin Ziegler +# adapted from: +# Algorithm::Diff (Perl) by Ned Konz +# Smalltalk by Mario I. Wolczko +# implements McIlroy-Hunt diff algorithm +# +# This program is free software. It may be redistributed and/or modified under +# the terms of the GPL version 2 (or later), the Perl Artistic licence, or the +# Ruby licence. +# +# $Id: change.rb,v 1.4 2004/08/08 20:33:09 austin Exp $ +#++ +# Provides Diff::LCS::Change and Diff::LCS::ContextChange. + + # Centralises the change test code in Diff::LCS::Change and + # Diff::LCS::ContextChange, since it's the same for both classes. +module Diff::LCS::ChangeTypeTests + def deleting? + @action == '-' + end + + def adding? + @action == '+' + end + + def unchanged? + @action == '=' + end + + def changed? + @changed == '!' + end + + def finished_a? + @changed == '>' + end + + def finished_b? + @changed == '<' + end +end + + # Represents a simplistic (non-contextual) change. Represents the removal or + # addition of an element from either the old or the new sequenced enumerable. +class Diff::LCS::Change + # Returns the action this Change represents. Can be '+' (#adding?), '-' + # (#deleting?), '=' (#unchanged?), # or '!' (#changed?). When created by + # Diff::LCS#diff or Diff::LCS#sdiff, it may also be '>' (#finished_a?) or + # '<' (#finished_b?). + attr_reader :action + attr_reader :position + attr_reader :element + + include Comparable + def ==(other) + (self.action == other.action) and + (self.position == other.position) and + (self.element == other.element) + end + + def <=>(other) + r = self.action <=> other.action + r = self.position <=> other.position if r.zero? + r = self.element <=> other.element if r.zero? + r + end + + def initialize(action, position, element) + @action = action + @position = position + @element = element + end + + # Creates a Change from an array produced by Change#to_a. + def to_a + [@action, @position, @element] + end + + def self.from_a(arr) + Diff::LCS::Change.new(arr[0], arr[1], arr[2]) + end + + include Diff::LCS::ChangeTypeTests +end + + # Represents a contextual change. Contains the position and values of the + # elements in the old and the new sequenced enumerables as well as the action + # taken. +class Diff::LCS::ContextChange + # Returns the action this Change represents. Can be '+' (#adding?), '-' + # (#deleting?), '=' (#unchanged?), # or '!' (#changed?). When + # created by Diff::LCS#diff or Diff::LCS#sdiff, it may also be '>' + # (#finished_a?) or '<' (#finished_b?). + attr_reader :action + attr_reader :old_position + attr_reader :old_element + attr_reader :new_position + attr_reader :new_element + + include Comparable + + def ==(other) + (@action == other.action) and + (@old_position == other.old_position) and + (@new_position == other.new_position) and + (@old_element == other.old_element) and + (@new_element == other.new_element) + end + + def inspect(*args) + %Q(#<#{self.class.name}:#{__id__} @action=#{action} positions=#{old_position},#{new_position} elements=#{old_element.inspect},#{new_element.inspect}>) + end + + def <=>(other) + r = @action <=> other.action + r = @old_position <=> other.old_position if r.zero? + r = @new_position <=> other.new_position if r.zero? + r = @old_element <=> other.old_element if r.zero? + r = @new_element <=> other.new_element if r.zero? + r + end + + def initialize(action, old_position, old_element, new_position, new_element) + @action = action + @old_position = old_position + @old_element = old_element + @new_position = new_position + @new_element = new_element + end + + def to_a + [@action, [@old_position, @old_element], [@new_position, @new_element]] + end + + # Creates a ContextChange from an array produced by ContextChange#to_a. + def self.from_a(arr) + if arr.size == 5 + Diff::LCS::ContextChange.new(arr[0], arr[1], arr[2], arr[3], arr[4]) + else + Diff::LCS::ContextChange.new(arr[0], arr[1][0], arr[1][1], arr[2][0], + arr[2][1]) + end + end + + # Simplifies a context change for use in some diff callbacks. '<' actions + # are converted to '-' and '>' actions are converted to '+'. + def self.simplify(event) + ea = event.to_a + + case ea[0] + when '-' + ea[2][1] = nil + when '<' + ea[0] = '-' + ea[2][1] = nil + when '+' + ea[1][1] = nil + when '>' + ea[0] = '+' + ea[1][1] = nil + end + + Diff::LCS::ContextChange.from_a(ea) + end + + include Diff::LCS::ChangeTypeTests +end diff --git a/lib/diff/lcs/hunk.rb b/lib/diff/lcs/hunk.rb new file mode 100644 index 00000000..a6e5ce41 --- /dev/null +++ b/lib/diff/lcs/hunk.rb @@ -0,0 +1,257 @@ +#! /usr/env/bin ruby +#-- +# Copyright 2004 Austin Ziegler +# adapted from: +# Algorithm::Diff (Perl) by Ned Konz +# Smalltalk by Mario I. Wolczko +# implements McIlroy-Hunt diff algorithm +# +# This program is free software. It may be redistributed and/or modified under +# the terms of the GPL version 2 (or later), the Perl Artistic licence, or the +# Ruby licence. +# +# $Id: hunk.rb,v 1.2 2004/08/08 20:33:09 austin Exp $ +#++ +# Contains Diff::LCS::Hunk for bin/ldiff. + +require 'diff/lcs/block' + + # A Hunk is a group of Blocks which overlap because of the context + # surrounding each block. (So if we're not using context, every hunk will + # contain one block.) Used in the diff program (bin/diff). +class Diff::LCS::Hunk + # Create a hunk using references to both the old and new data, as well as + # the piece of data + def initialize(data_old, data_new, piece, context, file_length_difference) + # At first, a hunk will have just one Block in it + @blocks = [ Diff::LCS::Block.new(piece) ] + @data_old = data_old + @data_new = data_new + + before = after = file_length_difference + after += @blocks[0].diff_size + @file_length_difference = after # The caller must get this manually + + # Save the start & end of each array. If the array doesn't exist + # (e.g., we're only adding items in this block), then figure out the + # line number based on the line number of the other file and the + # current difference in file lengths. + if @blocks[0].remove.empty? + a1 = a2 = nil + else + a1 = @blocks[0].remove[0].position + a2 = @blocks[0].remove[-1].position + end + + if @blocks[0].insert.empty? + b1 = b2 = nil + else + b1 = @blocks[0].insert[0].position + b2 = @blocks[0].insert[-1].position + end + + @start_old = a1 || (b1 - before) + @start_new = b1 || (a1 + before) + @end_old = a2 || (b2 - after) + @end_new = b2 || (a2 + after) + + self.flag_context = context + end + + attr_reader :blocks + attr_reader :start_old, :start_new + attr_reader :end_old, :end_new + attr_reader :file_length_difference + + # Change the "start" and "end" fields to note that context should be added + # to this hunk + attr_accessor :flag_context + def flag_context=(context) #:nodoc: + return if context.nil? or context.zero? + + add_start = (context > @start_old) ? @start_old : context + @start_old -= add_start + @start_new -= add_start + + if (@end_old + context) > @data_old.size + add_end = @data_old.size - @end_old + else + add_end = context + end + @end_old += add_end + @end_new += add_end + end + + def unshift(hunk) + @start_old = hunk.start_old + @start_new = hunk.start_new + blocks.unshift(*hunk.blocks) + end + + # Is there an overlap between hunk arg0 and old hunk arg1? Note: if end + # of old hunk is one less than beginning of second, they overlap + def overlaps?(hunk = nil) + return nil if hunk.nil? + + a = (@start_old - hunk.end_old) <= 1 + b = (@start_new - hunk.end_new) <= 1 + return (a or b) + end + + def diff(format) + case format + when :old + old_diff + when :unified + unified_diff + when :context + context_diff + when :ed + self + when :reverse_ed, :ed_finish + ed_diff(format) + else + raise "Unknown diff format #{format}." + end + end + + def each_old(block) + @data_old[@start_old .. @end_old].each { |e| yield e } + end + + private + # Note that an old diff can't have any context. Therefore, we know that + # there's only one block in the hunk. + def old_diff + warn "Expecting only one block in an old diff hunk!" if @blocks.size > 1 + op_act = { "+" => 'a', "-" => 'd', "!" => "c" } + + block = @blocks[0] + + # Calculate item number range. Old diff range is just like a context + # diff range, except the ranges are on one line with the action between + # them. + s = "#{context_range(:old)}#{op_act[block.op]}#{context_range(:new)}\n" + # If removing anything, just print out all the remove lines in the hunk + # which is just all the remove lines in the block. + @data_old[@start_old .. @end_old].each { |e| s << "< #{e}\n" } unless block.remove.empty? + s << "---\n" if block.op == "!" + @data_new[@start_new .. @end_new].each { |e| s << "> #{e}\n" } unless block.insert.empty? + s + end + + def unified_diff + # Calculate item number range. + s = "@@ -#{unified_range(:old)} +#{unified_range(:new)} @@\n" + + # Outlist starts containing the hunk of the old file. Removing an item + # just means putting a '-' in front of it. Inserting an item requires + # getting it from the new file and splicing it in. We splice in + # +num_added+ items. Remove blocks use +num_added+ because splicing + # changed the length of outlist. + # + # We remove +num_removed+ items. Insert blocks use +num_removed+ + # because their item numbers -- corresponding to positions in the NEW + # file -- don't take removed items into account. + lo, hi, num_added, num_removed = @start_old, @end_old, 0, 0 + + outlist = @data_old[lo .. hi].collect { |e| e.gsub(/^/, ' ') } + + @blocks.each do |block| + block.remove.each do |item| + op = item.action.to_s # - + offset = item.position - lo + num_added + outlist[offset].gsub!(/^ /, op.to_s) + num_removed += 1 + end + block.insert.each do |item| + op = item.action.to_s # + + offset = item.position - @start_new + num_removed + outlist[offset, 0] = "#{op}#{@data_new[item.position]}" + num_added += 1 + end + end + + s << outlist.join("\n") + end + + def context_diff + s = "***************\n" + s << "*** #{context_range(:old)} ****\n" + r = context_range(:new) + + # Print out file 1 part for each block in context diff format if there + # are any blocks that remove items + lo, hi = @start_old, @end_old + removes = @blocks.select { |e| not e.remove.empty? } + if removes + outlist = @data_old[lo .. hi].collect { |e| e.gsub(/^/, ' ') } + removes.each do |block| + block.remove.each do |item| + outlist[item.position - lo].gsub!(/^ /) { block.op } # - or ! + end + end + s << outlist.join("\n") + end + + s << "\n--- #{r} ----\n" + lo, hi = @start_new, @end_new + inserts = @blocks.select { |e| not e.insert.empty? } + if inserts + outlist = @data_new[lo .. hi].collect { |e| e.gsub(/^/, ' ') } + inserts.each do |block| + block.insert.each do |item| + outlist[item.position - lo].gsub!(/^ /) { block.op } # + or ! + end + end + s << outlist.join("\n") + end + s + end + + def ed_diff(format) + op_act = { "+" => 'a', "-" => 'd', "!" => "c" } + warn "Expecting only one block in an old diff hunk!" if @blocks.size > 1 + + if format == :reverse_ed + s = "#{op_act[@blocks[0].op]}#{context_range(:old)}\n" + else + s = "#{context_range(:old).gsub(/,/, ' ')}#{op_act[@blocks[0].op]}\n" + end + + unless @blocks[0].insert.empty? + @data_new[@start_new .. @end_new].each { |e| s << "#{e}\n" } + s << ".\n" + end + s + end + + # Generate a range of item numbers to print. Only print 1 number if the + # range has only one item in it. Otherwise, it's 'start,end' + def context_range(mode) + case mode + when :old + s, e = (@start_old + 1), (@end_old + 1) + when :new + s, e = (@start_new + 1), (@end_new + 1) + end + + (s < e) ? "#{s},#{e}" : "#{e}" + end + + # Generate a range of item numbers to print for unified diff. Print + # number where block starts, followed by number of lines in the block + # (don't print number of lines if it's 1) + def unified_range(mode) + case mode + when :old + s, e = (@start_old + 1), (@end_old + 1) + when :new + s, e = (@start_new + 1), (@end_new + 1) + end + + length = e - s + 1 + first = (length < 2) ? e : s # "strange, but correct" + (length == 1) ? "#{first}" : "#{first},#{length}" + end +end diff --git a/lib/diff/lcs/ldiff.rb b/lib/diff/lcs/ldiff.rb new file mode 100644 index 00000000..cc9cb803 --- /dev/null +++ b/lib/diff/lcs/ldiff.rb @@ -0,0 +1,226 @@ +#!/usr/bin/env ruby + +require 'optparse' +require 'ostruct' +require 'diff/lcs/hunk' + + # == ldiff Usage + # ldiff [options] oldfile newfile + # + # -c:: Displays a context diff with 3 lines of context. + # -C [LINES], --context [LINES]:: Displays a context diff with LINES lines of context. Default 3 lines. + # -u:: Displays a unified diff with 3 lines of context. + # -U [LINES], --unified [LINES]:: Displays a unified diff with LINES lines of context. Default 3 lines. + # -e:: Creates an 'ed' script to change oldfile to newfile. + # -f:: Creates an 'ed' script to change oldfile to newfile in reverse order. + # -a, --text:: Treats the files as text and compares them line-by-line, even if they do not seem to be text. + # --binary:: Treats the files as binary. + # -q, --brief:: Reports only whether or not the files differ, not the details. + # --help:: Shows the command-line help. + # --version:: Shows the version of Diff::LCS. + # + # By default, runs produces an "old-style" diff, with output like UNIX diff. + # + # == Copyright + # Copyright © 2004 Austin Ziegler + # + # Part of Diff::LCS + # Austin Ziegler + # + # This program is free software. It may be redistributed and/or modified under + # the terms of the GPL version 2 (or later), the Perl Artistic licence, or the + # Ruby licence. +module Diff::LCS::Ldiff + BANNER = <<-COPYRIGHT +ldiff #{Diff::LCS::VERSION} + Copyright © 2004 Austin Ziegler + + Part of Diff::LCS. + http://rubyforge.org/projects/ruwiki/ + + Austin Ziegler + + This program is free software. It may be redistributed and/or modified under + the terms of the GPL version 2 (or later), the Perl Artistic licence, or the + Ruby licence. + +$Id: ldiff.rb,v 1.1 2004/09/26 01:37:49 austin Exp $ + COPYRIGHT + + class << self + attr_reader :format, :lines #:nodoc: + attr_reader :file_old, :file_new #:nodoc: + attr_reader :data_old, :data_new #:nodoc: + + def run(args, input = $stdin, output = $stdout, error = $stderr) #:nodoc: + args.options do |o| + o.banner = "Usage: #{File.basename($0)} [options] oldfile newfile" + o.separator "" + o.on('-c', + 'Displays a context diff with 3 lines of', + 'context.') do |ctx| + @format = :context + @lines = 3 + end + o.on('-C', '--context [LINES]', Numeric, + 'Displays a context diff with LINES lines', + 'of context. Default 3 lines.') do |ctx| + @format = :context + @lines = ctx || 3 + end + o.on('-u', + 'Displays a unified diff with 3 lines of', + 'context.') do |ctx| + @format = :unified + @lines = 3 + end + o.on('-U', '--unified [LINES]', Numeric, + 'Displays a unified diff with LINES lines', + 'of context. Default 3 lines.') do |ctx| + @format = :unified + @lines = ctx || 3 + end + o.on('-e', + 'Creates an \'ed\' script to change', + 'oldfile to newfile.') do |ctx| + @format = :ed + end + o.on('-f', + 'Creates an \'ed\' script to change', + 'oldfile to newfile in reverse order.') do |ctx| + @format = :reverse_ed + end + o.on('-a', '--text', + 'Treat the files as text and compare them', + 'line-by-line, even if they do not seem', + 'to be text.') do |txt| + @binary = false + end + o.on('--binary', + 'Treats the files as binary.') do |bin| + @binary = true + end + o.on('-q', '--brief', + 'Report only whether or not the files', + 'differ, not the details.') do |ctx| + @format = :report + end + o.on_tail('--help', 'Shows this text.') do + error << o + return 0 + end + o.on_tail('--version', 'Shows the version of Diff::LCS.') do + error << BANNER + return 0 + end + o.on_tail "" + o.on_tail 'By default, runs produces an "old-style" diff, with output like UNIX diff.' + o.parse! + end + + unless args.size == 2 + error << args.options + return 127 + end + + # Defaults are for old-style diff + @format ||= :old + @lines ||= 0 + + file_old, file_new = *ARGV + + case @format + when :context + char_old = '*' * 3 + char_new = '-' * 3 + when :unified + char_old = '-' * 3 + char_new = '+' * 3 + end + + # After we've read up to a certain point in each file, the number of + # items we've read from each file will differ by FLD (could be 0). + file_length_difference = 0 + + if @binary.nil? or @binary + data_old = IO::read(file_old) + data_new = IO::read(file_new) + + # Test binary status + if @binary.nil? + old_txt = data_old[0...4096].grep(/\0/).empty? + new_txt = data_new[0...4096].grep(/\0/).empty? + @binary = (not old_txt) or (not new_txt) + old_txt = new_txt = nil + end + + unless @binary + data_old = data_old.split(/\n/).map! { |e| e.chomp } + data_new = data_new.split(/\n/).map! { |e| e.chomp } + end + else + data_old = IO::readlines(file_old).map! { |e| e.chomp } + data_new = IO::readlines(file_new).map! { |e| e.chomp } + end + + # diff yields lots of pieces, each of which is basically a Block object + if @binary + diffs = (data_old == data_new) + else + diffs = Diff::LCS.diff(data_old, data_new) + diffs = nil if diffs.empty? + end + + return 0 unless diffs + + if (@format == :report) and diffs + output << "Files #{file_old} and #{file_new} differ\n" + return 1 + end + + if (@format == :unified) or (@format == :context) + ft = File.stat(file_old).mtime.localtime.strftime('%Y-%m-%d %H:%M:%S %z') + puts "#{char_old} #{file_old}\t#{ft}" + ft = File.stat(file_new).mtime.localtime.strftime('%Y-%m-%d %H:%M:%S %z') + puts "#{char_new} #{file_new}\t#{ft}" + end + + # Loop over hunks. If a hunk overlaps with the last hunk, join them. + # Otherwise, print out the old one. + oldhunk = hunk = nil + + if @format == :ed + real_output = output + output = [] + end + + diffs.each do |piece| + begin + hunk = Diff::LCS::Hunk.new(data_old, data_new, piece, @lines, + file_length_difference) + file_length_difference = hunk.file_length_difference + + next unless oldhunk + + if (@lines > 0) and hunk.overlaps?(oldhunk) + hunk.unshift(oldhunk) + else + output << oldhunk.diff(@format) + end + ensure + oldhunk = hunk + output << "\n" + end + end + + output << oldhunk.diff(@format) + output << "\n" + + if @format == :ed + output.reverse_each { |e| real_output << e.diff(:ed_finish) } + end + + return 1 + end + end +end diff --git a/lib/diff/lcs/string.rb b/lib/diff/lcs/string.rb new file mode 100644 index 00000000..44c4a826 --- /dev/null +++ b/lib/diff/lcs/string.rb @@ -0,0 +1,19 @@ +#! /usr/env/bin ruby +#-- +# Copyright 2004 Austin Ziegler +# adapted from: +# Algorithm::Diff (Perl) by Ned Konz +# Smalltalk by Mario I. Wolczko +# implements McIlroy-Hunt diff algorithm +# +# This program is free software. It may be redistributed and/or modified under +# the terms of the GPL version 2 (or later), the Perl Artistic licence, or the +# Ruby licence. +# +# $Id: string.rb,v 1.3 2004/08/08 20:33:09 austin Exp $ +#++ +# Includes Diff::LCS into String. + +class String + include Diff::LCS +end diff --git a/lib/page_renderer.rb b/lib/page_renderer.rb index af64602c..26fbd06e 100644 --- a/lib/page_renderer.rb +++ b/lib/page_renderer.rb @@ -1,11 +1,9 @@ -require 'diff' +require 'xhtmldiff' # Temporary class containing all rendering stuff from a Revision # I want to shift all rendering loguc to the controller eventually class PageRenderer - include HTMLDiff - def self.setup_url_generator(url_generator) @@url_generator = url_generator end @@ -41,8 +39,22 @@ class PageRenderer def display_diff previous_revision = @revision.page.previous_revision(@revision) if previous_revision - rendered_previous_revision = WikiContent.new(previous_revision, @@url_generator).render! - diff(rendered_previous_revision, display_content) + + previous_content = "
\n" + WikiContent.new(previous_revision, @@url_generator).render!.to_s + "\n
" + current_content = "
\n" + display_content.to_s + "\n
" + diff_doc = REXML::Document.new + diff_doc << (div = REXML::Element.new 'div') + hd = XHTMLDiff.new(div) + + parsed_previous_revision = REXML::HashableElementDelegator.new( + REXML::XPath.first(REXML::Document.new(previous_content), '/div')) + parsed_display_content = REXML::HashableElementDelegator.new( + REXML::XPath.first(REXML::Document.new(current_content), '/div')) + Diff::LCS.traverse_balanced(parsed_previous_revision, parsed_display_content, hd) + + diffs = '' + diff_doc.write(diffs, -1, true, true) + diffs else display_content end diff --git a/lib/xhtmldiff.rb b/lib/xhtmldiff.rb new file mode 100644 index 00000000..c576b197 --- /dev/null +++ b/lib/xhtmldiff.rb @@ -0,0 +1,179 @@ +#!/usr/bin/env ruby +# Author: Aredridel +# Website: http://theinternetco.net/projects/ruby/xhtmldiff.html +# Licence: same as Ruby +# Version: 1.22 +# +# Tweaks by Jacques Distler +# -- add classnames to and elements added by XHTMLDiff, +# for better CSS styling + +require 'diff/lcs' +require 'rexml/document' +require 'delegate' + +def Math.max(a, b) + a > b ? a : b +end + +module REXML + + class Text + def deep_clone + clone + end + end + + class HashableElementDelegator < DelegateClass(Element) + def initialize(sub) + super sub + end + def == other + res = other.to_s.strip == self.to_s.strip + res + end + + def eql? other + self == other + end + + def[](k) + r = super + if r.kind_of? __getobj__.class + self.class.new(r) + else + r + end + end + + def hash + r = __getobj__.to_s.hash + r + end + end + +end + +class XHTMLDiff + include REXML + attr_accessor :output + + class << self + BLOCK_CONTAINERS = ['div', 'ul', 'li'] + def diff(a, b) + if a == b + return a.deep_clone + end + if REXML::HashableElementDelegator === a and REXML::HashableElementDelegator === b + o = REXML::Element.new(a.name) + o.add_attributes a.attributes + hd = self.new(o) + Diff::LCS.traverse_balanced(a, b, hd) + o + elsif REXML::Text === a and REXML::Text === b + o = REXML::Element.new('span') + aa = a.value.split(/\s/) + ba = b.value.split(/\s/) + hd = XHTMLTextDiff.new(o) + Diff::LCS.traverse_balanced(aa, ba, hd) + o + else + raise ArgumentError.new("both arguments must be equal or both be elements. a is #{a.class.name} and b is #{b.class.name}") + end + end + end + + def diff(a, b) + self.class.diff(a,b) + end + + def initialize(output) + @output = output + end + + # This will be called with both elements are the same + def match(event) + @output << event.old_element.deep_clone if event.old_element + end + + # This will be called when there is an element in A that isn't in B + def discard_a(event) + @output << wrap(event.old_element, 'del', 'diffdel') + end + + def change(event) + begin + sd = diff(event.old_element, event.new_element) + rescue ArgumentError + sd = nil + end + if sd and (ratio = (Float(rs = sd.to_s.gsub(%r{<(ins|del)>.*}, '').size) / bs = Math.max(event.old_element.to_s.size, event.new_element.to_s.size))) > 0.5 + @output << sd + else + @output << wrap(event.old_element, 'del', 'diffmod') + @output << wrap(event.new_element, 'ins', 'diffmod') + end + end + + # This will be called when there is an element in B that isn't in A + def discard_b(event) + @output << wrap(event.new_element, 'ins', 'diffins') + end + + def choose_event(event, element, tag) + end + + def wrap(element, tag = nil, class_name = nil) + if tag + el = Element.new tag + el << element.deep_clone + else + el = element.deep_clone + end + if class_name + el.add_attribute('class', class_name) + end + el + end + + class XHTMLTextDiff < XHTMLDiff + def change(event) + @output << wrap(event.old_element, 'del', 'diffmod') + @output << wrap(event.new_element, 'ins', 'diffmod') + end + + # This will be called with both elements are the same + def match(event) + @output << wrap(event.old_element, nil, nil) if event.old_element + end + + # This will be called when there is an element in A that isn't in B + def discard_a(event) + @output << wrap(event.old_element, 'del', 'diffdel') + end + + # This will be called when there is an element in B that isn't in A + def discard_b(event) + @output << wrap(event.new_element, 'ins', 'diffins') + end + + def wrap(element, tag = nil, class_name = nil) + element = REXML::Text.new(" " << element) if String === element + return element unless tag + wrapper_element = REXML::Element.new(tag) + wrapper_element.add_text element + if class_name + wrapper_element.add_attribute('class', class_name) + end + wrapper_element + end + end + +end + +if $0 == __FILE__ + + $stderr.puts "No tests available yet" + exit(1) + +end diff --git a/test/unit/diff_test.rb b/test/unit/diff_test.rb index 67a8a71b..ccb2abcc 100755 --- a/test/unit/diff_test.rb +++ b/test/unit/diff_test.rb @@ -1,110 +1,94 @@ #!/usr/bin/env ruby require File.expand_path(File.dirname(__FILE__) + '/../test_helper') -require 'diff' +require 'xhtmldiff' class DiffTest < Test::Unit::TestCase - include HTMLDiff - def setup - @builder = DiffBuilder.new('old', 'new') + end - def test_start_of_tag - assert @builder.start_of_tag?('<') - assert(!@builder.start_of_tag?('>')) - assert(!@builder.start_of_tag?('a')) - end - - def test_end_of_tag - assert @builder.end_of_tag?('>') - assert(!@builder.end_of_tag?('<')) - assert(!@builder.end_of_tag?('a')) - end - - def test_whitespace - assert @builder.whitespace?(" ") - assert @builder.whitespace?("\n") - assert @builder.whitespace?("\r") - assert(!@builder.whitespace?("a")) - end - - def test_convert_html_to_list_of_words_simple - assert_equal( - ['the', ' ', 'original', ' ', 'text'], - @builder.convert_html_to_list_of_words('the original text')) - end - - def test_convert_html_to_list_of_words_should_separate_endlines - assert_equal( - ['a', "\n", 'b', "\r", 'c'], - @builder.convert_html_to_list_of_words("a\nb\rc")) - end - - def test_convert_html_to_list_of_words_should_not_compress_whitespace - assert_equal( - ['a', ' ', 'b', ' ', 'c', "\r \n ", 'd'], - @builder.convert_html_to_list_of_words("a b c\r \n d")) - end - - def test_convert_html_to_list_of_words_should_handle_tags_well - assert_equal( - ['

', 'foo', ' ', 'bar', '

'], - @builder.convert_html_to_list_of_words("

foo bar

")) - end - - def test_convert_html_to_list_of_words_interesting - assert_equal( - ['

', 'this', ' ', 'is', '

', "\r\n", '

', 'the', ' ', 'new', ' ', 'string', - '

', "\r\n", '

', 'around', ' ', 'the', ' ', 'world', '

'], - @builder.convert_html_to_list_of_words( - "

this is

\r\n

the new string

\r\n

around the world

")) + def diff(a,b) + diff_doc = REXML::Document.new + diff_doc << (div = REXML::Element.new 'div' ) + hd = XHTMLDiff.new(div) + parsed_a = REXML::HashableElementDelegator.new( + REXML::XPath.first(REXML::Document.new("
"+a+"
"), '/div')) + parsed_b = REXML::HashableElementDelegator.new( + REXML::XPath.first(REXML::Document.new("
"+b+"
"), '/div')) + Diff::LCS.traverse_balanced(parsed_a, parsed_b, hd) + diffs = '' + diff_doc.write(diffs, -1, true, true) + diffs end def test_html_diff_simple a = 'this was the original string' b = 'this is the new string' - assert_equal('this wasis the ' + - 'originalnew string', - diff(a, b)) + assert_equal("
this was is the" + + " original new string
", + diff(a, b)) end def test_html_diff_with_multiple_paragraphs a = "

this was the original string

" - b = "

this is

\r\n

the new string

\r\n

around the world

" - - # Some of this expected result is accidental to implementation. - # At least it's well-formed and more or less correct. + b = "

this is

\n

the new string

\n

around the world

" assert_equal( - "

this wasis

"+ - "\r\n

the " + - "originalnew" + - " string

\r\n" + - "

around the world

", + "

this was is" + + " the original string

" + + "\n

the new string

" + + "\n

around the world

", diff(a, b)) end + def test_split_paragraph_into_two + a = "

foo bar

" + b = "

foo

bar

" + assert_equal( + "

foo bar

" + + "

bar

", + diff(a,b)) + end + + def test_join_two_paragraphs_into_one + a = "

foo

bar

" + b = "

foo bar

" + assert_equal( + "

foo bar

" + + "

bar

", + diff(a,b)) + end + + def test_add_inline_element + a = "

foo bar

" + b = "

foo bar

" + assert_equal( + "

foo bar" + + "bar

", + diff(a,b)) + end + # FIXME this test fails (ticket #67, http://dev.instiki.org/ticket/67) def test_html_diff_preserves_endlines_in_pre - a = "
\na\nb\nc\n
" - b = "
\n
" + a = "
a\nb\nc\n
" + b = "
a\n
" assert_equal( - "
\na\nb\nc\n
", + "
 a\nb\nc\n
", diff(a, b)) end def test_html_diff_with_tags a = "" b = "
foo
" - assert_equal '
foo
', diff(a, b) + assert_equal "
foo
", diff(a, b) end def test_diff_for_tag_change a = "x" b = "x" # FIXME sad, but true - this case produces an invalid XML. If handle this you can, strong your foo is. - assert_equal 'x', diff(a, b) + assert_equal "
xx
", diff(a, b) end end