Refactoring and deleting unused code from lib/diffr.rb until I can understand what it says. Also fixes #256. The build is still broken.

2006-01-22 21:40:20 +00:00 · 2006-01-22 21:40:20 +00:00 · 3ea1ef881f
commit 3ea1ef881f
parent 60c07ca1a2
4 changed files with 298 additions and 509 deletions
--- a/3
+++ b/3
@ -50,7 +50,8 @@ to install (until somebody sends a patch to properly package Instiki for all tho
 * Internationalization: Wiki words in any latin, greek, cyrillian, or armenian characters
 * Color diffs: Track changes through revisions
 * Definitely can run on SQLite and MySQL
-* May be able to run on Postgres, Oracle, DB2 and SqlServer (if you try this )
+* May be able to run on Postgres, Oracle, DB2 and SqlServer. If you try this, and it works 
+  (or, it doesn't, but you make it work) please write about it on Instiki.org.

 ===Command-line options:
 * Run "ruby instiki --help"
--- a/lib/diff.rb
+++ b/lib/diff.rb
@ -1,69 +1,199 @@
-# heavily based off difflib.py - see that file for documentation
-# ported from Python by Bill Atkins
+module HTMLDiff

-# This does not support all features offered by difflib; it 
-# implements only the subset of features necessary
-# to support a Ruby version of HTML Differ.  You're welcome to finish this off.
+  Match = Struct.new(:start_in_old, :start_in_new, :size)
+  class Match
+    def end_in_old
+      self.start_in_old + self.size
+    end
    
-# By default, String#each iterates by line.  This isn't really appropriate
-# for diff, so often a string will be split by // to get an array of one-
-# character strings.
-
-# Some methods in Diff are untested and are not guaranteed to work.  The
-# methods in HTMLDiff and any methods it calls should work quite well.
-
-# changes by DenisMertz
-# * main change:
-# ** get the tag soup away
-#    the tag soup problem was first reported with <p> tags, but it appeared also with
-#    <li>, <ul> etc... tags 
-#   this version should mostly fix these problems
-# ** added a Builder class to manage the creation of the final htmldiff
-# * minor changes:
-# ** use symbols instead of string to represent opcodes 
-# ** small fix to html2list
-#
-
-module Enumerable
-  def reduce(init)
-    result = init
-    each { |item| result = yield(result, item) }
-    result
+    def end_in_new
+      self.start_in_new + self.size
+    end
  end
-end
  
-class Object
-  def nil_or_empty?
-    nil? or empty?
-  end
-end
+  Operation = Struct.new(:action, :start_in_old, :end_in_old, :start_in_new, :end_in_new)

-module Diff
+  class DiffBuilder
+
+    def initialize(old_version, new_version)
+      @old_version, @new_version = old_version, new_version
+      @content = []
+    end
+
+    def build
+      split_inputs_to_words
+      index_new_words
+      operations.each {|opcode| perform_operation(opcode)}
+      return @content.join
+    end
+
+    def split_inputs_to_words
+      @old_words = convert_html_to_list_of_words(explode(@old_version))
+      @new_words = convert_html_to_list_of_words(explode(@new_version))
+    end
+
+    def index_new_words
+      @word_indices = {}
+      @new_words.each_with_index { |word, i| (@word_indices[word] ||= []) << i }
+    end
+
+    def operations
+      position_in_old = position_in_new = 0
+      operations = []
+      matching_blocks.each do |match|
+        match_starts_at_current_position_in_old = (position_in_old == match.start_in_old)
+        match_starts_at_current_position_in_new = (position_in_new == match.start_in_new)
+        
+        action_upto_match_positions = 
+          case [match_starts_at_current_position_in_old, match_starts_at_current_position_in_new]
+          when [false, false]
+            :replace
+          when [true, false]
+            :insert
+          when [false, true]
+            :delete
+          else
+            # this happens if the first few words are same in both versions
+            :none
+          end
+
+        if action_upto_match_positions != :none
+          operation_upto_match_positions = 
+              Operation.new(action_upto_match_positions, 
+                  position_in_old, match.start_in_old, 
+                  position_in_new, match.start_in_new)
+          operations << operation_upto_match_positions
+        end
+        match_operation = Operation.new(:equal, 
+            match.start_in_old, match.end_in_old, 
+            match.start_in_new, match.end_in_new)
+        operations << match_operation
+
+        position_in_old = match.end_in_old
+        position_in_new = match.end_in_new
+      end
+      operations
+    end
+
+    def matching_blocks
+      matching_blocks = []
+      recursively_find_matching_blocks(0, @old_words.size, 0, @new_words.size, matching_blocks)
+      matching_blocks
+    end
+    
+    def recursively_find_matching_blocks(start_in_old, end_in_old, start_in_new, end_in_new, matching_blocks)
+      match = find_match(start_in_old, end_in_old, start_in_new, end_in_new)
+      if match
+        if start_in_old < match.start_in_old and start_in_new < match.start_in_new
+          recursively_find_matching_blocks(
+              start_in_old, match.start_in_old, start_in_new, match.start_in_new, matching_blocks) 
+        end
+        matching_blocks << match
+        if match.end_in_old < end_in_old and match.end_in_new < end_in_new
+          recursively_find_matching_blocks(
+              match.end_in_old, end_in_old, match.end_in_new, end_in_new, matching_blocks)
+        end
+      end
+    end
+
+    def find_match(start_in_old, end_in_old, start_in_new, end_in_new)
+      besti, bestj, bestsize = start_in_old, start_in_new, 0
+      
+      j2len = {}
+      
+      (start_in_old..end_in_old).step do |i|
+        newj2len = {}
+        (@word_indices[@old_words[i]] || []).each do |j|
+          next  if j < start_in_new
+          break if j >= end_in_new
+          
+          k = newj2len[j] = (j2len[j - 1] || 0) + 1
+          if k > bestsize
+            besti, bestj, bestsize = i - k + 1, j - k + 1, k
+          end
+        end
+        j2len = newj2len
+      end
+      
+      while besti > start_in_old and bestj > start_in_new and @old_words[besti - 1] == @new_words[bestj - 1]
+        besti, bestj, bestsize = besti - 1, bestj - 1, bestsize + 1
+      end
+      
+      while besti + bestsize < end_in_old and bestj + bestsize < end_in_new and
+          @old_words[besti + bestsize] == @new_words[bestj + bestsize]
+        bestsize += 1
+      end
+      
+      if bestsize == 0 
+        return nil
+      else 
+        return Match.new(besti, bestj, bestsize)
+      end
+    end
+    
+    VALID_METHODS = [:replace, :insert, :delete, :equal]
+    def perform_operation(operation)
+      @operation = operation
+      self.send operation.action, operation
+    end
+
+    def replace(operation)
+      delete(operation, 'diffmod')
+      insert(operation, 'diffmod')
+    end
+    
+    def insert(operation, tagclass = 'diffins')
+      insert_tag('ins', tagclass, @new_words[operation.start_in_new...operation.end_in_new])
+    end
+    
+    def delete(operation, tagclass = 'diffdel')
+       insert_tag('del', tagclass, @old_words[operation.start_in_old...operation.end_in_old])
+    end
+    
+    def equal(operation)
+      # no tags to insert, simply copy the matching words from onbe of the versions
+      @content += @new_words[operation.start_in_new...operation.end_in_new]
+    end
+  
+    def opening_tag?(item)
+      item =~ %r!^\s*<[^>]+>\s*$!
+    end
+
+    def closing_tag?(item)
+      item =~ %r!^\s*</[^>]+>\s*$!
+    end
+
+    def tag?(item)
+      opening_tag?(item) or closing_tag?(item)
+    end
+
+    # Tries to enclose diff tags (<ins> or <del>) within <p> tags
+    # As a result the diff tags should be the "most inside" possible.
+    def insert_tag(tagname, cssclass, words)
+      unless words.any? { |word| tag?(word) }
+        @content << wrap_text(words.join, tagname, cssclass)
+      else
+        loop do
+          break if words.empty?
+          @content << words and break if words.all? { |word| tag?(word) }
+          # We are outside of a diff tag
+          @content << words.shift while not words.empty? and tag?(words.first)
+          @content << %(<#{tagname} class="#{cssclass}">) 
+          # We are inside a diff tag
+          @content << words.shift until words.empty? or tag?(words.first)
+          @content << %(</#{tagname}>)
+        end
+      end
+    end
+
+    def wrap_text(text, tagname, cssclass)
+      %(<#{tagname} class="#{cssclass}">#{text}</#{tagname}>)
+    end

-  module Utilities
    def explode(sequence)
      sequence.is_a?(String) ? sequence.split(//) : sequence
    end
  
-    def newline?(char)
-      %W(\n \r).include? char
-    end
-
-    def tab?(char)
-      "\t" == char
-    end
-
-    # XXX Could be more robust but unrecognized tags cause an infinite loop so
-    # better to be permissive
-    def open_tag?(char)
-      char =~ /\A<[^>]+>/
-    end
-
-    # See comment for open_tag?
-    def close_tag?(char)
-      char =~ %r!\A</[^>]+>!
-    end
-
    def end_of_tag?(char)
      char == '>'
    end
@ -72,408 +202,66 @@ module Diff
      char == '<'
    end
    
-    def html2list(x, use_brackets = false)
+    def whitespace?(char)
+      char =~ /\s/
+    end
+  
+    def convert_html_to_list_of_words(x, use_brackets = false)
      mode = :char
-      cur  = ''
-      out  = []
+      current_word  = ''
+      words = []
      
      explode(x).each do |char|
        case mode
        when :tag
          if end_of_tag? char
-            cur += use_brackets ? ']' : '>'
-            case cur
-            when '<pre>' then mode = :pre
-            when '<code>' then mode = :code
+            current_word << (use_brackets ? ']' : '>')
+            words << current_word
+            current_word = ''
+            if whitespace?(char) 
+              mode = :whitespace 
            else
-              out.push cur
-              cur, mode  = '', :char
+              mode = :char
            end
          else
-            cur += char
+            current_word << char
          end
        when :char
          if start_of_tag? char
-            out.push cur
-            cur  = use_brackets ? '[' : '<'
+            words << current_word unless current_word.empty?
+            current_word = (use_brackets ? '[' : '<')
            mode = :tag
          elsif /\s/.match char
-            out.push cur + char
-            cur = ''
+            words << current_word unless current_word.empty?
+            current_word = char
+            mode = :whitespace
          else
-            cur += char
+            current_word << char
          end
-        when :pre
-          if end_of_tag?(char) and cur =~ %r!</pre$!
-            cur += '>'
-            out.push cur
-            cur, mode  = '', :char
+        when :whitespace
+          if start_of_tag? char
+            words << current_word unless current_word.empty?
+            current_word = (use_brackets ? '[' : '<')
+            mode = :tag
+          elsif /\s/.match char
+            current_word << char
          else
-            cur += char
-          end
-        when :code
-          if end_of_tag?(char) and cur =~ %r!</code$!
-            cur += '>'
-            out.push cur
-            cur, mode  = '', :char
-          else
-            cur += char
-          end
-        end
-      end
-      
-      out.push(cur)
-      out.delete '' 
-      out.map {|elt| newline?(elt) ? elt : elt.chomp}
-    end
-  end
-
-  class SequenceMatcher
-    include Utilities
-
-    def initialize(a = [''], b = [''], isjunk = nil, byline = false)
-      a, b = explode(a), explode(b) unless byline 
-      @isjunk = isjunk || Proc.new {}
-      set_sequences a, b
-    end
-
-    def set_sequences(a, b)
-      set_sequence_a a
-      set_sequence_b b
-    end
-
-    def set_sequence_a(a)
-      @a = a
-      @matching_blocks = @opcodes = nil
-    end
-
-    def set_sequence_b(b)
-      @b = b
-      @matching_blocks = @opcodes = nil
-      chain_b
-    end
-
-    def chain_b
-      @fullbcount = nil
-      @b2j     = {}
-      pophash  = {}
-      junkdict = {}
-      
-      @b.each_with_index do |elt, idx|
-        if @b2j.has_key? elt
-          indices = @b2j[elt]
-          if @b.length >= 200 and indices.length * 100 > @b.length
-            pophash[elt] = 1
-            indices.clear
-          else
-            indices.push idx
+            words << current_word unless current_word.empty?
+            current_word = char
+            mode = :char
          end
        else 
-          @b2j[elt] = [idx]
+          raise "Unknown mode #{mode.inspect}"
        end
      end
-        
-      pophash.each_key { |elt| @b2j.delete elt }
-      
-      unless @isjunk.nil?
-        [pophash, @b2j].each do |d|
-          d.each_key do |elt|
-            if @isjunk.call(elt)
-              junkdict[elt] = 1
-              d.delete elt
-            end
-          end
-        end
-      end
-      
-      @isbjunk    = junkdict.method(:has_key?)
-      @isbpopular = junkdict.method(:has_key?)
+      words << current_word unless current_word.empty?
+      words
    end

-    def find_longest_match(a_low, a_high, b_low, b_high)
-      besti, bestj, bestsize = a_low, b_low, 0
+  end # of class Diff Builder
  
-      j2len = {}
-      
-      (a_low..a_high).step do |i|
-        newj2len = {}
-        (@b2j[@a[i]] || []).each do |j|
-          next  if j < b_low
-          break if j >= b_high
-          
-          k = newj2len[j] = (j2len[j - 1] || 0) + 1
-          if k > bestsize
-            besti, bestj, bestsize = i - k + 1, j - k + 1, k
-          end
-        end
-        j2len = newj2len
-      end
-      
-      while besti > a_low and bestj > b_low and not @isbjunk.call(@b[bestj - 1]) and @a[besti - 1] == @b[bestj - 1]
-        besti, bestj, bestsize = besti - 1, bestj - 1, bestsize + 1
-      end
-      
-      while besti + bestsize < a_high and bestj + bestsize < b_high and
-          not @isbjunk.call(@b[bestj + bestsize]) and
-          @a[besti + bestsize] == @b[bestj + bestsize]
-        bestsize += 1
-      end
-      
-      while besti > a_low and bestj > b_low and @isbjunk.call(@b[bestj - 1]) and @a[besti - 1] == @b[bestj - 1]
-        besti, bestj, bestsize = besti - 1, bestj - 1, bestsize + 1
-      end
-      
-      while besti + bestsize < a_high and bestj + bestsize < b_high and @isbjunk.call(@b[bestj+bestsize]) and
-          @a[besti+bestsize] == @b[bestj+bestsize]
-        bestsize += 1
-      end
-      
-      [besti, bestj, bestsize]
-    end
-    
-    def get_matching_blocks
-      return @matching_blocks unless @matching_blocks.nil_or_empty?
-      
-      @matching_blocks = []
-      size_of_a, size_of_b = @a.size, @b.size
-      match_block_helper(0, size_of_a, 0, size_of_b, @matching_blocks)
-      @matching_blocks.push [size_of_a, size_of_b, 0]
-    end
-    
-    def match_block_helper(a_low, a_high, b_low, b_high, answer)
-      i, j, k = x = find_longest_match(a_low, a_high, b_low, b_high)
-      unless k.zero?
-        match_block_helper(a_low, i, b_low, j, answer) if a_low < i and b_low < j
-        answer.push x
-        if i + k < a_high and j + k < b_high
-          match_block_helper(i + k, a_high, j + k, b_high, answer)
-        end
-      end
-    end
-    
-    def get_opcodes
-      return @opcodes unless @opcodes.nil_or_empty?
-
-      i = j = 0
-      @opcodes = answer = []
-      get_matching_blocks.each do |ai, bj, size|
-        tag = if i < ai and j < bj
-                :replace
-              elsif i < ai
-                :delete
-              elsif j < bj 
-                :insert
-              end
-
-        answer.push [tag, i, ai, j, bj] if tag
-        i, j = ai + size, bj + size
-        answer.push [:equal, ai, i, bj, j] unless size.zero?
-      end
-      answer
-    end
-
-    # XXX: untested
-    def get_grouped_opcodes(n = 3)
-      codes = get_opcodes
-      if codes.first.first == :equal
-        tag, i1, i2, j1, j2 = codes.first
-        codes[0] = tag, [i1, i2 - n].max, i2, [j1, j2-n].max, j2
-      end
-      
-      if codes.last.first == :equal
-        tag, i1, i2, j1, j2 = codes.last
-        codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
-      end
-
-      nn = n + n
-      group = []
-      codes.each do |tag, i1, i2, j1, j2|
-        if tag == :equal and i2 - i1 > nn
-          group.push [tag, i1, [i2, i1 + n].min, j1, [j2, j1 + n].min]
-          yield group
-          group = []
-          i1, j1 = [i1, i2-n].max, [j1, j2-n].max
-          group.push [tag, i1, i2, j1 ,j2]
-        end
-      end
-      yield group if group and group.size != 1 and group.first.first == :equal
-    end
-
-    def ratio
-      matches = get_matching_blocks.reduce(0) do |sum, triple|
-        sum + triple.last
-      end
-      Diff.calculate_ratio(matches, @a.size + @b.size)
-    end
-
-    def quick_ratio
-      if @fullbcount.nil_or_empty?
-        @fullbcount = {}
-        @b.each do |elt|
-          @fullbcount[elt] = (@fullbcount[elt] || 0) + 1
-        end
-      end
-      
-      avail   = {}
-      matches = 0
-      @a.each do |elt|
-        numb       = avail.has_key?(elt) ? avail[elt] : (@fullbcount[elt] || 0)
-        avail[elt] = numb - 1
-        matches   += 1 if numb > 0
-      end
-      Diff.calculate_ratio matches, @a.size + @b.size
-    end
-
-    def real_quick_ratio
-      size_of_a, size_of_b = @a.size, @b.size
-      Diff.calculate_ratio([size_of_a, size_of_b].min, size_of_a + size_of_b)
-    end
-
-    protected :chain_b, :match_block_helper
-  end # end class SequenceMatcher
-
-  class << self
-    def calculate_ratio(matches, length)
-      return 1.0 if length.zero?
-      2.0 * matches / length
-    end
-
-    # XXX: untested
-    def get_close_matches(word, possibilities, n = 3, cutoff = 0.6)
-      raise "n must be > 0: #{n}" unless n > 0
-      raise "cutoff must be in (0.0..1.0): #{cutoff}" unless cutoff.between 0.0..1.0
-
-      result = []
-      sequence_matcher = Diff::SequenceMatcher.new
-      sequence_matcher.set_sequence_b word
-      possibilities.each do |possibility|
-        sequence_matcher.set_sequence_a possibility
-        if sequence_matcher.real_quick_ratio >= cutoff and
-           sequence_matcher.quick_ratio >= cutoff      and
-           sequence_matcher.ratio >= cutoff
-          result.push [sequence_matcher.ratio, possibility]
-        end
-      end
-      
-      unless result.nil_or_empty?
-        result.sort
-        result.reverse!
-        result = result[-n..-1]
-      end
-      result.map {|score, x| x }
-    end
-
-    def count_leading(line, ch)
-      count, size = 0, line.size
-      count += 1 while count < size and line[count].chr == ch
-      count
-    end
-  end
-end
-
-module HTMLDiff
-  include Diff
-  class Builder
-    VALID_METHODS = [:replace, :insert, :delete, :equal]
-    def initialize(a, b)
-      @a, @b   = a, b
-      @content = []
-    end
-
-    def do_op(opcode)
-      @opcode = opcode
-      op = @opcode.first
-      raise NameError, "Invalid opcode '#{op}'" unless VALID_METHODS.include? op
-      send op
-    end
-
-    def result
-      @content.join
-    end
-
-    # These methods have to be called via do_op(opcode) so that @opcode is set properly
-    private
-
-      def replace
-        delete('diffmod')
-        insert('diffmod')
-      end
-      
-      def insert(tagclass = 'diffins')
-        op_helper('ins', tagclass, @b[@opcode[3]...@opcode[4]])
-      end
-      
-      def delete(tagclass = 'diffdel')
-         op_helper('del', tagclass, @a[@opcode[1]...@opcode[2]])
-      end
-      
-      def equal
-        @content += @b[@opcode[3]...@opcode[4]]
-      end
-    
-      # Using this as op_helper would be equivalent to the first version of diff.rb by Bill Atkins
-      def op_helper_simple(tagname, tagclass, to_add)
-        @content << %(<#{tagname} class="#{tagclass}">) << to_add << %(</#{tagname}>)
-      end
-      
-      # Tries to put <p> tags or newline chars before the opening diff tags (<ins> or <del>)
-      # or after the ending diff tags.
-      # As a result the diff tags should be the "most inside" possible.
-      def op_helper(tagname, tagclass, to_add)
-        predicate_methods = [:tab?, :newline?, :close_tag?, :open_tag?]
-        content_to_skip   = Proc.new do |item| 
-          predicate_methods.any? {|predicate| HTMLDiff.send(predicate, item)}
-        end
-
-        unless to_add.any? {|element| content_to_skip.call element}
-          @content << wrap_text(to_add, tagname, tagclass)
-        else
-          loop do
-            @content << to_add and break if to_add.all? {|element| content_to_skip.call element}
-            # We are outside of a diff tag
-            @content << to_add.shift while content_to_skip.call to_add.first 
-            @content << %(<#{tagname} class="#{tagclass}">) 
-            # We are inside a diff tag
-            @content << to_add.shift until content_to_skip.call to_add.first
-            @content << %(</#{tagname}>)
-          end
-        end
-        #remove_empty_diff(tagname, tagclass)
-      end
-
-      def wrap_text(text, tagname, tagclass)
-        %(<#{tagname} class="#{tagclass}">#{text}</#{tagname}>)
-      end
-
-      def remove_empty_diff(tagname, tagclass)
-        @content = @content[0...-2] if last_elements_empty_diff?(@content, tagname, tagclass)
-      end
-
-      def last_elements_empty_diff?(content, tagname, tagclass)
-        content[-2] == %(<#{tagname} class="#{tagclass}">) and content.last == %(</#{tagname}>)
-      end
-  end
-  
-  class << self
-    include Diff::Utilities
-
-    def diff(a, b)
-      a = html2list(explode(a))
-      b = html2list(explode(b))
-      
-      out = Builder.new(a, b)
-
-      sequence_matcher = Diff::SequenceMatcher.new(a, b)
-      sequence_matcher.get_opcodes.each {|opcode| out.do_op(opcode)}
-      out.result
-    end
-  end 
-end
-
-if __FILE__ == $0                                                               
-  if ARGV.size == 2                                                             
-    puts HTMLDiff.diff(IO.read(ARGV.pop), IO.read(ARGV.pop))                    
-  else                                                                          
-    puts "Usage: html_diff file1 file2"                                         
+  def diff(a, b)
+    DiffBuilder.new(a, b).build
  end
+  
 end
--- a/test/unit/diff_test.rb
+++ b/test/unit/diff_test.rb
@ -3,90 +3,91 @@
 require File.expand_path(File.dirname(__FILE__) + '/../test_helper')
 require 'diff'

-include Diff
-
 class DiffTest < Test::Unit::TestCase
-  def test_init
-    assert_nothing_raised {
-      s = SequenceMatcher.new('private Thread currentThread;',
-            'private volatile Thread currentThread;') { |x| x == ' ' }
-    }
+
+  include HTMLDiff
+
+  def setup
+    @builder = DiffBuilder.new('old', 'new')
  end

-  def test_matching_blocks
-    s = SequenceMatcher.new 'abxcd', 'abcd'
-    assert_equal [[0, 0, 2], [3, 2, 2], [5, 4, 0]], s.get_matching_blocks
+  def test_start_of_tag
+    assert @builder.start_of_tag?('<')
+    assert(!@builder.start_of_tag?('>'))
+    assert(!@builder.start_of_tag?('a'))
  end

-  def test_ratio
-    s = SequenceMatcher.new 'abcd', 'bcde'
-    assert_equal 0.75, s.ratio, 0.001
-    assert_equal 0.75, s.quick_ratio, 0.001
-    assert_equal 1.0, s.real_quick_ratio, 0.001
+  def test_end_of_tag
+    assert @builder.end_of_tag?('>')
+    assert(!@builder.end_of_tag?('<'))
+    assert(!@builder.end_of_tag?('a'))
  end

-  def test_longest_match
-    s = SequenceMatcher.new(' abcd', 'abcd abcd')
-    assert_equal [0, 4, 5], s.find_longest_match(0, 5, 0, 9)
+  def test_whitespace
+    assert @builder.whitespace?(" ")
+    assert @builder.whitespace?("\n")
+    assert @builder.whitespace?("\r")
+    assert(!@builder.whitespace?("a"))
  end

-  def test_opcodes
-    s = SequenceMatcher.new('qabxcd', 'abycdf')
+  def test_convert_html_to_list_of_words_simple
    assert_equal(
-      [
-        [:delete, 0, 1, 0, 0],
-        [:equal, 1, 3, 0, 2],
-        [:replace, 3, 4, 2, 3],
-        [:equal, 4, 6, 3, 5],
-        [:insert, 6, 6, 5, 6]
-      ],
-      s.get_opcodes)
+        ['the', ' ', 'original', ' ', 'text'],
+        @builder.convert_html_to_list_of_words('the original text'))
  end

-
-  def test_count_leading
-    assert_equal 3, Diff.count_leading('   abc', ' ')
-  end
-
-  def test_html2list
-    a = "here is the original text"
+  def test_convert_html_to_list_of_words_should_separate_endlines
    assert_equal(
-        ['here ', 'is ', 'the ', 'original ', 'text'],
-        HTMLDiff.html2list(a))
+        ['a', "\n", 'b', "\r", 'c'],
+        @builder.convert_html_to_list_of_words("a\nb\rc"))
  end

-  def test_html_diff
+  def test_convert_html_to_list_of_words_should_not_compress_whitespace
+    assert_equal(
+        ['a', ' ', 'b', '  ', 'c', "\r \n ", 'd'],
+        @builder.convert_html_to_list_of_words("a b  c\r \n d"))
+  end
+
+  def test_convert_html_to_list_of_words_should_handle_tags_well
+    assert_equal(
+        ['<p>', 'foo', ' ', 'bar', '</p>'],
+        @builder.convert_html_to_list_of_words("<p>foo bar</p>"))
+  end
+  
+  def test_convert_html_to_list_of_words_interesting
+    assert_equal(
+        ['<p>', 'this', ' ', 'is', '</p>', "\r\n", '<p>', 'the', ' ', 'new', ' ', 'string', 
+         '</p>', "\r\n", '<p>', 'around', ' ', 'the', ' ', 'world', '</p>'],
+        @builder.convert_html_to_list_of_words(
+            "<p>this is</p>\r\n<p>the new string</p>\r\n<p>around the world</p>"))
+  end
+
+  def test_html_diff_simple
    a = 'this was the original string'
-    b = 'this is the super string'
-    assert_equal('this <del class="diffmod">was </del>' + 
-           '<ins class="diffmod">is </ins>the ' +
-           '<del class="diffmod">original </del>' + 
-           '<ins class="diffmod">super </ins>string',
-           HTMLDiff.diff(a, b))
+    b = 'this is the new string'
+    assert_equal('this <del class="diffmod">was</del><ins class="diffmod">is</ins> the ' +
+           '<del class="diffmod">original</del><ins class="diffmod">new</ins> string',
+           diff(a, b))
  end

  def test_html_diff_with_multiple_paragraphs
    a = "<p>this was the original string</p>"
-    b = "<p>this is</p>\r\n<p>the super string</p>\r\n<p>around the world</p>"
-
+    b = "<p>this is</p>\r\n<p>the new string</p>\r\n<p>around the world</p>"
    assert_equal(
-      "<p>this <del class=\"diffmod\">was </del>" + 
-      "<ins class=\"diffmod\">is</ins></p>\r\n<p>the " +
-      "<del class=\"diffmod\">original </del>" + 
-      "<ins class=\"diffmod\">super </ins>string</p>\r\n" +
-      "<p><ins class=\"diffins\">around the world</ins></p>",
-      HTMLDiff.diff(a, b)
-    )
+        "<p>this <del class=\"diffmod\">was</del><ins class=\"diffmod\">is</ins></p>\r\n<p> the " +
+        "<del class=\"diffmod\">original </del>" + 
+        "<ins class=\"diffmod\">new </ins>string</p>\r\n" +
+        "<p><ins class=\"diffins\">around the world</ins></p>",
+        diff(a, b))
  end

  # FIXME this test fails (ticket #67, http://dev.instiki.org/ticket/67)
  def test_html_diff_preserves_endlines_in_pre
    a = "<pre>\na\nb\nc\n</pre>"
-    b = ''
-
+    b = "<pre>\n</pre>"
    assert_equal(
        "<pre>\n<del class=\"diffdel\">a\nb\nc\n</del></pre>",
-        HTMLDiff.diff(a, b))
+        diff(a, b))
  end
  
 end
--- a/test/unit/page_renderer_test.rb
+++ b/test/unit/page_renderer_test.rb
@ -265,9 +265,8 @@ class PageRendererTest < Test::Unit::TestCase
    Revision.create(:page => @page, :content => 'What a red and lovely morning today', 
        :author => Author.new('DavidHeinemeierHansson'), :revised_at => Time.now)

-    assert_equal "<p>What a <del class=\"diffmod\">blue </del><ins class=\"diffmod\">red " +
-        "</ins>and lovely <del class=\"diffmod\">morning</del><ins class=\"diffmod\">morning " +
-        "today</ins></p>", test_renderer(@page.revisions.last).display_diff
+    assert_equal "<p>What a <del class=\"diffmod\">blue</del><ins class=\"diffmod\">red" +
+        "</ins> and lovely morning<ins class=\"diffins\"> today</ins></p>", test_renderer(@page.revisions.last).display_diff
  end
  
  def test_link_to_file