diff --git a/README b/README index a80d0914..0868a3ec 100755 --- a/README +++ b/README @@ -50,7 +50,8 @@ to install (until somebody sends a patch to properly package Instiki for all tho * Internationalization: Wiki words in any latin, greek, cyrillian, or armenian characters * Color diffs: Track changes through revisions * Definitely can run on SQLite and MySQL -* May be able to run on Postgres, Oracle, DB2 and SqlServer (if you try this ) +* May be able to run on Postgres, Oracle, DB2 and SqlServer. If you try this, and it works + (or, it doesn't, but you make it work) please write about it on Instiki.org. ===Command-line options: * Run "ruby instiki --help" diff --git a/lib/diff.rb b/lib/diff.rb index 4a4e80f0..acc82189 100644 --- a/lib/diff.rb +++ b/lib/diff.rb @@ -1,205 +1,111 @@ -# heavily based off difflib.py - see that file for documentation -# ported from Python by Bill Atkins +module HTMLDiff -# This does not support all features offered by difflib; it -# implements only the subset of features necessary -# to support a Ruby version of HTML Differ. You're welcome to finish this off. - -# By default, String#each iterates by line. This isn't really appropriate -# for diff, so often a string will be split by // to get an array of one- -# character strings. - -# Some methods in Diff are untested and are not guaranteed to work. The -# methods in HTMLDiff and any methods it calls should work quite well. - -# changes by DenisMertz -# * main change: -# ** get the tag soup away -# the tag soup problem was first reported with
tags, but it appeared also with -#
' then mode = :pre
- when '' then mode = :code
- else
- out.push cur
- cur, mode = '', :char
- end
- else
- cur += char
- end
- when :char
- if start_of_tag? char
- out.push cur
- cur = use_brackets ? '[' : '<'
- mode = :tag
- elsif /\s/.match char
- out.push cur + char
- cur = ''
- else
- cur += char
- end
- when :pre
- if end_of_tag?(char) and cur =~ %r!
'
- out.push cur
- cur, mode = '', :char
- else
- cur += char
- end
- when :code
- if end_of_tag?(char) and cur =~ %r!'
- out.push cur
- cur, mode = '', :char
- else
- cur += char
- end
- end
- end
-
- out.push(cur)
- out.delete ''
- out.map {|elt| newline?(elt) ? elt : elt.chomp}
- end
- end
-
- class SequenceMatcher
- include Utilities
-
- def initialize(a = [''], b = [''], isjunk = nil, byline = false)
- a, b = explode(a), explode(b) unless byline
- @isjunk = isjunk || Proc.new {}
- set_sequences a, b
- end
-
- def set_sequences(a, b)
- set_sequence_a a
- set_sequence_b b
- end
-
- def set_sequence_a(a)
- @a = a
- @matching_blocks = @opcodes = nil
- end
-
- def set_sequence_b(b)
- @b = b
- @matching_blocks = @opcodes = nil
- chain_b
- end
-
- def chain_b
- @fullbcount = nil
- @b2j = {}
- pophash = {}
- junkdict = {}
-
- @b.each_with_index do |elt, idx|
- if @b2j.has_key? elt
- indices = @b2j[elt]
- if @b.length >= 200 and indices.length * 100 > @b.length
- pophash[elt] = 1
- indices.clear
- else
- indices.push idx
- end
- else
- @b2j[elt] = [idx]
- end
- end
-
- pophash.each_key { |elt| @b2j.delete elt }
-
- unless @isjunk.nil?
- [pophash, @b2j].each do |d|
- d.each_key do |elt|
- if @isjunk.call(elt)
- junkdict[elt] = 1
- d.delete elt
- end
- end
- end
- end
-
- @isbjunk = junkdict.method(:has_key?)
- @isbpopular = junkdict.method(:has_key?)
+ Match = Struct.new(:start_in_old, :start_in_new, :size)
+ class Match
+ def end_in_old
+ self.start_in_old + self.size
end
- def find_longest_match(a_low, a_high, b_low, b_high)
- besti, bestj, bestsize = a_low, b_low, 0
+ def end_in_new
+ self.start_in_new + self.size
+ end
+ end
+
+ Operation = Struct.new(:action, :start_in_old, :end_in_old, :start_in_new, :end_in_new)
+
+ class DiffBuilder
+
+ def initialize(old_version, new_version)
+ @old_version, @new_version = old_version, new_version
+ @content = []
+ end
+
+ def build
+ split_inputs_to_words
+ index_new_words
+ operations.each {|opcode| perform_operation(opcode)}
+ return @content.join
+ end
+
+ def split_inputs_to_words
+ @old_words = convert_html_to_list_of_words(explode(@old_version))
+ @new_words = convert_html_to_list_of_words(explode(@new_version))
+ end
+
+ def index_new_words
+ @word_indices = {}
+ @new_words.each_with_index { |word, i| (@word_indices[word] ||= []) << i }
+ end
+
+ def operations
+ position_in_old = position_in_new = 0
+ operations = []
+ matching_blocks.each do |match|
+ match_starts_at_current_position_in_old = (position_in_old == match.start_in_old)
+ match_starts_at_current_position_in_new = (position_in_new == match.start_in_new)
+
+ action_upto_match_positions =
+ case [match_starts_at_current_position_in_old, match_starts_at_current_position_in_new]
+ when [false, false]
+ :replace
+ when [true, false]
+ :insert
+ when [false, true]
+ :delete
+ else
+ # this happens if the first few words are same in both versions
+ :none
+ end
+
+ if action_upto_match_positions != :none
+ operation_upto_match_positions =
+ Operation.new(action_upto_match_positions,
+ position_in_old, match.start_in_old,
+ position_in_new, match.start_in_new)
+ operations << operation_upto_match_positions
+ end
+ match_operation = Operation.new(:equal,
+ match.start_in_old, match.end_in_old,
+ match.start_in_new, match.end_in_new)
+ operations << match_operation
+
+ position_in_old = match.end_in_old
+ position_in_new = match.end_in_new
+ end
+ operations
+ end
+
+ def matching_blocks
+ matching_blocks = []
+ recursively_find_matching_blocks(0, @old_words.size, 0, @new_words.size, matching_blocks)
+ matching_blocks
+ end
+
+ def recursively_find_matching_blocks(start_in_old, end_in_old, start_in_new, end_in_new, matching_blocks)
+ match = find_match(start_in_old, end_in_old, start_in_new, end_in_new)
+ if match
+ if start_in_old < match.start_in_old and start_in_new < match.start_in_new
+ recursively_find_matching_blocks(
+ start_in_old, match.start_in_old, start_in_new, match.start_in_new, matching_blocks)
+ end
+ matching_blocks << match
+ if match.end_in_old < end_in_old and match.end_in_new < end_in_new
+ recursively_find_matching_blocks(
+ match.end_in_old, end_in_old, match.end_in_new, end_in_new, matching_blocks)
+ end
+ end
+ end
+
+ def find_match(start_in_old, end_in_old, start_in_new, end_in_new)
+ besti, bestj, bestsize = start_in_old, start_in_new, 0
j2len = {}
- (a_low..a_high).step do |i|
+ (start_in_old..end_in_old).step do |i|
newj2len = {}
- (@b2j[@a[i]] || []).each do |j|
- next if j < b_low
- break if j >= b_high
+ (@word_indices[@old_words[i]] || []).each do |j|
+ next if j < start_in_new
+ break if j >= end_in_new
k = newj2len[j] = (j2len[j - 1] || 0) + 1
if k > bestsize
@@ -209,271 +115,153 @@ module Diff
j2len = newj2len
end
- while besti > a_low and bestj > b_low and not @isbjunk.call(@b[bestj - 1]) and @a[besti - 1] == @b[bestj - 1]
+ while besti > start_in_old and bestj > start_in_new and @old_words[besti - 1] == @new_words[bestj - 1]
besti, bestj, bestsize = besti - 1, bestj - 1, bestsize + 1
end
- while besti + bestsize < a_high and bestj + bestsize < b_high and
- not @isbjunk.call(@b[bestj + bestsize]) and
- @a[besti + bestsize] == @b[bestj + bestsize]
+ while besti + bestsize < end_in_old and bestj + bestsize < end_in_new and
+ @old_words[besti + bestsize] == @new_words[bestj + bestsize]
bestsize += 1
end
- while besti > a_low and bestj > b_low and @isbjunk.call(@b[bestj - 1]) and @a[besti - 1] == @b[bestj - 1]
- besti, bestj, bestsize = besti - 1, bestj - 1, bestsize + 1
- end
-
- while besti + bestsize < a_high and bestj + bestsize < b_high and @isbjunk.call(@b[bestj+bestsize]) and
- @a[besti+bestsize] == @b[bestj+bestsize]
- bestsize += 1
- end
-
- [besti, bestj, bestsize]
- end
-
- def get_matching_blocks
- return @matching_blocks unless @matching_blocks.nil_or_empty?
-
- @matching_blocks = []
- size_of_a, size_of_b = @a.size, @b.size
- match_block_helper(0, size_of_a, 0, size_of_b, @matching_blocks)
- @matching_blocks.push [size_of_a, size_of_b, 0]
- end
-
- def match_block_helper(a_low, a_high, b_low, b_high, answer)
- i, j, k = x = find_longest_match(a_low, a_high, b_low, b_high)
- unless k.zero?
- match_block_helper(a_low, i, b_low, j, answer) if a_low < i and b_low < j
- answer.push x
- if i + k < a_high and j + k < b_high
- match_block_helper(i + k, a_high, j + k, b_high, answer)
- end
+ if bestsize == 0
+ return nil
+ else
+ return Match.new(besti, bestj, bestsize)
end
end
- def get_opcodes
- return @opcodes unless @opcodes.nil_or_empty?
-
- i = j = 0
- @opcodes = answer = []
- get_matching_blocks.each do |ai, bj, size|
- tag = if i < ai and j < bj
- :replace
- elsif i < ai
- :delete
- elsif j < bj
- :insert
- end
-
- answer.push [tag, i, ai, j, bj] if tag
- i, j = ai + size, bj + size
- answer.push [:equal, ai, i, bj, j] unless size.zero?
- end
- answer
- end
-
- # XXX: untested
- def get_grouped_opcodes(n = 3)
- codes = get_opcodes
- if codes.first.first == :equal
- tag, i1, i2, j1, j2 = codes.first
- codes[0] = tag, [i1, i2 - n].max, i2, [j1, j2-n].max, j2
- end
-
- if codes.last.first == :equal
- tag, i1, i2, j1, j2 = codes.last
- codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
- end
-
- nn = n + n
- group = []
- codes.each do |tag, i1, i2, j1, j2|
- if tag == :equal and i2 - i1 > nn
- group.push [tag, i1, [i2, i1 + n].min, j1, [j2, j1 + n].min]
- yield group
- group = []
- i1, j1 = [i1, i2-n].max, [j1, j2-n].max
- group.push [tag, i1, i2, j1 ,j2]
- end
- end
- yield group if group and group.size != 1 and group.first.first == :equal
- end
-
- def ratio
- matches = get_matching_blocks.reduce(0) do |sum, triple|
- sum + triple.last
- end
- Diff.calculate_ratio(matches, @a.size + @b.size)
- end
-
- def quick_ratio
- if @fullbcount.nil_or_empty?
- @fullbcount = {}
- @b.each do |elt|
- @fullbcount[elt] = (@fullbcount[elt] || 0) + 1
- end
- end
-
- avail = {}
- matches = 0
- @a.each do |elt|
- numb = avail.has_key?(elt) ? avail[elt] : (@fullbcount[elt] || 0)
- avail[elt] = numb - 1
- matches += 1 if numb > 0
- end
- Diff.calculate_ratio matches, @a.size + @b.size
- end
-
- def real_quick_ratio
- size_of_a, size_of_b = @a.size, @b.size
- Diff.calculate_ratio([size_of_a, size_of_b].min, size_of_a + size_of_b)
- end
-
- protected :chain_b, :match_block_helper
- end # end class SequenceMatcher
-
- class << self
- def calculate_ratio(matches, length)
- return 1.0 if length.zero?
- 2.0 * matches / length
- end
-
- # XXX: untested
- def get_close_matches(word, possibilities, n = 3, cutoff = 0.6)
- raise "n must be > 0: #{n}" unless n > 0
- raise "cutoff must be in (0.0..1.0): #{cutoff}" unless cutoff.between 0.0..1.0
-
- result = []
- sequence_matcher = Diff::SequenceMatcher.new
- sequence_matcher.set_sequence_b word
- possibilities.each do |possibility|
- sequence_matcher.set_sequence_a possibility
- if sequence_matcher.real_quick_ratio >= cutoff and
- sequence_matcher.quick_ratio >= cutoff and
- sequence_matcher.ratio >= cutoff
- result.push [sequence_matcher.ratio, possibility]
- end
- end
-
- unless result.nil_or_empty?
- result.sort
- result.reverse!
- result = result[-n..-1]
- end
- result.map {|score, x| x }
- end
-
- def count_leading(line, ch)
- count, size = 0, line.size
- count += 1 while count < size and line[count].chr == ch
- count
- end
- end
-end
-
-module HTMLDiff
- include Diff
- class Builder
VALID_METHODS = [:replace, :insert, :delete, :equal]
- def initialize(a, b)
- @a, @b = a, b
- @content = []
+ def perform_operation(operation)
+ @operation = operation
+ self.send operation.action, operation
end
- def do_op(opcode)
- @opcode = opcode
- op = @opcode.first
- raise NameError, "Invalid opcode '#{op}'" unless VALID_METHODS.include? op
- send op
+ def replace(operation)
+ delete(operation, 'diffmod')
+ insert(operation, 'diffmod')
end
-
- def result
- @content.join
- end
-
- # These methods have to be called via do_op(opcode) so that @opcode is set properly
- private
-
- def replace
- delete('diffmod')
- insert('diffmod')
- end
-
- def insert(tagclass = 'diffins')
- op_helper('ins', tagclass, @b[@opcode[3]...@opcode[4]])
- end
-
- def delete(tagclass = 'diffdel')
- op_helper('del', tagclass, @a[@opcode[1]...@opcode[2]])
- end
-
- def equal
- @content += @b[@opcode[3]...@opcode[4]]
- end
- # Using this as op_helper would be equivalent to the first version of diff.rb by Bill Atkins
- def op_helper_simple(tagname, tagclass, to_add)
- @content << %(<#{tagname} class="#{tagclass}">) << to_add << %(#{tagname}>)
+ def insert(operation, tagclass = 'diffins')
+ insert_tag('ins', tagclass, @new_words[operation.start_in_new...operation.end_in_new])
+ end
+
+ def delete(operation, tagclass = 'diffdel')
+ insert_tag('del', tagclass, @old_words[operation.start_in_old...operation.end_in_old])
+ end
+
+ def equal(operation)
+ # no tags to insert, simply copy the matching words from onbe of the versions
+ @content += @new_words[operation.start_in_new...operation.end_in_new]
+ end
+
+ def opening_tag?(item)
+ item =~ %r!^\s*<[^>]+>\s*$!
+ end
+
+ def closing_tag?(item)
+ item =~ %r!^\s*[^>]+>\s*$!
+ end
+
+ def tag?(item)
+ opening_tag?(item) or closing_tag?(item)
+ end
+
+ # Tries to enclose diff tags ( or tags + # As a result the diff tags should be the "most inside" possible. + def insert_tag(tagname, cssclass, words) + unless words.any? { |word| tag?(word) } + @content << wrap_text(words.join, tagname, cssclass) + else + loop do + break if words.empty? + @content << words and break if words.all? { |word| tag?(word) } + # We are outside of a diff tag + @content << words.shift while not words.empty? and tag?(words.first) + @content << %(<#{tagname} class="#{cssclass}">) + # We are inside a diff tag + @content << words.shift until words.empty? or tag?(words.first) + @content << %(#{tagname}>) + end end + end + + def wrap_text(text, tagname, cssclass) + %(<#{tagname} class="#{cssclass}">#{text}#{tagname}>) + end + + def explode(sequence) + sequence.is_a?(String) ? sequence.split(//) : sequence + end + + def end_of_tag?(char) + char == '>' + end + + def start_of_tag?(char) + char == '<' + end + + def whitespace?(char) + char =~ /\s/ + end + + def convert_html_to_list_of_words(x, use_brackets = false) + mode = :char + current_word = '' + words = [] - # Tries to put
tags or newline chars before the opening diff tags ( or ', 'foo', ' ', 'bar', ' foo bar ', 'this', ' ', 'is', ' ', 'the', ' ', 'new', ' ', 'string',
+ ' ', 'around', ' ', 'the', ' ', 'world', ' this is the new string around the world this was the original string this is the super string around the world this is the new string around the world this the " +
- " around the world this the " +
+ " around the world What a What a )
- # or after the ending diff tags.
- # As a result the diff tags should be the "most inside" possible.
- def op_helper(tagname, tagclass, to_add)
- predicate_methods = [:tab?, :newline?, :close_tag?, :open_tag?]
- content_to_skip = Proc.new do |item|
- predicate_methods.any? {|predicate| HTMLDiff.send(predicate, item)}
- end
-
- unless to_add.any? {|element| content_to_skip.call element}
- @content << wrap_text(to_add, tagname, tagclass)
- else
- loop do
- @content << to_add and break if to_add.all? {|element| content_to_skip.call element}
- # We are outside of a diff tag
- @content << to_add.shift while content_to_skip.call to_add.first
- @content << %(<#{tagname} class="#{tagclass}">)
- # We are inside a diff tag
- @content << to_add.shift until content_to_skip.call to_add.first
- @content << %(#{tagname}>)
+ explode(x).each do |char|
+ case mode
+ when :tag
+ if end_of_tag? char
+ current_word << (use_brackets ? ']' : '>')
+ words << current_word
+ current_word = ''
+ if whitespace?(char)
+ mode = :whitespace
+ else
+ mode = :char
+ end
+ else
+ current_word << char
end
+ when :char
+ if start_of_tag? char
+ words << current_word unless current_word.empty?
+ current_word = (use_brackets ? '[' : '<')
+ mode = :tag
+ elsif /\s/.match char
+ words << current_word unless current_word.empty?
+ current_word = char
+ mode = :whitespace
+ else
+ current_word << char
+ end
+ when :whitespace
+ if start_of_tag? char
+ words << current_word unless current_word.empty?
+ current_word = (use_brackets ? '[' : '<')
+ mode = :tag
+ elsif /\s/.match char
+ current_word << char
+ else
+ words << current_word unless current_word.empty?
+ current_word = char
+ mode = :char
+ end
+ else
+ raise "Unknown mode #{mode.inspect}"
end
- #remove_empty_diff(tagname, tagclass)
end
+ words << current_word unless current_word.empty?
+ words
+ end
- def wrap_text(text, tagname, tagclass)
- %(<#{tagname} class="#{tagclass}">#{text}#{tagname}>)
- end
-
- def remove_empty_diff(tagname, tagclass)
- @content = @content[0...-2] if last_elements_empty_diff?(@content, tagname, tagclass)
- end
-
- def last_elements_empty_diff?(content, tagname, tagclass)
- content[-2] == %(<#{tagname} class="#{tagclass}">) and content.last == %(#{tagname}>)
- end
+ end # of class Diff Builder
+
+ def diff(a, b)
+ DiffBuilder.new(a, b).build
end
- class << self
- include Diff::Utilities
-
- def diff(a, b)
- a = html2list(explode(a))
- b = html2list(explode(b))
-
- out = Builder.new(a, b)
-
- sequence_matcher = Diff::SequenceMatcher.new(a, b)
- sequence_matcher.get_opcodes.each {|opcode| out.do_op(opcode)}
- out.result
- end
- end
end
-
-if __FILE__ == $0
- if ARGV.size == 2
- puts HTMLDiff.diff(IO.read(ARGV.pop), IO.read(ARGV.pop))
- else
- puts "Usage: html_diff file1 file2"
- end
-end
diff --git a/test/unit/diff_test.rb b/test/unit/diff_test.rb
index 9981305e..fadb4508 100755
--- a/test/unit/diff_test.rb
+++ b/test/unit/diff_test.rb
@@ -3,90 +3,91 @@
require File.expand_path(File.dirname(__FILE__) + '/../test_helper')
require 'diff'
-include Diff
-
class DiffTest < Test::Unit::TestCase
- def test_init
- assert_nothing_raised {
- s = SequenceMatcher.new('private Thread currentThread;',
- 'private volatile Thread currentThread;') { |x| x == ' ' }
- }
- end
-
- def test_matching_blocks
- s = SequenceMatcher.new 'abxcd', 'abcd'
- assert_equal [[0, 0, 2], [3, 2, 2], [5, 4, 0]], s.get_matching_blocks
- end
-
- def test_ratio
- s = SequenceMatcher.new 'abcd', 'bcde'
- assert_equal 0.75, s.ratio, 0.001
- assert_equal 0.75, s.quick_ratio, 0.001
- assert_equal 1.0, s.real_quick_ratio, 0.001
- end
-
- def test_longest_match
- s = SequenceMatcher.new(' abcd', 'abcd abcd')
- assert_equal [0, 4, 5], s.find_longest_match(0, 5, 0, 9)
- end
-
- def test_opcodes
- s = SequenceMatcher.new('qabxcd', 'abycdf')
- assert_equal(
- [
- [:delete, 0, 1, 0, 0],
- [:equal, 1, 3, 0, 2],
- [:replace, 3, 4, 2, 3],
- [:equal, 4, 6, 3, 5],
- [:insert, 6, 6, 5, 6]
- ],
- s.get_opcodes)
+
+ include HTMLDiff
+
+ def setup
+ @builder = DiffBuilder.new('old', 'new')
end
-
- def test_count_leading
- assert_equal 3, Diff.count_leading(' abc', ' ')
+ def test_start_of_tag
+ assert @builder.start_of_tag?('<')
+ assert(!@builder.start_of_tag?('>'))
+ assert(!@builder.start_of_tag?('a'))
end
- def test_html2list
- a = "here is the original text"
+ def test_end_of_tag
+ assert @builder.end_of_tag?('>')
+ assert(!@builder.end_of_tag?('<'))
+ assert(!@builder.end_of_tag?('a'))
+ end
+
+ def test_whitespace
+ assert @builder.whitespace?(" ")
+ assert @builder.whitespace?("\n")
+ assert @builder.whitespace?("\r")
+ assert(!@builder.whitespace?("a"))
+ end
+
+ def test_convert_html_to_list_of_words_simple
assert_equal(
- ['here ', 'is ', 'the ', 'original ', 'text'],
- HTMLDiff.html2list(a))
+ ['the', ' ', 'original', ' ', 'text'],
+ @builder.convert_html_to_list_of_words('the original text'))
end
- def test_html_diff
- a = 'this was the original string'
- b = 'this is the super string'
- assert_equal('this was ' +
- 'is the ' +
- 'original ' +
- 'super string',
- HTMLDiff.diff(a, b))
+ def test_convert_html_to_list_of_words_should_separate_endlines
+ assert_equal(
+ ['a', "\n", 'b', "\r", 'c'],
+ @builder.convert_html_to_list_of_words("a\nb\rc"))
+ end
+
+ def test_convert_html_to_list_of_words_should_not_compress_whitespace
+ assert_equal(
+ ['a', ' ', 'b', ' ', 'c', "\r \n ", 'd'],
+ @builder.convert_html_to_list_of_words("a b c\r \n d"))
+ end
+
+ def test_convert_html_to_list_of_words_should_handle_tags_well
+ assert_equal(
+ ['wasis the ' +
+ 'originalnew string',
+ diff(a, b))
+ end
+
def test_html_diff_with_multiple_paragraphs
a = "was " +
- "isoriginal " +
- "super stringwasisoriginal " +
+ "new string\na\nb\nc\n
"
- b = ''
-
+ b = "\n
"
assert_equal(
"\n
",
- HTMLDiff.diff(a, b))
+ diff(a, b))
end
end
diff --git a/test/unit/page_renderer_test.rb b/test/unit/page_renderer_test.rb
index aabaccf3..85cff5ca 100644
--- a/test/unit/page_renderer_test.rb
+++ b/test/unit/page_renderer_test.rb
@@ -265,9 +265,8 @@ class PageRendererTest < Test::Unit::TestCase
Revision.create(:page => @page, :content => 'What a red and lovely morning today',
:author => Author.new('DavidHeinemeierHansson'), :revised_at => Time.now)
- assert_equal "a\nb\nc\nblue red " +
- "and lovely morningmorning " +
- "todaybluered" +
+ " and lovely morning today