2006-01-22 22:40:20 +01:00
|
|
|
module HTMLDiff
|
2005-01-24 19:52:04 +01:00
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
Match = Struct.new(:start_in_old, :start_in_new, :size)
|
|
|
|
class Match
|
|
|
|
def end_in_old
|
|
|
|
self.start_in_old + self.size
|
2005-11-14 11:13:18 +01:00
|
|
|
end
|
2006-01-22 22:40:20 +01:00
|
|
|
|
|
|
|
def end_in_new
|
|
|
|
self.start_in_new + self.size
|
2005-11-14 11:13:18 +01:00
|
|
|
end
|
2006-01-22 22:40:20 +01:00
|
|
|
end
|
|
|
|
|
|
|
|
Operation = Struct.new(:action, :start_in_old, :end_in_old, :start_in_new, :end_in_new)
|
2005-11-14 11:13:18 +01:00
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
class DiffBuilder
|
2005-11-14 11:13:18 +01:00
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
def initialize(old_version, new_version)
|
|
|
|
@old_version, @new_version = old_version, new_version
|
|
|
|
@content = []
|
2005-11-14 11:13:18 +01:00
|
|
|
end
|
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
def build
|
|
|
|
split_inputs_to_words
|
|
|
|
index_new_words
|
2006-01-23 07:57:19 +01:00
|
|
|
operations.each {|op| perform_operation(op) }
|
2006-01-22 22:40:20 +01:00
|
|
|
return @content.join
|
2005-11-14 11:13:18 +01:00
|
|
|
end
|
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
def split_inputs_to_words
|
|
|
|
@old_words = convert_html_to_list_of_words(explode(@old_version))
|
|
|
|
@new_words = convert_html_to_list_of_words(explode(@new_version))
|
2005-11-14 11:13:18 +01:00
|
|
|
end
|
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
def index_new_words
|
|
|
|
@word_indices = {}
|
|
|
|
@new_words.each_with_index { |word, i| (@word_indices[word] ||= []) << i }
|
2005-11-14 11:13:18 +01:00
|
|
|
end
|
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
def operations
|
|
|
|
position_in_old = position_in_new = 0
|
|
|
|
operations = []
|
2006-01-23 07:57:19 +01:00
|
|
|
|
|
|
|
matches = matching_blocks
|
|
|
|
# an empty match at the end forces the loop below to handle the unmatched tails
|
|
|
|
# I'm sure it can be done more gracefully, but not at 23:52
|
|
|
|
matches << Match.new(@old_words.length, @new_words.length, 0)
|
|
|
|
|
|
|
|
matches.each_with_index do |match, i|
|
2006-01-22 22:40:20 +01:00
|
|
|
match_starts_at_current_position_in_old = (position_in_old == match.start_in_old)
|
|
|
|
match_starts_at_current_position_in_new = (position_in_new == match.start_in_new)
|
|
|
|
|
|
|
|
action_upto_match_positions =
|
|
|
|
case [match_starts_at_current_position_in_old, match_starts_at_current_position_in_new]
|
|
|
|
when [false, false]
|
|
|
|
:replace
|
|
|
|
when [true, false]
|
|
|
|
:insert
|
|
|
|
when [false, true]
|
|
|
|
:delete
|
2005-11-19 15:46:27 +01:00
|
|
|
else
|
2006-01-22 22:40:20 +01:00
|
|
|
# this happens if the first few words are same in both versions
|
|
|
|
:none
|
2005-11-19 15:46:27 +01:00
|
|
|
end
|
2005-01-24 19:52:04 +01:00
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
if action_upto_match_positions != :none
|
|
|
|
operation_upto_match_positions =
|
|
|
|
Operation.new(action_upto_match_positions,
|
|
|
|
position_in_old, match.start_in_old,
|
|
|
|
position_in_new, match.start_in_new)
|
|
|
|
operations << operation_upto_match_positions
|
|
|
|
end
|
|
|
|
match_operation = Operation.new(:equal,
|
|
|
|
match.start_in_old, match.end_in_old,
|
|
|
|
match.start_in_new, match.end_in_new)
|
|
|
|
operations << match_operation
|
2005-01-24 19:52:04 +01:00
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
position_in_old = match.end_in_old
|
|
|
|
position_in_new = match.end_in_new
|
|
|
|
end
|
2006-01-23 07:57:19 +01:00
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
operations
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
def matching_blocks
|
|
|
|
matching_blocks = []
|
|
|
|
recursively_find_matching_blocks(0, @old_words.size, 0, @new_words.size, matching_blocks)
|
|
|
|
matching_blocks
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
2006-01-22 22:40:20 +01:00
|
|
|
|
|
|
|
def recursively_find_matching_blocks(start_in_old, end_in_old, start_in_new, end_in_new, matching_blocks)
|
|
|
|
match = find_match(start_in_old, end_in_old, start_in_new, end_in_new)
|
|
|
|
if match
|
|
|
|
if start_in_old < match.start_in_old and start_in_new < match.start_in_new
|
|
|
|
recursively_find_matching_blocks(
|
|
|
|
start_in_old, match.start_in_old, start_in_new, match.start_in_new, matching_blocks)
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
2006-01-22 22:40:20 +01:00
|
|
|
matching_blocks << match
|
|
|
|
if match.end_in_old < end_in_old and match.end_in_new < end_in_new
|
|
|
|
recursively_find_matching_blocks(
|
|
|
|
match.end_in_old, end_in_old, match.end_in_new, end_in_new, matching_blocks)
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2006-01-22 22:40:20 +01:00
|
|
|
|
|
|
|
def find_match(start_in_old, end_in_old, start_in_new, end_in_new)
|
|
|
|
besti, bestj, bestsize = start_in_old, start_in_new, 0
|
2005-01-24 19:52:04 +01:00
|
|
|
|
|
|
|
j2len = {}
|
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
(start_in_old..end_in_old).step do |i|
|
2005-01-24 19:52:04 +01:00
|
|
|
newj2len = {}
|
2006-01-22 22:40:20 +01:00
|
|
|
(@word_indices[@old_words[i]] || []).each do |j|
|
|
|
|
next if j < start_in_new
|
|
|
|
break if j >= end_in_new
|
2005-01-24 19:52:04 +01:00
|
|
|
|
|
|
|
k = newj2len[j] = (j2len[j - 1] || 0) + 1
|
|
|
|
if k > bestsize
|
|
|
|
besti, bestj, bestsize = i - k + 1, j - k + 1, k
|
|
|
|
end
|
|
|
|
end
|
|
|
|
j2len = newj2len
|
|
|
|
end
|
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
while besti > start_in_old and bestj > start_in_new and @old_words[besti - 1] == @new_words[bestj - 1]
|
2005-11-14 11:13:18 +01:00
|
|
|
besti, bestj, bestsize = besti - 1, bestj - 1, bestsize + 1
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
while besti + bestsize < end_in_old and bestj + bestsize < end_in_new and
|
|
|
|
@old_words[besti + bestsize] == @new_words[bestj + bestsize]
|
2005-01-24 19:52:04 +01:00
|
|
|
bestsize += 1
|
|
|
|
end
|
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
if bestsize == 0
|
|
|
|
return nil
|
|
|
|
else
|
|
|
|
return Match.new(besti, bestj, bestsize)
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
VALID_METHODS = [:replace, :insert, :delete, :equal]
|
|
|
|
def perform_operation(operation)
|
|
|
|
@operation = operation
|
|
|
|
self.send operation.action, operation
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
2006-01-22 22:40:20 +01:00
|
|
|
|
|
|
|
def replace(operation)
|
|
|
|
delete(operation, 'diffmod')
|
|
|
|
insert(operation, 'diffmod')
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
def insert(operation, tagclass = 'diffins')
|
|
|
|
insert_tag('ins', tagclass, @new_words[operation.start_in_new...operation.end_in_new])
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
2006-01-22 22:40:20 +01:00
|
|
|
|
|
|
|
def delete(operation, tagclass = 'diffdel')
|
|
|
|
insert_tag('del', tagclass, @old_words[operation.start_in_old...operation.end_in_old])
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
2006-01-22 22:40:20 +01:00
|
|
|
|
|
|
|
def equal(operation)
|
|
|
|
# no tags to insert, simply copy the matching words from onbe of the versions
|
|
|
|
@content += @new_words[operation.start_in_new...operation.end_in_new]
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
2006-01-22 22:40:20 +01:00
|
|
|
|
|
|
|
def opening_tag?(item)
|
|
|
|
item =~ %r!^\s*<[^>]+>\s*$!
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
def closing_tag?(item)
|
|
|
|
item =~ %r!^\s*</[^>]+>\s*$!
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
def tag?(item)
|
|
|
|
opening_tag?(item) or closing_tag?(item)
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
|
|
|
|
2006-01-23 07:57:19 +01:00
|
|
|
def extract_consecutive_words(words, &condition)
|
|
|
|
index_of_first_tag = nil
|
|
|
|
words.each_with_index do |word, i|
|
|
|
|
if !condition.call(word)
|
|
|
|
index_of_first_tag = i
|
|
|
|
break
|
2005-11-14 11:13:18 +01:00
|
|
|
end
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
2006-01-23 07:57:19 +01:00
|
|
|
if index_of_first_tag
|
|
|
|
return words.slice!(0...index_of_first_tag)
|
|
|
|
else
|
|
|
|
return words.slice!(0..words.length)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
# This method encloses words within a specified tag (ins or del), and adds this into @content,
|
|
|
|
# with a twist: if there are words contain tags, it actually creates multiple ins or del,
|
|
|
|
# so that they don't include any ins or del. This handles cases like
|
|
|
|
# old: '<p>a</p>'
|
|
|
|
# new: '<p>ab</p><p>c</b>'
|
|
|
|
# diff result: '<p>a<ins>b</ins></p><p><ins>c</ins></p>'
|
|
|
|
# this still doesn't guarantee valid HTML (hint: think about diffing a text containing ins or
|
|
|
|
# del tags), but handles correctly more cases than earlier version.
|
|
|
|
#
|
|
|
|
# PS: Spare a thought for people who write HTML browsers. They live in this ... every day.
|
|
|
|
|
|
|
|
def insert_tag(tagname, cssclass, words)
|
|
|
|
loop do
|
|
|
|
break if words.empty?
|
|
|
|
non_tags = extract_consecutive_words(words) { |word| not tag?(word) }
|
|
|
|
@content << wrap_text(non_tags.join, tagname, cssclass) unless non_tags.empty?
|
|
|
|
|
|
|
|
break if words.empty?
|
|
|
|
@content += extract_consecutive_words(words) { |word| tag?(word) }
|
|
|
|
end
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
def wrap_text(text, tagname, cssclass)
|
|
|
|
%(<#{tagname} class="#{cssclass}">#{text}</#{tagname}>)
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
def explode(sequence)
|
|
|
|
sequence.is_a?(String) ? sequence.split(//) : sequence
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
2006-01-22 22:40:20 +01:00
|
|
|
|
|
|
|
def end_of_tag?(char)
|
|
|
|
char == '>'
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
2006-01-22 22:40:20 +01:00
|
|
|
|
|
|
|
def start_of_tag?(char)
|
|
|
|
char == '<'
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
def whitespace?(char)
|
|
|
|
char =~ /\s/
|
|
|
|
end
|
|
|
|
|
|
|
|
def convert_html_to_list_of_words(x, use_brackets = false)
|
|
|
|
mode = :char
|
|
|
|
current_word = ''
|
|
|
|
words = []
|
2005-11-14 11:13:18 +01:00
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
explode(x).each do |char|
|
|
|
|
case mode
|
|
|
|
when :tag
|
|
|
|
if end_of_tag? char
|
|
|
|
current_word << (use_brackets ? ']' : '>')
|
|
|
|
words << current_word
|
|
|
|
current_word = ''
|
|
|
|
if whitespace?(char)
|
|
|
|
mode = :whitespace
|
|
|
|
else
|
|
|
|
mode = :char
|
|
|
|
end
|
|
|
|
else
|
|
|
|
current_word << char
|
|
|
|
end
|
|
|
|
when :char
|
|
|
|
if start_of_tag? char
|
|
|
|
words << current_word unless current_word.empty?
|
|
|
|
current_word = (use_brackets ? '[' : '<')
|
|
|
|
mode = :tag
|
|
|
|
elsif /\s/.match char
|
|
|
|
words << current_word unless current_word.empty?
|
|
|
|
current_word = char
|
|
|
|
mode = :whitespace
|
|
|
|
else
|
|
|
|
current_word << char
|
|
|
|
end
|
|
|
|
when :whitespace
|
|
|
|
if start_of_tag? char
|
|
|
|
words << current_word unless current_word.empty?
|
|
|
|
current_word = (use_brackets ? '[' : '<')
|
|
|
|
mode = :tag
|
|
|
|
elsif /\s/.match char
|
|
|
|
current_word << char
|
|
|
|
else
|
|
|
|
words << current_word unless current_word.empty?
|
|
|
|
current_word = char
|
|
|
|
mode = :char
|
2005-11-14 11:13:18 +01:00
|
|
|
end
|
2006-01-22 22:40:20 +01:00
|
|
|
else
|
|
|
|
raise "Unknown mode #{mode.inspect}"
|
2005-11-14 11:13:18 +01:00
|
|
|
end
|
|
|
|
end
|
2006-01-22 22:40:20 +01:00
|
|
|
words << current_word unless current_word.empty?
|
|
|
|
words
|
|
|
|
end
|
2005-01-24 19:52:04 +01:00
|
|
|
|
2006-01-22 22:40:20 +01:00
|
|
|
end # of class Diff Builder
|
|
|
|
|
|
|
|
def diff(a, b)
|
|
|
|
DiffBuilder.new(a, b).build
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|
2005-11-14 11:13:18 +01:00
|
|
|
|
2005-01-24 19:52:04 +01:00
|
|
|
end
|