diff --git a/lib/chunks/nowiki.rb b/lib/chunks/nowiki.rb index f3cee86f..732dd511 100644 --- a/lib/chunks/nowiki.rb +++ b/lib/chunks/nowiki.rb @@ -1,6 +1,5 @@ require 'chunks/chunk' require 'sanitize' -require 'rexml/document' # This chunks allows certain parts of a wiki page to be hidden from the # rest of the rendering pipeline. It should be run at the beginning @@ -27,15 +26,7 @@ class NoWiki < Chunk::Abstract def initialize(match_data, content) super - begin - sanitized = sanitize_xhtml(match_data[1]) - doc = REXML::Document.new("
#{sanitized}
") - sanitized = doc.to_s.gsub(/\A
(.*)<\/div>\Z/m, '\1') - rescue REXML::ParseException - sanitized = %{
HTML parse error:
-#{sanitized.escapeHTML}
} - end - @plain_text = @unmask_text = sanitized + @plain_text = @unmask_text = safe_sanitize_xhtml(match_data[1]) end end diff --git a/lib/sanitize.rb b/lib/sanitize.rb index 001aefff..10e481d0 100644 --- a/lib/sanitize.rb +++ b/lib/sanitize.rb @@ -9,6 +9,8 @@ # sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML # sanitize_html() is a case-insensitive sanitizer suitable for HTML # sanitize_rexml() sanitizes a REXML tree, returning a string +# safe_sanitize_xhtml() makes extra-sure that the result is well-formed XHTML +# by running the output of sanitize_xhtml() through REXML # # == Files # @@ -69,6 +71,25 @@ module Sanitize return parsed if @to_tree return parsed.to_s end + +# Sanitize a string, parsed using XHTML parsing rules. Reparse the result to +# ensure well-formedness. +# +# :call-seq: +# safe_sanitize_xhtml(string) -> string +# +# Unless otherwise specified, the string is assumed to be utf-8 encoded. +# +# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding. +# (REXML trees are always utf-8 encoded.) + def safe_sanitize_xhtml(html, options = {}) + options[:to_tree] = false + sanitized = sanitize_xhtml(html, options) + doc = REXML::Document.new("
#{sanitized}
") + sanitized = doc.to_s.gsub(/\A
(.*)<\/div>\Z/m, '\1') + rescue REXML::ParseException + sanitized = sanitized.escapeHTML + end # Sanitize a string, parsed using HTML parsing rules. # diff --git a/test/unit/chunks/nowiki_test.rb b/test/unit/chunks/nowiki_test.rb index 8c068b63..f1f8e75b 100755 --- a/test/unit/chunks/nowiki_test.rb +++ b/test/unit/chunks/nowiki_test.rb @@ -26,7 +26,7 @@ class NoWikiTest < Test::Unit::TestCase def test_sanitize_nowiki_ill_formed match(NoWiki, "", - :plain_text => "
HTML parse error:\n<animateColor xlink:href='#foo'></animateColor>
" + :plain_text => "<animateColor xlink:href='#foo'></animateColor>" ) end