Better

Put the "safe" XHTML sanitization in lib/santize.rb, rather than in lib/chunks/nowiki.rb. D'oh!
2008-12-01 10:29:46 -06:00 · 2008-12-01 10:29:46 -06:00 · 513b2b16c1
commit 513b2b16c1
parent 758325923f
3 changed files with 23 additions and 11 deletions
--- a/lib/chunks/nowiki.rb
+++ b/lib/chunks/nowiki.rb
@ -1,6 +1,5 @@
 require 'chunks/chunk'
 require 'sanitize'
-require 'rexml/document'

 # This chunks allows certain parts of a wiki page to be hidden from the
 # rest of the rendering pipeline. It should be run at the beginning
@ -27,15 +26,7 @@ class NoWiki < Chunk::Abstract

  def initialize(match_data, content)
    super
-    begin
-      sanitized = sanitize_xhtml(match_data[1])
-      doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>")
-      sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
-    rescue REXML::ParseException
-      sanitized = %{<pre class='markdown-html-error' style='border: solid 3px red; background-color: pink;'>HTML parse error:
-#{sanitized.escapeHTML}</pre>}
-    end
-    @plain_text = @unmask_text = sanitized
+    @plain_text = @unmask_text = safe_sanitize_xhtml(match_data[1])
  end

 end
--- a/lib/sanitize.rb
+++ b/lib/sanitize.rb
@ -9,6 +9,8 @@
 #  sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
 #  sanitize_html() is a case-insensitive sanitizer suitable for HTML
 #  sanitize_rexml() sanitizes a REXML tree, returning a string
+#  safe_sanitize_xhtml() makes extra-sure that the result is well-formed XHTML
+#                        by running the output of sanitize_xhtml() through REXML
 #
 # == Files
 #
@ -69,6 +71,25 @@ module Sanitize
    return parsed if @to_tree
    return parsed.to_s
  end
+  
+# Sanitize a string, parsed using XHTML parsing rules. Reparse the result to
+#    ensure well-formedness. 
+#
+# :call-seq:
+#    safe_sanitize_xhtml(string)                    -> string
+#
+# Unless otherwise specified, the string is assumed to be utf-8 encoded.
+#
+# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
+# (REXML trees are always utf-8 encoded.)
+  def safe_sanitize_xhtml(html, options = {})
+    options[:to_tree] = false
+    sanitized = sanitize_xhtml(html, options)
+    doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>")
+    sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
+    rescue REXML::ParseException
+      sanitized = sanitized.escapeHTML
+  end 

 # Sanitize a string, parsed using HTML parsing rules.
 #
--- a/test/unit/chunks/nowiki_test.rb
+++ b/test/unit/chunks/nowiki_test.rb
@ -26,7 +26,7 @@ class NoWikiTest < Test::Unit::TestCase

  def test_sanitize_nowiki_ill_formed
    match(NoWiki, "<nowiki><animateColor xlink:href='#foo'/></nowiki>",
-                :plain_text => "<pre class='markdown-html-error' style='border: solid 3px red; background-color: pink;'>HTML parse error:\n&lt;animateColor xlink:href=&#39;#foo&#39;&gt;&lt;/animateColor&gt;</pre>"
+                :plain_text => "&lt;animateColor xlink:href=&#39;#foo&#39;&gt;&lt;/animateColor&gt;"
    )
  end