New Sanitizer Goes Live

The new sanitizer seems to work well (cuts the time required to produce the Instiki Atom feed in half). Our strategy is to use HTML5lib for <nowiki> content, but to use the new sanitizer for content that has been processed by Maruku (and hence is well-formed). The one broken unit test won't affect us (since it dealt with very malformed HTML).
2008-05-21 02:06:31 -05:00 · 2008-05-21 02:06:31 -05:00 · 45405fc97e
commit 45405fc97e
parent 800880f382
8 changed files with 24 additions and 16 deletions
--- a/lib/chunks/nowiki.rb
+++ b/lib/chunks/nowiki.rb
@ -16,6 +16,9 @@ require 'chunks/chunk'

 class NoWiki < Chunk::Abstract

+  require 'sanitize'
+  include Sanitize
+  
  NOWIKI_PATTERN = Regexp.new('<nowiki>(.*?)</nowiki>', Regexp::MULTILINE)
  def self.pattern() NOWIKI_PATTERN end

@ -23,7 +26,7 @@ class NoWiki < Chunk::Abstract

  def initialize(match_data, content)
    super
-    @plain_text = @unmask_text = match_data[1]
+    @plain_text = @unmask_text = sanitize_xhtml(match_data[1])
  end

 end
--- a/lib/sanitizer.rb
+++ b/lib/sanitizer.rb
@ -120,7 +120,7 @@ module Sanitizer
      #    => &lt;script> do_nasty_stuff() &lt;/script>
      #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
      #    => <a>Click here for $100</a>
-      def sanitize_xhtml(html)
+      def xhtml_sanitize(html)
        if html.index("<")
          tokenizer = HTML::Tokenizer.new(html.to_utf8)
          new_text = ""
@ -149,7 +149,7 @@ module Sanitizer
                    end
                    node.attributes.each do |attr,val|
                      if String === val
-                         node.attributes[attr] = CGI.escapeHTML(val.unescapeHTML)
+                         node.attributes[attr] = CGI.escapeHTML(CGI.unescapeHTML(val))
                      else
                        node.attributes.delete attr
                      end
@ -160,7 +160,7 @@ module Sanitizer
                  node.to_s.gsub(/</, "&lt;").gsub(/>/, "&gt;")
                end
              else
-                CGI.escapeHTML(node.to_s.unescapeHTML)
+                node.to_s.unescapeHTML.escapeHTML
            end
          end

--- a/lib/stringsupport.rb
+++ b/lib/stringsupport.rb
@ -2211,12 +2211,17 @@ class String

 #:stopdoc:

+    def escapeHTML
+            self.gsub( /&/, "&amp;" ).
+             gsub( /</, "&lt;" ).
+             gsub( />/, "&gt;" )
+    end
+    
    def unescapeHTML
    self.gsub(/&(.*?);/n) do
      match = $1.dup
      case match
      when /\Aamp\z/ni           then '&'
-      when /\Aquot\z/ni          then '"'
      when /\Agt\z/ni            then '>'
      when /\Alt\z/ni            then '<'
      when /\A#0*(\d+)\z/n       then
--- a/lib/wiki_content.rb
+++ b/lib/wiki_content.rb
@ -5,7 +5,6 @@ require_dependency 'chunks/include'
 require_dependency 'chunks/wiki'
 require_dependency 'chunks/literal'
 require 'chunks/nowiki'
-require 'sanitize'

 # Wiki content is just a string that can process itself with a chain of
 # actions. The actions can modify wiki content so that certain parts of
@ -113,8 +112,9 @@ end

 class WikiContent < String

+  require 'sanitizer'
  include ChunkManager
-  include Sanitize
+  include Sanitizer

  DEFAULT_OPTS = {
    :active_chunks       => ACTIVE_CHUNKS,
@ -193,7 +193,7 @@ class WikiContent < String
        chunk.unmask_text
      end
    end
-    self.replace sanitize_xhtml(self)
+    self.replace xhtml_sanitize(self)
  end

  def page_name
--- a/test/sanitizer.dat
+++ b/test/sanitizer.dat
@ -359,7 +359,7 @@
    "name": "should_sanitize_script_tag_with_multiple_open_brackets",
    "input": "<<script>alert(\"XSS\");//<</script>",
    "output": "&lt;&lt;script&gt;alert(\"XSS\");//&lt;&lt;/script&gt;",
-    "xhtml": "&lt;&lt;script&gt;alert(&quot;XSS&quot;);//&lt;&lt;/script&gt;",
+    "xhtml": "&lt;&lt;script&gt;alert(\"XSS\");//&lt;&lt;/script&gt;",
    "rexml": "Ill-formed XHTML!"
  },

@ -375,7 +375,7 @@
    "name": "should_sanitize_tag_broken_up_by_null",
    "input": "<scr\u0000ipt>alert(\"XSS\")</scr\u0000ipt>",
    "output": "&lt;scr\ufffdipt&gt;alert(\"XSS\")&lt;/scr\ufffdipt&gt;",
-    "xhtml": "&lt;scr&gt;alert(&quot;XSS&quot;)&lt;/scr&gt;",
+    "xhtml": "&lt;scr&gt;alert(\"XSS\")&lt;/scr&gt;",
    "rexml": "Ill-formed XHTML!"
  },

--- a/test/unit/chunks/nowiki_test.rb
+++ b/test/unit/chunks/nowiki_test.rb
@ -18,9 +18,9 @@ class NoWikiTest < Test::Unit::TestCase
 	)
  end

-  def test_no_sanitize_nowiki
+  def test_sanitize_nowiki
 	match(NoWiki, 'This sentence contains <nowiki>[[test]]&<a href="a&b">shebang</a> <script>alert("xss!");</script> *foo*</nowiki>. Do not touch!',
-		:plain_text => '[[test]]&<a href="a&b">shebang</a> <script>alert("xss!");</script> *foo*'
+		:plain_text => "[[test]]&amp;<a href='a&amp;b'>shebang</a> &lt;script&gt;alert(\"xss!\");&lt;/script&gt; *foo*"
 	)
  end

--- a/test/unit/page_renderer_test.rb
+++ b/test/unit/page_renderer_test.rb
@ -85,8 +85,8 @@ class PageRendererTest < Test::Unit::TestCase
        %{xmlns='http://www.w3.org/1998/Math/MathML'><mi>sin</mi><mo stretchy='false'>} +
        %{(</mo><mi>x</mi><mo stretchy='false'>)</mo><semantics><annotation-xml encoding='SVG1.1'>} +
        %{<svg/></annotation-xml></semantics></math><div class='maruku-eq-tex'><code style='display: none;'>} +
-        %{\\sin(x) \\begin{svg}<svg></svg>\\end{svg}</code></div></div>},
-        "$$\\sin(x) \\begin{svg}<svg></svg>\\end{svg}$$")
+        %{\\sin(x) \\begin{svg}<svg/>\\end{svg}</code></div></div>},
+        "$$\\sin(x) \\begin{svg}<svg/>\\end{svg}$$")
  
    code_block = [ 
      'This is a code block:',
@ -264,7 +264,7 @@ class PageRendererTest < Test::Unit::TestCase
       
    # currently, upper case HTML elements are not allowed
    assert_markup_parsed_as( 
-      "<p>This &lt;IMG SRC=\"http://hobix.com/sample.jpg\" alt=\"\"/&gt; is an inline image link.</p>", 
+      "<p>This &lt;IMG SRC='http://hobix.com/sample.jpg' alt=''/&gt; is an inline image link.</p>", 
      'This <IMG SRC="http://hobix.com/sample.jpg" alt="" /> is an inline image link.')
  end
  
--- a/test/unit/sanitizer_test.rb
+++ b/test/unit/sanitizer_test.rb
@ -14,7 +14,7 @@ class SanitizerTest < Test::Unit::TestCase
  end

  def do_sanitize_xhtml stream
-    sanitize_xhtml(stream.to_utf8)
+    xhtml_sanitize(stream)
  end

  def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)