From 41346bf8bd918de83a562e35758189b5d6a21138 Mon Sep 17 00:00:00 2001
From: Jacques Distler <distler@golem.ph.utexas.edu>
Date: Sat, 17 May 2008 01:43:11 -0500
Subject: [PATCH] Efficiency: Entity handling Previously, used a regexp to find
 and convert named entities in the content. Now use a more efficient
 algorithm. Similar tweak for converting NCRs before checking whether text is
 valid utf-8.

---
 app/controllers/wiki_controller.rb |  2 --
 lib/chunks/engines.rb              | 13 +++++----
 lib/chunks/nowiki.rb               |  5 +---
 lib/sanitize.rb                    | 44 ++++++++++++++++++++++--------
 lib/wiki_content.rb                |  2 ++
 test/unit/chunks/nowiki_test.rb    |  6 ----
 test/unit/page_renderer_test.rb    |  7 +++++
 7 files changed, 50 insertions(+), 29 deletions(-)
diff --git a/app/controllers/wiki_controller.rb b/app/controllers/wiki_controller.rb
index 6d69b554..80bd4a8e 100644
--- a/app/controllers/wiki_controller.rb
+++ b/app/controllers/wiki_controller.rb
@@ -13,8 +13,6 @@ class WikiController < ApplicationController
 
   layout 'default', :except => [:atom_with_content, :atom_with_headlines, :atom, :tex, :s5, :export_html]
 
-  include Sanitize
-
   def index
     if @web_name
       redirect_home
diff --git a/lib/chunks/engines.rb b/lib/chunks/engines.rb
index 2576ad14..0f5300c6 100644
--- a/lib/chunks/engines.rb
+++ b/lib/chunks/engines.rb
@@ -1,6 +1,8 @@
 $: << File.dirname(__FILE__) + "../../lib"
 
 require_dependency 'chunks/chunk'
+require 'sanitize'
+
 
 # The markup engines are Chunks that call the one of RedCloth
 # or RDoc to convert text. This markup occurs when the chunk is required
@@ -40,13 +42,13 @@ module Engines
 
       # If the request is for S5, call Maruku accordingly (without math)
       if @content.options[:mode] == :s5
-        my_content = Maruku.new(@content.delete("\r"), {:math_enabled => false,
-                            :content_only => true,
+        my_content = Maruku.new(@content.delete("\r").to_utf8,
+                           {:math_enabled => false, :content_only => true,
                             :author => @content.options[:engine_opts][:author],
                             :title => @content.options[:engine_opts][:title]})
         @content.options[:renderer].s5_theme = my_content.s5_theme
       else
-        html = Maruku.new(@content.delete("\r"), {:math_enabled => false}).to_html
+        html = Maruku.new(@content.delete("\r").to_utf8, {:math_enabled => false}).to_html
         html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
       end
 
@@ -60,7 +62,8 @@ module Engines
 
       # If the request is for S5, call Maruku accordingly
       if @content.options[:mode] == :s5
-        my_content = Maruku.new(@content.delete("\r"), {:math_enabled => true,
+        my_content = Maruku.new(@content.delete("\r").to_utf8,
+                           {:math_enabled => true,
                             :math_numbered => ['\\[','\\begin{equation}'],
                             :content_only => true,
                             :author => @content.options[:engine_opts][:author],
@@ -68,7 +71,7 @@ module Engines
         @content.options[:renderer].s5_theme = my_content.s5_theme
         my_content.to_s5
       else
-        html = Maruku.new(@content.delete("\r"),
+        html = Maruku.new(@content.delete("\r").to_utf8,
              {:math_enabled => true,
               :math_numbered => ['\\[','\\begin{equation}']}).to_html
         html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
diff --git a/lib/chunks/nowiki.rb b/lib/chunks/nowiki.rb
index dc1f1109..db67c847 100644
--- a/lib/chunks/nowiki.rb
+++ b/lib/chunks/nowiki.rb
@@ -14,9 +14,6 @@ require 'chunks/chunk'
 # Author: Mark Reid <mark at threewordslong dot com>
 # Created: 8th June 2004
 
-require 'sanitize'
-include Sanitize
-
 class NoWiki < Chunk::Abstract
 
   NOWIKI_PATTERN = Regexp.new('<nowiki>(.*?)</nowiki>', Regexp::MULTILINE)
@@ -26,7 +23,7 @@ class NoWiki < Chunk::Abstract
 
   def initialize(match_data, content)
     super
-    @plain_text = @unmask_text = sanitize_xhtml(match_data[1])
+    @plain_text = @unmask_text = match_data[1]
   end
 
 end
diff --git a/lib/sanitize.rb b/lib/sanitize.rb
index 76da1779..8d2e0595 100644
--- a/lib/sanitize.rb
+++ b/lib/sanitize.rb
@@ -131,8 +131,12 @@ class String
 #--
    def is_utf8?
      #expand NCRs to utf-8
-     text = self.gsub(/&#x([a-fA-F0-9]+);/) {|m| [$1.hex].pack('U*') }
-     text.gsub!(/&#(\d+);/) {|m| [$1.to_i].pack('U*') }
+     pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)
+     1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')}
+     text = pieces.join
+     pieces = text.split(/&#(\d+);/)
+     1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')}
+     text = pieces.join     
      #ensure the resulting string of bytes is valid utf-8
      text =~  /\A(
          [\x09\x0A\x0D\x20-\x7E]            # ASCII
@@ -2280,7 +2284,9 @@ class String
 #     string.to_ncr  -> string
 #
     def to_ncr
-       self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
+       pieces = self.split(/&([a-zA-Z0-9]+);/)
+       1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr}
+       pieces.join
     end
 
 # Converts XHTML+MathML named entities in string to Numeric Character References
@@ -2291,7 +2297,9 @@ class String
 # Substitution is done in-place.
 #
     def to_ncr!
-       self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
+       pieces = self.split(/&([a-zA-Z0-9]+);/)
+       1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr}
+       self.replace pieces.join
     end
 
 # Converts XHTML+MathML named entities in string to UTF-8
@@ -2300,7 +2308,9 @@ class String
 #     string.to_utf8  -> string
 #
     def to_utf8
-       self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8}
+       pieces = self.split(/&([a-zA-Z0-9]+);/)
+       1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
+       pieces.join
     end
 
 # Converts XHTML+MathML named entities in string to UTF-8
@@ -2311,21 +2321,31 @@ class String
 # Substitution is done in-place.
 #
     def to_utf8!
-       self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8}
+       pieces = self.split(/&([a-zA-Z0-9]+);/)
+       1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
+       self.replace pieces.join
     end
 
   protected
 
     def convert_to_ncr #:nodoc:
-      self =~ /^&([a-zA-Z0-9]+);$/
-      name = $1
-      return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name] : "&amp;" + name + ";"
+      if self =~ /^(lt|gt|amp|quot|apos)$/
+        self.replace "&" + self + ";"
+      elsif MATHML_ENTITIES.has_key?(self)
+        self.replace MATHML_ENTITIES[self]
+      else
+        self.replace "&amp;" + self + ";"
+      end
     end
 
     def convert_to_utf8 #:nodoc:
-      self =~ /^&([a-zA-Z0-9]+);$/
-      name = $1
-      return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*') : "&amp;" + name + ";"
+      if self =~ /^(lt|gt|amp|quot|apos)$/
+        self.replace "&" + self + ";"
+      elsif MATHML_ENTITIES.has_key?(self)         
+        self.replace MATHML_ENTITIES[self].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*')
+      else
+        self.replace "&amp;" + self + ";"
+      end
     end
 
 
diff --git a/lib/wiki_content.rb b/lib/wiki_content.rb
index c3e9de1e..75a846e5 100644
--- a/lib/wiki_content.rb
+++ b/lib/wiki_content.rb
@@ -7,6 +7,8 @@ require_dependency 'chunks/literal'
 require 'chunks/nowiki'
 require 'sanitize'
 
+include Sanitize
+
 # Wiki content is just a string that can process itself with a chain of
 # actions. The actions can modify wiki content so that certain parts of
 # it are protected from being rendered by later actions.
diff --git a/test/unit/chunks/nowiki_test.rb b/test/unit/chunks/nowiki_test.rb
index c1010922..8af5a645 100755
--- a/test/unit/chunks/nowiki_test.rb
+++ b/test/unit/chunks/nowiki_test.rb
@@ -12,10 +12,4 @@ class NoWikiTest < Test::Unit::TestCase
 	)
   end
 
-  def test_sanitized_nowiki
-	match(NoWiki, 'This sentence contains <nowiki><span>a & b</span> <script>alert("XSS!");</script></nowiki>. Do not touch!',
-		:plain_text => '<span>a &amp; b</span> &lt;script&gt;alert("XSS!");&lt;/script&gt;'
-	)
-  end
-
 end
diff --git a/test/unit/page_renderer_test.rb b/test/unit/page_renderer_test.rb
index 943f698d..feef67e0 100644
--- a/test/unit/page_renderer_test.rb
+++ b/test/unit/page_renderer_test.rb
@@ -344,6 +344,13 @@ class PageRendererTest < Test::Unit::TestCase
         "</ins> and lovely morning<ins class='diffins'> today</ins></span></p>", test_renderer(@page.revisions.last).display_diff
   end
   
+  def test_nowiki_sanitization
+    assert_markup_parsed_as('<p>This sentence contains <span>a &amp; b</span> ' +
+     '&lt;script&gt;alert("XSS!");&lt;/script&gt;. Do not touch!</p>',
+      'This sentence contains <nowiki><span>a & b</span> <script>alert("XSS!");' +
+      '</script></nowiki>. Do not touch!')
+  end
+  
   def test_link_to_file
     assert_markup_parsed_as( 
       "<p><span class='newWikiWord'>doc.pdf<a href='../file/doc.pdf'>?</a></span></p>",