From 198d7847bd5ef4bac96ca2a851bc318c800216f8 Mon Sep 17 00:00:00 2001
From: Jacques Distler <distler@golem.ph.utexas.edu>
Date: Sat, 13 Oct 2007 16:32:04 -0500
Subject: [PATCH] Performance My REXML::Element.to_ncr (and
 REXML::Element.to_utf8) is horribly slow. For long documents, it proves more
 efficient to serialize to a string, apply String.to_ncr (or String.to_utf8)
 and then Sanitize the string.

---
 lib/chunks/engines.rb                   |  4 +-
 lib/sanitize.rb                         | 66 +++++++++++++++++++++++--
 test/functional/wiki_controller_test.rb | 21 +++++++-
 test/unit/page_renderer_test.rb         | 14 +++---
 test/unit/sanitize_test.rb              | 32 ++++++++++++
 5 files changed, 121 insertions(+), 16 deletions(-)
 create mode 100644 test/unit/sanitize_test.rb
diff --git a/lib/chunks/engines.rb b/lib/chunks/engines.rb
index 7d82a18d..84a08a89 100644
--- a/lib/chunks/engines.rb
+++ b/lib/chunks/engines.rb
@@ -77,9 +77,9 @@ module Engines
         @content.options[:renderer].s5_theme = my_content.s5_theme
         sanitize_xhtml(my_content.to_s5)
       else
-        html = sanitize_rexml(Maruku.new(@content.delete("\r"),
+        html = sanitize_xhtml(Maruku.new(@content.delete("\r"),
              {:math_enabled => true,
-              :math_numbered => ['\\[','\\begin{equation}']}).to_html_tree)
+              :math_numbered => ['\\[','\\begin{equation}']}).to_html)
         html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
       end
 
diff --git a/lib/sanitize.rb b/lib/sanitize.rb
index b9fa2449..92945de6 100644
--- a/lib/sanitize.rb
+++ b/lib/sanitize.rb
@@ -57,8 +57,15 @@ module Sanitize
         instance_variable_set("@#{name}", value)
       end
     end
-    parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
-      :encoding => @encoding, :tree => @treebuilder })
+    if @encoding == 'utf-8'
+      parsed = XHTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
+        :lowercase_element_name => false, :lowercase_attr_name => false,
+        :encoding => @encoding, :tree => @treebuilder })
+    else
+      parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
+        :lowercase_element_name => false, :lowercase_attr_name => false,
+        :encoding => @encoding, :tree => @treebuilder })
+    end      
     return parsed if @to_tree
     return parsed.to_s
   end
@@ -86,8 +93,13 @@ module Sanitize
         instance_variable_set("@#{name}", value)
       end
     end
-    parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
-      :encoding => @encoding, :tree => @treebuilder })
+    if @encoding == 'utf-8'
+      parsed = HTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
+        :encoding => @encoding, :tree => @treebuilder })
+    else
+      parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
+        :encoding => @encoding, :tree => @treebuilder })
+    end 
     return parsed if @to_tree
     return parsed.to_s
   end
@@ -98,7 +110,7 @@ module Sanitize
 #    sanitize_rexml(tree)                    -> string
 #
   def sanitize_rexml(tree)
-    tokens = TreeWalkers.get_tree_walker('rexml').new(tree.to_ncr)
+    tokens = TreeWalkers.get_tree_walker('rexml').new(tree.to_utf8)
     XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
       :space_before_trailing_solidus => true,
       :inject_meta_charset => false,
@@ -2273,6 +2285,25 @@ class String
        self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
     end
 
+# Converts XHTML+MathML named entities to UTF-8
+#
+#  :call-seq:
+#     string.to_utf8  -> string
+#
+    def to_utf8
+       self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8}
+    end
+
+# Converts XHTML+MathML named entities to UTF-8
+#
+#  :call-seq:
+#     string.to_ncr!  -> str or nil
+#
+# Substitution is done in-place.
+    def to_utf8!
+       self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8}
+    end
+
   protected
 
     def convert_to_ncr #:nodoc:
@@ -2281,6 +2312,13 @@ class String
       return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name] : "&amp;" + name + ";"
     end
 
+    def convert_to_utf8 #:nodoc:
+      self =~ /^&([a-zA-Z0-9]+);$/
+      name = $1
+      return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*') : "&amp;" + name + ";"
+    end
+
+
 end
 
 require 'rexml/element'
@@ -2305,5 +2343,23 @@ module REXML #:nodoc:
       }
       return self
     end
+    
+# Convert XHTML+MathML Named Entities in a REXML::Element to UTF-8
+#
+#  :call-seq:
+#     tree.to_utf8  -> REXML::Element
+#
+    def to_utf8
+      XPath.each(self, '//*') { |el|
+        el.texts.each_index  {|i|
+          el.texts[i].value = el.texts[i].to_s.to_utf8
+        }
+        el.attributes.each { |name,val|
+          el.attributes[name] = val.to_utf8
+        }
+      }
+      return self
+    end
+
   end
 end
diff --git a/test/functional/wiki_controller_test.rb b/test/functional/wiki_controller_test.rb
index 5dd3d218..2ba37c92 100755
--- a/test/functional/wiki_controller_test.rb
+++ b/test/functional/wiki_controller_test.rb
@@ -680,10 +680,25 @@ class WikiControllerTest < Test::Unit::TestCase
 %
 % Unresolved issues:
 %
-%  \binom{}{}
-%
 %  \righttoleftarrow
 %  \lefttorightarrow
+%
+%  \color{} with HTML colorspec
+%  \bgcolor
+%  \array
+
+% Of the standard HTML named colors, white, black, red, green, blue and yellow
+% are predefined in the color package. Here are the rest.
+\definecolor{aqua}{rgb}{0, 1.0, 1.0}
+\definecolor{fuschia}{rgb}{1.0, 0, 1.0}
+\definecolor{gray}{rgb}{0.502, 0.502, 0.502}
+\definecolor{lime}{rgb}{0, 1.0, 0}
+\definecolor{maroon}{rgb}{0.502, 0, 0}
+\definecolor{navy}{rgb}{0, 0, 0.502}
+\definecolor{olive}{rgb}{0.502, 0.502, 0}
+\definecolor{purple}{rgb}{0.502, 0, 0.502}
+\definecolor{silver}{rgb}{0.753, 0.753, 0.753}
+\definecolor{teal}{rgb}{0, 0.502, 0.502}
 
 % Because of conflicts, \space and \mathop are converted to
 % \itexspace and \operatorname during preprocessing.
@@ -842,6 +857,8 @@ class WikiControllerTest < Test::Unit::TestCase
 \renewcommand{\scriptsize}{\scriptstyle}
 \newcommand{\scriptscriptsize}{\scriptscriptstyle}
 \newcommand{\mathfr}{\mathfrak}
+\newcommand{\statusline}[2]{#2}
+\newcommand{\toggle}[2]{#1}
 
 %-------------------------------------------------------------------
 
diff --git a/test/unit/page_renderer_test.rb b/test/unit/page_renderer_test.rb
index 471c4c07..48d836ee 100644
--- a/test/unit/page_renderer_test.rb
+++ b/test/unit/page_renderer_test.rb
@@ -164,14 +164,14 @@ class PageRendererTest < Test::Unit::TestCase
   # wikiwords are invalid as styles, must be in "name: value" form
   def test_content_with_wikiword_in_style_tag
     assert_markup_parsed_as(
-        "<p>That is some <em style=\"\">Stylish Emphasis</em></p>", 
+        "<p>That is some <em style=''>Stylish Emphasis</em></p>", 
 	    'That is some <em style="WikiWord">Stylish Emphasis</em>')
   end
  
   # validates format of style..
   def test_content_with_valid_style_in_style_tag
     assert_markup_parsed_as(
-        "<p>That is some <em style=\"text-align: right;\">Stylish Emphasis</em></p>", 
+        "<p>That is some <em style='text-align: right;'>Stylish Emphasis</em></p>", 
 	    'That is some <em style="text-align: right">Stylish Emphasis</em>')
   end
   
@@ -199,24 +199,24 @@ class PageRendererTest < Test::Unit::TestCase
   
   def test_content_with_link_in_parentheses
     assert_markup_parsed_as(
-      "<p>(<a href=\"http://wiki.org/wiki.cgi?WhatIsWiki\">What is a wiki?</a>)</p>",
+      "<p>(<a href='http://wiki.org/wiki.cgi?WhatIsWiki'>What is a wiki?</a>)</p>",
       '([What is a wiki?](http://wiki.org/wiki.cgi?WhatIsWiki))')
   end
   
   def test_content_with_image_link
     assert_markup_parsed_as( 
-      "<p>This <img alt=\"\" src=\"http://hobix.com/sample.jpg\" /> is a Markdown image link.</p>", 
+      "<p>This <img src='http://hobix.com/sample.jpg' alt=''/> is a Markdown image link.</p>", 
       'This ![](http://hobix.com/sample.jpg) is a Markdown image link.')
   end
   
   def test_content_with_inlined_img_tag
     assert_markup_parsed_as( 
-      "<p>This <img alt=\"\" src=\"http://hobix.com/sample.jpg\" /> is an inline image link.</p>", 
+      "<p>This <img src='http://hobix.com/sample.jpg' alt=''/> is an inline image link.</p>", 
       'This <img src="http://hobix.com/sample.jpg" alt="" /> is an inline image link.')
        
     # currently, upper case HTML elements are not allowed
     assert_markup_parsed_as( 
-      '<p>This &lt;IMG SRC="http://hobix.com/sample.jpg" alt=""&gt;&lt;/IMG&gt; is an inline image link.</p>', 
+      '<p>This &lt;IMG SRC="http://hobix.com/sample.jpg" alt=""/&gt; is an inline image link.</p>', 
       'This <IMG SRC="http://hobix.com/sample.jpg" alt="" /> is an inline image link.')
   end
   
@@ -361,7 +361,7 @@ class PageRendererTest < Test::Unit::TestCase
     EOL
   
     assert_markup_parsed_as(
-        "<ul>\n<li><a href=\"~b\">a</a></li>\n\n<li>c~ d</li>\n</ul>",
+        "<ul>\n<li><a href='~b'>a</a></li>\n\n<li>c~ d</li>\n</ul>",
         list_with_tildas)
   end
   
diff --git a/test/unit/sanitize_test.rb b/test/unit/sanitize_test.rb
new file mode 100644
index 00000000..ced2276f
--- /dev/null
+++ b/test/unit/sanitize_test.rb
@@ -0,0 +1,32 @@
+#!/usr/bin/env ruby
+
+require File.expand_path(File.dirname(__FILE__) + '/../test_helper')
+require 'sanitize'
+
+class SanitizeTest < Test::Unit::TestCase
+
+  def setup
+
+  end
+
+  def rexml_doc(string)
+    REXML::Document.new(
+      "<div xmlns='http://www.w3.org/1999/xhtml'>#{string}</div>")
+  end
+  
+  def my_rex(string)
+    sanitize_rexml(rexml_doc(string)).gsub(/\A<div xmlns="http:\/\/www.w3.org\/1999\/xhtml">(.*)<\/div>\Z/m, '\1')
+  end
+
+  def test_sanitize_named_entities
+    input = '<p>Greek &phi;, double-struck &Aopf;, numeric &#x1D538; &#8279;</p>'
+    output = "<p>Greek \317\225, double-struck \360\235\224\270, numeric \360\235\224\270 \342\201\227</p>"
+    output2 = "<p>Greek \317\225, double-struck \360\235\224\270, numeric &#x1D538; &#8279;</p>"
+    assert_equal(output, sanitize_xhtml(input))
+    assert_equal(output, sanitize_html(input))
+    assert_equal(output, my_rex(input))
+    assert_equal(output2, input.to_utf8)
+  end
+
+
+end