Performance

OK. This is a better way: define a custom TreeWalker which converts named entities to utf-8 as it goes. This avoids having to do an extra tree traversal in sanitize_rexml, AND avoids the trainwreck that is html5/inputstream.rb.
2007-10-14 21:07:46 -05:00 · 2007-10-14 21:07:46 -05:00 · 1911d18f65
commit 1911d18f65
parent 198d7847bd
3 changed files with 81 additions and 12 deletions
--- a/lib/chunks/engines.rb
+++ b/lib/chunks/engines.rb
@ -77,9 +77,9 @@ module Engines
        @content.options[:renderer].s5_theme = my_content.s5_theme
        sanitize_xhtml(my_content.to_s5)
      else
-        html = sanitize_xhtml(Maruku.new(@content.delete("\r"),
+        html = sanitize_rexml(Maruku.new(@content.delete("\r"),
             {:math_enabled => true,
-              :math_numbered => ['\\[','\\begin{equation}']}).to_html)
+              :math_numbered => ['\\[','\\begin{equation}']}).to_html_tree)
        html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
      end

--- a/lib/sanitize.rb
+++ b/lib/sanitize.rb
@ -110,7 +110,7 @@ module Sanitize
 #    sanitize_rexml(tree)                    -> string
 #
  def sanitize_rexml(tree)
-    tokens = TreeWalkers.get_tree_walker('rexml').new(tree.to_utf8)
+    tokens = TreeWalkers.get_tree_walker('rexml2').new(tree)
    XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
      :space_before_trailing_solidus => true,
      :inject_meta_charset => false,
@ -2333,13 +2333,14 @@ module REXML #:nodoc:
 # REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
 # access the resulting REXML document.
    def to_ncr
-      XPath.each(self, '//*') { |el|
+      self.each_element { |el|
        el.texts.each_index  {|i|
          el.texts[i].value = el.texts[i].to_s.to_ncr
        }
        el.attributes.each { |name,val|
          el.attributes[name] = val.to_ncr
        }
+        el.to_ncr if el.has_elements?
      }
      return self
    end
@ -2350,16 +2351,84 @@ module REXML #:nodoc:
 #     tree.to_utf8  -> REXML::Element
 #
    def to_utf8
-      XPath.each(self, '//*') { |el|
+      self.each_element { |el|
        el.texts.each_index  {|i|
          el.texts[i].value = el.texts[i].to_s.to_utf8
        }
        el.attributes.each { |name,val|
          el.attributes[name] = val.to_utf8
        }
+        el.to_utf8 if el.has_elements?
      }
      return self
    end

  end
 end
+
+module HTML5 #:nodoc:
+  module TreeWalkers
+
+    private
+
+    class << self
+      def [](name)
+        case name.to_s.downcase
+        when 'rexml'
+          require 'html5/treewalkers/rexml'
+          REXML::TreeWalker
+        when 'rexml2'
+          REXML2::TreeWalker
+        else
+          raise "Unknown TreeWalker #{name}"
+        end
+      end
+
+      alias :get_tree_walker :[]
+    end
+
+    module REXML2
+      class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
+
+        private
+
+        def node_details(node)
+          case node
+          when ::REXML::Document
+            [:DOCUMENT]
+          when ::REXML::Element
+            if !node.name
+              [:DOCUMENT_FRAGMENT]
+            else
+              [:ELEMENT, node.name,
+                node.attributes.map {|name,value| [name,value.to_utf8]},
+                node.has_elements? || node.has_text?]
+            end
+          when ::REXML::Text
+            [:TEXT, node.value.to_utf8]
+          when ::REXML::Comment
+            [:COMMENT, node.string]
+          when ::REXML::DocType
+            [:DOCTYPE, node.name, node.public, node.system]
+          when ::REXML::XMLDecl
+            [nil]
+          else
+            [:UNKNOWN, node.class.inspect]
+          end
+        end
+
+        def first_child(node)
+          node.children.first
+        end
+
+        def next_sibling(node)
+          node.next_sibling
+        end
+
+        def parent(node)
+          node.parent
+        end
+      end
+    end
+  end
+end
--- a/test/unit/page_renderer_test.rb
+++ b/test/unit/page_renderer_test.rb
@ -164,14 +164,14 @@ class PageRendererTest < Test::Unit::TestCase
  # wikiwords are invalid as styles, must be in "name: value" form
  def test_content_with_wikiword_in_style_tag
    assert_markup_parsed_as(
-        "<p>That is some <em style=''>Stylish Emphasis</em></p>", 
+        '<p>That is some <em style="">Stylish Emphasis</em></p>', 
 	    'That is some <em style="WikiWord">Stylish Emphasis</em>')
  end
 
  # validates format of style..
  def test_content_with_valid_style_in_style_tag
    assert_markup_parsed_as(
-        "<p>That is some <em style='text-align: right;'>Stylish Emphasis</em></p>", 
+        '<p>That is some <em style="text-align: right;">Stylish Emphasis</em></p>', 
 	    'That is some <em style="text-align: right">Stylish Emphasis</em>')
  end
  
@ -199,24 +199,24 @@ class PageRendererTest < Test::Unit::TestCase
  
  def test_content_with_link_in_parentheses
    assert_markup_parsed_as(
-      "<p>(<a href='http://wiki.org/wiki.cgi?WhatIsWiki'>What is a wiki?</a>)</p>",
+      '<p>(<a href="http://wiki.org/wiki.cgi?WhatIsWiki">What is a wiki?</a>)</p>',
      '([What is a wiki?](http://wiki.org/wiki.cgi?WhatIsWiki))')
  end
  
  def test_content_with_image_link
    assert_markup_parsed_as( 
-      "<p>This <img src='http://hobix.com/sample.jpg' alt=''/> is a Markdown image link.</p>", 
+      '<p>This <img alt="" src="http://hobix.com/sample.jpg" /> is a Markdown image link.</p>', 
      'This ![](http://hobix.com/sample.jpg) is a Markdown image link.')
  end
  
  def test_content_with_inlined_img_tag
    assert_markup_parsed_as( 
-      "<p>This <img src='http://hobix.com/sample.jpg' alt=''/> is an inline image link.</p>", 
+      '<p>This <img alt="" src="http://hobix.com/sample.jpg" /> is an inline image link.</p>', 
      'This <img src="http://hobix.com/sample.jpg" alt="" /> is an inline image link.')
       
    # currently, upper case HTML elements are not allowed
    assert_markup_parsed_as( 
-      '<p>This &lt;IMG SRC="http://hobix.com/sample.jpg" alt=""/&gt; is an inline image link.</p>', 
+      '<p>This &lt;IMG SRC="http://hobix.com/sample.jpg" alt=""&gt;&lt;/IMG&gt; is an inline image link.</p>', 
      'This <IMG SRC="http://hobix.com/sample.jpg" alt="" /> is an inline image link.')
  end
  
@ -361,7 +361,7 @@ class PageRendererTest < Test::Unit::TestCase
    EOL
  
    assert_markup_parsed_as(
-        "<ul>\n<li><a href='~b'>a</a></li>\n\n<li>c~ d</li>\n</ul>",
+        "<ul>\n<li><a href=\"~b\">a</a></li>\n\n<li>c~ d</li>\n</ul>",
        list_with_tildas)
  end