From 1911d18f659280e59f291a63eda52ad34169b922 Mon Sep 17 00:00:00 2001 From: Jacques Distler Date: Sun, 14 Oct 2007 21:07:46 -0500 Subject: [PATCH] Performance OK. This is a better way: define a custom TreeWalker which converts named entities to utf-8 as it goes. This avoids having to do an extra tree traversal in sanitize_rexml, AND avoids the trainwreck that is html5/inputstream.rb. --- lib/chunks/engines.rb | 4 +- lib/sanitize.rb | 75 +++++++++++++++++++++++++++++++-- test/unit/page_renderer_test.rb | 14 +++--- 3 files changed, 81 insertions(+), 12 deletions(-) diff --git a/lib/chunks/engines.rb b/lib/chunks/engines.rb index 84a08a89..7d82a18d 100644 --- a/lib/chunks/engines.rb +++ b/lib/chunks/engines.rb @@ -77,9 +77,9 @@ module Engines @content.options[:renderer].s5_theme = my_content.s5_theme sanitize_xhtml(my_content.to_s5) else - html = sanitize_xhtml(Maruku.new(@content.delete("\r"), + html = sanitize_rexml(Maruku.new(@content.delete("\r"), {:math_enabled => true, - :math_numbered => ['\\[','\\begin{equation}']}).to_html) + :math_numbered => ['\\[','\\begin{equation}']}).to_html_tree) html.gsub(/\A
\n?(.*?)\n?<\/div>\Z/m, '\1') end diff --git a/lib/sanitize.rb b/lib/sanitize.rb index 92945de6..ff750459 100644 --- a/lib/sanitize.rb +++ b/lib/sanitize.rb @@ -110,7 +110,7 @@ module Sanitize # sanitize_rexml(tree) -> string # def sanitize_rexml(tree) - tokens = TreeWalkers.get_tree_walker('rexml').new(tree.to_utf8) + tokens = TreeWalkers.get_tree_walker('rexml2').new(tree) XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8', :space_before_trailing_solidus => true, :inject_meta_charset => false, @@ -2333,13 +2333,14 @@ module REXML #:nodoc: # REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you # access the resulting REXML document. def to_ncr - XPath.each(self, '//*') { |el| + self.each_element { |el| el.texts.each_index {|i| el.texts[i].value = el.texts[i].to_s.to_ncr } el.attributes.each { |name,val| el.attributes[name] = val.to_ncr } + el.to_ncr if el.has_elements? } return self end @@ -2350,16 +2351,84 @@ module REXML #:nodoc: # tree.to_utf8 -> REXML::Element # def to_utf8 - XPath.each(self, '//*') { |el| + self.each_element { |el| el.texts.each_index {|i| el.texts[i].value = el.texts[i].to_s.to_utf8 } el.attributes.each { |name,val| el.attributes[name] = val.to_utf8 } + el.to_utf8 if el.has_elements? } return self end end end + +module HTML5 #:nodoc: + module TreeWalkers + + private + + class << self + def [](name) + case name.to_s.downcase + when 'rexml' + require 'html5/treewalkers/rexml' + REXML::TreeWalker + when 'rexml2' + REXML2::TreeWalker + else + raise "Unknown TreeWalker #{name}" + end + end + + alias :get_tree_walker :[] + end + + module REXML2 + class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker + + private + + def node_details(node) + case node + when ::REXML::Document + [:DOCUMENT] + when ::REXML::Element + if !node.name + [:DOCUMENT_FRAGMENT] + else + [:ELEMENT, node.name, + node.attributes.map {|name,value| [name,value.to_utf8]}, + node.has_elements? || node.has_text?] + end + when ::REXML::Text + [:TEXT, node.value.to_utf8] + when ::REXML::Comment + [:COMMENT, node.string] + when ::REXML::DocType + [:DOCTYPE, node.name, node.public, node.system] + when ::REXML::XMLDecl + [nil] + else + [:UNKNOWN, node.class.inspect] + end + end + + def first_child(node) + node.children.first + end + + def next_sibling(node) + node.next_sibling + end + + def parent(node) + node.parent + end + end + end + end +end diff --git a/test/unit/page_renderer_test.rb b/test/unit/page_renderer_test.rb index 48d836ee..6dfc5fde 100644 --- a/test/unit/page_renderer_test.rb +++ b/test/unit/page_renderer_test.rb @@ -164,14 +164,14 @@ class PageRendererTest < Test::Unit::TestCase # wikiwords are invalid as styles, must be in "name: value" form def test_content_with_wikiword_in_style_tag assert_markup_parsed_as( - "

That is some Stylish Emphasis

", + '

That is some Stylish Emphasis

', 'That is some Stylish Emphasis') end # validates format of style.. def test_content_with_valid_style_in_style_tag assert_markup_parsed_as( - "

That is some Stylish Emphasis

", + '

That is some Stylish Emphasis

', 'That is some Stylish Emphasis') end @@ -199,24 +199,24 @@ class PageRendererTest < Test::Unit::TestCase def test_content_with_link_in_parentheses assert_markup_parsed_as( - "

(What is a wiki?)

", + '

(What is a wiki?)

', '([What is a wiki?](http://wiki.org/wiki.cgi?WhatIsWiki))') end def test_content_with_image_link assert_markup_parsed_as( - "

This is a Markdown image link.

", + '

This is a Markdown image link.

', 'This ![](http://hobix.com/sample.jpg) is a Markdown image link.') end def test_content_with_inlined_img_tag assert_markup_parsed_as( - "

This is an inline image link.

", + '

This is an inline image link.

', 'This is an inline image link.') # currently, upper case HTML elements are not allowed assert_markup_parsed_as( - '

This <IMG SRC="http://hobix.com/sample.jpg" alt=""/> is an inline image link.

', + '

This <IMG SRC="http://hobix.com/sample.jpg" alt=""></IMG> is an inline image link.

', 'This is an inline image link.') end @@ -361,7 +361,7 @@ class PageRendererTest < Test::Unit::TestCase EOL assert_markup_parsed_as( - "", + "", list_with_tildas) end