Performance

My REXML::Element.to_ncr (and REXML::Element.to_utf8) is horribly slow. For long documents, it proves more efficient to serialize to a string, apply String.to_ncr (or String.to_utf8) and then Sanitize the string.
This commit is contained in:
Jacques Distler 2007-10-13 16:32:04 -05:00
parent 0eb1ab56b0
commit 198d7847bd
5 changed files with 121 additions and 16 deletions

View file

@ -77,9 +77,9 @@ module Engines
@content.options[:renderer].s5_theme = my_content.s5_theme @content.options[:renderer].s5_theme = my_content.s5_theme
sanitize_xhtml(my_content.to_s5) sanitize_xhtml(my_content.to_s5)
else else
html = sanitize_rexml(Maruku.new(@content.delete("\r"), html = sanitize_xhtml(Maruku.new(@content.delete("\r"),
{:math_enabled => true, {:math_enabled => true,
:math_numbered => ['\\[','\\begin{equation}']}).to_html_tree) :math_numbered => ['\\[','\\begin{equation}']}).to_html)
html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1') html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
end end

View file

@ -57,8 +57,15 @@ module Sanitize
instance_variable_set("@#{name}", value) instance_variable_set("@#{name}", value)
end end
end end
parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer, if @encoding == 'utf-8'
:encoding => @encoding, :tree => @treebuilder }) parsed = XHTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
:lowercase_element_name => false, :lowercase_attr_name => false,
:encoding => @encoding, :tree => @treebuilder })
else
parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
:lowercase_element_name => false, :lowercase_attr_name => false,
:encoding => @encoding, :tree => @treebuilder })
end
return parsed if @to_tree return parsed if @to_tree
return parsed.to_s return parsed.to_s
end end
@ -86,8 +93,13 @@ module Sanitize
instance_variable_set("@#{name}", value) instance_variable_set("@#{name}", value)
end end
end end
parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer, if @encoding == 'utf-8'
:encoding => @encoding, :tree => @treebuilder }) parsed = HTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
:encoding => @encoding, :tree => @treebuilder })
else
parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
:encoding => @encoding, :tree => @treebuilder })
end
return parsed if @to_tree return parsed if @to_tree
return parsed.to_s return parsed.to_s
end end
@ -98,7 +110,7 @@ module Sanitize
# sanitize_rexml(tree) -> string # sanitize_rexml(tree) -> string
# #
def sanitize_rexml(tree) def sanitize_rexml(tree)
tokens = TreeWalkers.get_tree_walker('rexml').new(tree.to_ncr) tokens = TreeWalkers.get_tree_walker('rexml').new(tree.to_utf8)
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8', XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
:space_before_trailing_solidus => true, :space_before_trailing_solidus => true,
:inject_meta_charset => false, :inject_meta_charset => false,
@ -2273,6 +2285,25 @@ class String
self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr} self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
end end
# Converts XHTML+MathML named entities to UTF-8
#
# :call-seq:
# string.to_utf8 -> string
#
def to_utf8
self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8}
end
# Converts XHTML+MathML named entities to UTF-8
#
# :call-seq:
# string.to_ncr! -> str or nil
#
# Substitution is done in-place.
def to_utf8!
self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8}
end
protected protected
def convert_to_ncr #:nodoc: def convert_to_ncr #:nodoc:
@ -2281,6 +2312,13 @@ class String
return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name] : "&amp;" + name + ";" return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name] : "&amp;" + name + ";"
end end
def convert_to_utf8 #:nodoc:
self =~ /^&([a-zA-Z0-9]+);$/
name = $1
return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*') : "&amp;" + name + ";"
end
end end
require 'rexml/element' require 'rexml/element'
@ -2305,5 +2343,23 @@ module REXML #:nodoc:
} }
return self return self
end end
# Convert XHTML+MathML Named Entities in a REXML::Element to UTF-8
#
# :call-seq:
# tree.to_utf8 -> REXML::Element
#
def to_utf8
XPath.each(self, '//*') { |el|
el.texts.each_index {|i|
el.texts[i].value = el.texts[i].to_s.to_utf8
}
el.attributes.each { |name,val|
el.attributes[name] = val.to_utf8
}
}
return self
end
end end
end end

View file

@ -680,10 +680,25 @@ class WikiControllerTest < Test::Unit::TestCase
% %
% Unresolved issues: % Unresolved issues:
% %
% \binom{}{}
%
% \righttoleftarrow % \righttoleftarrow
% \lefttorightarrow % \lefttorightarrow
%
% \color{} with HTML colorspec
% \bgcolor
% \array
% Of the standard HTML named colors, white, black, red, green, blue and yellow
% are predefined in the color package. Here are the rest.
\definecolor{aqua}{rgb}{0, 1.0, 1.0}
\definecolor{fuschia}{rgb}{1.0, 0, 1.0}
\definecolor{gray}{rgb}{0.502, 0.502, 0.502}
\definecolor{lime}{rgb}{0, 1.0, 0}
\definecolor{maroon}{rgb}{0.502, 0, 0}
\definecolor{navy}{rgb}{0, 0, 0.502}
\definecolor{olive}{rgb}{0.502, 0.502, 0}
\definecolor{purple}{rgb}{0.502, 0, 0.502}
\definecolor{silver}{rgb}{0.753, 0.753, 0.753}
\definecolor{teal}{rgb}{0, 0.502, 0.502}
% Because of conflicts, \space and \mathop are converted to % Because of conflicts, \space and \mathop are converted to
% \itexspace and \operatorname during preprocessing. % \itexspace and \operatorname during preprocessing.
@ -842,6 +857,8 @@ class WikiControllerTest < Test::Unit::TestCase
\renewcommand{\scriptsize}{\scriptstyle} \renewcommand{\scriptsize}{\scriptstyle}
\newcommand{\scriptscriptsize}{\scriptscriptstyle} \newcommand{\scriptscriptsize}{\scriptscriptstyle}
\newcommand{\mathfr}{\mathfrak} \newcommand{\mathfr}{\mathfrak}
\newcommand{\statusline}[2]{#2}
\newcommand{\toggle}[2]{#1}
%------------------------------------------------------------------- %-------------------------------------------------------------------

View file

@ -164,14 +164,14 @@ class PageRendererTest < Test::Unit::TestCase
# wikiwords are invalid as styles, must be in "name: value" form # wikiwords are invalid as styles, must be in "name: value" form
def test_content_with_wikiword_in_style_tag def test_content_with_wikiword_in_style_tag
assert_markup_parsed_as( assert_markup_parsed_as(
"<p>That is some <em style=\"\">Stylish Emphasis</em></p>", "<p>That is some <em style=''>Stylish Emphasis</em></p>",
'That is some <em style="WikiWord">Stylish Emphasis</em>') 'That is some <em style="WikiWord">Stylish Emphasis</em>')
end end
# validates format of style.. # validates format of style..
def test_content_with_valid_style_in_style_tag def test_content_with_valid_style_in_style_tag
assert_markup_parsed_as( assert_markup_parsed_as(
"<p>That is some <em style=\"text-align: right;\">Stylish Emphasis</em></p>", "<p>That is some <em style='text-align: right;'>Stylish Emphasis</em></p>",
'That is some <em style="text-align: right">Stylish Emphasis</em>') 'That is some <em style="text-align: right">Stylish Emphasis</em>')
end end
@ -199,24 +199,24 @@ class PageRendererTest < Test::Unit::TestCase
def test_content_with_link_in_parentheses def test_content_with_link_in_parentheses
assert_markup_parsed_as( assert_markup_parsed_as(
"<p>(<a href=\"http://wiki.org/wiki.cgi?WhatIsWiki\">What is a wiki?</a>)</p>", "<p>(<a href='http://wiki.org/wiki.cgi?WhatIsWiki'>What is a wiki?</a>)</p>",
'([What is a wiki?](http://wiki.org/wiki.cgi?WhatIsWiki))') '([What is a wiki?](http://wiki.org/wiki.cgi?WhatIsWiki))')
end end
def test_content_with_image_link def test_content_with_image_link
assert_markup_parsed_as( assert_markup_parsed_as(
"<p>This <img alt=\"\" src=\"http://hobix.com/sample.jpg\" /> is a Markdown image link.</p>", "<p>This <img src='http://hobix.com/sample.jpg' alt=''/> is a Markdown image link.</p>",
'This ![](http://hobix.com/sample.jpg) is a Markdown image link.') 'This ![](http://hobix.com/sample.jpg) is a Markdown image link.')
end end
def test_content_with_inlined_img_tag def test_content_with_inlined_img_tag
assert_markup_parsed_as( assert_markup_parsed_as(
"<p>This <img alt=\"\" src=\"http://hobix.com/sample.jpg\" /> is an inline image link.</p>", "<p>This <img src='http://hobix.com/sample.jpg' alt=''/> is an inline image link.</p>",
'This <img src="http://hobix.com/sample.jpg" alt="" /> is an inline image link.') 'This <img src="http://hobix.com/sample.jpg" alt="" /> is an inline image link.')
# currently, upper case HTML elements are not allowed # currently, upper case HTML elements are not allowed
assert_markup_parsed_as( assert_markup_parsed_as(
'<p>This &lt;IMG SRC="http://hobix.com/sample.jpg" alt=""&gt;&lt;/IMG&gt; is an inline image link.</p>', '<p>This &lt;IMG SRC="http://hobix.com/sample.jpg" alt=""/&gt; is an inline image link.</p>',
'This <IMG SRC="http://hobix.com/sample.jpg" alt="" /> is an inline image link.') 'This <IMG SRC="http://hobix.com/sample.jpg" alt="" /> is an inline image link.')
end end
@ -361,7 +361,7 @@ class PageRendererTest < Test::Unit::TestCase
EOL EOL
assert_markup_parsed_as( assert_markup_parsed_as(
"<ul>\n<li><a href=\"~b\">a</a></li>\n\n<li>c~ d</li>\n</ul>", "<ul>\n<li><a href='~b'>a</a></li>\n\n<li>c~ d</li>\n</ul>",
list_with_tildas) list_with_tildas)
end end

View file

@ -0,0 +1,32 @@
#!/usr/bin/env ruby
require File.expand_path(File.dirname(__FILE__) + '/../test_helper')
require 'sanitize'
class SanitizeTest < Test::Unit::TestCase
def setup
end
def rexml_doc(string)
REXML::Document.new(
"<div xmlns='http://www.w3.org/1999/xhtml'>#{string}</div>")
end
def my_rex(string)
sanitize_rexml(rexml_doc(string)).gsub(/\A<div xmlns="http:\/\/www.w3.org\/1999\/xhtml">(.*)<\/div>\Z/m, '\1')
end
def test_sanitize_named_entities
input = '<p>Greek &phi;, double-struck &Aopf;, numeric &#x1D538; &#8279;</p>'
output = "<p>Greek \317\225, double-struck \360\235\224\270, numeric \360\235\224\270 \342\201\227</p>"
output2 = "<p>Greek \317\225, double-struck \360\235\224\270, numeric &#x1D538; &#8279;</p>"
assert_equal(output, sanitize_xhtml(input))
assert_equal(output, sanitize_html(input))
assert_equal(output, my_rex(input))
assert_equal(output2, input.to_utf8)
end
end