From 198d7847bd5ef4bac96ca2a851bc318c800216f8 Mon Sep 17 00:00:00 2001 From: Jacques Distler Date: Sat, 13 Oct 2007 16:32:04 -0500 Subject: [PATCH] Performance My REXML::Element.to_ncr (and REXML::Element.to_utf8) is horribly slow. For long documents, it proves more efficient to serialize to a string, apply String.to_ncr (or String.to_utf8) and then Sanitize the string. --- lib/chunks/engines.rb | 4 +- lib/sanitize.rb | 66 +++++++++++++++++++++++-- test/functional/wiki_controller_test.rb | 21 +++++++- test/unit/page_renderer_test.rb | 14 +++--- test/unit/sanitize_test.rb | 32 ++++++++++++ 5 files changed, 121 insertions(+), 16 deletions(-) create mode 100644 test/unit/sanitize_test.rb diff --git a/lib/chunks/engines.rb b/lib/chunks/engines.rb index 7d82a18d..84a08a89 100644 --- a/lib/chunks/engines.rb +++ b/lib/chunks/engines.rb @@ -77,9 +77,9 @@ module Engines @content.options[:renderer].s5_theme = my_content.s5_theme sanitize_xhtml(my_content.to_s5) else - html = sanitize_rexml(Maruku.new(@content.delete("\r"), + html = sanitize_xhtml(Maruku.new(@content.delete("\r"), {:math_enabled => true, - :math_numbered => ['\\[','\\begin{equation}']}).to_html_tree) + :math_numbered => ['\\[','\\begin{equation}']}).to_html) html.gsub(/\A
\n?(.*?)\n?<\/div>\Z/m, '\1') end diff --git a/lib/sanitize.rb b/lib/sanitize.rb index b9fa2449..92945de6 100644 --- a/lib/sanitize.rb +++ b/lib/sanitize.rb @@ -57,8 +57,15 @@ module Sanitize instance_variable_set("@#{name}", value) end end - parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer, - :encoding => @encoding, :tree => @treebuilder }) + if @encoding == 'utf-8' + parsed = XHTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer, + :lowercase_element_name => false, :lowercase_attr_name => false, + :encoding => @encoding, :tree => @treebuilder }) + else + parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer, + :lowercase_element_name => false, :lowercase_attr_name => false, + :encoding => @encoding, :tree => @treebuilder }) + end return parsed if @to_tree return parsed.to_s end @@ -86,8 +93,13 @@ module Sanitize instance_variable_set("@#{name}", value) end end - parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer, - :encoding => @encoding, :tree => @treebuilder }) + if @encoding == 'utf-8' + parsed = HTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer, + :encoding => @encoding, :tree => @treebuilder }) + else + parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer, + :encoding => @encoding, :tree => @treebuilder }) + end return parsed if @to_tree return parsed.to_s end @@ -98,7 +110,7 @@ module Sanitize # sanitize_rexml(tree) -> string # def sanitize_rexml(tree) - tokens = TreeWalkers.get_tree_walker('rexml').new(tree.to_ncr) + tokens = TreeWalkers.get_tree_walker('rexml').new(tree.to_utf8) XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8', :space_before_trailing_solidus => true, :inject_meta_charset => false, @@ -2273,6 +2285,25 @@ class String self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr} end +# Converts XHTML+MathML named entities to UTF-8 +# +# :call-seq: +# string.to_utf8 -> string +# + def to_utf8 + self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8} + end + +# Converts XHTML+MathML named entities to UTF-8 +# +# :call-seq: +# string.to_ncr! -> str or nil +# +# Substitution is done in-place. + def to_utf8! + self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8} + end + protected def convert_to_ncr #:nodoc: @@ -2281,6 +2312,13 @@ class String return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name] : "&" + name + ";" end + def convert_to_utf8 #:nodoc: + self =~ /^&([a-zA-Z0-9]+);$/ + name = $1 + return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*') : "&" + name + ";" + end + + end require 'rexml/element' @@ -2305,5 +2343,23 @@ module REXML #:nodoc: } return self end + +# Convert XHTML+MathML Named Entities in a REXML::Element to UTF-8 +# +# :call-seq: +# tree.to_utf8 -> REXML::Element +# + def to_utf8 + XPath.each(self, '//*') { |el| + el.texts.each_index {|i| + el.texts[i].value = el.texts[i].to_s.to_utf8 + } + el.attributes.each { |name,val| + el.attributes[name] = val.to_utf8 + } + } + return self + end + end end diff --git a/test/functional/wiki_controller_test.rb b/test/functional/wiki_controller_test.rb index 5dd3d218..2ba37c92 100755 --- a/test/functional/wiki_controller_test.rb +++ b/test/functional/wiki_controller_test.rb @@ -680,10 +680,25 @@ class WikiControllerTest < Test::Unit::TestCase % % Unresolved issues: % -% \binom{}{} -% % \righttoleftarrow % \lefttorightarrow +% +% \color{} with HTML colorspec +% \bgcolor +% \array + +% Of the standard HTML named colors, white, black, red, green, blue and yellow +% are predefined in the color package. Here are the rest. +\definecolor{aqua}{rgb}{0, 1.0, 1.0} +\definecolor{fuschia}{rgb}{1.0, 0, 1.0} +\definecolor{gray}{rgb}{0.502, 0.502, 0.502} +\definecolor{lime}{rgb}{0, 1.0, 0} +\definecolor{maroon}{rgb}{0.502, 0, 0} +\definecolor{navy}{rgb}{0, 0, 0.502} +\definecolor{olive}{rgb}{0.502, 0.502, 0} +\definecolor{purple}{rgb}{0.502, 0, 0.502} +\definecolor{silver}{rgb}{0.753, 0.753, 0.753} +\definecolor{teal}{rgb}{0, 0.502, 0.502} % Because of conflicts, \space and \mathop are converted to % \itexspace and \operatorname during preprocessing. @@ -842,6 +857,8 @@ class WikiControllerTest < Test::Unit::TestCase \renewcommand{\scriptsize}{\scriptstyle} \newcommand{\scriptscriptsize}{\scriptscriptstyle} \newcommand{\mathfr}{\mathfrak} +\newcommand{\statusline}[2]{#2} +\newcommand{\toggle}[2]{#1} %------------------------------------------------------------------- diff --git a/test/unit/page_renderer_test.rb b/test/unit/page_renderer_test.rb index 471c4c07..48d836ee 100644 --- a/test/unit/page_renderer_test.rb +++ b/test/unit/page_renderer_test.rb @@ -164,14 +164,14 @@ class PageRendererTest < Test::Unit::TestCase # wikiwords are invalid as styles, must be in "name: value" form def test_content_with_wikiword_in_style_tag assert_markup_parsed_as( - "

That is some Stylish Emphasis

", + "

That is some Stylish Emphasis

", 'That is some Stylish Emphasis') end # validates format of style.. def test_content_with_valid_style_in_style_tag assert_markup_parsed_as( - "

That is some Stylish Emphasis

", + "

That is some Stylish Emphasis

", 'That is some Stylish Emphasis') end @@ -199,24 +199,24 @@ class PageRendererTest < Test::Unit::TestCase def test_content_with_link_in_parentheses assert_markup_parsed_as( - "

(What is a wiki?)

", + "

(What is a wiki?)

", '([What is a wiki?](http://wiki.org/wiki.cgi?WhatIsWiki))') end def test_content_with_image_link assert_markup_parsed_as( - "

This \"\" is a Markdown image link.

", + "

This is a Markdown image link.

", 'This ![](http://hobix.com/sample.jpg) is a Markdown image link.') end def test_content_with_inlined_img_tag assert_markup_parsed_as( - "

This \"\" is an inline image link.

", + "

This is an inline image link.

", 'This is an inline image link.') # currently, upper case HTML elements are not allowed assert_markup_parsed_as( - '

This <IMG SRC="http://hobix.com/sample.jpg" alt=""></IMG> is an inline image link.

', + '

This <IMG SRC="http://hobix.com/sample.jpg" alt=""/> is an inline image link.

', 'This is an inline image link.') end @@ -361,7 +361,7 @@ class PageRendererTest < Test::Unit::TestCase EOL assert_markup_parsed_as( - "", + "", list_with_tildas) end diff --git a/test/unit/sanitize_test.rb b/test/unit/sanitize_test.rb new file mode 100644 index 00000000..ced2276f --- /dev/null +++ b/test/unit/sanitize_test.rb @@ -0,0 +1,32 @@ +#!/usr/bin/env ruby + +require File.expand_path(File.dirname(__FILE__) + '/../test_helper') +require 'sanitize' + +class SanitizeTest < Test::Unit::TestCase + + def setup + + end + + def rexml_doc(string) + REXML::Document.new( + "
#{string}
") + end + + def my_rex(string) + sanitize_rexml(rexml_doc(string)).gsub(/\A
(.*)<\/div>\Z/m, '\1') + end + + def test_sanitize_named_entities + input = '

Greek φ, double-struck 𝔸, numeric 𝔸 ⁗

' + output = "

Greek \317\225, double-struck \360\235\224\270, numeric \360\235\224\270 \342\201\227

" + output2 = "

Greek \317\225, double-struck \360\235\224\270, numeric 𝔸 ⁗

" + assert_equal(output, sanitize_xhtml(input)) + assert_equal(output, sanitize_html(input)) + assert_equal(output, my_rex(input)) + assert_equal(output2, input.to_utf8) + end + + +end