Performance
OK. This is a better way: define a custom TreeWalker which converts named entities to utf-8 as it goes. This avoids having to do an extra tree traversal in sanitize_rexml, AND avoids the trainwreck that is html5/inputstream.rb.
This commit is contained in:
parent
198d7847bd
commit
1911d18f65
|
@ -77,9 +77,9 @@ module Engines
|
||||||
@content.options[:renderer].s5_theme = my_content.s5_theme
|
@content.options[:renderer].s5_theme = my_content.s5_theme
|
||||||
sanitize_xhtml(my_content.to_s5)
|
sanitize_xhtml(my_content.to_s5)
|
||||||
else
|
else
|
||||||
html = sanitize_xhtml(Maruku.new(@content.delete("\r"),
|
html = sanitize_rexml(Maruku.new(@content.delete("\r"),
|
||||||
{:math_enabled => true,
|
{:math_enabled => true,
|
||||||
:math_numbered => ['\\[','\\begin{equation}']}).to_html)
|
:math_numbered => ['\\[','\\begin{equation}']}).to_html_tree)
|
||||||
html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
|
html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -110,7 +110,7 @@ module Sanitize
|
||||||
# sanitize_rexml(tree) -> string
|
# sanitize_rexml(tree) -> string
|
||||||
#
|
#
|
||||||
def sanitize_rexml(tree)
|
def sanitize_rexml(tree)
|
||||||
tokens = TreeWalkers.get_tree_walker('rexml').new(tree.to_utf8)
|
tokens = TreeWalkers.get_tree_walker('rexml2').new(tree)
|
||||||
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
|
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
|
||||||
:space_before_trailing_solidus => true,
|
:space_before_trailing_solidus => true,
|
||||||
:inject_meta_charset => false,
|
:inject_meta_charset => false,
|
||||||
|
@ -2333,13 +2333,14 @@ module REXML #:nodoc:
|
||||||
# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
|
# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
|
||||||
# access the resulting REXML document.
|
# access the resulting REXML document.
|
||||||
def to_ncr
|
def to_ncr
|
||||||
XPath.each(self, '//*') { |el|
|
self.each_element { |el|
|
||||||
el.texts.each_index {|i|
|
el.texts.each_index {|i|
|
||||||
el.texts[i].value = el.texts[i].to_s.to_ncr
|
el.texts[i].value = el.texts[i].to_s.to_ncr
|
||||||
}
|
}
|
||||||
el.attributes.each { |name,val|
|
el.attributes.each { |name,val|
|
||||||
el.attributes[name] = val.to_ncr
|
el.attributes[name] = val.to_ncr
|
||||||
}
|
}
|
||||||
|
el.to_ncr if el.has_elements?
|
||||||
}
|
}
|
||||||
return self
|
return self
|
||||||
end
|
end
|
||||||
|
@ -2350,16 +2351,84 @@ module REXML #:nodoc:
|
||||||
# tree.to_utf8 -> REXML::Element
|
# tree.to_utf8 -> REXML::Element
|
||||||
#
|
#
|
||||||
def to_utf8
|
def to_utf8
|
||||||
XPath.each(self, '//*') { |el|
|
self.each_element { |el|
|
||||||
el.texts.each_index {|i|
|
el.texts.each_index {|i|
|
||||||
el.texts[i].value = el.texts[i].to_s.to_utf8
|
el.texts[i].value = el.texts[i].to_s.to_utf8
|
||||||
}
|
}
|
||||||
el.attributes.each { |name,val|
|
el.attributes.each { |name,val|
|
||||||
el.attributes[name] = val.to_utf8
|
el.attributes[name] = val.to_utf8
|
||||||
}
|
}
|
||||||
|
el.to_utf8 if el.has_elements?
|
||||||
}
|
}
|
||||||
return self
|
return self
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
module HTML5 #:nodoc:
|
||||||
|
module TreeWalkers
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
class << self
|
||||||
|
def [](name)
|
||||||
|
case name.to_s.downcase
|
||||||
|
when 'rexml'
|
||||||
|
require 'html5/treewalkers/rexml'
|
||||||
|
REXML::TreeWalker
|
||||||
|
when 'rexml2'
|
||||||
|
REXML2::TreeWalker
|
||||||
|
else
|
||||||
|
raise "Unknown TreeWalker #{name}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
alias :get_tree_walker :[]
|
||||||
|
end
|
||||||
|
|
||||||
|
module REXML2
|
||||||
|
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def node_details(node)
|
||||||
|
case node
|
||||||
|
when ::REXML::Document
|
||||||
|
[:DOCUMENT]
|
||||||
|
when ::REXML::Element
|
||||||
|
if !node.name
|
||||||
|
[:DOCUMENT_FRAGMENT]
|
||||||
|
else
|
||||||
|
[:ELEMENT, node.name,
|
||||||
|
node.attributes.map {|name,value| [name,value.to_utf8]},
|
||||||
|
node.has_elements? || node.has_text?]
|
||||||
|
end
|
||||||
|
when ::REXML::Text
|
||||||
|
[:TEXT, node.value.to_utf8]
|
||||||
|
when ::REXML::Comment
|
||||||
|
[:COMMENT, node.string]
|
||||||
|
when ::REXML::DocType
|
||||||
|
[:DOCTYPE, node.name, node.public, node.system]
|
||||||
|
when ::REXML::XMLDecl
|
||||||
|
[nil]
|
||||||
|
else
|
||||||
|
[:UNKNOWN, node.class.inspect]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def first_child(node)
|
||||||
|
node.children.first
|
||||||
|
end
|
||||||
|
|
||||||
|
def next_sibling(node)
|
||||||
|
node.next_sibling
|
||||||
|
end
|
||||||
|
|
||||||
|
def parent(node)
|
||||||
|
node.parent
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
|
@ -164,14 +164,14 @@ class PageRendererTest < Test::Unit::TestCase
|
||||||
# wikiwords are invalid as styles, must be in "name: value" form
|
# wikiwords are invalid as styles, must be in "name: value" form
|
||||||
def test_content_with_wikiword_in_style_tag
|
def test_content_with_wikiword_in_style_tag
|
||||||
assert_markup_parsed_as(
|
assert_markup_parsed_as(
|
||||||
"<p>That is some <em style=''>Stylish Emphasis</em></p>",
|
'<p>That is some <em style="">Stylish Emphasis</em></p>',
|
||||||
'That is some <em style="WikiWord">Stylish Emphasis</em>')
|
'That is some <em style="WikiWord">Stylish Emphasis</em>')
|
||||||
end
|
end
|
||||||
|
|
||||||
# validates format of style..
|
# validates format of style..
|
||||||
def test_content_with_valid_style_in_style_tag
|
def test_content_with_valid_style_in_style_tag
|
||||||
assert_markup_parsed_as(
|
assert_markup_parsed_as(
|
||||||
"<p>That is some <em style='text-align: right;'>Stylish Emphasis</em></p>",
|
'<p>That is some <em style="text-align: right;">Stylish Emphasis</em></p>',
|
||||||
'That is some <em style="text-align: right">Stylish Emphasis</em>')
|
'That is some <em style="text-align: right">Stylish Emphasis</em>')
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -199,24 +199,24 @@ class PageRendererTest < Test::Unit::TestCase
|
||||||
|
|
||||||
def test_content_with_link_in_parentheses
|
def test_content_with_link_in_parentheses
|
||||||
assert_markup_parsed_as(
|
assert_markup_parsed_as(
|
||||||
"<p>(<a href='http://wiki.org/wiki.cgi?WhatIsWiki'>What is a wiki?</a>)</p>",
|
'<p>(<a href="http://wiki.org/wiki.cgi?WhatIsWiki">What is a wiki?</a>)</p>',
|
||||||
'([What is a wiki?](http://wiki.org/wiki.cgi?WhatIsWiki))')
|
'([What is a wiki?](http://wiki.org/wiki.cgi?WhatIsWiki))')
|
||||||
end
|
end
|
||||||
|
|
||||||
def test_content_with_image_link
|
def test_content_with_image_link
|
||||||
assert_markup_parsed_as(
|
assert_markup_parsed_as(
|
||||||
"<p>This <img src='http://hobix.com/sample.jpg' alt=''/> is a Markdown image link.</p>",
|
'<p>This <img alt="" src="http://hobix.com/sample.jpg" /> is a Markdown image link.</p>',
|
||||||
'This ![](http://hobix.com/sample.jpg) is a Markdown image link.')
|
'This ![](http://hobix.com/sample.jpg) is a Markdown image link.')
|
||||||
end
|
end
|
||||||
|
|
||||||
def test_content_with_inlined_img_tag
|
def test_content_with_inlined_img_tag
|
||||||
assert_markup_parsed_as(
|
assert_markup_parsed_as(
|
||||||
"<p>This <img src='http://hobix.com/sample.jpg' alt=''/> is an inline image link.</p>",
|
'<p>This <img alt="" src="http://hobix.com/sample.jpg" /> is an inline image link.</p>',
|
||||||
'This <img src="http://hobix.com/sample.jpg" alt="" /> is an inline image link.')
|
'This <img src="http://hobix.com/sample.jpg" alt="" /> is an inline image link.')
|
||||||
|
|
||||||
# currently, upper case HTML elements are not allowed
|
# currently, upper case HTML elements are not allowed
|
||||||
assert_markup_parsed_as(
|
assert_markup_parsed_as(
|
||||||
'<p>This <IMG SRC="http://hobix.com/sample.jpg" alt=""/> is an inline image link.</p>',
|
'<p>This <IMG SRC="http://hobix.com/sample.jpg" alt=""></IMG> is an inline image link.</p>',
|
||||||
'This <IMG SRC="http://hobix.com/sample.jpg" alt="" /> is an inline image link.')
|
'This <IMG SRC="http://hobix.com/sample.jpg" alt="" /> is an inline image link.')
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -361,7 +361,7 @@ class PageRendererTest < Test::Unit::TestCase
|
||||||
EOL
|
EOL
|
||||||
|
|
||||||
assert_markup_parsed_as(
|
assert_markup_parsed_as(
|
||||||
"<ul>\n<li><a href='~b'>a</a></li>\n\n<li>c~ d</li>\n</ul>",
|
"<ul>\n<li><a href=\"~b\">a</a></li>\n\n<li>c~ d</li>\n</ul>",
|
||||||
list_with_tildas)
|
list_with_tildas)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue