Sync with trunk
This commit is contained in:
commit
7521a073b2
152
lib/sanitize.rb
152
lib/sanitize.rb
|
@ -57,8 +57,15 @@ module Sanitize
|
||||||
instance_variable_set("@#{name}", value)
|
instance_variable_set("@#{name}", value)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
|
if @encoding == 'utf-8'
|
||||||
:encoding => @encoding, :tree => @treebuilder })
|
parsed = XHTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
|
||||||
|
:lowercase_element_name => false, :lowercase_attr_name => false,
|
||||||
|
:encoding => @encoding, :tree => @treebuilder })
|
||||||
|
else
|
||||||
|
parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
|
||||||
|
:lowercase_element_name => false, :lowercase_attr_name => false,
|
||||||
|
:encoding => @encoding, :tree => @treebuilder })
|
||||||
|
end
|
||||||
return parsed if @to_tree
|
return parsed if @to_tree
|
||||||
return parsed.to_s
|
return parsed.to_s
|
||||||
end
|
end
|
||||||
|
@ -86,8 +93,13 @@ module Sanitize
|
||||||
instance_variable_set("@#{name}", value)
|
instance_variable_set("@#{name}", value)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
|
if @encoding == 'utf-8'
|
||||||
:encoding => @encoding, :tree => @treebuilder })
|
parsed = HTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
|
||||||
|
:encoding => @encoding, :tree => @treebuilder })
|
||||||
|
else
|
||||||
|
parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
|
||||||
|
:encoding => @encoding, :tree => @treebuilder })
|
||||||
|
end
|
||||||
return parsed if @to_tree
|
return parsed if @to_tree
|
||||||
return parsed.to_s
|
return parsed.to_s
|
||||||
end
|
end
|
||||||
|
@ -98,7 +110,7 @@ module Sanitize
|
||||||
# sanitize_rexml(tree) -> string
|
# sanitize_rexml(tree) -> string
|
||||||
#
|
#
|
||||||
def sanitize_rexml(tree)
|
def sanitize_rexml(tree)
|
||||||
tokens = TreeWalkers.get_tree_walker('rexml').new(tree.to_ncr)
|
tokens = TreeWalkers.get_tree_walker('rexml2').new(tree)
|
||||||
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
|
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
|
||||||
:space_before_trailing_solidus => true,
|
:space_before_trailing_solidus => true,
|
||||||
:inject_meta_charset => false,
|
:inject_meta_charset => false,
|
||||||
|
@ -2254,7 +2266,7 @@ class String
|
||||||
}
|
}
|
||||||
#:startdoc:
|
#:startdoc:
|
||||||
|
|
||||||
# Converts XHTML+MathML named entities to Numeric Character References
|
# Converts XHTML+MathML named entities in string to Numeric Character References
|
||||||
#
|
#
|
||||||
# :call-seq:
|
# :call-seq:
|
||||||
# string.to_ncr -> string
|
# string.to_ncr -> string
|
||||||
|
@ -2263,16 +2275,37 @@ class String
|
||||||
self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
|
self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
|
||||||
end
|
end
|
||||||
|
|
||||||
# Converts XHTML+MathML named entities to Numeric Character References
|
# Converts XHTML+MathML named entities in string to Numeric Character References
|
||||||
#
|
#
|
||||||
# :call-seq:
|
# :call-seq:
|
||||||
# string.to_ncr! -> str or nil
|
# string.to_ncr! -> str or nil
|
||||||
#
|
#
|
||||||
# Substitution is done in-place.
|
# Substitution is done in-place.
|
||||||
|
#
|
||||||
def to_ncr!
|
def to_ncr!
|
||||||
self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
|
self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Converts XHTML+MathML named entities in string to UTF-8
|
||||||
|
#
|
||||||
|
# :call-seq:
|
||||||
|
# string.to_utf8 -> string
|
||||||
|
#
|
||||||
|
def to_utf8
|
||||||
|
self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8}
|
||||||
|
end
|
||||||
|
|
||||||
|
# Converts XHTML+MathML named entities in string to UTF-8
|
||||||
|
#
|
||||||
|
# :call-seq:
|
||||||
|
# string.to_ncr! -> str or nil
|
||||||
|
#
|
||||||
|
# Substitution is done in-place.
|
||||||
|
#
|
||||||
|
def to_utf8!
|
||||||
|
self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8}
|
||||||
|
end
|
||||||
|
|
||||||
protected
|
protected
|
||||||
|
|
||||||
def convert_to_ncr #:nodoc:
|
def convert_to_ncr #:nodoc:
|
||||||
|
@ -2281,6 +2314,13 @@ class String
|
||||||
return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name] : "&" + name + ";"
|
return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name] : "&" + name + ";"
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def convert_to_utf8 #:nodoc:
|
||||||
|
self =~ /^&([a-zA-Z0-9]+);$/
|
||||||
|
name = $1
|
||||||
|
return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*') : "&" + name + ";"
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
require 'rexml/element'
|
require 'rexml/element'
|
||||||
|
@ -2294,16 +2334,112 @@ module REXML #:nodoc:
|
||||||
#
|
#
|
||||||
# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
|
# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
|
||||||
# access the resulting REXML document.
|
# access the resulting REXML document.
|
||||||
|
#
|
||||||
|
# Note that this method needs to traverse the entire tree, converting text nodes and attributes
|
||||||
|
# for each element. This can be SLOW. It will often be faster to serialize to a string and then
|
||||||
|
# use String.to_ncr instead.
|
||||||
|
#
|
||||||
def to_ncr
|
def to_ncr
|
||||||
XPath.each(self, '//*') { |el|
|
self.each_element { |el|
|
||||||
el.texts.each_index {|i|
|
el.texts.each_index {|i|
|
||||||
el.texts[i].value = el.texts[i].to_s.to_ncr
|
el.texts[i].value = el.texts[i].to_s.to_ncr
|
||||||
}
|
}
|
||||||
el.attributes.each { |name,val|
|
el.attributes.each { |name,val|
|
||||||
el.attributes[name] = val.to_ncr
|
el.attributes[name] = val.to_ncr
|
||||||
}
|
}
|
||||||
|
el.to_ncr if el.has_elements?
|
||||||
}
|
}
|
||||||
return self
|
return self
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Convert XHTML+MathML Named Entities in a REXML::Element to UTF-8
|
||||||
|
#
|
||||||
|
# :call-seq:
|
||||||
|
# tree.to_utf8 -> REXML::Element
|
||||||
|
#
|
||||||
|
# Note that this method needs to traverse the entire tree, converting text nodes and attributes
|
||||||
|
# for each element. This can be SLOW. It will often be faster to serialize to a string and then
|
||||||
|
# use String.to_utf8 instead.
|
||||||
|
#
|
||||||
|
def to_utf8
|
||||||
|
self.each_element { |el|
|
||||||
|
el.texts.each_index {|i|
|
||||||
|
el.texts[i].value = el.texts[i].to_s.to_utf8
|
||||||
|
}
|
||||||
|
el.attributes.each { |name,val|
|
||||||
|
el.attributes[name] = val.to_utf8
|
||||||
|
}
|
||||||
|
el.to_utf8 if el.has_elements?
|
||||||
|
}
|
||||||
|
return self
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
module HTML5 #:nodoc: all
|
||||||
|
module TreeWalkers
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
class << self
|
||||||
|
def [](name)
|
||||||
|
case name.to_s.downcase
|
||||||
|
when 'rexml'
|
||||||
|
require 'html5/treewalkers/rexml'
|
||||||
|
REXML::TreeWalker
|
||||||
|
when 'rexml2'
|
||||||
|
REXML2::TreeWalker
|
||||||
|
else
|
||||||
|
raise "Unknown TreeWalker #{name}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
alias :get_tree_walker :[]
|
||||||
|
end
|
||||||
|
|
||||||
|
module REXML2
|
||||||
|
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def node_details(node)
|
||||||
|
case node
|
||||||
|
when ::REXML::Document
|
||||||
|
[:DOCUMENT]
|
||||||
|
when ::REXML::Element
|
||||||
|
if !node.name
|
||||||
|
[:DOCUMENT_FRAGMENT]
|
||||||
|
else
|
||||||
|
[:ELEMENT, node.name,
|
||||||
|
node.attributes.map {|name,value| [name,value.to_utf8]},
|
||||||
|
node.has_elements? || node.has_text?]
|
||||||
|
end
|
||||||
|
when ::REXML::Text
|
||||||
|
[:TEXT, node.value.to_utf8]
|
||||||
|
when ::REXML::Comment
|
||||||
|
[:COMMENT, node.string]
|
||||||
|
when ::REXML::DocType
|
||||||
|
[:DOCTYPE, node.name, node.public, node.system]
|
||||||
|
when ::REXML::XMLDecl
|
||||||
|
[nil]
|
||||||
|
else
|
||||||
|
[:UNKNOWN, node.class.inspect]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def first_child(node)
|
||||||
|
node.children.first
|
||||||
|
end
|
||||||
|
|
||||||
|
def next_sibling(node)
|
||||||
|
node.next_sibling
|
||||||
|
end
|
||||||
|
|
||||||
|
def parent(node)
|
||||||
|
node.parent
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -680,10 +680,25 @@ class WikiControllerTest < Test::Unit::TestCase
|
||||||
%
|
%
|
||||||
% Unresolved issues:
|
% Unresolved issues:
|
||||||
%
|
%
|
||||||
% \binom{}{}
|
|
||||||
%
|
|
||||||
% \righttoleftarrow
|
% \righttoleftarrow
|
||||||
% \lefttorightarrow
|
% \lefttorightarrow
|
||||||
|
%
|
||||||
|
% \color{} with HTML colorspec
|
||||||
|
% \bgcolor
|
||||||
|
% \array
|
||||||
|
|
||||||
|
% Of the standard HTML named colors, white, black, red, green, blue and yellow
|
||||||
|
% are predefined in the color package. Here are the rest.
|
||||||
|
\definecolor{aqua}{rgb}{0, 1.0, 1.0}
|
||||||
|
\definecolor{fuschia}{rgb}{1.0, 0, 1.0}
|
||||||
|
\definecolor{gray}{rgb}{0.502, 0.502, 0.502}
|
||||||
|
\definecolor{lime}{rgb}{0, 1.0, 0}
|
||||||
|
\definecolor{maroon}{rgb}{0.502, 0, 0}
|
||||||
|
\definecolor{navy}{rgb}{0, 0, 0.502}
|
||||||
|
\definecolor{olive}{rgb}{0.502, 0.502, 0}
|
||||||
|
\definecolor{purple}{rgb}{0.502, 0, 0.502}
|
||||||
|
\definecolor{silver}{rgb}{0.753, 0.753, 0.753}
|
||||||
|
\definecolor{teal}{rgb}{0, 0.502, 0.502}
|
||||||
|
|
||||||
% Because of conflicts, \space and \mathop are converted to
|
% Because of conflicts, \space and \mathop are converted to
|
||||||
% \itexspace and \operatorname during preprocessing.
|
% \itexspace and \operatorname during preprocessing.
|
||||||
|
@ -842,6 +857,8 @@ class WikiControllerTest < Test::Unit::TestCase
|
||||||
\renewcommand{\scriptsize}{\scriptstyle}
|
\renewcommand{\scriptsize}{\scriptstyle}
|
||||||
\newcommand{\scriptscriptsize}{\scriptscriptstyle}
|
\newcommand{\scriptscriptsize}{\scriptscriptstyle}
|
||||||
\newcommand{\mathfr}{\mathfrak}
|
\newcommand{\mathfr}{\mathfrak}
|
||||||
|
\newcommand{\statusline}[2]{#2}
|
||||||
|
\newcommand{\toggle}[2]{#1}
|
||||||
|
|
||||||
%-------------------------------------------------------------------
|
%-------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
|
@ -164,14 +164,14 @@ class PageRendererTest < Test::Unit::TestCase
|
||||||
# wikiwords are invalid as styles, must be in "name: value" form
|
# wikiwords are invalid as styles, must be in "name: value" form
|
||||||
def test_content_with_wikiword_in_style_tag
|
def test_content_with_wikiword_in_style_tag
|
||||||
assert_markup_parsed_as(
|
assert_markup_parsed_as(
|
||||||
"<p>That is some <em style=\"\">Stylish Emphasis</em></p>",
|
'<p>That is some <em style="">Stylish Emphasis</em></p>',
|
||||||
'That is some <em style="WikiWord">Stylish Emphasis</em>')
|
'That is some <em style="WikiWord">Stylish Emphasis</em>')
|
||||||
end
|
end
|
||||||
|
|
||||||
# validates format of style..
|
# validates format of style..
|
||||||
def test_content_with_valid_style_in_style_tag
|
def test_content_with_valid_style_in_style_tag
|
||||||
assert_markup_parsed_as(
|
assert_markup_parsed_as(
|
||||||
"<p>That is some <em style=\"text-align: right;\">Stylish Emphasis</em></p>",
|
'<p>That is some <em style="text-align: right;">Stylish Emphasis</em></p>',
|
||||||
'That is some <em style="text-align: right">Stylish Emphasis</em>')
|
'That is some <em style="text-align: right">Stylish Emphasis</em>')
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -199,19 +199,19 @@ class PageRendererTest < Test::Unit::TestCase
|
||||||
|
|
||||||
def test_content_with_link_in_parentheses
|
def test_content_with_link_in_parentheses
|
||||||
assert_markup_parsed_as(
|
assert_markup_parsed_as(
|
||||||
"<p>(<a href=\"http://wiki.org/wiki.cgi?WhatIsWiki\">What is a wiki?</a>)</p>",
|
'<p>(<a href="http://wiki.org/wiki.cgi?WhatIsWiki">What is a wiki?</a>)</p>',
|
||||||
'([What is a wiki?](http://wiki.org/wiki.cgi?WhatIsWiki))')
|
'([What is a wiki?](http://wiki.org/wiki.cgi?WhatIsWiki))')
|
||||||
end
|
end
|
||||||
|
|
||||||
def test_content_with_image_link
|
def test_content_with_image_link
|
||||||
assert_markup_parsed_as(
|
assert_markup_parsed_as(
|
||||||
"<p>This <img alt=\"\" src=\"http://hobix.com/sample.jpg\" /> is a Markdown image link.</p>",
|
'<p>This <img alt="" src="http://hobix.com/sample.jpg" /> is a Markdown image link.</p>',
|
||||||
'This ![](http://hobix.com/sample.jpg) is a Markdown image link.')
|
'This ![](http://hobix.com/sample.jpg) is a Markdown image link.')
|
||||||
end
|
end
|
||||||
|
|
||||||
def test_content_with_inlined_img_tag
|
def test_content_with_inlined_img_tag
|
||||||
assert_markup_parsed_as(
|
assert_markup_parsed_as(
|
||||||
"<p>This <img alt=\"\" src=\"http://hobix.com/sample.jpg\" /> is an inline image link.</p>",
|
'<p>This <img alt="" src="http://hobix.com/sample.jpg" /> is an inline image link.</p>',
|
||||||
'This <img src="http://hobix.com/sample.jpg" alt="" /> is an inline image link.')
|
'This <img src="http://hobix.com/sample.jpg" alt="" /> is an inline image link.')
|
||||||
|
|
||||||
# currently, upper case HTML elements are not allowed
|
# currently, upper case HTML elements are not allowed
|
||||||
|
|
32
test/unit/sanitize_test.rb
Normal file
32
test/unit/sanitize_test.rb
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
#!/usr/bin/env ruby
|
||||||
|
|
||||||
|
require File.expand_path(File.dirname(__FILE__) + '/../test_helper')
|
||||||
|
require 'sanitize'
|
||||||
|
|
||||||
|
class SanitizeTest < Test::Unit::TestCase
|
||||||
|
|
||||||
|
def setup
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
def rexml_doc(string)
|
||||||
|
REXML::Document.new(
|
||||||
|
"<div xmlns='http://www.w3.org/1999/xhtml'>#{string}</div>")
|
||||||
|
end
|
||||||
|
|
||||||
|
def my_rex(string)
|
||||||
|
sanitize_rexml(rexml_doc(string)).gsub(/\A<div xmlns="http:\/\/www.w3.org\/1999\/xhtml">(.*)<\/div>\Z/m, '\1')
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_sanitize_named_entities
|
||||||
|
input = '<p>Greek φ, double-struck 𝔸, numeric 𝔸 ⁗</p>'
|
||||||
|
output = "<p>Greek \317\225, double-struck \360\235\224\270, numeric \360\235\224\270 \342\201\227</p>"
|
||||||
|
output2 = "<p>Greek \317\225, double-struck \360\235\224\270, numeric 𝔸 ⁗</p>"
|
||||||
|
assert_equal(output, sanitize_xhtml(input))
|
||||||
|
assert_equal(output, sanitize_html(input))
|
||||||
|
assert_equal(output, my_rex(input))
|
||||||
|
assert_equal(output2, input.to_utf8)
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
end
|
Loading…
Reference in a new issue