diff --git a/lib/chunks/engines.rb b/lib/chunks/engines.rb
index 7d82a18d..84a08a89 100644
--- a/lib/chunks/engines.rb
+++ b/lib/chunks/engines.rb
@@ -77,9 +77,9 @@ module Engines
@content.options[:renderer].s5_theme = my_content.s5_theme
sanitize_xhtml(my_content.to_s5)
else
- html = sanitize_rexml(Maruku.new(@content.delete("\r"),
+ html = sanitize_xhtml(Maruku.new(@content.delete("\r"),
{:math_enabled => true,
- :math_numbered => ['\\[','\\begin{equation}']}).to_html_tree)
+ :math_numbered => ['\\[','\\begin{equation}']}).to_html)
html.gsub(/\A
\n?(.*?)\n?<\/div>\Z/m, '\1')
end
diff --git a/lib/sanitize.rb b/lib/sanitize.rb
index b9fa2449..92945de6 100644
--- a/lib/sanitize.rb
+++ b/lib/sanitize.rb
@@ -57,8 +57,15 @@ module Sanitize
instance_variable_set("@#{name}", value)
end
end
- parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
- :encoding => @encoding, :tree => @treebuilder })
+ if @encoding == 'utf-8'
+ parsed = XHTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
+ :lowercase_element_name => false, :lowercase_attr_name => false,
+ :encoding => @encoding, :tree => @treebuilder })
+ else
+ parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
+ :lowercase_element_name => false, :lowercase_attr_name => false,
+ :encoding => @encoding, :tree => @treebuilder })
+ end
return parsed if @to_tree
return parsed.to_s
end
@@ -86,8 +93,13 @@ module Sanitize
instance_variable_set("@#{name}", value)
end
end
- parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
- :encoding => @encoding, :tree => @treebuilder })
+ if @encoding == 'utf-8'
+ parsed = HTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
+ :encoding => @encoding, :tree => @treebuilder })
+ else
+ parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
+ :encoding => @encoding, :tree => @treebuilder })
+ end
return parsed if @to_tree
return parsed.to_s
end
@@ -98,7 +110,7 @@ module Sanitize
# sanitize_rexml(tree) -> string
#
def sanitize_rexml(tree)
- tokens = TreeWalkers.get_tree_walker('rexml').new(tree.to_ncr)
+ tokens = TreeWalkers.get_tree_walker('rexml').new(tree.to_utf8)
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
:space_before_trailing_solidus => true,
:inject_meta_charset => false,
@@ -2273,6 +2285,25 @@ class String
self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
end
+# Converts XHTML+MathML named entities to UTF-8
+#
+# :call-seq:
+# string.to_utf8 -> string
+#
+ def to_utf8
+ self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8}
+ end
+
+# Converts XHTML+MathML named entities to UTF-8
+#
+# :call-seq:
+# string.to_ncr! -> str or nil
+#
+# Substitution is done in-place.
+ def to_utf8!
+ self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8}
+ end
+
protected
def convert_to_ncr #:nodoc:
@@ -2281,6 +2312,13 @@ class String
return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name] : "&" + name + ";"
end
+ def convert_to_utf8 #:nodoc:
+ self =~ /^&([a-zA-Z0-9]+);$/
+ name = $1
+ return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name].split(';').collect {|s| s.gsub(/^([A-F0-9]+)$/, '\1').hex }.pack('U*') : "&" + name + ";"
+ end
+
+
end
require 'rexml/element'
@@ -2305,5 +2343,23 @@ module REXML #:nodoc:
}
return self
end
+
+# Convert XHTML+MathML Named Entities in a REXML::Element to UTF-8
+#
+# :call-seq:
+# tree.to_utf8 -> REXML::Element
+#
+ def to_utf8
+ XPath.each(self, '//*') { |el|
+ el.texts.each_index {|i|
+ el.texts[i].value = el.texts[i].to_s.to_utf8
+ }
+ el.attributes.each { |name,val|
+ el.attributes[name] = val.to_utf8
+ }
+ }
+ return self
+ end
+
end
end
diff --git a/test/functional/wiki_controller_test.rb b/test/functional/wiki_controller_test.rb
index 5dd3d218..2ba37c92 100755
--- a/test/functional/wiki_controller_test.rb
+++ b/test/functional/wiki_controller_test.rb
@@ -680,10 +680,25 @@ class WikiControllerTest < Test::Unit::TestCase
%
% Unresolved issues:
%
-% \binom{}{}
-%
% \righttoleftarrow
% \lefttorightarrow
+%
+% \color{} with HTML colorspec
+% \bgcolor
+% \array
+
+% Of the standard HTML named colors, white, black, red, green, blue and yellow
+% are predefined in the color package. Here are the rest.
+\definecolor{aqua}{rgb}{0, 1.0, 1.0}
+\definecolor{fuschia}{rgb}{1.0, 0, 1.0}
+\definecolor{gray}{rgb}{0.502, 0.502, 0.502}
+\definecolor{lime}{rgb}{0, 1.0, 0}
+\definecolor{maroon}{rgb}{0.502, 0, 0}
+\definecolor{navy}{rgb}{0, 0, 0.502}
+\definecolor{olive}{rgb}{0.502, 0.502, 0}
+\definecolor{purple}{rgb}{0.502, 0, 0.502}
+\definecolor{silver}{rgb}{0.753, 0.753, 0.753}
+\definecolor{teal}{rgb}{0, 0.502, 0.502}
% Because of conflicts, \space and \mathop are converted to
% \itexspace and \operatorname during preprocessing.
@@ -842,6 +857,8 @@ class WikiControllerTest < Test::Unit::TestCase
\renewcommand{\scriptsize}{\scriptstyle}
\newcommand{\scriptscriptsize}{\scriptscriptstyle}
\newcommand{\mathfr}{\mathfrak}
+\newcommand{\statusline}[2]{#2}
+\newcommand{\toggle}[2]{#1}
%-------------------------------------------------------------------
diff --git a/test/unit/page_renderer_test.rb b/test/unit/page_renderer_test.rb
index 471c4c07..48d836ee 100644
--- a/test/unit/page_renderer_test.rb
+++ b/test/unit/page_renderer_test.rb
@@ -164,14 +164,14 @@ class PageRendererTest < Test::Unit::TestCase
# wikiwords are invalid as styles, must be in "name: value" form
def test_content_with_wikiword_in_style_tag
assert_markup_parsed_as(
- "
That is some Stylish Emphasis
",
+ "
That is some Stylish Emphasis
",
'That is some
Stylish Emphasis')
end
# validates format of style..
def test_content_with_valid_style_in_style_tag
assert_markup_parsed_as(
- "
That is some Stylish Emphasis
",
+ "
That is some Stylish Emphasis
",
'That is some
Stylish Emphasis')
end
@@ -199,24 +199,24 @@ class PageRendererTest < Test::Unit::TestCase
def test_content_with_link_in_parentheses
assert_markup_parsed_as(
- "
(What is a wiki?)
",
+ "
(What is a wiki?)
",
'([What is a wiki?](http://wiki.org/wiki.cgi?WhatIsWiki))')
end
def test_content_with_image_link
assert_markup_parsed_as(
- "
This is a Markdown image link.
",
+ "
This is a Markdown image link.
",
'This ![](http://hobix.com/sample.jpg) is a Markdown image link.')
end
def test_content_with_inlined_img_tag
assert_markup_parsed_as(
- "
This is an inline image link.
",
+ "
This is an inline image link.
",
'This
is an inline image link.')
# currently, upper case HTML elements are not allowed
assert_markup_parsed_as(
- '
This <IMG SRC="http://hobix.com/sample.jpg" alt=""></IMG> is an inline image link.
',
+ '
This <IMG SRC="http://hobix.com/sample.jpg" alt=""/> is an inline image link.
',
'This
is an inline image link.')
end
@@ -361,7 +361,7 @@ class PageRendererTest < Test::Unit::TestCase
EOL
assert_markup_parsed_as(
- "
",
+ "
",
list_with_tildas)
end
diff --git a/test/unit/sanitize_test.rb b/test/unit/sanitize_test.rb
new file mode 100644
index 00000000..ced2276f
--- /dev/null
+++ b/test/unit/sanitize_test.rb
@@ -0,0 +1,32 @@
+#!/usr/bin/env ruby
+
+require File.expand_path(File.dirname(__FILE__) + '/../test_helper')
+require 'sanitize'
+
+class SanitizeTest < Test::Unit::TestCase
+
+ def setup
+
+ end
+
+ def rexml_doc(string)
+ REXML::Document.new(
+ "
#{string}
")
+ end
+
+ def my_rex(string)
+ sanitize_rexml(rexml_doc(string)).gsub(/\A
(.*)<\/div>\Z/m, '\1')
+ end
+
+ def test_sanitize_named_entities
+ input = '
Greek φ, double-struck 𝔸, numeric 𝔸 ⁗
'
+ output = "
Greek \317\225, double-struck \360\235\224\270, numeric \360\235\224\270 \342\201\227
"
+ output2 = "
Greek \317\225, double-struck \360\235\224\270, numeric 𝔸 ⁗
"
+ assert_equal(output, sanitize_xhtml(input))
+ assert_equal(output, sanitize_html(input))
+ assert_equal(output, my_rex(input))
+ assert_equal(output2, input.to_utf8)
+ end
+
+
+end