Sync with trunk

2007-10-15 07:16:33 -04:00 · 2007-10-15 07:16:33 -04:00 · 7521a073b2
commit 7521a073b2
parent 1cc2043cf6 de125367b0
4 changed files with 200 additions and 15 deletions
--- a/lib/sanitize.rb
+++ b/lib/sanitize.rb
@ -57,8 +57,15 @@ module Sanitize
        instance_variable_set("@#{name}", value)
      end
    end
-    parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
+    if @encoding == 'utf-8'
+      parsed = XHTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
+        :lowercase_element_name => false, :lowercase_attr_name => false,
        :encoding => @encoding, :tree => @treebuilder })
+    else
+      parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
+        :lowercase_element_name => false, :lowercase_attr_name => false,
+        :encoding => @encoding, :tree => @treebuilder })
+    end      
    return parsed if @to_tree
    return parsed.to_s
  end
@ -86,8 +93,13 @@ module Sanitize
        instance_variable_set("@#{name}", value)
      end
    end
+    if @encoding == 'utf-8'
+      parsed = HTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
+        :encoding => @encoding, :tree => @treebuilder })
+    else
      parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
        :encoding => @encoding, :tree => @treebuilder })
+    end 
    return parsed if @to_tree
    return parsed.to_s
  end
@ -98,7 +110,7 @@ module Sanitize
 #    sanitize_rexml(tree)                    -> string
 #
  def sanitize_rexml(tree)
-    tokens = TreeWalkers.get_tree_walker('rexml').new(tree.to_ncr)
+    tokens = TreeWalkers.get_tree_walker('rexml2').new(tree)
    XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
      :space_before_trailing_solidus => true,
      :inject_meta_charset => false,
@ -2254,7 +2266,7 @@ class String
  }
 #:startdoc:

-# Converts XHTML+MathML named entities to Numeric Character References
+# Converts XHTML+MathML named entities in string to Numeric Character References
 #
 #  :call-seq:
 #     string.to_ncr  -> string
@ -2263,16 +2275,37 @@ class String
       self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
    end

-# Converts XHTML+MathML named entities to Numeric Character References
+# Converts XHTML+MathML named entities in string to Numeric Character References
 #
 #  :call-seq:
 #     string.to_ncr!  -> str or nil
 #
 # Substitution is done in-place.
+#
    def to_ncr!
       self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
    end

+# Converts XHTML+MathML named entities in string to UTF-8
+#
+#  :call-seq:
+#     string.to_utf8  -> string
+#
+    def to_utf8
+       self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8}
+    end
+
+# Converts XHTML+MathML named entities in string to UTF-8
+#
+#  :call-seq:
+#     string.to_ncr!  -> str or nil
+#
+# Substitution is done in-place.
+#
+    def to_utf8!
+       self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8}
+    end
+
  protected

    def convert_to_ncr #:nodoc:
@ -2281,6 +2314,13 @@ class String
      return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name] : "&amp;" + name + ";"
    end

+    def convert_to_utf8 #:nodoc:
+      self =~ /^&([a-zA-Z0-9]+);$/
+      name = $1
+      return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*') : "&amp;" + name + ";"
+    end
+
+
 end

 require 'rexml/element'
@ -2294,16 +2334,112 @@ module REXML #:nodoc:
 #
 # REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
 # access the resulting REXML document.
+#
+# Note that this method needs to traverse the entire tree, converting text nodes and attributes
+# for each element. This can be SLOW. It will often be faster to serialize to a string and then
+# use String.to_ncr instead.
+#
    def to_ncr
-      XPath.each(self, '//*') { |el|
+      self.each_element { |el|
        el.texts.each_index  {|i|
          el.texts[i].value = el.texts[i].to_s.to_ncr
        }
        el.attributes.each { |name,val|
          el.attributes[name] = val.to_ncr
        }
+        el.to_ncr if el.has_elements?
      }
      return self
    end
+    
+# Convert XHTML+MathML Named Entities in a REXML::Element to UTF-8
+#
+#  :call-seq:
+#     tree.to_utf8  -> REXML::Element
+#
+# Note that this method needs to traverse the entire tree, converting text nodes and attributes 
+# for each element. This can be SLOW. It will often be faster to serialize to a string and then
+# use String.to_utf8 instead.
+#
+    def to_utf8
+      self.each_element { |el|
+        el.texts.each_index  {|i|
+          el.texts[i].value = el.texts[i].to_s.to_utf8
+        }
+        el.attributes.each { |name,val|
+          el.attributes[name] = val.to_utf8
+        }
+        el.to_utf8 if el.has_elements?
+      }
+      return self
+    end
+
+  end
+end
+
+module HTML5 #:nodoc: all
+  module TreeWalkers
+
+    private
+
+    class << self
+      def [](name)
+        case name.to_s.downcase
+        when 'rexml'
+          require 'html5/treewalkers/rexml'
+          REXML::TreeWalker
+        when 'rexml2'
+          REXML2::TreeWalker
+        else
+          raise "Unknown TreeWalker #{name}"
+        end
+      end
+
+      alias :get_tree_walker :[]
+    end
+
+    module REXML2
+      class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
+
+        private
+
+        def node_details(node)
+          case node
+          when ::REXML::Document
+            [:DOCUMENT]
+          when ::REXML::Element
+            if !node.name
+              [:DOCUMENT_FRAGMENT]
+            else
+              [:ELEMENT, node.name,
+                node.attributes.map {|name,value| [name,value.to_utf8]},
+                node.has_elements? || node.has_text?]
+            end
+          when ::REXML::Text
+            [:TEXT, node.value.to_utf8]
+          when ::REXML::Comment
+            [:COMMENT, node.string]
+          when ::REXML::DocType
+            [:DOCTYPE, node.name, node.public, node.system]
+          when ::REXML::XMLDecl
+            [nil]
+          else
+            [:UNKNOWN, node.class.inspect]
+          end
+        end
+
+        def first_child(node)
+          node.children.first
+        end
+
+        def next_sibling(node)
+          node.next_sibling
+        end
+
+        def parent(node)
+          node.parent
+        end
+      end
+    end
  end
 end
--- a/test/functional/wiki_controller_test.rb
+++ b/test/functional/wiki_controller_test.rb
@ -680,10 +680,25 @@ class WikiControllerTest < Test::Unit::TestCase
 %
 % Unresolved  issues:
 %
-%   \binom{}{}
-%
 %   \righttoleftarrow
 %   \lefttorightarrow
+%
+%   \color{} with HTML colorspec
+%   \bgcolor
+%   \array
+
+% Of  the standard HTML named colors, white, black, red, green, blue and yellow
+% are  predefined in the color package. Here are the rest.
+\definecolor{aqua}{rgb}{0, 1.0, 1.0}
+\definecolor{fuschia}{rgb}{1.0, 0, 1.0}
+\definecolor{gray}{rgb}{0.502, 0.502, 0.502}
+\definecolor{lime}{rgb}{0, 1.0, 0}
+\definecolor{maroon}{rgb}{0.502, 0, 0}
+\definecolor{navy}{rgb}{0, 0, 0.502}
+\definecolor{olive}{rgb}{0.502, 0.502, 0}
+\definecolor{purple}{rgb}{0.502, 0, 0.502}
+\definecolor{silver}{rgb}{0.753, 0.753, 0.753}
+\definecolor{teal}{rgb}{0, 0.502, 0.502}

 % Because  of conflicts, \space and \mathop are converted to
 % \itexspace  and \operatorname during preprocessing.
@ -842,6 +857,8 @@ class WikiControllerTest < Test::Unit::TestCase
 \renewcommand{\scriptsize}{\scriptstyle}
 \newcommand{\scriptscriptsize}{\scriptscriptstyle}
 \newcommand{\mathfr}{\mathfrak}
+\newcommand{\statusline}[2]{#2}
+\newcommand{\toggle}[2]{#1}

 %-------------------------------------------------------------------

--- a/test/unit/page_renderer_test.rb
+++ b/test/unit/page_renderer_test.rb
@ -164,14 +164,14 @@ class PageRendererTest < Test::Unit::TestCase
  # wikiwords are invalid as styles, must be in "name: value" form
  def test_content_with_wikiword_in_style_tag
    assert_markup_parsed_as(
-        "<p>That is some <em style=\"\">Stylish Emphasis</em></p>", 
+        '<p>That is some <em style="">Stylish Emphasis</em></p>', 
 	    'That is some <em style="WikiWord">Stylish Emphasis</em>')
  end
 
  # validates format of style..
  def test_content_with_valid_style_in_style_tag
    assert_markup_parsed_as(
-        "<p>That is some <em style=\"text-align: right;\">Stylish Emphasis</em></p>", 
+        '<p>That is some <em style="text-align: right;">Stylish Emphasis</em></p>', 
 	    'That is some <em style="text-align: right">Stylish Emphasis</em>')
  end
  
@ -199,19 +199,19 @@ class PageRendererTest < Test::Unit::TestCase
  
  def test_content_with_link_in_parentheses
    assert_markup_parsed_as(
-      "<p>(<a href=\"http://wiki.org/wiki.cgi?WhatIsWiki\">What is a wiki?</a>)</p>",
+      '<p>(<a href="http://wiki.org/wiki.cgi?WhatIsWiki">What is a wiki?</a>)</p>',
      '([What is a wiki?](http://wiki.org/wiki.cgi?WhatIsWiki))')
  end
  
  def test_content_with_image_link
    assert_markup_parsed_as( 
-      "<p>This <img alt=\"\" src=\"http://hobix.com/sample.jpg\" /> is a Markdown image link.</p>", 
+      '<p>This <img alt="" src="http://hobix.com/sample.jpg" /> is a Markdown image link.</p>', 
      'This ![](http://hobix.com/sample.jpg) is a Markdown image link.')
  end
  
  def test_content_with_inlined_img_tag
    assert_markup_parsed_as( 
-      "<p>This <img alt=\"\" src=\"http://hobix.com/sample.jpg\" /> is an inline image link.</p>", 
+      '<p>This <img alt="" src="http://hobix.com/sample.jpg" /> is an inline image link.</p>', 
      'This <img src="http://hobix.com/sample.jpg" alt="" /> is an inline image link.')
       
    # currently, upper case HTML elements are not allowed
--- a/test/unit/sanitize_test.rb
+++ b/test/unit/sanitize_test.rb
@ -0,0 +1,32 @@
+#!/usr/bin/env ruby
+
+require File.expand_path(File.dirname(__FILE__) + '/../test_helper')
+require 'sanitize'
+
+class SanitizeTest < Test::Unit::TestCase
+
+  def setup
+
+  end
+
+  def rexml_doc(string)
+    REXML::Document.new(
+      "<div xmlns='http://www.w3.org/1999/xhtml'>#{string}</div>")
+  end
+  
+  def my_rex(string)
+    sanitize_rexml(rexml_doc(string)).gsub(/\A<div xmlns="http:\/\/www.w3.org\/1999\/xhtml">(.*)<\/div>\Z/m, '\1')
+  end
+
+  def test_sanitize_named_entities
+    input = '<p>Greek &phi;, double-struck &Aopf;, numeric &#x1D538; &#8279;</p>'
+    output = "<p>Greek \317\225, double-struck \360\235\224\270, numeric \360\235\224\270 \342\201\227</p>"
+    output2 = "<p>Greek \317\225, double-struck \360\235\224\270, numeric &#x1D538; &#8279;</p>"
+    assert_equal(output, sanitize_xhtml(input))
+    assert_equal(output, sanitize_html(input))
+    assert_equal(output, my_rex(input))
+    assert_equal(output2, input.to_utf8)
+  end
+
+
+end