Ruby 1.9 Compatibility

Completely removed the html5lib sanitizer. Fixed the string-handling to work in both Ruby 1.8.x and 1.9.2. There are still, inexplicably, two functional tests that fail. But the rest seems to work quite well.
2009-11-30 16:28:18 -06:00 · 2009-11-30 16:28:18 -06:00 · a6429f8c22
commit a6429f8c22
parent 79c8572053
142 changed files with 519 additions and 843 deletions
--- a/lib/chunks/engines.rb
+++ b/lib/chunks/engines.rb
@ -30,6 +30,7 @@ module Engines

  class Textile < AbstractEngine
    def mask
+      @content.as_utf8
      redcloth = RedCloth.new(@content, [:hard_breaks] + @content.options[:engine_opts])
      redcloth.filter_html = false
      redcloth.no_span_caps = false  
@ -39,6 +40,7 @@ module Engines

  class Markdown < AbstractEngine
    def mask
+      @content.as_utf8
      # If the request is for S5, call Maruku accordingly (without math)
      if @content.options[:mode] == :s5
        my_content = Maruku.new(@content.delete("\r").to_utf8,
@ -56,6 +58,7 @@ module Engines

  class MarkdownMML < AbstractEngine
    def mask
+      @content.as_utf8
      # If the request is for S5, call Maruku accordingly
      if @content.options[:mode] == :s5
        my_content = Maruku.new(@content.delete("\r").to_utf8,
@ -77,6 +80,7 @@ module Engines

  class MarkdownPNG < AbstractEngine
    def mask
+      @content.as_utf8
      # If the request is for S5, call Maruku accordingly
      if @content.options[:mode] == :s5
        my_content = Maruku.new(@content.delete("\r").to_utf8,
@ -108,6 +112,7 @@ module Engines

  class Mixed < AbstractEngine
    def mask
+      @content.as_utf8
      redcloth = RedCloth.new(@content, @content.options[:engine_opts])
      redcloth.filter_html = false
      redcloth.no_span_caps = false
@ -117,6 +122,7 @@ module Engines

  class RDoc < AbstractEngine
    def mask
+      @content.as_utf8
      html = RDocSupport::RDocFormatter.new(@content).to_html
    end
  end
--- a/lib/chunks/nowiki.rb
+++ b/lib/chunks/nowiki.rb
@ -1,5 +1,5 @@
 require 'chunks/chunk'
-require 'sanitize'
+require 'sanitizer'

 # This chunks allows certain parts of a wiki page to be hidden from the
 # rest of the rendering pipeline. It should be run at the beginning
@ -17,7 +17,7 @@ require 'sanitize'

 class NoWiki < Chunk::Abstract

-  include Sanitize
+  include Sanitizer
  
  NOWIKI_PATTERN = Regexp.new('<nowiki>(.*?)</nowiki>', Regexp::MULTILINE)
  def self.pattern() NOWIKI_PATTERN end
@ -26,7 +26,7 @@ class NoWiki < Chunk::Abstract

  def initialize(match_data, content)
    super
-    @plain_text = @unmask_text = safe_sanitize_xhtml(match_data[1])
+    @plain_text = @unmask_text = safe_xhtml_sanitize(match_data[1])
  end

 end
--- a/lib/sanitize.rb
+++ b/lib/sanitize.rb
@ -1,262 +0,0 @@
-# == Introduction
-#
-# This module provides sanitization of XHTML+MathML+SVG 
-# and of inline style attributes. Its genesis is {described here}[http://golem.ph.utexas.edu/~distler/blog/archives/001181.html].
-#
-# Uses the {HTML5lib parser}[http://code.google.com/p/html5lib/], so that the parsing behaviour should
-# resemble that of browsers.
-#
-#  sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
-#  sanitize_html() is a case-insensitive sanitizer suitable for HTML
-#  sanitize_rexml() sanitizes a REXML tree, returning a string
-#  safe_sanitize_xhtml() makes extra-sure that the result is well-formed XHTML
-#                        by running the output of sanitize_xhtml() through REXML
-#
-# == Files
-#
-# {sanitize.rb}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/lib/sanitize.rb],
-# {HTML5lib}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/vendor/plugins/HTML5lib/]
-#
-# == Author
-#
-# {Jacques Distler}[http://golem.ph.utexas.edu/~distler/]
-#
-# ==  License
-#
-# Ruby License
-
-module Sanitize
-
-  require 'html5/html5parser'
-  require 'html5/liberalxmlparser'
-  require 'html5/treewalkers'
-  require 'html5/treebuilders'
-  require 'html5/serializer'
-  require 'html5/sanitizer'
-  require 'stringsupport.rb'
-
-  include HTML5
-
-# Sanitize a string, parsed using XHTML parsing rules.
-#
-# :call-seq:
-#    sanitize_xhtml(string)                    -> string
-#    sanitize_xhtml(string, {:encoding => 'iso-8859-1', :to_tree => true}) -> REXML::Document
-#
-# Unless otherwise specified, the string is assumed to be utf-8 encoded.
-# By default, the output is a string. But, optionally, you can return a REXML tree.
-#
-# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
-# (REXML trees are always utf-8 encoded.)
-  def sanitize_xhtml(html, options = {})
-    @encoding = 'utf-8'
-    @treebuilder = TreeBuilders::REXML::TreeBuilder
-    @to_tree = false
-    options.each do |name, value|
-      next unless %w(encoding treebuilder to_tree).include? name.to_s
-      if name.to_s == 'treebuilder'
-        @treebuilder =  HTML5lib::TreeBuilders.get_tree_builder(value)
-      else
-        instance_variable_set("@#{name}", value)
-      end
-    end
-    if @encoding == 'utf-8'
-      parsed = XHTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
-        :lowercase_element_name => false, :lowercase_attr_name => false,
-        :encoding => @encoding, :tree => @treebuilder })
-    else
-      parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
-        :lowercase_element_name => false, :lowercase_attr_name => false,
-        :encoding => @encoding, :tree => @treebuilder })
-    end      
-    return parsed if @to_tree
-    return parsed.to_s
-  end
-  
-# Sanitize a string, parsed using XHTML parsing rules. Reparse the result to
-#    ensure well-formedness. 
-#
-# :call-seq:
-#    safe_sanitize_xhtml(string)                    -> string
-#
-# Unless otherwise specified, the string is assumed to be utf-8 encoded.
-#
-# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
-# (REXML trees are always utf-8 encoded.)
-  def safe_sanitize_xhtml(html, options = {})
-    options[:to_tree] = false
-    sanitized = sanitize_xhtml(html, options)
-    doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>")
-    sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
-    rescue REXML::ParseException
-      sanitized = sanitized.escapeHTML
-  end 
-
-# Sanitize a string, parsed using HTML parsing rules.
-#
-# :call-seq:
-#    sanitize_html( string )                    ->  string
-#    sanitize_html( string, {:encoding => 'iso-8859-1', :to_tree => true} ) ->  REXML::Document
-#
-# Unless otherwise specified, the string is assumed to be utf-8 encoded.
-# By default, the output is a string. But, optionally, you can return a REXML tree.
-#
-# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
-# (REXML trees are always utf-8 encoded.)
-  def sanitize_html(html, options = {})
-    @encoding = 'utf-8'
-    @treebuilder = TreeBuilders::REXML::TreeBuilder
-    @to_tree = false
-    options.each do |name, value|
-      next unless %w(encoding treebuilder to_tree).include? name.to_s
-      if name.to_s == 'treebuilder'
-        @treebuilder =  HTML5lib::TreeBuilders.get_tree_builder(value)
-      else
-        instance_variable_set("@#{name}", value)
-      end
-    end
-    if @encoding == 'utf-8'
-      parsed = HTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
-        :encoding => @encoding, :tree => @treebuilder })
-    else
-      parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
-        :encoding => @encoding, :tree => @treebuilder })
-    end 
-    return parsed if @to_tree
-    return parsed.to_s
-  end
-
-# Sanitize a REXML tree. The output is a string.
-#
-# :call-seq:
-#    sanitize_rexml(tree)                    -> string
-#
-  def sanitize_rexml(tree)
-    tokens = TreeWalkers.get_tree_walker('rexml2').new(tree)
-    XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
-      :space_before_trailing_solidus => true,
-      :inject_meta_charset => false,
-      :sanitize => true})
-  end
-end
-
-require 'rexml/element'
-module REXML #:nodoc:
-  class Element
-
-# Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References
-#
-#  :call-seq:
-#     tree.to_ncr  -> REXML::Element
-#
-# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
-# access the resulting REXML document.
-#
-# Note that this method needs to traverse the entire tree, converting text nodes and attributes
-# for each element. This can be SLOW. It will often be faster to serialize to a string and then
-# use String.to_ncr instead.
-#
-    def to_ncr
-      self.each_element { |el|
-        el.texts.each_index  {|i|
-          el.texts[i].value = el.texts[i].to_s.to_ncr
-        }
-        el.attributes.each { |name,val|
-          el.attributes[name] = val.to_ncr
-        }
-        el.to_ncr if el.has_elements?
-      }
-      return self
-    end
-    
-# Convert XHTML+MathML Named Entities in a REXML::Element to UTF-8
-#
-#  :call-seq:
-#     tree.to_utf8  -> REXML::Element
-#
-# Note that this method needs to traverse the entire tree, converting text nodes and attributes 
-# for each element. This can be SLOW. It will often be faster to serialize to a string and then
-# use String.to_utf8 instead.
-#
-    def to_utf8
-      self.each_element { |el|
-        el.texts.each_index  {|i|
-          el.texts[i].value = el.texts[i].to_s.to_utf8
-        }
-        el.attributes.each { |name,val|
-          el.attributes[name] = val.to_utf8
-        }
-        el.to_utf8 if el.has_elements?
-      }
-      return self
-    end
-
-  end
-end
-
-module HTML5 #:nodoc: all
-  module TreeWalkers
-
-    private
-
-    class << self
-      def [](name)
-        case name.to_s.downcase
-        when 'rexml'
-          require 'html5/treewalkers/rexml'
-          REXML::TreeWalker
-        when 'rexml2'
-          REXML2::TreeWalker
-        else
-          raise "Unknown TreeWalker #{name}"
-        end
-      end
-
-      alias :get_tree_walker :[]
-    end
-
-    module REXML2
-      class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
-
-        private
-
-        def node_details(node)
-          case node
-          when ::REXML::Document
-            [:DOCUMENT]
-          when ::REXML::Element
-            if !node.name
-              [:DOCUMENT_FRAGMENT]
-            else
-              [:ELEMENT, node.name,
-                node.attributes.map {|name,value| [name,value.to_utf8]},
-                node.has_elements? || node.has_text?]
-            end
-          when ::REXML::Text
-            [:TEXT, node.value.to_utf8]
-          when ::REXML::Comment
-            [:COMMENT, node.string]
-          when ::REXML::DocType
-            [:DOCTYPE, node.name, node.public, node.system]
-          when ::REXML::XMLDecl
-            [nil]
-          else
-            [:UNKNOWN, node.class.inspect]
-          end
-        end
-
-        def first_child(node)
-          node.children.first
-        end
-
-        def next_sibling(node)
-          node.next_sibling
-        end
-
-        def parent(node)
-          node.parent
-        end
-      end
-    end
-  end
-end
--- a/lib/sanitizer.rb
+++ b/lib/sanitizer.rb
@ -169,7 +169,7 @@ module Sanitizer
          node.attributes.delete attr; next
        end
        if ATTR_VAL_IS_URI.include?(attr)
-          val_unescaped = val.unescapeHTML.gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
+          val_unescaped = val.unescapeHTML.as_bytes.gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
          if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0]) 
            node.attributes.delete attr; next
          end                        
@ -206,4 +206,23 @@ module Sanitizer

      clean.join(' ')
    end
+    
+# Sanitize a string, parsed using XHTML parsing rules. Reparse the result to
+#    ensure well-formedness. 
+#
+# :call-seq:
+#    safe_sanitize_xhtml(string)                    -> string
+#
+# Unless otherwise specified, the string is assumed to be utf-8 encoded.
+#
+# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
+# (REXML trees are always utf-8 encoded.)
+  def safe_xhtml_sanitize(html, options = {})
+    sanitized = xhtml_sanitize(html.purify)
+    doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>")
+    sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
+    rescue REXML::ParseException
+      sanitized = sanitized.escapeHTML
+  end 
+
 end
--- a/lib/stringsupport.rb
+++ b/lib/stringsupport.rb
@ -2,6 +2,26 @@

 class String

+# A method to allow byte-oriented operations in both Ruby 1.8 and Ruby 1.9
+#
+# Under 1.8, this is a NOOP. Under 1.9, it sets the encoding to "ASCII-8BIT"
+#--
+  def as_bytes
+    force_encoding("ASCII-8BIT") if self.respond_to?(:force_encoding)
+    self
+  end
+
+#++
+# A method to allow string-oriented operations in both Ruby 1.8 and Ruby 1.9
+#
+# Under 1.8, this is a NOOP. Under 1.9, it sets the encoding to "UTF-8"
+#--
+  def as_utf8
+    force_encoding("UTF-8") if self.respond_to?(:force_encoding)
+    self
+  end
+
+#++
 # Take a string, and remove any invalid substrings, returning a valid utf-8 string.
 #
 # :call-seq:
@ -11,12 +31,16 @@ class String
 #--
   def purify
     text = check_ncrs
-     text.split(//u).grep(UTF8_REGEX).join
+     if text.respond_to?(:encoding)
+       text.split(//).collect{|c| c.as_bytes}.grep(UTF8_REGEX).join.as_utf8
+     else
+       text.split(//u).grep(UTF8_REGEX).join
+     end
   end

  def check_ncrs
-    text = gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*') =~ UTF8_REGEX ? m : '' }
-    text.gsub(/&#(\d+);/) { |m| [$1.to_i].pack('U*') =~ UTF8_REGEX ? m : '' }
+    text = gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' }
+    text.gsub(/&#(\d+);/) { |m| [$1.to_i].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' }
  end

  UTF8_REGEX = /\A(
@ -42,7 +66,7 @@ class String
 #--
   def is_utf8?
     #expand NCRs to utf-8
-     text = self.check_ncrs
+     text = self.check_ncrs.as_bytes
     
     # You might think this is faster, but it isn't
     #pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)