instiki/lib/sanitize.rb

# == Introduction
#
# This module provides sanitization of XHTML+MathML+SVG 
# and of inline style attributes. Its genesis is {described here}[http://golem.ph.utexas.edu/~distler/blog/archives/001181.html].
#
# Uses the {HTML5lib parser}[http://code.google.com/p/html5lib/], so that the parsing behaviour should
# resemble that of browsers.
#
#  sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
#  sanitize_html() is a case-insensitive sanitizer suitable for HTML
#  sanitize_rexml() sanitizes a REXML tree, returning a string
#  safe_sanitize_xhtml() makes extra-sure that the result is well-formed XHTML
#                        by running the output of sanitize_xhtml() through REXML
#
# == Files
#
# {sanitize.rb}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/lib/sanitize.rb],
# {HTML5lib}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/vendor/plugins/HTML5lib/]
#
# == Author
#
# {Jacques Distler}[http://golem.ph.utexas.edu/~distler/]
#
# ==  License
#
# Ruby License

module Sanitize

  require 'html5/html5parser'
  require 'html5/liberalxmlparser'
  require 'html5/treewalkers'
  require 'html5/treebuilders'
  require 'html5/serializer'
  require 'html5/sanitizer'
  require 'stringsupport.rb'

  include HTML5

# Sanitize a string, parsed using XHTML parsing rules.
#
# :call-seq:
#    sanitize_xhtml(string)                    -> string
#    sanitize_xhtml(string, {:encoding => 'iso-8859-1', :to_tree => true}) -> REXML::Document
#
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
# By default, the output is a string. But, optionally, you can return a REXML tree.
#
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
# (REXML trees are always utf-8 encoded.)
  def sanitize_xhtml(html, options = {})
    @encoding = 'utf-8'
    @treebuilder = TreeBuilders::REXML::TreeBuilder
    @to_tree = false
    options.each do |name, value|
      next unless %w(encoding treebuilder to_tree).include? name.to_s
      if name.to_s == 'treebuilder'
        @treebuilder =  HTML5lib::TreeBuilders.get_tree_builder(value)
      else
        instance_variable_set("@#{name}", value)
      end
    end
    if @encoding == 'utf-8'
      parsed = XHTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
        :lowercase_element_name => false, :lowercase_attr_name => false,
        :encoding => @encoding, :tree => @treebuilder })
    else
      parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
        :lowercase_element_name => false, :lowercase_attr_name => false,
        :encoding => @encoding, :tree => @treebuilder })
    end      
    return parsed if @to_tree
    return parsed.to_s
  end
  
# Sanitize a string, parsed using XHTML parsing rules. Reparse the result to
#    ensure well-formedness. 
#
# :call-seq:
#    safe_sanitize_xhtml(string)                    -> string
#
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
#
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
# (REXML trees are always utf-8 encoded.)
  def safe_sanitize_xhtml(html, options = {})
    options[:to_tree] = false
    sanitized = sanitize_xhtml(html, options)
    doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>")
    sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
    rescue REXML::ParseException
      sanitized = sanitized.escapeHTML
  end 

# Sanitize a string, parsed using HTML parsing rules.
#
# :call-seq:
#    sanitize_html( string )                    ->  string
#    sanitize_html( string, {:encoding => 'iso-8859-1', :to_tree => true} ) ->  REXML::Document
#
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
# By default, the output is a string. But, optionally, you can return a REXML tree.
#
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
# (REXML trees are always utf-8 encoded.)
  def sanitize_html(html, options = {})
    @encoding = 'utf-8'
    @treebuilder = TreeBuilders::REXML::TreeBuilder
    @to_tree = false
    options.each do |name, value|
      next unless %w(encoding treebuilder to_tree).include? name.to_s
      if name.to_s == 'treebuilder'
        @treebuilder =  HTML5lib::TreeBuilders.get_tree_builder(value)
      else
        instance_variable_set("@#{name}", value)
      end
    end
    if @encoding == 'utf-8'
      parsed = HTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
        :encoding => @encoding, :tree => @treebuilder })
    else
      parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
        :encoding => @encoding, :tree => @treebuilder })
    end 
    return parsed if @to_tree
    return parsed.to_s
  end

# Sanitize a REXML tree. The output is a string.
#
# :call-seq:
#    sanitize_rexml(tree)                    -> string
#
  def sanitize_rexml(tree)
    tokens = TreeWalkers.get_tree_walker('rexml2').new(tree)
    XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
      :space_before_trailing_solidus => true,
      :inject_meta_charset => false,
      :sanitize => true})
  end
end

require 'rexml/element'
module REXML #:nodoc:
  class Element

# Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References
#
#  :call-seq:
#     tree.to_ncr  -> REXML::Element
#
# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
# access the resulting REXML document.
#
# Note that this method needs to traverse the entire tree, converting text nodes and attributes
# for each element. This can be SLOW. It will often be faster to serialize to a string and then
# use String.to_ncr instead.
#
    def to_ncr
      self.each_element { |el|
        el.texts.each_index  {|i|
          el.texts[i].value = el.texts[i].to_s.to_ncr
        }
        el.attributes.each { |name,val|
          el.attributes[name] = val.to_ncr
        }
        el.to_ncr if el.has_elements?
      }
      return self
    end
    
# Convert XHTML+MathML Named Entities in a REXML::Element to UTF-8
#
#  :call-seq:
#     tree.to_utf8  -> REXML::Element
#
# Note that this method needs to traverse the entire tree, converting text nodes and attributes 
# for each element. This can be SLOW. It will often be faster to serialize to a string and then
# use String.to_utf8 instead.
#
    def to_utf8
      self.each_element { |el|
        el.texts.each_index  {|i|
          el.texts[i].value = el.texts[i].to_s.to_utf8
        }
        el.attributes.each { |name,val|
          el.attributes[name] = val.to_utf8
        }
        el.to_utf8 if el.has_elements?
      }
      return self
    end

  end
end

module HTML5 #:nodoc: all
  module TreeWalkers

    private

    class << self
      def [](name)
        case name.to_s.downcase
        when 'rexml'
          require 'html5/treewalkers/rexml'
          REXML::TreeWalker
        when 'rexml2'
          REXML2::TreeWalker
        else
          raise "Unknown TreeWalker #{name}"
        end
      end

      alias :get_tree_walker :[]
    end

    module REXML2
      class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker

        private

        def node_details(node)
          case node
          when ::REXML::Document
            [:DOCUMENT]
          when ::REXML::Element
            if !node.name
              [:DOCUMENT_FRAGMENT]
            else
              [:ELEMENT, node.name,
                node.attributes.map {|name,value| [name,value.to_utf8]},
                node.has_elements? || node.has_text?]
            end
          when ::REXML::Text
            [:TEXT, node.value.to_utf8]
          when ::REXML::Comment
            [:COMMENT, node.string]
          when ::REXML::DocType
            [:DOCTYPE, node.name, node.public, node.system]
          when ::REXML::XMLDecl
            [nil]
          else
            [:UNKNOWN, node.class.inspect]
          end
        end

        def first_child(node)
          node.children.first
        end

        def next_sibling(node)
          node.next_sibling
        end

        def parent(node)
          node.parent
        end
      end
    end
  end
end
Sanitizer API documentation now online See: http://golem.ph.utexas.edu/~distler/code/rdoc/sanitize/ 2007-06-09 06:51:30 +02:00			`# == Introduction`
			`#`
HTML5lib is Back. Synced with latest version of HTML5lib, which fixes problem with Astral plane characters. I should really do some tests, but the HTML5lib Sanitizer seems to be 2-5 times slower than the old sanitizer. 2007-05-30 17:45:52 +02:00			`# This module provides sanitization of XHTML+MathML+SVG`
Sanitizer API documentation now online See: http://golem.ph.utexas.edu/~distler/code/rdoc/sanitize/ 2007-06-09 06:51:30 +02:00			`# and of inline style attributes. Its genesis is {described here}[http://golem.ph.utexas.edu/~distler/blog/archives/001181.html].`
Finally! XSS-protection, done right. If you want something done right, ... 2007-02-22 08:06:53 +01:00			`#`
Sanitizer API documentation now online See: http://golem.ph.utexas.edu/~distler/code/rdoc/sanitize/ 2007-06-09 06:51:30 +02:00			`# Uses the {HTML5lib parser}[http://code.google.com/p/html5lib/], so that the parsing behaviour should`
HTML5lib is Back. Synced with latest version of HTML5lib, which fixes problem with Astral plane characters. I should really do some tests, but the HTML5lib Sanitizer seems to be 2-5 times slower than the old sanitizer. 2007-05-30 17:45:52 +02:00			`# resemble that of browsers.`
			`#`
			`# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML`
			`# sanitize_html() is a case-insensitive sanitizer suitable for HTML`
Sanitizer API documentation now online See: http://golem.ph.utexas.edu/~distler/code/rdoc/sanitize/ 2007-06-09 06:51:30 +02:00			`# sanitize_rexml() sanitizes a REXML tree, returning a string`
Better Put the "safe" XHTML sanitization in lib/santize.rb, rather than in lib/chunks/nowiki.rb. D'oh! 2008-12-01 17:29:46 +01:00			`# safe_sanitize_xhtml() makes extra-sure that the result is well-formed XHTML`
			`# by running the output of sanitize_xhtml() through REXML`
Sanitizer API documentation now online See: http://golem.ph.utexas.edu/~distler/code/rdoc/sanitize/ 2007-06-09 06:51:30 +02:00			`#`
			`# == Files`
			`#`
			`# {sanitize.rb}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/lib/sanitize.rb],`
			`# {HTML5lib}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/vendor/plugins/HTML5lib/]`
			`#`
			`# == Author`
			`#`
			`# {Jacques Distler}[http://golem.ph.utexas.edu/~distler/]`
			`#`
			`# == License`
			`#`
			`# Ruby License`
Finally! XSS-protection, done right. If you want something done right, ... 2007-02-22 08:06:53 +01:00
Sanitizer API documentation now online See: http://golem.ph.utexas.edu/~distler/code/rdoc/sanitize/ 2007-06-09 06:51:30 +02:00			`module Sanitize`
Finally! XSS-protection, done right. If you want something done right, ... 2007-02-22 08:06:53 +01:00
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`require 'html5/html5parser'`
			`require 'html5/liberalxmlparser'`
			`require 'html5/treewalkers'`
			`require 'html5/treebuilders'`
			`require 'html5/serializer'`
			`require 'html5/sanitizer'`
Sync with Latest itex2MML and MathML::Entities Support the latest changes in http://www.w3.org/TR/2009/WD-xml-entity-names-20091117/ 2009-11-18 19:04:07 +01:00			`require 'stringsupport.rb'`
REXML Trees Synced with latest HTML5lib. Added preliminary support (currently disabled) for sanitizing REXML trees. 2007-06-05 23:34:49 +02:00
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`include HTML5`
Finally! XSS-protection, done right. If you want something done right, ... 2007-02-22 08:06:53 +01:00
Enhancements to sanitize.rb Options, options, ... options. 2007-06-08 08:23:09 +02:00			`# Sanitize a string, parsed using XHTML parsing rules.`
			`#`
			`# :call-seq:`
			`# sanitize_xhtml(string) -> string`
			`# sanitize_xhtml(string, {:encoding => 'iso-8859-1', :to_tree => true}) -> REXML::Document`
			`#`
			`# Unless otherwise specified, the string is assumed to be utf-8 encoded.`
			`# By default, the output is a string. But, optionally, you can return a REXML tree.`
Sanitizer API documentation now online See: http://golem.ph.utexas.edu/~distler/code/rdoc/sanitize/ 2007-06-09 06:51:30 +02:00			`#`
Updated to Latest HTML5lib Synced with latest HTML5lib. Added some RDoc-compatible documentation to the sanitizer. 2007-06-09 00:26:00 +02:00			`# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.`
			`# (REXML trees are always utf-8 encoded.)`
Enhancements to sanitize.rb Options, options, ... options. 2007-06-08 08:23:09 +02:00			`def sanitize_xhtml(html, options = {})`
			`@encoding = 'utf-8'`
			`@treebuilder = TreeBuilders::REXML::TreeBuilder`
			`@to_tree = false`
			`options.each do \|name, value\|`
			`next unless %w(encoding treebuilder to_tree).include? name.to_s`
			`if name.to_s == 'treebuilder'`
Sanitizer Fix Whoops! Looks like Ryan changed the API for the HTML5 sanitizer. Bad, bad, bad. Fixed now. 2007-08-30 23:06:20 +02:00			`@treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)`
Enhancements to sanitize.rb Options, options, ... options. 2007-06-08 08:23:09 +02:00			`else`
			`instance_variable_set("@#{name}", value)`
			`end`
			`end`
Performance My REXML::Element.to_ncr (and REXML::Element.to_utf8) is horribly slow. For long documents, it proves more efficient to serialize to a string, apply String.to_ncr (or String.to_utf8) and then Sanitize the string. 2007-10-13 23:32:04 +02:00			`if @encoding == 'utf-8'`
			`parsed = XHTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,`
			`:lowercase_element_name => false, :lowercase_attr_name => false,`
			`:encoding => @encoding, :tree => @treebuilder })`
			`else`
			`parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,`
			`:lowercase_element_name => false, :lowercase_attr_name => false,`
			`:encoding => @encoding, :tree => @treebuilder })`
			`end`
Enhancements to sanitize.rb Options, options, ... options. 2007-06-08 08:23:09 +02:00			`return parsed if @to_tree`
			`return parsed.to_s`
HTML5lib is Back. Synced with latest version of HTML5lib, which fixes problem with Astral plane characters. I should really do some tests, but the HTML5lib Sanitizer seems to be 2-5 times slower than the old sanitizer. 2007-05-30 17:45:52 +02:00			`end`
Better Put the "safe" XHTML sanitization in lib/santize.rb, rather than in lib/chunks/nowiki.rb. D'oh! 2008-12-01 17:29:46 +01:00
			`# Sanitize a string, parsed using XHTML parsing rules. Reparse the result to`
			`# ensure well-formedness.`
			`#`
			`# :call-seq:`
			`# safe_sanitize_xhtml(string) -> string`
			`#`
			`# Unless otherwise specified, the string is assumed to be utf-8 encoded.`
			`#`
			`# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.`
			`# (REXML trees are always utf-8 encoded.)`
			`def safe_sanitize_xhtml(html, options = {})`
			`options[:to_tree] = false`
			`sanitized = sanitize_xhtml(html, options)`
			`doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>")`
			`sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')`
			`rescue REXML::ParseException`
			`sanitized = sanitized.escapeHTML`
			`end`
Finally! XSS-protection, done right. If you want something done right, ... 2007-02-22 08:06:53 +01:00
Enhancements to sanitize.rb Options, options, ... options. 2007-06-08 08:23:09 +02:00			`# Sanitize a string, parsed using HTML parsing rules.`
			`#`
			`# :call-seq:`
Sanitizer API documentation now online See: http://golem.ph.utexas.edu/~distler/code/rdoc/sanitize/ 2007-06-09 06:51:30 +02:00			`# sanitize_html( string ) -> string`
			`# sanitize_html( string, {:encoding => 'iso-8859-1', :to_tree => true} ) -> REXML::Document`
Enhancements to sanitize.rb Options, options, ... options. 2007-06-08 08:23:09 +02:00			`#`
			`# Unless otherwise specified, the string is assumed to be utf-8 encoded.`
			`# By default, the output is a string. But, optionally, you can return a REXML tree.`
Sanitizer API documentation now online See: http://golem.ph.utexas.edu/~distler/code/rdoc/sanitize/ 2007-06-09 06:51:30 +02:00			`#`
Updated to Latest HTML5lib Synced with latest HTML5lib. Added some RDoc-compatible documentation to the sanitizer. 2007-06-09 00:26:00 +02:00			`# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.`
			`# (REXML trees are always utf-8 encoded.)`
Enhancements to sanitize.rb Options, options, ... options. 2007-06-08 08:23:09 +02:00			`def sanitize_html(html, options = {})`
			`@encoding = 'utf-8'`
			`@treebuilder = TreeBuilders::REXML::TreeBuilder`
			`@to_tree = false`
			`options.each do \|name, value\|`
			`next unless %w(encoding treebuilder to_tree).include? name.to_s`
			`if name.to_s == 'treebuilder'`
Sanitizer Fix Whoops! Looks like Ryan changed the API for the HTML5 sanitizer. Bad, bad, bad. Fixed now. 2007-08-30 23:06:20 +02:00			`@treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)`
Enhancements to sanitize.rb Options, options, ... options. 2007-06-08 08:23:09 +02:00			`else`
			`instance_variable_set("@#{name}", value)`
			`end`
			`end`
Performance My REXML::Element.to_ncr (and REXML::Element.to_utf8) is horribly slow. For long documents, it proves more efficient to serialize to a string, apply String.to_ncr (or String.to_utf8) and then Sanitize the string. 2007-10-13 23:32:04 +02:00			`if @encoding == 'utf-8'`
			`parsed = HTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,`
			`:encoding => @encoding, :tree => @treebuilder })`
			`else`
			`parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,`
			`:encoding => @encoding, :tree => @treebuilder })`
			`end`
Enhancements to sanitize.rb Options, options, ... options. 2007-06-08 08:23:09 +02:00			`return parsed if @to_tree`
			`return parsed.to_s`
HTML5lib is Back. Synced with latest version of HTML5lib, which fixes problem with Astral plane characters. I should really do some tests, but the HTML5lib Sanitizer seems to be 2-5 times slower than the old sanitizer. 2007-05-30 17:45:52 +02:00			`end`
Finally! XSS-protection, done right. If you want something done right, ... 2007-02-22 08:06:53 +01:00
Enhancements to sanitize.rb Options, options, ... options. 2007-06-08 08:23:09 +02:00			`# Sanitize a REXML tree. The output is a string.`
			`#`
			`# :call-seq:`
			`# sanitize_rexml(tree) -> string`
			`#`
REXML Trees Synced with latest HTML5lib. Added preliminary support (currently disabled) for sanitizing REXML trees. 2007-06-05 23:34:49 +02:00			`def sanitize_rexml(tree)`
Performance OK. This is a better way: define a custom TreeWalker which converts named entities to utf-8 as it goes. This avoids having to do an extra tree traversal in sanitize_rexml, AND avoids the trainwreck that is html5/inputstream.rb. 2007-10-15 04:07:46 +02:00			`tokens = TreeWalkers.get_tree_walker('rexml2').new(tree)`
Use XHTMLSerializer, where appropriate. 2007-07-05 01:53:03 +02:00			`XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',`
Bugfix Me stoopid. 2007-06-06 01:06:26 +02:00			`:space_before_trailing_solidus => true,`
			`:inject_meta_charset => false,`
			`:sanitize => true})`
REXML Trees Synced with latest HTML5lib. Added preliminary support (currently disabled) for sanitizing REXML trees. 2007-06-05 23:34:49 +02:00			`end`
HTML5lib is Back. Synced with latest version of HTML5lib, which fixes problem with Astral plane characters. I should really do some tests, but the HTML5lib Sanitizer seems to be 2-5 times slower than the old sanitizer. 2007-05-30 17:45:52 +02:00			`end`
Consolidation Shuffled around a couple of files. 2007-06-09 05:39:37 +02:00
			`require 'rexml/element'`
Sanitizer API documentation now online See: http://golem.ph.utexas.edu/~distler/code/rdoc/sanitize/ 2007-06-09 06:51:30 +02:00			`module REXML #:nodoc:`
Consolidation Shuffled around a couple of files. 2007-06-09 05:39:37 +02:00			`class Element`

			`# Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References`
			`#`
			`# :call-seq:`
Sanitizer API documentation now online See: http://golem.ph.utexas.edu/~distler/code/rdoc/sanitize/ 2007-06-09 06:51:30 +02:00			`# tree.to_ncr -> REXML::Element`
Consolidation Shuffled around a couple of files. 2007-06-09 05:39:37 +02:00			`#`
			`# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you`
			`# access the resulting REXML document.`
Update RDOC documentation. Update the documentation for sanitize.rb, to match current behaviour. 2007-10-15 05:22:18 +02:00			`#`
			`# Note that this method needs to traverse the entire tree, converting text nodes and attributes`
			`# for each element. This can be SLOW. It will often be faster to serialize to a string and then`
			`# use String.to_ncr instead.`
			`#`
Consolidation Shuffled around a couple of files. 2007-06-09 05:39:37 +02:00			`def to_ncr`
Performance OK. This is a better way: define a custom TreeWalker which converts named entities to utf-8 as it goes. This avoids having to do an extra tree traversal in sanitize_rexml, AND avoids the trainwreck that is html5/inputstream.rb. 2007-10-15 04:07:46 +02:00			`self.each_element { \|el\|`
Consolidation Shuffled around a couple of files. 2007-06-09 05:39:37 +02:00			`el.texts.each_index {\|i\|`
			`el.texts[i].value = el.texts[i].to_s.to_ncr`
			`}`
			`el.attributes.each { \|name,val\|`
			`el.attributes[name] = val.to_ncr`
			`}`
Performance OK. This is a better way: define a custom TreeWalker which converts named entities to utf-8 as it goes. This avoids having to do an extra tree traversal in sanitize_rexml, AND avoids the trainwreck that is html5/inputstream.rb. 2007-10-15 04:07:46 +02:00			`el.to_ncr if el.has_elements?`
Consolidation Shuffled around a couple of files. 2007-06-09 05:39:37 +02:00			`}`
			`return self`
			`end`
Performance My REXML::Element.to_ncr (and REXML::Element.to_utf8) is horribly slow. For long documents, it proves more efficient to serialize to a string, apply String.to_ncr (or String.to_utf8) and then Sanitize the string. 2007-10-13 23:32:04 +02:00
			`# Convert XHTML+MathML Named Entities in a REXML::Element to UTF-8`
			`#`
			`# :call-seq:`
			`# tree.to_utf8 -> REXML::Element`
Update RDOC documentation. Update the documentation for sanitize.rb, to match current behaviour. 2007-10-15 05:22:18 +02:00			`#`
			`# Note that this method needs to traverse the entire tree, converting text nodes and attributes`
			`# for each element. This can be SLOW. It will often be faster to serialize to a string and then`
			`# use String.to_utf8 instead.`
Performance My REXML::Element.to_ncr (and REXML::Element.to_utf8) is horribly slow. For long documents, it proves more efficient to serialize to a string, apply String.to_ncr (or String.to_utf8) and then Sanitize the string. 2007-10-13 23:32:04 +02:00			`#`
			`def to_utf8`
Performance OK. This is a better way: define a custom TreeWalker which converts named entities to utf-8 as it goes. This avoids having to do an extra tree traversal in sanitize_rexml, AND avoids the trainwreck that is html5/inputstream.rb. 2007-10-15 04:07:46 +02:00			`self.each_element { \|el\|`
Performance My REXML::Element.to_ncr (and REXML::Element.to_utf8) is horribly slow. For long documents, it proves more efficient to serialize to a string, apply String.to_ncr (or String.to_utf8) and then Sanitize the string. 2007-10-13 23:32:04 +02:00			`el.texts.each_index {\|i\|`
			`el.texts[i].value = el.texts[i].to_s.to_utf8`
			`}`
			`el.attributes.each { \|name,val\|`
			`el.attributes[name] = val.to_utf8`
			`}`
Performance OK. This is a better way: define a custom TreeWalker which converts named entities to utf-8 as it goes. This avoids having to do an extra tree traversal in sanitize_rexml, AND avoids the trainwreck that is html5/inputstream.rb. 2007-10-15 04:07:46 +02:00			`el.to_utf8 if el.has_elements?`
Performance My REXML::Element.to_ncr (and REXML::Element.to_utf8) is horribly slow. For long documents, it proves more efficient to serialize to a string, apply String.to_ncr (or String.to_utf8) and then Sanitize the string. 2007-10-13 23:32:04 +02:00			`}`
			`return self`
			`end`

Consolidation Shuffled around a couple of files. 2007-06-09 05:39:37 +02:00			`end`
			`end`
Performance OK. This is a better way: define a custom TreeWalker which converts named entities to utf-8 as it goes. This avoids having to do an extra tree traversal in sanitize_rexml, AND avoids the trainwreck that is html5/inputstream.rb. 2007-10-15 04:07:46 +02:00
Update RDOC documentation. Update the documentation for sanitize.rb, to match current behaviour. 2007-10-15 05:22:18 +02:00			`module HTML5 #:nodoc: all`
Performance OK. This is a better way: define a custom TreeWalker which converts named entities to utf-8 as it goes. This avoids having to do an extra tree traversal in sanitize_rexml, AND avoids the trainwreck that is html5/inputstream.rb. 2007-10-15 04:07:46 +02:00			`module TreeWalkers`

			`private`

			`class << self`
			`def [](name)`
			`case name.to_s.downcase`
			`when 'rexml'`
			`require 'html5/treewalkers/rexml'`
			`REXML::TreeWalker`
			`when 'rexml2'`
			`REXML2::TreeWalker`
			`else`
			`raise "Unknown TreeWalker #{name}"`
			`end`
			`end`

			`alias :get_tree_walker :[]`
			`end`

			`module REXML2`
			`class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker`

			`private`

			`def node_details(node)`
			`case node`
			`when ::REXML::Document`
			`[:DOCUMENT]`
			`when ::REXML::Element`
			`if !node.name`
			`[:DOCUMENT_FRAGMENT]`
			`else`
			`[:ELEMENT, node.name,`
			`node.attributes.map {\|name,value\| [name,value.to_utf8]},`
			`node.has_elements? \|\| node.has_text?]`
			`end`
			`when ::REXML::Text`
			`[:TEXT, node.value.to_utf8]`
			`when ::REXML::Comment`
			`[:COMMENT, node.string]`
			`when ::REXML::DocType`
			`[:DOCTYPE, node.name, node.public, node.system]`
			`when ::REXML::XMLDecl`
			`[nil]`
			`else`
			`[:UNKNOWN, node.class.inspect]`
			`end`
			`end`

			`def first_child(node)`
			`node.children.first`
			`end`

			`def next_sibling(node)`
			`node.next_sibling`
			`end`

			`def parent(node)`
			`node.parent`
			`end`
			`end`
			`end`
			`end`
			`end`