REXML Trees

Synced with latest HTML5lib. Added preliminary support (currently disabled) for sanitizing REXML trees.
2007-06-05 16:34:49 -05:00 · 2007-06-05 16:34:49 -05:00 · bd8ba1f4b1
commit bd8ba1f4b1
parent 4dd70af5ae
28 changed files with 1317 additions and 112 deletions
--- a/vendor/plugins/HTML5lib/lib/html5lib/constants.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/constants.rb
@ -148,6 +148,26 @@ module HTML5lib
      input
  ]

+  BOOLEAN_ATTRIBUTES = {
+    :global => %w[irrelevant],
+    'style' => %w[scoped],
+    'img' => %w[ismap],
+    'audio' => %w[autoplay controls],
+    'video' => %w[autoplay controls],
+    'script' => %w[defer async],
+    'details' => %w[open],
+    'datagrid' => %w[multiple disabled],
+    'command' => %w[hidden disabled checked default],
+    'menu' => %w[autosubmit],
+    'fieldset' => %w[disabled readonly],
+    'option' => %w[disabled readonly selected],
+    'optgroup' => %w[disabled readonly],
+    'button' => %w[disabled autofocus],
+    'input' => %w[disabled readonly required autofocus checked ismap],
+    'select' => %w[disabled readonly autofocus multiple],
+    'output' => %w[disabled readonly]
+  }
+
  # entitiesWindows1252 has to be _ordered_ and needs to have an index.
  ENTITIES_WINDOWS1252 = [
      8364,  # 0x80  0x20AC  EURO SIGN
--- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb
@ -37,13 +37,13 @@ module HTML5lib
    # :strict - raise an exception when a parse error is encountered
    # :tree - a treebuilder class controlling the type of tree that will be
    # returned. Built in treebuilders can be accessed through
-    # html5lib.treebuilders.getTreeBuilder(treeType)
+    # HTML5lib::TreeBuilders[treeType]
    def initialize(options = {})
      @strict = false
      @errors = []
     
      @tokenizer =  HTMLTokenizer
-      @tree = TreeBuilders::REXMLTree::TreeBuilder
+      @tree = TreeBuilders::REXML::TreeBuilder
 
      options.each { |name, value| instance_variable_set("@#{name}", value) }

--- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_phase.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_phase.rb
@ -107,4 +107,4 @@ module HTML5lib
    end

  end
-end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/phase.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/phase.rb
@ -153,4 +153,4 @@ module HTML5lib
    end

  end
-end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
@ -58,7 +58,7 @@ module HTML5lib
      unless @char_encoding == 'utf-8'
        begin
          require 'iconv'
-          uString = Iconv.iconv('utf-8', @encoding, uString)[0]
+          uString = Iconv.iconv('utf-8', @char_encoding, uString)[0]
        rescue
        end
      end
@ -95,11 +95,13 @@ module HTML5lib
      #First look for a BOM
      #This will also read past the BOM if present
      encoding = detect_bom
+
      #If there is no BOM need to look for meta elements with encoding 
      #information
      if encoding.nil? and @parse_meta
        encoding = detect_encoding_meta
      end
+
      #Guess with chardet, if avaliable
      if encoding.nil? and @chardet
        begin
@ -111,13 +113,14 @@ module HTML5lib
        rescue LoadError
        end
      end
+
      # If all else fails use the default encoding
      if encoding.nil?
        encoding = @DEFAULT_ENCODING
      end
    
      #Substitute for equivalent encodings:
-      encoding_sub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
+      encoding_sub = {'iso-8859-1' => 'windows-1252'}

      if encoding_sub.has_key?(encoding.downcase)
        encoding = encoding_sub[encoding.downcase]
@ -132,10 +135,10 @@ module HTML5lib
    def detect_bom
      bom_dict = {
        "\xef\xbb\xbf" => 'utf-8',
-        "\xff\xfe" => 'utf-16-le',
-        "\xfe\xff" => 'utf-16-be',
-        "\xff\xfe\x00\x00" => 'utf-32-le',
-        "\x00\x00\xfe\xff" => 'utf-32-be'
+        "\xff\xfe" => 'utf16le',
+        "\xfe\xff" => 'utf16be',
+        "\xff\xfe\x00\x00" => 'utf32le',
+        "\x00\x00\xfe\xff" => 'utf32be'
      }

      # Go to beginning of file and read in 4 bytes
@ -205,7 +208,17 @@ module HTML5lib
      else
        begin
          @tell += 1
-          return @data_stream[@tell - 1].chr
+          c = @data_stream[@tell - 1]
+          case c
+          when 0xC2 .. 0xDF
+            @tell += 1
+            c.chr + @data_stream[@tell-1].chr
+          when 0xE0 .. 0xF0
+            @tell += 2
+            c.chr + @data_stream[@tell-2].chr + @data_stream[@tell-1].chr
+          else
+            c.chr
+          end
        rescue
          return :EOF
        end
@ -227,8 +240,8 @@ module HTML5lib
          else
            # Then the rest
            begin
-              char_stack.push(@data_stream[@tell].chr)
              @tell += 1
+              char_stack.push(@data_stream[@tell-1].chr)
            rescue
              char_stack.push(:EOF)
              break
--- a/vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb
@ -1,4 +1,3 @@
-require 'html5lib/tokenizer'
 require 'cgi'

 module HTML5lib
@ -6,7 +5,7 @@ module HTML5lib
 # This module provides sanitization of XHTML+MathML+SVG
 # and of inline style attributes.

-  class HTMLSanitizer < HTMLTokenizer
+   module HTMLSanitizeModule

    ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
      button caption center cite code col colgroup dd del dfn dir div dl dt
@ -96,19 +95,7 @@ module HTML5lib
    ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
    ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS

-    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
-    # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
-    # attributes are parsed, and a restricted set, # specified by
-    # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
-    # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
-    # in ALLOWED_PROTOCOLS are allowed.
-    #
-    #   sanitize_html('<script> do_nasty_stuff() </script>')
-    #  => &lt;script> do_nasty_stuff() &lt;/script>
-    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
-    #  => <a>Click here for $100</a>
-    def each
-      super do |token|
+    def process_token(token)
        case token[:type]
        when :StartTag, :EndTag, :EmptyTag
          if ALLOWED_ELEMENTS.include?(token[:name])
@ -126,7 +113,7 @@ module HTML5lib
              end
              token[:data] = attrs.map {|k,v| [k,v]}
            end
-            yield token
+            return token
          else
            if token[:type] == :EndTag
              token[:data] = "</#{token[:name]}>"
@ -139,12 +126,11 @@ module HTML5lib
            token[:data].insert(-2,'/') if token[:type] == :EmptyTag
            token[:type] = :Characters
            token.delete(:name)
-            yield token
+            return token
          end
        else
-          yield token
+          return token
        end
-      end
    end

    def sanitize_css(style)
@ -174,4 +160,23 @@ module HTML5lib
      style = clean.join(' ')
    end
  end
+
+  class HTMLSanitizeFilter < Filter
+    include HTMLSanitizeModule
+    def each
+      @source.each do |token|
+        yield(process_token(token))
+      end
+    end
+  end
+
+  class HTMLSanitizer < HTMLTokenizer
+    include HTMLSanitizeModule
+    def each
+      super do |token|
+        yield(process_token(token))
+      end
+    end
+  end
+
 end
--- a/vendor/plugins/HTML5lib/lib/html5lib/serializer.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/serializer.rb
@ -0,0 +1,418 @@
+require 'html5lib/constants'
+require 'jcode'
+
+module HTML5lib
+
+class Filter
+    include Enumerable
+    def initialize(source)
+        @source = source
+    end
+end
+
+class OptionalTagFilter < Filter
+    def slider
+        previous1 = previous2 = nil
+        @source.each do |token|
+            yield previous2, previous1, token if previous1 != nil
+            previous2 = previous1
+            previous1 = token
+        end
+        yield previous2, previous1, nil
+    end
+
+    def each
+        slider do |previous, token, nexttok|
+            type = token[:type]
+            if type == :StartTag
+                yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok)
+            elsif type == :EndTag
+                yield token unless is_optional_end(token[:name], nexttok)
+            else
+                yield token
+            end
+        end
+    end
+
+    def is_optional_start(tagname, previous, nexttok)
+        type = nexttok ? nexttok[:type] : nil
+        if tagname == 'html'
+            # An html element's start tag may be omitted if the first thing
+            # inside the html element is not a space character or a comment.
+            return ![:Comment, :SpaceCharacters].include?(type)
+        elsif tagname == 'head'
+            # A head element's start tag may be omitted if the first thing
+            # inside the head element is an element.
+            return type == :StartTag
+        elsif tagname == 'body'
+            # A body element's start tag may be omitted if the first thing
+            # inside the body element is not a space character or a comment,
+            # except if the first thing inside the body element is a script
+            # or style element and the node immediately preceding the body
+            # element is a head element whose end tag has been omitted.
+            if [:Comment, :SpaceCharacters].include?(type)
+                return false
+            elsif type == :StartTag
+                # XXX: we do not look at the preceding event, so we never omit
+                # the body element's start tag if it's followed by a script or
+                # a style element.
+                return !%w[script style].include?(nexttok[:name])
+            else
+                return true
+            end
+        elsif tagname == 'colgroup'
+            # A colgroup element's start tag may be omitted if the first thing
+            # inside the colgroup element is a col element, and if the element
+            # is not immediately preceeded by another colgroup element whose
+            # end tag has been omitted.
+            if type == :StartTag
+                # XXX: we do not look at the preceding event, so instead we never
+                # omit the colgroup element's end tag when it is immediately
+                # followed by another colgroup element. See is_optional_end.
+                return nexttok[:name] == "col"
+            else
+                return false
+            end
+        elsif tagname == 'tbody'
+            # A tbody element's start tag may be omitted if the first thing
+            # inside the tbody element is a tr element, and if the element is
+            # not immediately preceeded by a tbody, thead, or tfoot element
+            # whose end tag has been omitted.
+            if type == :StartTag
+                # omit the thead and tfoot elements' end tag when they are
+                # immediately followed by a tbody element. See is_optional_end.
+                if previous and previous[:type] == :EndTag and \
+                  %w(tbody thead tfoot).include?(previous[:name])
+                    return false
+                end
+
+                return nexttok[:name] == 'tr'
+            else
+                return false
+            end
+        end
+        return false
+    end
+
+    def is_optional_end(tagname, nexttok)
+        type = nexttok ? nexttok[:type] : nil
+        if %w[html head body].include?(tagname)
+            # An html element's end tag may be omitted if the html element
+            # is not immediately followed by a space character or a comment.
+            return ![:Comment, :SpaceCharacters].include?(type)
+        elsif %w[li optgroup option tr].include?(tagname)
+            # A li element's end tag may be omitted if the li element is
+            # immediately followed by another li element or if there is
+            # no more content in the parent element.
+            # An optgroup element's end tag may be omitted if the optgroup
+            # element is immediately followed by another optgroup element,
+            # or if there is no more content in the parent element.
+            # An option element's end tag may be omitted if the option
+            # element is immediately followed by another option element,
+            # or if there is no more content in the parent element.
+            # A tr element's end tag may be omitted if the tr element is
+            # immediately followed by another tr element, or if there is
+            # no more content in the parent element.
+            if type == :StartTag
+                return nexttok[:name] == tagname
+            else
+                return type == :EndTag || type == nil
+            end
+        elsif %w(dt dd).include?(tagname)
+            # A dt element's end tag may be omitted if the dt element is
+            # immediately followed by another dt element or a dd element.
+            # A dd element's end tag may be omitted if the dd element is
+            # immediately followed by another dd element or a dt element,
+            # or if there is no more content in the parent element.
+            if type == :StartTag
+                return %w(dt dd).include?(nexttok[:name])
+            elsif tagname == 'dd'
+                return type == :EndTag || type == nil
+            else
+                return false
+            end
+        elsif tagname == 'p'
+            # A p element's end tag may be omitted if the p element is
+            # immediately followed by an address, blockquote, dl, fieldset,
+            # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
+            # or ul  element, or if there is no more content in the parent
+            # element.
+            if type == :StartTag
+                return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5
+                    h6 hr menu ol p pre table ul).include?(nexttok[:name])
+            else
+                return type == :EndTag || type == nil
+            end
+        elsif tagname == 'colgroup'
+            # A colgroup element's end tag may be omitted if the colgroup
+            # element is not immediately followed by a space character or
+            # a comment.
+            if [:Comment, :SpaceCharacters].include?(type)
+                return false
+            elsif type == :StartTag
+                # XXX: we also look for an immediately following colgroup
+                # element. See is_optional_start.
+                return nexttok[:name] != 'colgroup'
+            else
+                return true
+            end
+        elsif %w(thead tbody).include? tagname
+            # A thead element's end tag may be omitted if the thead element
+            # is immediately followed by a tbody or tfoot element.
+            # A tbody element's end tag may be omitted if the tbody element
+            # is immediately followed by a tbody or tfoot element, or if
+            # there is no more content in the parent element.
+            # A tfoot element's end tag may be omitted if the tfoot element
+            # is immediately followed by a tbody element, or if there is no
+            # more content in the parent element.
+            # XXX: we never omit the end tag when the following element is
+            # a tbody. See is_optional_start.
+            if type == :StartTag
+                return %w(tbody tfoot).include?(nexttok[:name])
+            elsif tagname == 'tbody'
+                return (type == :EndTag or type == nil)
+            else
+                return false
+            end
+        elsif tagname == 'tfoot'
+            # A tfoot element's end tag may be omitted if the tfoot element
+            # is immediately followed by a tbody element, or if there is no
+            # more content in the parent element.
+            # XXX: we never omit the end tag when the following element is
+            # a tbody. See is_optional_start.
+            if type == :StartTag
+                return nexttok[:name] == 'tbody'
+            else
+                return type == :EndTag || type == nil
+            end
+        elsif %w(td th).include? tagname
+            # A td element's end tag may be omitted if the td element is
+            # immediately followed by a td or th element, or if there is
+            # no more content in the parent element.
+            # A th element's end tag may be omitted if the th element is
+            # immediately followed by a td or th element, or if there is
+            # no more content in the parent element.
+            if type == :StartTag
+                return %w(td th).include?(nexttok[:name])
+            else
+                return type == :EndTag || type == nil
+            end
+        end
+        return false
+    end
+end
+
+class HTMLSerializer
+    CDATA_ELEMENTS = %w[style script xmp iframe noembed noframes noscript]
+
+    def self.serialize(stream, options = {})
+        new(options).serialize(stream)
+    end
+
+    def initialize(options={})
+        @quote_attr_values = false
+        @quote_char = '"'
+        @use_best_quote_char = true
+        @minimize_boolean_attributes = true
+
+        @use_trailing_solidus = false
+        @space_before_trailing_solidus = true
+
+        @omit_optional_tags = true
+        @sanitize = false
+
+        @strip_whitespace = false
+
+        @inject_meta_charset = true
+
+        options.each do |name, value|
+            next unless %w(quote_attr_values quote_char use_best_quote_char
+              minimize_boolean_attributes use_trailing_solidus
+              space_before_trailing_solidus omit_optional_tags sanitize
+              strip_whitespace inject_meta_charset).include? name.to_s
+            @use_best_quote_char = false if name.to_s == 'quote_char'
+            instance_variable_set("@#{name}", value)
+        end
+
+        @errors = []
+    end
+
+    def serialize(treewalker, encoding=nil)
+        in_cdata = false
+        @errors = []
+        if encoding and @inject_meta_charset
+            treewalker = filter_inject_meta_charset(treewalker, encoding)
+        end
+        if @strip_whitespace
+            treewalker = filter_whitespace(treewalker)
+        end
+        if @sanitize
+            require 'html5lib/sanitizer'
+            treewalker = HTMLSanitizeFilter.new(treewalker)
+        end
+#        if @omit_optional_tags
+#            treewalker = OptionalTagFilter.new(treewalker)
+#        end
+
+        result = []
+        treewalker.each do |token|
+            type = token[:type]
+            if type == :Doctype
+                doctype = "<!DOCTYPE %s>" % token[:name]
+                if encoding
+                    result << doctype.encode(encoding)
+                else
+                    result << doctype
+                end
+
+            elsif [:Characters, :SpaceCharacters].include? type
+                if type == :SpaceCharacters or in_cdata
+                    if in_cdata and token[:data].find("</") >= 0
+                        serializeError(_("Unexpected </ in CDATA"))
+                    end
+                    if encoding
+                        result << token[:data].encode(encoding, errors || "strict")
+                    else
+                        result << token[:data]
+                    end
+                elsif encoding
+                    result << token[:data].replace("&", "&amp;") \
+                        .encode(encoding, unicode_encode_errors)
+                else
+                    result << token[:data] \
+                        .gsub("&", "&amp;") \
+                        .gsub("<", "&lt;")  \
+                        .gsub(">", "&gt;")
+                end
+
+            elsif [:StartTag, :EmptyTag].include? type
+                name = token[:name]
+                if CDATA_ELEMENTS.include?(name)
+                    in_cdata = true
+                elsif in_cdata
+                    serializeError(_("Unexpected child element of a CDATA element"))
+                end
+                attrs = token[:data].to_a
+                attrs.sort()
+                attributes = []
+                for k,v in attrs
+                    if encoding
+                        k = k.encode(encoding)
+                    end
+                    attributes << ' '
+
+                    attributes << k
+                    if not @minimize_boolean_attributes or \
+                      (!(BOOLEAN_ATTRIBUTES[name]||[]).include?(k) \
+                      and !BOOLEAN_ATTRIBUTES[:global].include?(k))
+                        attributes << "="
+                        if @quote_attr_values or v.empty?
+                            quote_attr = true
+                        else
+                            quote_attr = (SPACE_CHARACTERS.join('') + "<>\"'").each_char.any? {|c| v.include?(c)}
+                        end
+                        v = v.gsub("&", "&amp;")
+                        if encoding
+                            v = v.encode(encoding, unicode_encode_errors)
+                        end
+                        if quote_attr
+                            quote_char = @quote_char
+                            if @use_best_quote_char
+                                if v.index("'") and !v.index('"')
+                                    quote_char = '"'
+                                elsif v.index('"') and !v.index("'")
+                                    quote_char = "'"
+                                end
+                            end
+                            if quote_char == "'"
+                                v = v.gsub("'", "&#39;")
+                            else
+                                v = v.gsub('"', "&quot;")
+                            end
+                            attributes << quote_char << v << quote_char
+                        else
+                            attributes << v
+                        end
+                    end
+                end
+                if VOID_ELEMENTS.include?(name) and @use_trailing_solidus
+                    if @space_before_trailing_solidus
+                        attributes << " /"
+                    else
+                        attributes << "/"
+                    end
+                end
+                if encoding
+                    result << "<%s%s>" % [name.encode(encoding), attributes.join('')]
+                else
+                    result << "<%s%s>" % [name, attributes.join('')]
+                end
+
+            elsif type == :EndTag
+                name = token[:name]
+                if CDATA_ELEMENTS.include?(name)
+                    in_cdata = false
+                elsif in_cdata
+                    serializeError(_("Unexpected child element of a CDATA element"))
+                end
+                end_tag = "</%s>" % name
+                if encoding
+                    end_tag = end_tag.encode(encoding)
+                end
+                result << end_tag
+
+            elsif type == :Comment
+                data = token[:data]
+                if data.index("--")
+                    serializeError(_("Comment contains --"))
+                end
+                comment = "<!--%s-->" % token[:data]
+                if encoding
+                    comment = comment.encode(encoding, unicode_encode_errors)
+                end
+                result << comment
+
+            else
+                serializeError(token[:data])
+            end
+        end
+        result.join('')
+    end
+
+    def render(treewalker, encoding=nil)
+        if encoding
+            return "".join(list(serialize(treewalker, encoding)))
+        else
+            return "".join(list(serialize(treewalker)))
+        end
+    end
+
+    def serializeError(data="XXX ERROR MESSAGE NEEDED")
+        # XXX The idea is to make data mandatory.
+        @errors.push(data)
+        if @strict
+            raise SerializeError
+        end
+    end
+
+    def filter_inject_meta_charset(treewalker, encoding)
+        done = false
+        for token in treewalker
+            if not done and token[:type] == :StartTag \
+              and token[:name].lower() == "head"
+                yield({:type => :EmptyTag, :name => "meta", \
+                    :data => {"charset" => encoding}})
+            end
+            yield token
+        end
+    end
+
+    def filter_whitespace(treewalker)
+        raise NotImplementedError
+    end
+end
+
+# Error in serialized tree
+class SerializeError < Exception
+end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb
@ -1,21 +1,24 @@
 module HTML5lib
  module TreeBuilders

-    def self.getTreeBuilder(name)
-      case name.to_s.downcase
+    class << self
+      def [](name)
+        case name.to_s.downcase
        when 'simpletree' then
          require 'html5lib/treebuilders/simpletree'
          SimpleTree::TreeBuilder
        when 'rexml' then
          require 'html5lib/treebuilders/rexml'
-          REXMLTree::TreeBuilder
+          REXML::TreeBuilder
        when 'hpricot' then
          require 'html5lib/treebuilders/hpricot'
          Hpricot::TreeBuilder
        else
          raise "Unknown TreeBuilder #{name}"
+        end
      end
-    end

+      alias :getTreeBuilder :[]
+    end
  end
 end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb
@ -144,7 +144,7 @@ module HTML5lib
          # code. It should still do the same though.

          # Step 1: stop the algorithm when there's nothing to do.
-          return unless @activeFormattingElements
+          return if @activeFormattingElements.empty?

          # Step 2 and step 3: we start with the last element. So i is -1.
          i = -1
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb
@ -1,4 +1,5 @@
 require 'html5lib/treebuilders/base'
+require 'rubygems'
 require 'hpricot'
 require 'forwardable'

@ -26,12 +27,14 @@ module HTML5lib
            childNodes << node
            hpricot.children << node.hpricot
          end
+          node.hpricot.parent = hpricot
          node.parent = self
        end

        def removeChild(node)
           childNodes.delete(node)
           hpricot.children.delete_at(hpricot.children.index(node.hpricot))
+           node.hpricot.parent = nil
           node.parent = nil
        end

@ -48,6 +51,7 @@ module HTML5lib
          if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
            childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
          else
+            refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
            childNodes.insert(index, node)
          end
        end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb
@ -4,7 +4,7 @@ require 'forwardable'

 module HTML5lib
  module TreeBuilders
-    module REXMLTree
+    module REXML

      class Node < Base::Node
        extend Forwardable
@ -52,6 +52,7 @@ module HTML5lib
            childNodes[index-1].rxobj.raw = true
          else
            childNodes.insert index, node
+            refNode.rxobj.parent.insert_before(refNode.rxobj,node.rxobj)
          end
        end

@ -62,7 +63,7 @@ module HTML5lib

      class Element < Node
        def self.rxclass
-          REXML::Element
+          ::REXML::Element
        end

        def initialize name
@ -95,7 +96,7 @@ module HTML5lib

      class Document < Node
        def self.rxclass
-          REXML::Document
+          ::REXML::Document
        end

        def initialize
@ -120,7 +121,7 @@ module HTML5lib

      class DocumentType < Node
        def self.rxclass
-          REXML::DocType
+          ::REXML::DocType
        end

        def printTree indent=0
@ -145,7 +146,7 @@ module HTML5lib
      class TextNode < Node
        def initialize data
          raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;')
-          @rxobj = REXML::Text.new(raw, true, nil, true)
+          @rxobj = ::REXML::Text.new(raw, true, nil, true)
        end

        def printTree indent=0
@ -155,7 +156,7 @@ module HTML5lib

      class CommentNode < Node
        def self.rxclass
-          REXML::Comment
+          ::REXML::Comment
        end

        def printTree indent=0
--- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers.rb
@ -0,0 +1,26 @@
+require 'html5lib/treewalkers/base'
+
+module HTML5lib
+  module TreeWalkers
+
+    class << self
+      def [](name)
+        case name.to_s.downcase
+        when 'simpletree' then
+          require 'html5lib/treewalkers/simpletree'
+          SimpleTree::TreeWalker
+        when 'rexml' then
+          require 'html5lib/treewalkers/rexml'
+          REXML::TreeWalker
+        when 'hpricot' then
+          require 'html5lib/treewalkers/hpricot'
+          Hpricot::TreeWalker
+        else
+          raise "Unknown TreeWalker #{name}"
+        end
+      end
+
+      alias :getTreeWalker :[]
+    end
+  end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/base.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/base.rb
@ -0,0 +1,156 @@
+require 'html5lib/constants'
+module HTML5lib
+module TreeWalkers
+
+module TokenConstructor
+    def error(msg)
+        return {:type => "SerializeError", :data => msg}
+    end
+
+    def normalizeAttrs(attrs)
+        attrs.to_a
+    end
+
+    def emptyTag(name, attrs, hasChildren=false)
+        error(_("Void element has children")) if hasChildren
+        return({:type => :EmptyTag, :name => name, \
+                :data => normalizeAttrs(attrs)})
+    end
+
+    def startTag(name, attrs)
+        return {:type => :StartTag, :name => name, \
+                 :data => normalizeAttrs(attrs)}
+    end
+
+    def endTag(name)
+        return {:type => :EndTag, :name => name, :data => []}
+    end
+
+    def text(data)
+        if data =~ /^([#{SPACE_CHARACTERS.join('')}]+)/
+          yield({:type => :SpaceCharacters, :data => $1})
+          data = data[$1.length .. -1]
+          return if data.empty?
+        end
+
+        if data =~ /([#{SPACE_CHARACTERS.join('')}]+)$/
+          yield({:type => :Characters, :data => data[0 ... -$1.length]})
+          yield({:type => :SpaceCharacters, :data => $1})
+        else
+          yield({:type => :Characters, :data => data})
+        end
+    end
+
+    def comment(data)
+        return {:type => :Comment, :data => data}
+    end
+
+    def doctype(name)
+        return {:type => :Doctype, :name => name, :data => name.upcase() == "HTML"}
+    end
+
+    def unknown(nodeType)
+        return error(_("Unknown node type: ") + nodeType.to_s)
+    end
+
+    def _(str)
+      str
+    end
+end
+
+class Base
+    include TokenConstructor
+
+    def initialize(tree)
+        @tree = tree
+    end
+
+    def each
+        raise NotImplementedError
+    end
+
+    alias walk each
+end
+
+class NonRecursiveTreeWalker < TreeWalkers::Base
+    def node_details(node)
+        raise NotImplementedError
+    end
+
+    def first_child(node)
+        raise NotImplementedError
+    end
+
+    def next_sibling(node)
+        raise NotImplementedError
+    end
+
+    def parent(node)
+        raise NotImplementedError
+    end
+
+    def each
+        currentNode = @tree
+        while currentNode != nil
+            details = node_details(currentNode)
+            hasChildren = false
+
+            case details.shift
+            when :DOCTYPE
+                yield doctype(*details)
+
+            when :TEXT
+                text(*details) {|token| yield token}
+
+            when :ELEMENT
+                name, attributes, hasChildren = details
+                if VOID_ELEMENTS.include?(name)
+                    yield emptyTag(name, attributes.to_a, hasChildren)
+                    hasChildren = false
+                else
+                    yield startTag(name, attributes.to_a)
+                end
+
+            when :COMMENT
+                yield comment(details[0])
+
+            when :DOCUMENT, :DOCUMENT_FRAGMENT
+                hasChildren = true
+
+            when nil
+                # ignore (REXML::XMLDecl is an example)
+
+            else
+                yield unknown(details[0])
+            end
+
+            firstChild = hasChildren ? first_child(currentNode) : nil
+            if firstChild != nil
+                currentNode = firstChild
+            else
+                while currentNode != nil
+                    details = node_details(currentNode)
+                    if details.shift == :ELEMENT
+                        name, attributes, hasChildren = details
+                        yield endTag(name) if !VOID_ELEMENTS.include?(name)
+                    end
+
+                    if @tree == currentNode
+                        currentNode = nil
+                    else
+                        nextSibling = next_sibling(currentNode)
+                        if nextSibling != nil
+                            currentNode = nextSibling
+                            break
+                        end
+
+                        currentNode = parent(currentNode)
+                    end
+                end
+            end
+        end
+    end
+end
+
+end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/hpricot.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/hpricot.rb
@ -0,0 +1,48 @@
+require 'html5lib/treewalkers/base'
+require 'rexml/document'
+
+module HTML5lib
+  module TreeWalkers
+    module Hpricot
+      class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
+
+        def node_details(node)
+          case node
+          when ::Hpricot::Elem
+            if !node.name
+              [:DOCUMENT_FRAGMENT]
+            else
+              [:ELEMENT, node.name,
+                node.attributes.map {|name,value| [name,value]},
+                !node.empty?]
+            end
+          when ::Hpricot::Text
+            [:TEXT, node.to_plain_text]
+          when ::Hpricot::Comment
+            [:COMMENT, node.content]
+          when ::Hpricot::Doc
+            [:DOCUMENT]
+          when ::Hpricot::DocType
+            [:DOCTYPE, node.target]
+          when ::Hpricot::XMLDecl
+            [nil]
+          else
+            [:UNKNOWN, node.class.inspect]
+          end
+        end
+
+        def first_child(node)
+          node.children.first
+        end
+
+        def next_sibling(node)
+          node.next_node
+        end
+
+        def parent(node)
+          node.parent
+        end
+      end
+    end
+  end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/rexml.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/rexml.rb
@ -0,0 +1,48 @@
+require 'html5lib/treewalkers/base'
+require 'rexml/document'
+
+module HTML5lib
+  module TreeWalkers
+    module REXML
+      class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
+
+        def node_details(node)
+          case node
+          when ::REXML::Document
+            [:DOCUMENT]
+          when ::REXML::Element
+            if !node.name
+              [:DOCUMENT_FRAGMENT]
+            else
+              [:ELEMENT, node.name,
+                node.attributes.map {|name,value| [name,value]},
+                node.has_elements? || node.has_text?]
+            end
+          when ::REXML::Text
+            [:TEXT, node.value]
+          when ::REXML::Comment
+            [:COMMENT, node.string]
+          when ::REXML::DocType
+            [:DOCTYPE, node.name]
+          when ::REXML::XMLDecl
+            [nil]
+          else
+            [:UNKNOWN, node.class.inspect]
+          end
+        end
+
+        def first_child(node)
+          node.children.first
+        end
+
+        def next_sibling(node)
+          node.next_sibling
+        end
+
+        def parent(node)
+          node.parent
+        end
+      end
+    end
+  end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/simpletree.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/simpletree.rb
@ -0,0 +1,48 @@
+require 'html5lib/treewalkers/base'
+
+module HTML5lib
+  module TreeWalkers
+    module SimpleTree
+      class TreeWalker < HTML5lib::TreeWalkers::Base
+        include HTML5lib::TreeBuilders::SimpleTree
+
+        def walk(node)
+          case node
+          when Document, DocumentFragment
+            return
+
+          when DocumentType
+            yield doctype(node.name)
+
+          when TextNode
+            text(node.value) {|token| yield token}
+
+          when Element
+            if VOID_ELEMENTS.include?(node.name)
+              yield emptyTag(node.name, node.attributes, node.hasContent())
+            else
+              yield startTag(node.name, node.attributes)
+              for child in node.childNodes
+                walk(child) {|token| yield token}
+              end
+              yield endTag(node.name)
+            end
+
+          when CommentNode
+            yield comment(node.value)
+
+          else
+            puts '?'
+            yield unknown(node.class)
+          end
+        end
+
+        def each
+          for child in @tree.childNodes
+            walk(child) {|node| yield node}
+          end
+        end
+      end
+    end
+  end
+end