REXML Trees

Synced with latest HTML5lib. Added preliminary support (currently disabled) for sanitizing REXML trees.
2007-06-05 16:34:49 -05:00 · 2007-06-05 16:34:49 -05:00 · bd8ba1f4b1
commit bd8ba1f4b1
parent 4dd70af5ae
28 changed files with 1317 additions and 112 deletions
--- a/lib/chunks/engines.rb
+++ b/lib/chunks/engines.rb
@ -53,9 +53,10 @@ module Engines
    def mask
      require_dependency 'maruku'
      require_dependency 'maruku/ext/math'
-      html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
-            {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html
-      sanitize_xhtml(html.to_ncr)
+#      html = sanitize_rexml(Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
+#            {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html_tree)
+      html = sanitize_xhtml(Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
+            {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html.to_ncr)
    end
  end

--- a/lib/sanitize.rb
+++ b/lib/sanitize.rb
@ -8,19 +8,36 @@ module Sanitize
 #
 #  sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
 #  sanitize_html() is a case-insensitive sanitizer suitable for HTML
+#  sanitize_rexml() sanitized a REXML tree, returning a string


-  require 'html5lib/sanitizer'
  require 'html5lib/html5parser'
  require 'html5lib/liberalxmlparser'
+
+  require 'html5lib/treewalkers'
+  require 'html5lib/serializer'
+  require 'string_utils'
+  require 'html5lib/sanitizer'
+
  include HTML5lib

  def sanitize_xhtml(html)
-    XHTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
+    XHTMLParser.parseFragment(html.to_ncr, :tokenizer => HTMLSanitizer).to_s
  end

  def sanitize_html(html)
    HTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
  end

+  def sanitize_rexml(tree)
+    tokens = TreeWalkers.getTreeWalker('rexml').new(tree.to_ncr)
+    HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
+      :quote_attr_values => 'true',
+      :minimize_boolean_attributes => 'false',
+      :use_trailing_solidus => 'true',
+      :space_before_trailing_solidus => 'true',
+      :omit_optional_tags => 'false',
+      :inject_meta_charset => 'false',
+      :sanitize => 'true'})
+  end
 end
--- a/lib/string_utils.rb
+++ b/lib/string_utils.rb
@ -2155,3 +2155,20 @@ class String
    end

 end
+
+require 'rexml/element'
+module REXML
+  class Element
+    def to_ncr
+      XPath.each(self, '//*') { |el|
+        el.texts.each_index  {|i|
+          el.texts[i].value = el.texts[i].to_s.to_ncr
+        }
+        el.attributes.each { |name,val|
+          el.attributes[name] = val.to_ncr
+        }
+      }
+      return self
+    end
+  end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/constants.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/constants.rb
@ -148,6 +148,26 @@ module HTML5lib
      input
  ]

+  BOOLEAN_ATTRIBUTES = {
+    :global => %w[irrelevant],
+    'style' => %w[scoped],
+    'img' => %w[ismap],
+    'audio' => %w[autoplay controls],
+    'video' => %w[autoplay controls],
+    'script' => %w[defer async],
+    'details' => %w[open],
+    'datagrid' => %w[multiple disabled],
+    'command' => %w[hidden disabled checked default],
+    'menu' => %w[autosubmit],
+    'fieldset' => %w[disabled readonly],
+    'option' => %w[disabled readonly selected],
+    'optgroup' => %w[disabled readonly],
+    'button' => %w[disabled autofocus],
+    'input' => %w[disabled readonly required autofocus checked ismap],
+    'select' => %w[disabled readonly autofocus multiple],
+    'output' => %w[disabled readonly]
+  }
+
  # entitiesWindows1252 has to be _ordered_ and needs to have an index.
  ENTITIES_WINDOWS1252 = [
      8364,  # 0x80  0x20AC  EURO SIGN
--- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb
@ -37,13 +37,13 @@ module HTML5lib
    # :strict - raise an exception when a parse error is encountered
    # :tree - a treebuilder class controlling the type of tree that will be
    # returned. Built in treebuilders can be accessed through
-    # html5lib.treebuilders.getTreeBuilder(treeType)
+    # HTML5lib::TreeBuilders[treeType]
    def initialize(options = {})
      @strict = false
      @errors = []
     
      @tokenizer =  HTMLTokenizer
-      @tree = TreeBuilders::REXMLTree::TreeBuilder
+      @tree = TreeBuilders::REXML::TreeBuilder
 
      options.each { |name, value| instance_variable_set("@#{name}", value) }

--- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_phase.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_phase.rb
@ -107,4 +107,4 @@ module HTML5lib
    end

  end
-end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/phase.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/phase.rb
@ -153,4 +153,4 @@ module HTML5lib
    end

  end
-end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
@ -58,7 +58,7 @@ module HTML5lib
      unless @char_encoding == 'utf-8'
        begin
          require 'iconv'
-          uString = Iconv.iconv('utf-8', @encoding, uString)[0]
+          uString = Iconv.iconv('utf-8', @char_encoding, uString)[0]
        rescue
        end
      end
@ -95,11 +95,13 @@ module HTML5lib
      #First look for a BOM
      #This will also read past the BOM if present
      encoding = detect_bom
+
      #If there is no BOM need to look for meta elements with encoding 
      #information
      if encoding.nil? and @parse_meta
        encoding = detect_encoding_meta
      end
+
      #Guess with chardet, if avaliable
      if encoding.nil? and @chardet
        begin
@ -111,13 +113,14 @@ module HTML5lib
        rescue LoadError
        end
      end
+
      # If all else fails use the default encoding
      if encoding.nil?
        encoding = @DEFAULT_ENCODING
      end
    
      #Substitute for equivalent encodings:
-      encoding_sub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
+      encoding_sub = {'iso-8859-1' => 'windows-1252'}

      if encoding_sub.has_key?(encoding.downcase)
        encoding = encoding_sub[encoding.downcase]
@ -132,10 +135,10 @@ module HTML5lib
    def detect_bom
      bom_dict = {
        "\xef\xbb\xbf" => 'utf-8',
-        "\xff\xfe" => 'utf-16-le',
-        "\xfe\xff" => 'utf-16-be',
-        "\xff\xfe\x00\x00" => 'utf-32-le',
-        "\x00\x00\xfe\xff" => 'utf-32-be'
+        "\xff\xfe" => 'utf16le',
+        "\xfe\xff" => 'utf16be',
+        "\xff\xfe\x00\x00" => 'utf32le',
+        "\x00\x00\xfe\xff" => 'utf32be'
      }

      # Go to beginning of file and read in 4 bytes
@ -205,7 +208,17 @@ module HTML5lib
      else
        begin
          @tell += 1
-          return @data_stream[@tell - 1].chr
+          c = @data_stream[@tell - 1]
+          case c
+          when 0xC2 .. 0xDF
+            @tell += 1
+            c.chr + @data_stream[@tell-1].chr
+          when 0xE0 .. 0xF0
+            @tell += 2
+            c.chr + @data_stream[@tell-2].chr + @data_stream[@tell-1].chr
+          else
+            c.chr
+          end
        rescue
          return :EOF
        end
@ -227,8 +240,8 @@ module HTML5lib
          else
            # Then the rest
            begin
-              char_stack.push(@data_stream[@tell].chr)
              @tell += 1
+              char_stack.push(@data_stream[@tell-1].chr)
            rescue
              char_stack.push(:EOF)
              break
--- a/vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb
@ -1,4 +1,3 @@
-require 'html5lib/tokenizer'
 require 'cgi'

 module HTML5lib
@ -6,7 +5,7 @@ module HTML5lib
 # This module provides sanitization of XHTML+MathML+SVG
 # and of inline style attributes.

-  class HTMLSanitizer < HTMLTokenizer
+   module HTMLSanitizeModule

    ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
      button caption center cite code col colgroup dd del dfn dir div dl dt
@ -96,19 +95,7 @@ module HTML5lib
    ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
    ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS

-    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
-    # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
-    # attributes are parsed, and a restricted set, # specified by
-    # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
-    # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
-    # in ALLOWED_PROTOCOLS are allowed.
-    #
-    #   sanitize_html('<script> do_nasty_stuff() </script>')
-    #  => &lt;script> do_nasty_stuff() &lt;/script>
-    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
-    #  => <a>Click here for $100</a>
-    def each
-      super do |token|
+    def process_token(token)
        case token[:type]
        when :StartTag, :EndTag, :EmptyTag
          if ALLOWED_ELEMENTS.include?(token[:name])
@ -126,7 +113,7 @@ module HTML5lib
              end
              token[:data] = attrs.map {|k,v| [k,v]}
            end
-            yield token
+            return token
          else
            if token[:type] == :EndTag
              token[:data] = "</#{token[:name]}>"
@ -139,12 +126,11 @@ module HTML5lib
            token[:data].insert(-2,'/') if token[:type] == :EmptyTag
            token[:type] = :Characters
            token.delete(:name)
-            yield token
+            return token
          end
        else
-          yield token
+          return token
        end
-      end
    end

    def sanitize_css(style)
@ -174,4 +160,23 @@ module HTML5lib
      style = clean.join(' ')
    end
  end
+
+  class HTMLSanitizeFilter < Filter
+    include HTMLSanitizeModule
+    def each
+      @source.each do |token|
+        yield(process_token(token))
+      end
+    end
+  end
+
+  class HTMLSanitizer < HTMLTokenizer
+    include HTMLSanitizeModule
+    def each
+      super do |token|
+        yield(process_token(token))
+      end
+    end
+  end
+
 end
--- a/vendor/plugins/HTML5lib/lib/html5lib/serializer.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/serializer.rb
@ -0,0 +1,418 @@
+require 'html5lib/constants'
+require 'jcode'
+
+module HTML5lib
+
+class Filter
+    include Enumerable
+    def initialize(source)
+        @source = source
+    end
+end
+
+class OptionalTagFilter < Filter
+    def slider
+        previous1 = previous2 = nil
+        @source.each do |token|
+            yield previous2, previous1, token if previous1 != nil
+            previous2 = previous1
+            previous1 = token
+        end
+        yield previous2, previous1, nil
+    end
+
+    def each
+        slider do |previous, token, nexttok|
+            type = token[:type]
+            if type == :StartTag
+                yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok)
+            elsif type == :EndTag
+                yield token unless is_optional_end(token[:name], nexttok)
+            else
+                yield token
+            end
+        end
+    end
+
+    def is_optional_start(tagname, previous, nexttok)
+        type = nexttok ? nexttok[:type] : nil
+        if tagname == 'html'
+            # An html element's start tag may be omitted if the first thing
+            # inside the html element is not a space character or a comment.
+            return ![:Comment, :SpaceCharacters].include?(type)
+        elsif tagname == 'head'
+            # A head element's start tag may be omitted if the first thing
+            # inside the head element is an element.
+            return type == :StartTag
+        elsif tagname == 'body'
+            # A body element's start tag may be omitted if the first thing
+            # inside the body element is not a space character or a comment,
+            # except if the first thing inside the body element is a script
+            # or style element and the node immediately preceding the body
+            # element is a head element whose end tag has been omitted.
+            if [:Comment, :SpaceCharacters].include?(type)
+                return false
+            elsif type == :StartTag
+                # XXX: we do not look at the preceding event, so we never omit
+                # the body element's start tag if it's followed by a script or
+                # a style element.
+                return !%w[script style].include?(nexttok[:name])
+            else
+                return true
+            end
+        elsif tagname == 'colgroup'
+            # A colgroup element's start tag may be omitted if the first thing
+            # inside the colgroup element is a col element, and if the element
+            # is not immediately preceeded by another colgroup element whose
+            # end tag has been omitted.
+            if type == :StartTag
+                # XXX: we do not look at the preceding event, so instead we never
+                # omit the colgroup element's end tag when it is immediately
+                # followed by another colgroup element. See is_optional_end.
+                return nexttok[:name] == "col"
+            else
+                return false
+            end
+        elsif tagname == 'tbody'
+            # A tbody element's start tag may be omitted if the first thing
+            # inside the tbody element is a tr element, and if the element is
+            # not immediately preceeded by a tbody, thead, or tfoot element
+            # whose end tag has been omitted.
+            if type == :StartTag
+                # omit the thead and tfoot elements' end tag when they are
+                # immediately followed by a tbody element. See is_optional_end.
+                if previous and previous[:type] == :EndTag and \
+                  %w(tbody thead tfoot).include?(previous[:name])
+                    return false
+                end
+
+                return nexttok[:name] == 'tr'
+            else
+                return false
+            end
+        end
+        return false
+    end
+
+    def is_optional_end(tagname, nexttok)
+        type = nexttok ? nexttok[:type] : nil
+        if %w[html head body].include?(tagname)
+            # An html element's end tag may be omitted if the html element
+            # is not immediately followed by a space character or a comment.
+            return ![:Comment, :SpaceCharacters].include?(type)
+        elsif %w[li optgroup option tr].include?(tagname)
+            # A li element's end tag may be omitted if the li element is
+            # immediately followed by another li element or if there is
+            # no more content in the parent element.
+            # An optgroup element's end tag may be omitted if the optgroup
+            # element is immediately followed by another optgroup element,
+            # or if there is no more content in the parent element.
+            # An option element's end tag may be omitted if the option
+            # element is immediately followed by another option element,
+            # or if there is no more content in the parent element.
+            # A tr element's end tag may be omitted if the tr element is
+            # immediately followed by another tr element, or if there is
+            # no more content in the parent element.
+            if type == :StartTag
+                return nexttok[:name] == tagname
+            else
+                return type == :EndTag || type == nil
+            end
+        elsif %w(dt dd).include?(tagname)
+            # A dt element's end tag may be omitted if the dt element is
+            # immediately followed by another dt element or a dd element.
+            # A dd element's end tag may be omitted if the dd element is
+            # immediately followed by another dd element or a dt element,
+            # or if there is no more content in the parent element.
+            if type == :StartTag
+                return %w(dt dd).include?(nexttok[:name])
+            elsif tagname == 'dd'
+                return type == :EndTag || type == nil
+            else
+                return false
+            end
+        elsif tagname == 'p'
+            # A p element's end tag may be omitted if the p element is
+            # immediately followed by an address, blockquote, dl, fieldset,
+            # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
+            # or ul  element, or if there is no more content in the parent
+            # element.
+            if type == :StartTag
+                return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5
+                    h6 hr menu ol p pre table ul).include?(nexttok[:name])
+            else
+                return type == :EndTag || type == nil
+            end
+        elsif tagname == 'colgroup'
+            # A colgroup element's end tag may be omitted if the colgroup
+            # element is not immediately followed by a space character or
+            # a comment.
+            if [:Comment, :SpaceCharacters].include?(type)
+                return false
+            elsif type == :StartTag
+                # XXX: we also look for an immediately following colgroup
+                # element. See is_optional_start.
+                return nexttok[:name] != 'colgroup'
+            else
+                return true
+            end
+        elsif %w(thead tbody).include? tagname
+            # A thead element's end tag may be omitted if the thead element
+            # is immediately followed by a tbody or tfoot element.
+            # A tbody element's end tag may be omitted if the tbody element
+            # is immediately followed by a tbody or tfoot element, or if
+            # there is no more content in the parent element.
+            # A tfoot element's end tag may be omitted if the tfoot element
+            # is immediately followed by a tbody element, or if there is no
+            # more content in the parent element.
+            # XXX: we never omit the end tag when the following element is
+            # a tbody. See is_optional_start.
+            if type == :StartTag
+                return %w(tbody tfoot).include?(nexttok[:name])
+            elsif tagname == 'tbody'
+                return (type == :EndTag or type == nil)
+            else
+                return false
+            end
+        elsif tagname == 'tfoot'
+            # A tfoot element's end tag may be omitted if the tfoot element
+            # is immediately followed by a tbody element, or if there is no
+            # more content in the parent element.
+            # XXX: we never omit the end tag when the following element is
+            # a tbody. See is_optional_start.
+            if type == :StartTag
+                return nexttok[:name] == 'tbody'
+            else
+                return type == :EndTag || type == nil
+            end
+        elsif %w(td th).include? tagname
+            # A td element's end tag may be omitted if the td element is
+            # immediately followed by a td or th element, or if there is
+            # no more content in the parent element.
+            # A th element's end tag may be omitted if the th element is
+            # immediately followed by a td or th element, or if there is
+            # no more content in the parent element.
+            if type == :StartTag
+                return %w(td th).include?(nexttok[:name])
+            else
+                return type == :EndTag || type == nil
+            end
+        end
+        return false
+    end
+end
+
+class HTMLSerializer
+    CDATA_ELEMENTS = %w[style script xmp iframe noembed noframes noscript]
+
+    def self.serialize(stream, options = {})
+        new(options).serialize(stream)
+    end
+
+    def initialize(options={})
+        @quote_attr_values = false
+        @quote_char = '"'
+        @use_best_quote_char = true
+        @minimize_boolean_attributes = true
+
+        @use_trailing_solidus = false
+        @space_before_trailing_solidus = true
+
+        @omit_optional_tags = true
+        @sanitize = false
+
+        @strip_whitespace = false
+
+        @inject_meta_charset = true
+
+        options.each do |name, value|
+            next unless %w(quote_attr_values quote_char use_best_quote_char
+              minimize_boolean_attributes use_trailing_solidus
+              space_before_trailing_solidus omit_optional_tags sanitize
+              strip_whitespace inject_meta_charset).include? name.to_s
+            @use_best_quote_char = false if name.to_s == 'quote_char'
+            instance_variable_set("@#{name}", value)
+        end
+
+        @errors = []
+    end
+
+    def serialize(treewalker, encoding=nil)
+        in_cdata = false
+        @errors = []
+        if encoding and @inject_meta_charset
+            treewalker = filter_inject_meta_charset(treewalker, encoding)
+        end
+        if @strip_whitespace
+            treewalker = filter_whitespace(treewalker)
+        end
+        if @sanitize
+            require 'html5lib/sanitizer'
+            treewalker = HTMLSanitizeFilter.new(treewalker)
+        end
+#        if @omit_optional_tags
+#            treewalker = OptionalTagFilter.new(treewalker)
+#        end
+
+        result = []
+        treewalker.each do |token|
+            type = token[:type]
+            if type == :Doctype
+                doctype = "<!DOCTYPE %s>" % token[:name]
+                if encoding
+                    result << doctype.encode(encoding)
+                else
+                    result << doctype
+                end
+
+            elsif [:Characters, :SpaceCharacters].include? type
+                if type == :SpaceCharacters or in_cdata
+                    if in_cdata and token[:data].find("</") >= 0
+                        serializeError(_("Unexpected </ in CDATA"))
+                    end
+                    if encoding
+                        result << token[:data].encode(encoding, errors || "strict")
+                    else
+                        result << token[:data]
+                    end
+                elsif encoding
+                    result << token[:data].replace("&", "&amp;") \
+                        .encode(encoding, unicode_encode_errors)
+                else
+                    result << token[:data] \
+                        .gsub("&", "&amp;") \
+                        .gsub("<", "&lt;")  \
+                        .gsub(">", "&gt;")
+                end
+
+            elsif [:StartTag, :EmptyTag].include? type
+                name = token[:name]
+                if CDATA_ELEMENTS.include?(name)
+                    in_cdata = true
+                elsif in_cdata
+                    serializeError(_("Unexpected child element of a CDATA element"))
+                end
+                attrs = token[:data].to_a
+                attrs.sort()
+                attributes = []
+                for k,v in attrs
+                    if encoding
+                        k = k.encode(encoding)
+                    end
+                    attributes << ' '
+
+                    attributes << k
+                    if not @minimize_boolean_attributes or \
+                      (!(BOOLEAN_ATTRIBUTES[name]||[]).include?(k) \
+                      and !BOOLEAN_ATTRIBUTES[:global].include?(k))
+                        attributes << "="
+                        if @quote_attr_values or v.empty?
+                            quote_attr = true
+                        else
+                            quote_attr = (SPACE_CHARACTERS.join('') + "<>\"'").each_char.any? {|c| v.include?(c)}
+                        end
+                        v = v.gsub("&", "&amp;")
+                        if encoding
+                            v = v.encode(encoding, unicode_encode_errors)
+                        end
+                        if quote_attr
+                            quote_char = @quote_char
+                            if @use_best_quote_char
+                                if v.index("'") and !v.index('"')
+                                    quote_char = '"'
+                                elsif v.index('"') and !v.index("'")
+                                    quote_char = "'"
+                                end
+                            end
+                            if quote_char == "'"
+                                v = v.gsub("'", "&#39;")
+                            else
+                                v = v.gsub('"', "&quot;")
+                            end
+                            attributes << quote_char << v << quote_char
+                        else
+                            attributes << v
+                        end
+                    end
+                end
+                if VOID_ELEMENTS.include?(name) and @use_trailing_solidus
+                    if @space_before_trailing_solidus
+                        attributes << " /"
+                    else
+                        attributes << "/"
+                    end
+                end
+                if encoding
+                    result << "<%s%s>" % [name.encode(encoding), attributes.join('')]
+                else
+                    result << "<%s%s>" % [name, attributes.join('')]
+                end
+
+            elsif type == :EndTag
+                name = token[:name]
+                if CDATA_ELEMENTS.include?(name)
+                    in_cdata = false
+                elsif in_cdata
+                    serializeError(_("Unexpected child element of a CDATA element"))
+                end
+                end_tag = "</%s>" % name
+                if encoding
+                    end_tag = end_tag.encode(encoding)
+                end
+                result << end_tag
+
+            elsif type == :Comment
+                data = token[:data]
+                if data.index("--")
+                    serializeError(_("Comment contains --"))
+                end
+                comment = "<!--%s-->" % token[:data]
+                if encoding
+                    comment = comment.encode(encoding, unicode_encode_errors)
+                end
+                result << comment
+
+            else
+                serializeError(token[:data])
+            end
+        end
+        result.join('')
+    end
+
+    def render(treewalker, encoding=nil)
+        if encoding
+            return "".join(list(serialize(treewalker, encoding)))
+        else
+            return "".join(list(serialize(treewalker)))
+        end
+    end
+
+    def serializeError(data="XXX ERROR MESSAGE NEEDED")
+        # XXX The idea is to make data mandatory.
+        @errors.push(data)
+        if @strict
+            raise SerializeError
+        end
+    end
+
+    def filter_inject_meta_charset(treewalker, encoding)
+        done = false
+        for token in treewalker
+            if not done and token[:type] == :StartTag \
+              and token[:name].lower() == "head"
+                yield({:type => :EmptyTag, :name => "meta", \
+                    :data => {"charset" => encoding}})
+            end
+            yield token
+        end
+    end
+
+    def filter_whitespace(treewalker)
+        raise NotImplementedError
+    end
+end
+
+# Error in serialized tree
+class SerializeError < Exception
+end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb
@ -1,21 +1,24 @@
 module HTML5lib
  module TreeBuilders

-    def self.getTreeBuilder(name)
-      case name.to_s.downcase
+    class << self
+      def [](name)
+        case name.to_s.downcase
        when 'simpletree' then
          require 'html5lib/treebuilders/simpletree'
          SimpleTree::TreeBuilder
        when 'rexml' then
          require 'html5lib/treebuilders/rexml'
-          REXMLTree::TreeBuilder
+          REXML::TreeBuilder
        when 'hpricot' then
          require 'html5lib/treebuilders/hpricot'
          Hpricot::TreeBuilder
        else
          raise "Unknown TreeBuilder #{name}"
+        end
      end
-    end

+      alias :getTreeBuilder :[]
+    end
  end
 end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb
@ -144,7 +144,7 @@ module HTML5lib
          # code. It should still do the same though.

          # Step 1: stop the algorithm when there's nothing to do.
-          return unless @activeFormattingElements
+          return if @activeFormattingElements.empty?

          # Step 2 and step 3: we start with the last element. So i is -1.
          i = -1
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb
@ -1,4 +1,5 @@
 require 'html5lib/treebuilders/base'
+require 'rubygems'
 require 'hpricot'
 require 'forwardable'

@ -26,12 +27,14 @@ module HTML5lib
            childNodes << node
            hpricot.children << node.hpricot
          end
+          node.hpricot.parent = hpricot
          node.parent = self
        end

        def removeChild(node)
           childNodes.delete(node)
           hpricot.children.delete_at(hpricot.children.index(node.hpricot))
+           node.hpricot.parent = nil
           node.parent = nil
        end

@ -48,6 +51,7 @@ module HTML5lib
          if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
            childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
          else
+            refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
            childNodes.insert(index, node)
          end
        end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb
@ -4,7 +4,7 @@ require 'forwardable'

 module HTML5lib
  module TreeBuilders
-    module REXMLTree
+    module REXML

      class Node < Base::Node
        extend Forwardable
@ -52,6 +52,7 @@ module HTML5lib
            childNodes[index-1].rxobj.raw = true
          else
            childNodes.insert index, node
+            refNode.rxobj.parent.insert_before(refNode.rxobj,node.rxobj)
          end
        end

@ -62,7 +63,7 @@ module HTML5lib

      class Element < Node
        def self.rxclass
-          REXML::Element
+          ::REXML::Element
        end

        def initialize name
@ -95,7 +96,7 @@ module HTML5lib

      class Document < Node
        def self.rxclass
-          REXML::Document
+          ::REXML::Document
        end

        def initialize
@ -120,7 +121,7 @@ module HTML5lib

      class DocumentType < Node
        def self.rxclass
-          REXML::DocType
+          ::REXML::DocType
        end

        def printTree indent=0
@ -145,7 +146,7 @@ module HTML5lib
      class TextNode < Node
        def initialize data
          raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;')
-          @rxobj = REXML::Text.new(raw, true, nil, true)
+          @rxobj = ::REXML::Text.new(raw, true, nil, true)
        end

        def printTree indent=0
@ -155,7 +156,7 @@ module HTML5lib

      class CommentNode < Node
        def self.rxclass
-          REXML::Comment
+          ::REXML::Comment
        end

        def printTree indent=0
--- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers.rb
@ -0,0 +1,26 @@
+require 'html5lib/treewalkers/base'
+
+module HTML5lib
+  module TreeWalkers
+
+    class << self
+      def [](name)
+        case name.to_s.downcase
+        when 'simpletree' then
+          require 'html5lib/treewalkers/simpletree'
+          SimpleTree::TreeWalker
+        when 'rexml' then
+          require 'html5lib/treewalkers/rexml'
+          REXML::TreeWalker
+        when 'hpricot' then
+          require 'html5lib/treewalkers/hpricot'
+          Hpricot::TreeWalker
+        else
+          raise "Unknown TreeWalker #{name}"
+        end
+      end
+
+      alias :getTreeWalker :[]
+    end
+  end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/base.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/base.rb
@ -0,0 +1,156 @@
+require 'html5lib/constants'
+module HTML5lib
+module TreeWalkers
+
+module TokenConstructor
+    def error(msg)
+        return {:type => "SerializeError", :data => msg}
+    end
+
+    def normalizeAttrs(attrs)
+        attrs.to_a
+    end
+
+    def emptyTag(name, attrs, hasChildren=false)
+        error(_("Void element has children")) if hasChildren
+        return({:type => :EmptyTag, :name => name, \
+                :data => normalizeAttrs(attrs)})
+    end
+
+    def startTag(name, attrs)
+        return {:type => :StartTag, :name => name, \
+                 :data => normalizeAttrs(attrs)}
+    end
+
+    def endTag(name)
+        return {:type => :EndTag, :name => name, :data => []}
+    end
+
+    def text(data)
+        if data =~ /^([#{SPACE_CHARACTERS.join('')}]+)/
+          yield({:type => :SpaceCharacters, :data => $1})
+          data = data[$1.length .. -1]
+          return if data.empty?
+        end
+
+        if data =~ /([#{SPACE_CHARACTERS.join('')}]+)$/
+          yield({:type => :Characters, :data => data[0 ... -$1.length]})
+          yield({:type => :SpaceCharacters, :data => $1})
+        else
+          yield({:type => :Characters, :data => data})
+        end
+    end
+
+    def comment(data)
+        return {:type => :Comment, :data => data}
+    end
+
+    def doctype(name)
+        return {:type => :Doctype, :name => name, :data => name.upcase() == "HTML"}
+    end
+
+    def unknown(nodeType)
+        return error(_("Unknown node type: ") + nodeType.to_s)
+    end
+
+    def _(str)
+      str
+    end
+end
+
+class Base
+    include TokenConstructor
+
+    def initialize(tree)
+        @tree = tree
+    end
+
+    def each
+        raise NotImplementedError
+    end
+
+    alias walk each
+end
+
+class NonRecursiveTreeWalker < TreeWalkers::Base
+    def node_details(node)
+        raise NotImplementedError
+    end
+
+    def first_child(node)
+        raise NotImplementedError
+    end
+
+    def next_sibling(node)
+        raise NotImplementedError
+    end
+
+    def parent(node)
+        raise NotImplementedError
+    end
+
+    def each
+        currentNode = @tree
+        while currentNode != nil
+            details = node_details(currentNode)
+            hasChildren = false
+
+            case details.shift
+            when :DOCTYPE
+                yield doctype(*details)
+
+            when :TEXT
+                text(*details) {|token| yield token}
+
+            when :ELEMENT
+                name, attributes, hasChildren = details
+                if VOID_ELEMENTS.include?(name)
+                    yield emptyTag(name, attributes.to_a, hasChildren)
+                    hasChildren = false
+                else
+                    yield startTag(name, attributes.to_a)
+                end
+
+            when :COMMENT
+                yield comment(details[0])
+
+            when :DOCUMENT, :DOCUMENT_FRAGMENT
+                hasChildren = true
+
+            when nil
+                # ignore (REXML::XMLDecl is an example)
+
+            else
+                yield unknown(details[0])
+            end
+
+            firstChild = hasChildren ? first_child(currentNode) : nil
+            if firstChild != nil
+                currentNode = firstChild
+            else
+                while currentNode != nil
+                    details = node_details(currentNode)
+                    if details.shift == :ELEMENT
+                        name, attributes, hasChildren = details
+                        yield endTag(name) if !VOID_ELEMENTS.include?(name)
+                    end
+
+                    if @tree == currentNode
+                        currentNode = nil
+                    else
+                        nextSibling = next_sibling(currentNode)
+                        if nextSibling != nil
+                            currentNode = nextSibling
+                            break
+                        end
+
+                        currentNode = parent(currentNode)
+                    end
+                end
+            end
+        end
+    end
+end
+
+end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/hpricot.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/hpricot.rb
@ -0,0 +1,48 @@
+require 'html5lib/treewalkers/base'
+require 'rexml/document'
+
+module HTML5lib
+  module TreeWalkers
+    module Hpricot
+      class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
+
+        def node_details(node)
+          case node
+          when ::Hpricot::Elem
+            if !node.name
+              [:DOCUMENT_FRAGMENT]
+            else
+              [:ELEMENT, node.name,
+                node.attributes.map {|name,value| [name,value]},
+                !node.empty?]
+            end
+          when ::Hpricot::Text
+            [:TEXT, node.to_plain_text]
+          when ::Hpricot::Comment
+            [:COMMENT, node.content]
+          when ::Hpricot::Doc
+            [:DOCUMENT]
+          when ::Hpricot::DocType
+            [:DOCTYPE, node.target]
+          when ::Hpricot::XMLDecl
+            [nil]
+          else
+            [:UNKNOWN, node.class.inspect]
+          end
+        end
+
+        def first_child(node)
+          node.children.first
+        end
+
+        def next_sibling(node)
+          node.next_node
+        end
+
+        def parent(node)
+          node.parent
+        end
+      end
+    end
+  end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/rexml.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/rexml.rb
@ -0,0 +1,48 @@
+require 'html5lib/treewalkers/base'
+require 'rexml/document'
+
+module HTML5lib
+  module TreeWalkers
+    module REXML
+      class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
+
+        def node_details(node)
+          case node
+          when ::REXML::Document
+            [:DOCUMENT]
+          when ::REXML::Element
+            if !node.name
+              [:DOCUMENT_FRAGMENT]
+            else
+              [:ELEMENT, node.name,
+                node.attributes.map {|name,value| [name,value]},
+                node.has_elements? || node.has_text?]
+            end
+          when ::REXML::Text
+            [:TEXT, node.value]
+          when ::REXML::Comment
+            [:COMMENT, node.string]
+          when ::REXML::DocType
+            [:DOCTYPE, node.name]
+          when ::REXML::XMLDecl
+            [nil]
+          else
+            [:UNKNOWN, node.class.inspect]
+          end
+        end
+
+        def first_child(node)
+          node.children.first
+        end
+
+        def next_sibling(node)
+          node.next_sibling
+        end
+
+        def parent(node)
+          node.parent
+        end
+      end
+    end
+  end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/simpletree.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/simpletree.rb
@ -0,0 +1,48 @@
+require 'html5lib/treewalkers/base'
+
+module HTML5lib
+  module TreeWalkers
+    module SimpleTree
+      class TreeWalker < HTML5lib::TreeWalkers::Base
+        include HTML5lib::TreeBuilders::SimpleTree
+
+        def walk(node)
+          case node
+          when Document, DocumentFragment
+            return
+
+          when DocumentType
+            yield doctype(node.name)
+
+          when TextNode
+            text(node.value) {|token| yield token}
+
+          when Element
+            if VOID_ELEMENTS.include?(node.name)
+              yield emptyTag(node.name, node.attributes, node.hasContent())
+            else
+              yield startTag(node.name, node.attributes)
+              for child in node.childNodes
+                walk(child) {|token| yield token}
+              end
+              yield endTag(node.name)
+            end
+
+          when CommentNode
+            yield comment(node.value)
+
+          else
+            puts '?'
+            yield unknown(node.class)
+          end
+        end
+
+        def each
+          for child in @tree.childNodes
+            walk(child) {|node| yield node}
+          end
+        end
+      end
+    end
+  end
+end
--- a/vendor/plugins/HTML5lib/parse.rb
+++ b/vendor/plugins/HTML5lib/parse.rb
@ -0,0 +1,137 @@
+#!/usr/bin/env ruby
+# 
+# Parse a document to a simpletree tree, with optional profiling
+
+$:.unshift File.dirname(__FILE__),'lib'
+
+def parse(opts, args)
+
+  f = args[-1]
+  if f
+    begin
+      require 'open-uri' if f[0..6] == 'http://'
+      f = open(f)
+    rescue
+    end
+  else
+    $stderr.write("No filename provided. Use -h for help\n")
+    exit(1)
+  end
+
+  require 'html5lib/treebuilders'
+  treebuilder = HTML5lib::TreeBuilders[opts.treebuilder]
+
+  if opts.output == :xml
+    require 'html5lib/liberalxmlparser'
+    p = HTML5lib::XHTMLParser.new(:tree=>treebuilder)
+  else
+    require 'html5lib/html5parser'
+    p = HTML5lib::HTMLParser.new(:tree=>treebuilder)
+  end
+
+  if opts.profile
+    require 'profiler'
+    Profiler__::start_profile
+    p.parse(f)
+    Profiler__::stop_profile
+    Profiler__::print_profile($stderr)
+  elsif opts.time
+    require 'time'
+    t0 = Time.new
+    document = p.parse(f)
+    t1 = Time.new
+    printOutput(p, document, opts)
+    t2 = Time.new
+    puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1]
+  else
+    document = p.parse(f)
+    printOutput(p, document, opts)
+  end
+end
+
+def printOutput(parser, document, opts)
+  puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding
+
+  case opts.output
+  when :xml
+    print document
+  when :html
+    require 'html5lib/treewalkers'
+    tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document)
+    require 'html5lib/serializer'
+    print HTML5lib::HTMLSerializer.serialize(tokens, :encoding=>'utf-8')
+  when :hilite
+    print document.hilite
+  when :tree
+    print parser.tree.testSerializer(document)
+  end
+
+  if opts.error
+    errList=[]
+    for pos, message in parser.errors
+        errList << ("Line %i Col %i"%pos + " " + message)
+    end
+    $stderr.write("\nParse errors:\n" + errList.join("\n")+"\n")
+  end
+end
+
+require 'ostruct'
+options = OpenStruct.new
+options.profile = false
+options.time = false
+options.output = :tree
+options.treebuilder = 'simpletree'
+options.error = false
+options.encoding = false
+
+require 'optparse'
+opts = OptionParser.new do |opts|
+  opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
+    options.profile = profile
+  end
+    
+  opts.on("-t", "--[no-]time", "Time the run") do |time|
+    options.time = time
+  end
+    
+  opts.on("--[no-]tree", "Do not print output tree") do |tree|
+    if tree
+      options.output = :tree
+    else
+      options.output = nil
+    end
+  end
+  
+  opts.on("-b", "--treebuilder NAME") do |treebuilder|
+    options.treebuilder = treebuilder
+  end
+
+  opts.on("-e", "--error", "Print a list of parse errors") do |error|
+    options.error = error
+  end
+
+  opts.on("-x", "--xml", "output as xml") do |xml|
+    options.output = :xml
+    options.treebuilder = "rexml"
+  end
+  
+  opts.on("--html", "Output as html") do |html|
+    options.output = :html
+  end
+  
+  opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
+    options.output = :hilite
+  end
+  
+  opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
+    options.encoding = encoding
+  end
+
+  opts.on_tail("-h", "--help", "Show this message") do
+    puts opts
+    exit
+  end
+end
+
+opts.parse!(ARGV)
+parse options, ARGV
--- a/vendor/plugins/HTML5lib/tests/preamble.rb
+++ b/vendor/plugins/HTML5lib/tests/preamble.rb
@ -21,3 +21,53 @@ rescue LoadError
    end
  end
 end
+
+module HTML5lib
+  module TestSupport
+    def self.startswith?(a, b)
+      b[0... a.length] == a
+    end
+
+    def self.parseTestcase(data)
+      innerHTML = nil
+      input = []
+      output = []
+      errors = []
+      currentList = input
+      data.split(/\n/).each do |line|
+        if !line.empty? and !startswith?("#errors", line) and
+          !startswith?("#document", line) and
+          !startswith?("#data", line) and
+          !startswith?("#document-fragment", line)
+
+          if currentList == output and startswith?("|", line)
+            currentList.push(line[2..-1])
+          else
+            currentList.push(line)
+          end
+        elsif line == "#errors"
+          currentList = errors
+        elsif line == "#document" or startswith?("#document-fragment", line)
+          if startswith?("#document-fragment", line)
+            innerHTML = line[19..-1]
+            raise AssertionError unless innerHTML
+          end
+          currentList = output
+        end
+      end
+      return innerHTML, input.join("\n"), output.join("\n"), errors
+    end
+
+    # convert the output of str(document) to the format used in the testcases
+    def convertTreeDump(treedump)
+      treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
+    end
+
+    def sortattrs(output)
+      output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) do |match|
+         match.split("\n").sort.join("\n")
+      end
+    end
+
+  end
+end
--- a/vendor/plugins/HTML5lib/tests/test_encoding.rb
+++ b/vendor/plugins/HTML5lib/tests/test_encoding.rb
@ -4,33 +4,33 @@ require 'html5lib/inputstream'

 class Html5EncodingTestCase < Test::Unit::TestCase

-begin
+  begin
    require 'rubygems'
    require 'UniversalDetector'

    def test_chardet
-        File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
-            stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
-            assert_equal 'big5', stream.char_encoding.downcase
-        end
+      File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
+        stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
+        assert_equal 'big5', stream.char_encoding.downcase
+      end
    end
-rescue LoadError
+  rescue LoadError
    puts "chardet not found, skipping chardet tests"
-end
+  end

-    html5lib_test_files('encoding').each do |test_file|        
-        test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
+  html5lib_test_files('encoding').each do |test_file|        
+    test_name = File.basename(test_file).sub('.dat', '').tr('-', '')

-        File.read(test_file).split("#data\n").each_with_index do |data, index|
-            next if data.empty?
-            input, encoding = data.split(/\n#encoding\s+/, 2)
-            encoding = encoding.split[0]
+    File.read(test_file).split("#data\n").each_with_index do |data, index|
+      next if data.empty?
+      input, encoding = data.split(/\n#encoding\s+/, 2)
+      encoding = encoding.split[0]

-            define_method 'test_%s_%d' % [ test_name, index + 1 ] do
-                stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
-                assert_equal encoding.downcase, stream.char_encoding.downcase, input
-            end
-        end
+      define_method 'test_%s_%d' % [ test_name, index + 1 ] do
+        stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
+        assert_equal encoding.downcase, stream.char_encoding.downcase, input
+      end
    end
+  end

 end
--- a/vendor/plugins/HTML5lib/tests/test_parser.rb
+++ b/vendor/plugins/HTML5lib/tests/test_parser.rb
@ -14,53 +14,12 @@ end

 $CHECK_PARSER_ERRORS = false

-puts 'Testing: ' + $tree_types_to_test * ', '
+puts 'Testing tree builders: ' + $tree_types_to_test * ', '


 class Html5ParserTestCase < Test::Unit::TestCase
-
-  def self.startswith?(a, b)
-    b[0... a.length] == a
-  end
-
-  def self.parseTestcase(data)
-    innerHTML = nil
-    input = []
-    output = []
-    errors = []
-    currentList = input
-    data.split(/\n/).each do |line|
-      if !line.empty? and !startswith?("#errors", line) and
-        !startswith?("#document", line) and
-        !startswith?("#data", line) and
-        !startswith?("#document-fragment", line)
-
-        if currentList == output and startswith?("|", line)
-          currentList.push(line[2..-1])
-        else
-          currentList.push(line)
-        end
-      elsif line == "#errors"
-        currentList = errors
-      elsif line == "#document" or startswith?("#document-fragment", line)
-        if startswith?("#document-fragment", line)
-          innerHTML = line[19..-1]
-          raise AssertionError unless innerHTML
-        end
-        currentList = output
-      end
-    end
-    return innerHTML, input.join("\n"), output.join("\n"), errors
-  end
-  
-  # convert the output of str(document) to the format used in the testcases
-  def convertTreeDump(treedump)
-    treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
-  end
-
-  def sortattrs(output)
-    output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
-  end
+  include HTML5lib
+  include TestSupport

  html5lib_test_files('tree-construction').each do |test_file|

@ -69,12 +28,13 @@ class Html5ParserTestCase < Test::Unit::TestCase
    File.read(test_file).split("#data\n").each_with_index do |data, index|
      next if data.empty?
     
-      innerHTML, input, expected_output, expected_errors = parseTestcase(data)
+      innerHTML, input, expected_output, expected_errors =
+        TestSupport.parseTestcase(data)

      $tree_types_to_test.each do |tree_name|
        define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do

-          parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name))
+          parser = HTMLParser.new(:tree => TreeBuilders[tree_name])
        
          if innerHTML
            parser.parseFragment(input, innerHTML)
--- a/vendor/plugins/HTML5lib/tests/test_sanitizer.rb
+++ b/vendor/plugins/HTML5lib/tests/test_sanitizer.rb
@ -2,9 +2,11 @@

 require File.join(File.dirname(__FILE__), 'preamble')

-require 'html5lib/sanitizer'
 require 'html5lib/html5parser'
 require 'html5lib/liberalxmlparser'
+require 'html5lib/treewalkers'
+require 'html5lib/serializer'
+require 'html5lib/sanitizer'

 class SanitizeTest < Test::Unit::TestCase
  include HTML5lib
--- a/vendor/plugins/HTML5lib/tests/test_serializer.rb
+++ b/vendor/plugins/HTML5lib/tests/test_serializer.rb
@ -0,0 +1,52 @@
+require File.join(File.dirname(__FILE__), 'preamble')
+
+require 'html5lib/html5parser'
+require 'html5lib/serializer'
+require 'html5lib/treewalkers'
+
+#Run the serialize error checks
+checkSerializeErrors = false
+
+class JsonWalker < HTML5lib::TreeWalkers::Base
+  def each
+    @tree.each do |token|
+      case token[0]
+      when 'StartTag'
+        yield startTag(token[1], token[2])
+      when 'EndTag'
+        yield endTag(token[1])
+      when 'EmptyTag'
+        yield emptyTag(token[1], token[2])
+      when 'Comment'
+        yield comment(token[1])
+      when 'Characters', 'SpaceCharacters'
+        text(token[1]) {|textToken| yield textToken}
+      when 'Doctype'
+        yield doctype(token[1])
+      else
+        raise ValueError("Unknown token type: " + type)
+      end
+    end
+  end
+end
+
+class Html5SerializeTestcase < Test::Unit::TestCase
+  html5lib_test_files('serializer').each do |filename|
+    test_name = File.basename(filename).sub('.test', '')
+    tests = JSON::parse(open(filename).read)
+    tests['tests'].each_with_index do |test, index|
+
+      define_method "test_#{test_name}_#{index+1}" do
+        result = HTML5lib::HTMLSerializer.
+          serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
+        expected = test["expected"]
+        if expected.length == 1
+          assert_equal(expected[0], result, test["description"])
+        elsif !expected.include?(result)
+          flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
+        end
+      end
+
+    end
+  end
+end
--- a/vendor/plugins/HTML5lib/tests/test_stream.rb
+++ b/vendor/plugins/HTML5lib/tests/test_stream.rb
@ -0,0 +1,54 @@
+require File.join(File.dirname(__FILE__), 'preamble')
+
+require 'html5lib/inputstream'
+
+class HTMLInputStreamTest < Test::Unit::TestCase
+  include HTML5lib
+
+  def test_char_ascii
+    stream = HTMLInputStream.new("'")
+    assert_equal('ascii', stream.char_encoding)
+    assert_equal("'", stream.char)
+  end
+
+  def test_char_null
+    stream = HTMLInputStream.new("\x00")
+    assert_equal("\xef\xbf\xbd", stream.char)
+  end
+
+  def test_char_utf8
+    stream = HTMLInputStream.new("\xe2\x80\x98")
+    assert_equal('utf-8', stream.char_encoding)
+    assert_equal("\xe2\x80\x98", stream.char)
+  end
+
+  def test_char_win1252
+    stream = HTMLInputStream.new("\x91")
+    assert_equal('windows-1252', stream.char_encoding)
+    assert_equal("\xe2\x80\x98", stream.char)
+  end
+
+  def test_bom
+    stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
+    assert_equal('utf-8', stream.char_encoding)
+    assert_equal("'", stream.char)
+  end
+
+  def test_utf_16
+    stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
+    assert(stream.char_encoding, 'utf-16-le')
+    assert_equal(1025, stream.chars_until(' ',true).length)
+  end
+
+  def test_newlines
+    stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
+    assert_equal(0, stream.instance_eval {@tell})
+    assert_equal("a\nbb\n", stream.chars_until('c'))
+    assert_equal(6, stream.instance_eval {@tell})
+    assert_equal([3,1], stream.position)
+    assert_equal("ccc\ndddd", stream.chars_until('x'))
+    assert_equal(14, stream.instance_eval {@tell})
+    assert_equal([4,5], stream.position)
+    assert_equal([0,1,4,8], stream.instance_eval {@new_lines})
+  end
+end
--- a/vendor/plugins/HTML5lib/tests/test_treewalkers.rb
+++ b/vendor/plugins/HTML5lib/tests/test_treewalkers.rb
@ -0,0 +1,110 @@
+require File.join(File.dirname(__FILE__), 'preamble')
+
+require 'html5lib/html5parser'
+require 'html5lib/treewalkers'
+require 'html5lib/treebuilders'
+
+$tree_types_to_test = {
+  'simpletree' =>
+    {:builder => HTML5lib::TreeBuilders['simpletree'],
+     :walker  => HTML5lib::TreeWalkers['simpletree']},
+  'rexml' =>
+    {:builder => HTML5lib::TreeBuilders['rexml'],
+     :walker  => HTML5lib::TreeWalkers['rexml']},
+# 'hpricot' =>
+#   {:builder => HTML5lib::TreeBuilders['hpricot'],
+#    :walker  => HTML5lib::TreeWalkers['hpricot']},
+}
+
+puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
+
+class TestTreeWalkers < Test::Unit::TestCase
+  include HTML5lib::TestSupport
+
+  def concatenateCharacterTokens(tokens)
+    charactersToken = nil
+    for token in tokens
+        type = token[:type]
+        if [:Characters, :SpaceCharacters].include?(type)
+            if charactersToken == nil
+                charactersToken = {:type => :Characters, :data => token[:data]}
+            else
+                charactersToken[:data] += token[:data]
+            end
+        else
+            if charactersToken != nil
+                yield charactersToken
+                charactersToken = nil
+            end
+            yield token
+        end
+    end
+    yield charactersToken if charactersToken != nil
+  end
+
+  def convertTokens(tokens)
+    output = []
+    indent = 0
+    concatenateCharacterTokens(tokens) do |token|
+        case token[:type]
+        when :StartTag, :EmptyTag
+            output << "#{' '*indent}<#{token[:name]}>"
+            indent += 2
+            for name, value in token[:data].to_a.sort
+                next if name=='xmlns'
+                output << "#{' '*indent}#{name}=\"#{value}\""
+            end
+            indent -= 2 if token[:type] == :EmptyTag
+        when :EndTag
+            indent -= 2
+        when :Comment
+            output << "#{' '*indent}<!-- #{token[:data]} -->"
+        when :Doctype
+            output << "#{' '*indent}<!DOCTYPE #{token[:name]}>"
+        when :Characters, :SpaceCharacters
+            output << "#{' '*indent}\"#{token[:data]}\""
+        else
+            # TODO: what to do with errors?
+        end
+    end
+    return output.join("\n")
+  end
+
+  html5lib_test_files('tree-construction').each do |test_file|
+
+    test_name = File.basename(test_file).sub('.dat', '')
+
+    File.read(test_file).split("#data\n").each_with_index do |data, index|
+      next if data.empty?
+
+      innerHTML, input, expected_output, expected_errors =
+        HTML5lib::TestSupport::parseTestcase(data)
+
+      rexml = $tree_types_to_test['rexml']
+      $tree_types_to_test.each do |tree_name, treeClass|
+
+        define_method "test_#{test_name}_#{index}_#{tree_name}" do
+
+          parser = HTML5lib::HTMLParser.new(:tree => treeClass[:builder])
+
+          if innerHTML
+            parser.parseFragment(input, innerHTML)
+          else
+            parser.parse(input)
+          end
+
+          document = parser.tree.getDocument
+
+          begin
+            output = sortattrs(convertTokens(treeClass[:walker].new(document)))
+            expected = sortattrs(expected_output)
+            errorMsg = "\n\nExpected:\n#{expected}\nRecieved:\n#{output}\n"
+            assert_equal(expected, output, errorMsg)
+          rescue NotImplementedError
+            # Amnesty for those that confess...
+          end
+        end
+      end
+   end
+  end
+end
--- a/vendor/plugins/maruku/lib/maruku/output/to_html.rb
+++ b/vendor/plugins/maruku/lib/maruku/output/to_html.rb
@ -154,6 +154,21 @@ Example:
 	CSS: style.css math.css

 =end
+	# Render to an HTML fragment (returns a REXML document tree)
+	def to_html_tree
+		div = Element.new 'div'
+                        children_to_html.each do |e|
+                                div << e
+                        end
+
+                        # render footnotes
+                        if @doc.footnotes_order.size > 0
+                                div << render_footnotes
+                        end
+
+                doc = Document.new(nil,{:respect_whitespace =>:all})
+                doc << div
+	end

 	# Render to a complete HTML document (returns a REXML document tree)
 	def to_html_document_tree