REXML Trees

Synced with latest HTML5lib. Added preliminary support (currently disabled) for sanitizing REXML trees.
2007-06-05 16:34:49 -05:00 · 2007-06-05 16:34:49 -05:00 · bd8ba1f4b1
commit bd8ba1f4b1
parent 4dd70af5ae
28 changed files with 1317 additions and 112 deletions
--- a/lib/chunks/engines.rb
+++ b/lib/chunks/engines.rb
@ -53,9 +53,10 @@ module Engines
    def mask
      require_dependency 'maruku'
      require_dependency 'maruku/ext/math'
-      html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
+#      html = sanitize_rexml(Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
-            {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html
+#            {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html_tree)
-      sanitize_xhtml(html.to_ncr)
+      html = sanitize_xhtml(Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
            {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html.to_ncr)
    end
  end
--- a/lib/sanitize.rb
+++ b/lib/sanitize.rb
@ -8,19 +8,36 @@ module Sanitize
 #
 #  sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
 #  sanitize_html() is a case-insensitive sanitizer suitable for HTML
 #  sanitize_rexml() sanitized a REXML tree, returning a string
  require 'html5lib/sanitizer'
  require 'html5lib/html5parser'
  require 'html5lib/liberalxmlparser'
  require 'html5lib/treewalkers'
  require 'html5lib/serializer'
  require 'string_utils'
  require 'html5lib/sanitizer'
  include HTML5lib
  def sanitize_xhtml(html)
-    XHTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
+    XHTMLParser.parseFragment(html.to_ncr, :tokenizer => HTMLSanitizer).to_s
  end
  def sanitize_html(html)
    HTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
  end
  def sanitize_rexml(tree)
    tokens = TreeWalkers.getTreeWalker('rexml').new(tree.to_ncr)
    HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
      :quote_attr_values => 'true',
      :minimize_boolean_attributes => 'false',
      :use_trailing_solidus => 'true',
      :space_before_trailing_solidus => 'true',
      :omit_optional_tags => 'false',
      :inject_meta_charset => 'false',
      :sanitize => 'true'})
  end
 end
--- a/lib/string_utils.rb
+++ b/lib/string_utils.rb
@ -2155,3 +2155,20 @@ class String
    end
 end
 require 'rexml/element'
 module REXML
  class Element
    def to_ncr
      XPath.each(self, '//*') { |el|
        el.texts.each_index  {|i|
          el.texts[i].value = el.texts[i].to_s.to_ncr
        }
        el.attributes.each { |name,val|
          el.attributes[name] = val.to_ncr
        }
      }
      return self
    end
  end
 end
--- a/vendor/plugins/HTML5lib/lib/html5lib/constants.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/constants.rb
@ -148,6 +148,26 @@ module HTML5lib
      input
  ]
  BOOLEAN_ATTRIBUTES = {
    :global => %w[irrelevant],
    'style' => %w[scoped],
    'img' => %w[ismap],
    'audio' => %w[autoplay controls],
    'video' => %w[autoplay controls],
    'script' => %w[defer async],
    'details' => %w[open],
    'datagrid' => %w[multiple disabled],
    'command' => %w[hidden disabled checked default],
    'menu' => %w[autosubmit],
    'fieldset' => %w[disabled readonly],
    'option' => %w[disabled readonly selected],
    'optgroup' => %w[disabled readonly],
    'button' => %w[disabled autofocus],
    'input' => %w[disabled readonly required autofocus checked ismap],
    'select' => %w[disabled readonly autofocus multiple],
    'output' => %w[disabled readonly]
  }
  # entitiesWindows1252 has to be _ordered_ and needs to have an index.
  ENTITIES_WINDOWS1252 = [
      8364,  # 0x80  0x20AC  EURO SIGN
--- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb
@ -37,13 +37,13 @@ module HTML5lib
    # :strict - raise an exception when a parse error is encountered
    # :tree - a treebuilder class controlling the type of tree that will be
    # returned. Built in treebuilders can be accessed through
-    # html5lib.treebuilders.getTreeBuilder(treeType)
+    # HTML5lib::TreeBuilders[treeType]
    def initialize(options = {})
      @strict = false
      @errors = []
      @tokenizer =  HTMLTokenizer
-      @tree = TreeBuilders::REXMLTree::TreeBuilder
+      @tree = TreeBuilders::REXML::TreeBuilder
      options.each { |name, value| instance_variable_set("@#{name}", value) }
--- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_phase.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_phase.rb
@ -107,4 +107,4 @@ module HTML5lib
    end
  end
-end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/phase.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/phase.rb
@ -153,4 +153,4 @@ module HTML5lib
    end
  end
-end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
@ -58,7 +58,7 @@ module HTML5lib
      unless @char_encoding == 'utf-8'
        begin
          require 'iconv'
-          uString = Iconv.iconv('utf-8', @encoding, uString)[0]
+          uString = Iconv.iconv('utf-8', @char_encoding, uString)[0]
        rescue
        end
      end
@ -95,11 +95,13 @@ module HTML5lib
      #First look for a BOM
      #This will also read past the BOM if present
      encoding = detect_bom
      #If there is no BOM need to look for meta elements with encoding 
      #information
      if encoding.nil? and @parse_meta
        encoding = detect_encoding_meta
      end
      #Guess with chardet, if avaliable
      if encoding.nil? and @chardet
        begin
@ -111,13 +113,14 @@ module HTML5lib
        rescue LoadError
        end
      end
      # If all else fails use the default encoding
      if encoding.nil?
        encoding = @DEFAULT_ENCODING
      end
      #Substitute for equivalent encodings:
-      encoding_sub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
+      encoding_sub = {'iso-8859-1' => 'windows-1252'}
      if encoding_sub.has_key?(encoding.downcase)
        encoding = encoding_sub[encoding.downcase]
@ -132,10 +135,10 @@ module HTML5lib
    def detect_bom
      bom_dict = {
        "\xef\xbb\xbf" => 'utf-8',
-        "\xff\xfe" => 'utf-16-le',
+        "\xff\xfe" => 'utf16le',
-        "\xfe\xff" => 'utf-16-be',
+        "\xfe\xff" => 'utf16be',
-        "\xff\xfe\x00\x00" => 'utf-32-le',
+        "\xff\xfe\x00\x00" => 'utf32le',
-        "\x00\x00\xfe\xff" => 'utf-32-be'
+        "\x00\x00\xfe\xff" => 'utf32be'
      }
      # Go to beginning of file and read in 4 bytes
@ -205,7 +208,17 @@ module HTML5lib
      else
        begin
          @tell += 1
-          return @data_stream[@tell - 1].chr
+          c = @data_stream[@tell - 1]
          case c
          when 0xC2 .. 0xDF
            @tell += 1
            c.chr + @data_stream[@tell-1].chr
          when 0xE0 .. 0xF0
            @tell += 2
            c.chr + @data_stream[@tell-2].chr + @data_stream[@tell-1].chr
          else
            c.chr
          end
        rescue
          return :EOF
        end
@ -227,8 +240,8 @@ module HTML5lib
          else
            # Then the rest
            begin
              char_stack.push(@data_stream[@tell].chr)
              @tell += 1
              char_stack.push(@data_stream[@tell-1].chr)
            rescue
              char_stack.push(:EOF)
              break
--- a/vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb
@ -1,4 +1,3 @@
 require 'html5lib/tokenizer'
 require 'cgi'
 module HTML5lib
@ -6,7 +5,7 @@ module HTML5lib
 # This module provides sanitization of XHTML+MathML+SVG
 # and of inline style attributes.
-  class HTMLSanitizer < HTMLTokenizer
+   module HTMLSanitizeModule
    ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
      button caption center cite code col colgroup dd del dfn dir div dl dt
@ -96,19 +95,7 @@ module HTML5lib
    ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
    ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
-    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
+    def process_token(token)
    # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
    # attributes are parsed, and a restricted set, # specified by
    # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
    # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
    # in ALLOWED_PROTOCOLS are allowed.
    #
    #   sanitize_html('<script> do_nasty_stuff() </script>')
    #  => &lt;script> do_nasty_stuff() &lt;/script>
    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
    #  => <a>Click here for $100</a>
    def each
      super do |token|
        case token[:type]
        when :StartTag, :EndTag, :EmptyTag
          if ALLOWED_ELEMENTS.include?(token[:name])
@ -126,7 +113,7 @@ module HTML5lib
              end
              token[:data] = attrs.map {|k,v| [k,v]}
            end
-            yield token
+            return token
          else
            if token[:type] == :EndTag
              token[:data] = "</#{token[:name]}>"
@ -139,12 +126,11 @@ module HTML5lib
            token[:data].insert(-2,'/') if token[:type] == :EmptyTag
            token[:type] = :Characters
            token.delete(:name)
-            yield token
+            return token
          end
        else
-          yield token
+          return token
        end
      end
    end
    def sanitize_css(style)
@ -174,4 +160,23 @@ module HTML5lib
      style = clean.join(' ')
    end
  end
  class HTMLSanitizeFilter < Filter
    include HTMLSanitizeModule
    def each
      @source.each do |token|
        yield(process_token(token))
      end
    end
  end
  class HTMLSanitizer < HTMLTokenizer
    include HTMLSanitizeModule
    def each
      super do |token|
        yield(process_token(token))
      end
    end
  end
 end
--- a/vendor/plugins/HTML5lib/lib/html5lib/serializer.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/serializer.rb
@ -0,0 +1,418 @@
 require 'html5lib/constants'
 require 'jcode'
 module HTML5lib
 class Filter
    include Enumerable
    def initialize(source)
        @source = source
    end
 end
 class OptionalTagFilter < Filter
    def slider
        previous1 = previous2 = nil
        @source.each do |token|
            yield previous2, previous1, token if previous1 != nil
            previous2 = previous1
            previous1 = token
        end
        yield previous2, previous1, nil
    end
    def each
        slider do |previous, token, nexttok|
            type = token[:type]
            if type == :StartTag
                yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok)
            elsif type == :EndTag
                yield token unless is_optional_end(token[:name], nexttok)
            else
                yield token
            end
        end
    end
    def is_optional_start(tagname, previous, nexttok)
        type = nexttok ? nexttok[:type] : nil
        if tagname == 'html'
            # An html element's start tag may be omitted if the first thing
            # inside the html element is not a space character or a comment.
            return ![:Comment, :SpaceCharacters].include?(type)
        elsif tagname == 'head'
            # A head element's start tag may be omitted if the first thing
            # inside the head element is an element.
            return type == :StartTag
        elsif tagname == 'body'
            # A body element's start tag may be omitted if the first thing
            # inside the body element is not a space character or a comment,
            # except if the first thing inside the body element is a script
            # or style element and the node immediately preceding the body
            # element is a head element whose end tag has been omitted.
            if [:Comment, :SpaceCharacters].include?(type)
                return false
            elsif type == :StartTag
                # XXX: we do not look at the preceding event, so we never omit
                # the body element's start tag if it's followed by a script or
                # a style element.
                return !%w[script style].include?(nexttok[:name])
            else
                return true
            end
        elsif tagname == 'colgroup'
            # A colgroup element's start tag may be omitted if the first thing
            # inside the colgroup element is a col element, and if the element
            # is not immediately preceeded by another colgroup element whose
            # end tag has been omitted.
            if type == :StartTag
                # XXX: we do not look at the preceding event, so instead we never
                # omit the colgroup element's end tag when it is immediately
                # followed by another colgroup element. See is_optional_end.
                return nexttok[:name] == "col"
            else
                return false
            end
        elsif tagname == 'tbody'
            # A tbody element's start tag may be omitted if the first thing
            # inside the tbody element is a tr element, and if the element is
            # not immediately preceeded by a tbody, thead, or tfoot element
            # whose end tag has been omitted.
            if type == :StartTag
                # omit the thead and tfoot elements' end tag when they are
                # immediately followed by a tbody element. See is_optional_end.
                if previous and previous[:type] == :EndTag and \
                  %w(tbody thead tfoot).include?(previous[:name])
                    return false
                end
                return nexttok[:name] == 'tr'
            else
                return false
            end
        end
        return false
    end
    def is_optional_end(tagname, nexttok)
        type = nexttok ? nexttok[:type] : nil
        if %w[html head body].include?(tagname)
            # An html element's end tag may be omitted if the html element
            # is not immediately followed by a space character or a comment.
            return ![:Comment, :SpaceCharacters].include?(type)
        elsif %w[li optgroup option tr].include?(tagname)
            # A li element's end tag may be omitted if the li element is
            # immediately followed by another li element or if there is
            # no more content in the parent element.
            # An optgroup element's end tag may be omitted if the optgroup
            # element is immediately followed by another optgroup element,
            # or if there is no more content in the parent element.
            # An option element's end tag may be omitted if the option
            # element is immediately followed by another option element,
            # or if there is no more content in the parent element.
            # A tr element's end tag may be omitted if the tr element is
            # immediately followed by another tr element, or if there is
            # no more content in the parent element.
            if type == :StartTag
                return nexttok[:name] == tagname
            else
                return type == :EndTag || type == nil
            end
        elsif %w(dt dd).include?(tagname)
            # A dt element's end tag may be omitted if the dt element is
            # immediately followed by another dt element or a dd element.
            # A dd element's end tag may be omitted if the dd element is
            # immediately followed by another dd element or a dt element,
            # or if there is no more content in the parent element.
            if type == :StartTag
                return %w(dt dd).include?(nexttok[:name])
            elsif tagname == 'dd'
                return type == :EndTag || type == nil
            else
                return false
            end
        elsif tagname == 'p'
            # A p element's end tag may be omitted if the p element is
            # immediately followed by an address, blockquote, dl, fieldset,
            # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
            # or ul  element, or if there is no more content in the parent
            # element.
            if type == :StartTag
                return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5
                    h6 hr menu ol p pre table ul).include?(nexttok[:name])
            else
                return type == :EndTag || type == nil
            end
        elsif tagname == 'colgroup'
            # A colgroup element's end tag may be omitted if the colgroup
            # element is not immediately followed by a space character or
            # a comment.
            if [:Comment, :SpaceCharacters].include?(type)
                return false
            elsif type == :StartTag
                # XXX: we also look for an immediately following colgroup
                # element. See is_optional_start.
                return nexttok[:name] != 'colgroup'
            else
                return true
            end
        elsif %w(thead tbody).include? tagname
            # A thead element's end tag may be omitted if the thead element
            # is immediately followed by a tbody or tfoot element.
            # A tbody element's end tag may be omitted if the tbody element
            # is immediately followed by a tbody or tfoot element, or if
            # there is no more content in the parent element.
            # A tfoot element's end tag may be omitted if the tfoot element
            # is immediately followed by a tbody element, or if there is no
            # more content in the parent element.
            # XXX: we never omit the end tag when the following element is
            # a tbody. See is_optional_start.
            if type == :StartTag
                return %w(tbody tfoot).include?(nexttok[:name])
            elsif tagname == 'tbody'
                return (type == :EndTag or type == nil)
            else
                return false
            end
        elsif tagname == 'tfoot'
            # A tfoot element's end tag may be omitted if the tfoot element
            # is immediately followed by a tbody element, or if there is no
            # more content in the parent element.
            # XXX: we never omit the end tag when the following element is
            # a tbody. See is_optional_start.
            if type == :StartTag
                return nexttok[:name] == 'tbody'
            else
                return type == :EndTag || type == nil
            end
        elsif %w(td th).include? tagname
            # A td element's end tag may be omitted if the td element is
            # immediately followed by a td or th element, or if there is
            # no more content in the parent element.
            # A th element's end tag may be omitted if the th element is
            # immediately followed by a td or th element, or if there is
            # no more content in the parent element.
            if type == :StartTag
                return %w(td th).include?(nexttok[:name])
            else
                return type == :EndTag || type == nil
            end
        end
        return false
    end
 end
 class HTMLSerializer
    CDATA_ELEMENTS = %w[style script xmp iframe noembed noframes noscript]
    def self.serialize(stream, options = {})
        new(options).serialize(stream)
    end
    def initialize(options={})
        @quote_attr_values = false
        @quote_char = '"'
        @use_best_quote_char = true
        @minimize_boolean_attributes = true
        @use_trailing_solidus = false
        @space_before_trailing_solidus = true
        @omit_optional_tags = true
        @sanitize = false
        @strip_whitespace = false
        @inject_meta_charset = true
        options.each do |name, value|
            next unless %w(quote_attr_values quote_char use_best_quote_char
              minimize_boolean_attributes use_trailing_solidus
              space_before_trailing_solidus omit_optional_tags sanitize
              strip_whitespace inject_meta_charset).include? name.to_s
            @use_best_quote_char = false if name.to_s == 'quote_char'
            instance_variable_set("@#{name}", value)
        end
        @errors = []
    end
    def serialize(treewalker, encoding=nil)
        in_cdata = false
        @errors = []
        if encoding and @inject_meta_charset
            treewalker = filter_inject_meta_charset(treewalker, encoding)
        end
        if @strip_whitespace
            treewalker = filter_whitespace(treewalker)
        end
        if @sanitize
            require 'html5lib/sanitizer'
            treewalker = HTMLSanitizeFilter.new(treewalker)
        end
 #        if @omit_optional_tags
 #            treewalker = OptionalTagFilter.new(treewalker)
 #        end
        result = []
        treewalker.each do |token|
            type = token[:type]
            if type == :Doctype
                doctype = "<!DOCTYPE %s>" % token[:name]
                if encoding
                    result << doctype.encode(encoding)
                else
                    result << doctype
                end
            elsif [:Characters, :SpaceCharacters].include? type
                if type == :SpaceCharacters or in_cdata
                    if in_cdata and token[:data].find("</") >= 0
                        serializeError(_("Unexpected </ in CDATA"))
                    end
                    if encoding
                        result << token[:data].encode(encoding, errors || "strict")
                    else
                        result << token[:data]
                    end
                elsif encoding
                    result << token[:data].replace("&", "&amp;") \
                        .encode(encoding, unicode_encode_errors)
                else
                    result << token[:data] \
                        .gsub("&", "&amp;") \
                        .gsub("<", "&lt;")  \
                        .gsub(">", "&gt;")
                end
            elsif [:StartTag, :EmptyTag].include? type
                name = token[:name]
                if CDATA_ELEMENTS.include?(name)
                    in_cdata = true
                elsif in_cdata
                    serializeError(_("Unexpected child element of a CDATA element"))
                end
                attrs = token[:data].to_a
                attrs.sort()
                attributes = []
                for k,v in attrs
                    if encoding
                        k = k.encode(encoding)
                    end
                    attributes << ' '
                    attributes << k
                    if not @minimize_boolean_attributes or \
                      (!(BOOLEAN_ATTRIBUTES[name]||[]).include?(k) \
                      and !BOOLEAN_ATTRIBUTES[:global].include?(k))
                        attributes << "="
                        if @quote_attr_values or v.empty?
                            quote_attr = true
                        else
                            quote_attr = (SPACE_CHARACTERS.join('') + "<>\"'").each_char.any? {|c| v.include?(c)}
                        end
                        v = v.gsub("&", "&amp;")
                        if encoding
                            v = v.encode(encoding, unicode_encode_errors)
                        end
                        if quote_attr
                            quote_char = @quote_char
                            if @use_best_quote_char
                                if v.index("'") and !v.index('"')
                                    quote_char = '"'
                                elsif v.index('"') and !v.index("'")
                                    quote_char = "'"
                                end
                            end
                            if quote_char == "'"
                                v = v.gsub("'", "&#39;")
                            else
                                v = v.gsub('"', "&quot;")
                            end
                            attributes << quote_char << v << quote_char
                        else
                            attributes << v
                        end
                    end
                end
                if VOID_ELEMENTS.include?(name) and @use_trailing_solidus
                    if @space_before_trailing_solidus
                        attributes << " /"
                    else
                        attributes << "/"
                    end
                end
                if encoding
                    result << "<%s%s>" % [name.encode(encoding), attributes.join('')]
                else
                    result << "<%s%s>" % [name, attributes.join('')]
                end
            elsif type == :EndTag
                name = token[:name]
                if CDATA_ELEMENTS.include?(name)
                    in_cdata = false
                elsif in_cdata
                    serializeError(_("Unexpected child element of a CDATA element"))
                end
                end_tag = "</%s>" % name
                if encoding
                    end_tag = end_tag.encode(encoding)
                end
                result << end_tag
            elsif type == :Comment
                data = token[:data]
                if data.index("--")
                    serializeError(_("Comment contains --"))
                end
                comment = "<!--%s-->" % token[:data]
                if encoding
                    comment = comment.encode(encoding, unicode_encode_errors)
                end
                result << comment
            else
                serializeError(token[:data])
            end
        end
        result.join('')
    end
    def render(treewalker, encoding=nil)
        if encoding
            return "".join(list(serialize(treewalker, encoding)))
        else
            return "".join(list(serialize(treewalker)))
        end
    end
    def serializeError(data="XXX ERROR MESSAGE NEEDED")
        # XXX The idea is to make data mandatory.
        @errors.push(data)
        if @strict
            raise SerializeError
        end
    end
    def filter_inject_meta_charset(treewalker, encoding)
        done = false
        for token in treewalker
            if not done and token[:type] == :StartTag \
              and token[:name].lower() == "head"
                yield({:type => :EmptyTag, :name => "meta", \
                    :data => {"charset" => encoding}})
            end
            yield token
        end
    end
    def filter_whitespace(treewalker)
        raise NotImplementedError
    end
 end
 # Error in serialized tree
 class SerializeError < Exception
 end
 end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb
@ -1,21 +1,24 @@
 module HTML5lib
  module TreeBuilders
-    def self.getTreeBuilder(name)
+    class << self
-      case name.to_s.downcase
+      def [](name)
        case name.to_s.downcase
        when 'simpletree' then
          require 'html5lib/treebuilders/simpletree'
          SimpleTree::TreeBuilder
        when 'rexml' then
          require 'html5lib/treebuilders/rexml'
-          REXMLTree::TreeBuilder
+          REXML::TreeBuilder
        when 'hpricot' then
          require 'html5lib/treebuilders/hpricot'
          Hpricot::TreeBuilder
        else
          raise "Unknown TreeBuilder #{name}"
        end
      end
    end
      alias :getTreeBuilder :[]
    end
  end
 end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb
@ -144,7 +144,7 @@ module HTML5lib
          # code. It should still do the same though.
          # Step 1: stop the algorithm when there's nothing to do.
-          return unless @activeFormattingElements
+          return if @activeFormattingElements.empty?
          # Step 2 and step 3: we start with the last element. So i is -1.
          i = -1
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb
@ -1,4 +1,5 @@
 require 'html5lib/treebuilders/base'
 require 'rubygems'
 require 'hpricot'
 require 'forwardable'
@ -26,12 +27,14 @@ module HTML5lib
            childNodes << node
            hpricot.children << node.hpricot
          end
          node.hpricot.parent = hpricot
          node.parent = self
        end
        def removeChild(node)
           childNodes.delete(node)
           hpricot.children.delete_at(hpricot.children.index(node.hpricot))
           node.hpricot.parent = nil
           node.parent = nil
        end
@ -48,6 +51,7 @@ module HTML5lib
          if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
            childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
          else
            refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
            childNodes.insert(index, node)
          end
        end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb
@ -4,7 +4,7 @@ require 'forwardable'
 module HTML5lib
  module TreeBuilders
-    module REXMLTree
+    module REXML
      class Node < Base::Node
        extend Forwardable
@ -52,6 +52,7 @@ module HTML5lib
            childNodes[index-1].rxobj.raw = true
          else
            childNodes.insert index, node
            refNode.rxobj.parent.insert_before(refNode.rxobj,node.rxobj)
          end
        end
@ -62,7 +63,7 @@ module HTML5lib
      class Element < Node
        def self.rxclass
-          REXML::Element
+          ::REXML::Element
        end
        def initialize name
@ -95,7 +96,7 @@ module HTML5lib
      class Document < Node
        def self.rxclass
-          REXML::Document
+          ::REXML::Document
        end
        def initialize
@ -120,7 +121,7 @@ module HTML5lib
      class DocumentType < Node
        def self.rxclass
-          REXML::DocType
+          ::REXML::DocType
        end
        def printTree indent=0
@ -145,7 +146,7 @@ module HTML5lib
      class TextNode < Node
        def initialize data
          raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;')
-          @rxobj = REXML::Text.new(raw, true, nil, true)
+          @rxobj = ::REXML::Text.new(raw, true, nil, true)
        end
        def printTree indent=0
@ -155,7 +156,7 @@ module HTML5lib
      class CommentNode < Node
        def self.rxclass
-          REXML::Comment
+          ::REXML::Comment
        end
        def printTree indent=0
--- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers.rb
@ -0,0 +1,26 @@
 require 'html5lib/treewalkers/base'
 module HTML5lib
  module TreeWalkers
    class << self
      def [](name)
        case name.to_s.downcase
        when 'simpletree' then
          require 'html5lib/treewalkers/simpletree'
          SimpleTree::TreeWalker
        when 'rexml' then
          require 'html5lib/treewalkers/rexml'
          REXML::TreeWalker
        when 'hpricot' then
          require 'html5lib/treewalkers/hpricot'
          Hpricot::TreeWalker
        else
          raise "Unknown TreeWalker #{name}"
        end
      end
      alias :getTreeWalker :[]
    end
  end
 end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/base.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/base.rb
@ -0,0 +1,156 @@
 require 'html5lib/constants'
 module HTML5lib
 module TreeWalkers
 module TokenConstructor
    def error(msg)
        return {:type => "SerializeError", :data => msg}
    end
    def normalizeAttrs(attrs)
        attrs.to_a
    end
    def emptyTag(name, attrs, hasChildren=false)
        error(_("Void element has children")) if hasChildren
        return({:type => :EmptyTag, :name => name, \
                :data => normalizeAttrs(attrs)})
    end
    def startTag(name, attrs)
        return {:type => :StartTag, :name => name, \
                 :data => normalizeAttrs(attrs)}
    end
    def endTag(name)
        return {:type => :EndTag, :name => name, :data => []}
    end
    def text(data)
        if data =~ /^([#{SPACE_CHARACTERS.join('')}]+)/
          yield({:type => :SpaceCharacters, :data => $1})
          data = data[$1.length .. -1]
          return if data.empty?
        end
        if data =~ /([#{SPACE_CHARACTERS.join('')}]+)$/
          yield({:type => :Characters, :data => data[0 ... -$1.length]})
          yield({:type => :SpaceCharacters, :data => $1})
        else
          yield({:type => :Characters, :data => data})
        end
    end
    def comment(data)
        return {:type => :Comment, :data => data}
    end
    def doctype(name)
        return {:type => :Doctype, :name => name, :data => name.upcase() == "HTML"}
    end
    def unknown(nodeType)
        return error(_("Unknown node type: ") + nodeType.to_s)
    end
    def _(str)
      str
    end
 end
 class Base
    include TokenConstructor
    def initialize(tree)
        @tree = tree
    end
    def each
        raise NotImplementedError
    end
    alias walk each
 end
 class NonRecursiveTreeWalker < TreeWalkers::Base
    def node_details(node)
        raise NotImplementedError
    end
    def first_child(node)
        raise NotImplementedError
    end
    def next_sibling(node)
        raise NotImplementedError
    end
    def parent(node)
        raise NotImplementedError
    end
    def each
        currentNode = @tree
        while currentNode != nil
            details = node_details(currentNode)
            hasChildren = false
            case details.shift
            when :DOCTYPE
                yield doctype(*details)
            when :TEXT
                text(*details) {|token| yield token}
            when :ELEMENT
                name, attributes, hasChildren = details
                if VOID_ELEMENTS.include?(name)
                    yield emptyTag(name, attributes.to_a, hasChildren)
                    hasChildren = false
                else
                    yield startTag(name, attributes.to_a)
                end
            when :COMMENT
                yield comment(details[0])
            when :DOCUMENT, :DOCUMENT_FRAGMENT
                hasChildren = true
            when nil
                # ignore (REXML::XMLDecl is an example)
            else
                yield unknown(details[0])
            end
            firstChild = hasChildren ? first_child(currentNode) : nil
            if firstChild != nil
                currentNode = firstChild
            else
                while currentNode != nil
                    details = node_details(currentNode)
                    if details.shift == :ELEMENT
                        name, attributes, hasChildren = details
                        yield endTag(name) if !VOID_ELEMENTS.include?(name)
                    end
                    if @tree == currentNode
                        currentNode = nil
                    else
                        nextSibling = next_sibling(currentNode)
                        if nextSibling != nil
                            currentNode = nextSibling
                            break
                        end
                        currentNode = parent(currentNode)
                    end
                end
            end
        end
    end
 end
 end
 end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/hpricot.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/hpricot.rb
@ -0,0 +1,48 @@
 require 'html5lib/treewalkers/base'
 require 'rexml/document'
 module HTML5lib
  module TreeWalkers
    module Hpricot
      class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
        def node_details(node)
          case node
          when ::Hpricot::Elem
            if !node.name
              [:DOCUMENT_FRAGMENT]
            else
              [:ELEMENT, node.name,
                node.attributes.map {|name,value| [name,value]},
                !node.empty?]
            end
          when ::Hpricot::Text
            [:TEXT, node.to_plain_text]
          when ::Hpricot::Comment
            [:COMMENT, node.content]
          when ::Hpricot::Doc
            [:DOCUMENT]
          when ::Hpricot::DocType
            [:DOCTYPE, node.target]
          when ::Hpricot::XMLDecl
            [nil]
          else
            [:UNKNOWN, node.class.inspect]
          end
        end
        def first_child(node)
          node.children.first
        end
        def next_sibling(node)
          node.next_node
        end
        def parent(node)
          node.parent
        end
      end
    end
  end
 end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/rexml.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/rexml.rb
@ -0,0 +1,48 @@
 require 'html5lib/treewalkers/base'
 require 'rexml/document'
 module HTML5lib
  module TreeWalkers
    module REXML
      class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
        def node_details(node)
          case node
          when ::REXML::Document
            [:DOCUMENT]
          when ::REXML::Element
            if !node.name
              [:DOCUMENT_FRAGMENT]
            else
              [:ELEMENT, node.name,
                node.attributes.map {|name,value| [name,value]},
                node.has_elements? || node.has_text?]
            end
          when ::REXML::Text
            [:TEXT, node.value]
          when ::REXML::Comment
            [:COMMENT, node.string]
          when ::REXML::DocType
            [:DOCTYPE, node.name]
          when ::REXML::XMLDecl
            [nil]
          else
            [:UNKNOWN, node.class.inspect]
          end
        end
        def first_child(node)
          node.children.first
        end
        def next_sibling(node)
          node.next_sibling
        end
        def parent(node)
          node.parent
        end
      end
    end
  end
 end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/simpletree.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/simpletree.rb
@ -0,0 +1,48 @@
 require 'html5lib/treewalkers/base'
 module HTML5lib
  module TreeWalkers
    module SimpleTree
      class TreeWalker < HTML5lib::TreeWalkers::Base
        include HTML5lib::TreeBuilders::SimpleTree
        def walk(node)
          case node
          when Document, DocumentFragment
            return
          when DocumentType
            yield doctype(node.name)
          when TextNode
            text(node.value) {|token| yield token}
          when Element
            if VOID_ELEMENTS.include?(node.name)
              yield emptyTag(node.name, node.attributes, node.hasContent())
            else
              yield startTag(node.name, node.attributes)
              for child in node.childNodes
                walk(child) {|token| yield token}
              end
              yield endTag(node.name)
            end
          when CommentNode
            yield comment(node.value)
          else
            puts '?'
            yield unknown(node.class)
          end
        end
        def each
          for child in @tree.childNodes
            walk(child) {|node| yield node}
          end
        end
      end
    end
  end
 end
--- a/vendor/plugins/HTML5lib/parse.rb
+++ b/vendor/plugins/HTML5lib/parse.rb
@ -0,0 +1,137 @@
 #!/usr/bin/env ruby
 # 
 # Parse a document to a simpletree tree, with optional profiling
 $:.unshift File.dirname(__FILE__),'lib'
 def parse(opts, args)
  f = args[-1]
  if f
    begin
      require 'open-uri' if f[0..6] == 'http://'
      f = open(f)
    rescue
    end
  else
    $stderr.write("No filename provided. Use -h for help\n")
    exit(1)
  end
  require 'html5lib/treebuilders'
  treebuilder = HTML5lib::TreeBuilders[opts.treebuilder]
  if opts.output == :xml
    require 'html5lib/liberalxmlparser'
    p = HTML5lib::XHTMLParser.new(:tree=>treebuilder)
  else
    require 'html5lib/html5parser'
    p = HTML5lib::HTMLParser.new(:tree=>treebuilder)
  end
  if opts.profile
    require 'profiler'
    Profiler__::start_profile
    p.parse(f)
    Profiler__::stop_profile
    Profiler__::print_profile($stderr)
  elsif opts.time
    require 'time'
    t0 = Time.new
    document = p.parse(f)
    t1 = Time.new
    printOutput(p, document, opts)
    t2 = Time.new
    puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1]
  else
    document = p.parse(f)
    printOutput(p, document, opts)
  end
 end
 def printOutput(parser, document, opts)
  puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding
  case opts.output
  when :xml
    print document
  when :html
    require 'html5lib/treewalkers'
    tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document)
    require 'html5lib/serializer'
    print HTML5lib::HTMLSerializer.serialize(tokens, :encoding=>'utf-8')
  when :hilite
    print document.hilite
  when :tree
    print parser.tree.testSerializer(document)
  end
  if opts.error
    errList=[]
    for pos, message in parser.errors
        errList << ("Line %i Col %i"%pos + " " + message)
    end
    $stderr.write("\nParse errors:\n" + errList.join("\n")+"\n")
  end
 end
 require 'ostruct'
 options = OpenStruct.new
 options.profile = false
 options.time = false
 options.output = :tree
 options.treebuilder = 'simpletree'
 options.error = false
 options.encoding = false
 require 'optparse'
 opts = OptionParser.new do |opts|
  opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
    options.profile = profile
  end
  opts.on("-t", "--[no-]time", "Time the run") do |time|
    options.time = time
  end
  opts.on("--[no-]tree", "Do not print output tree") do |tree|
    if tree
      options.output = :tree
    else
      options.output = nil
    end
  end
  opts.on("-b", "--treebuilder NAME") do |treebuilder|
    options.treebuilder = treebuilder
  end
  opts.on("-e", "--error", "Print a list of parse errors") do |error|
    options.error = error
  end
  opts.on("-x", "--xml", "output as xml") do |xml|
    options.output = :xml
    options.treebuilder = "rexml"
  end
  opts.on("--html", "Output as html") do |html|
    options.output = :html
  end
  opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
    options.output = :hilite
  end
  opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
    options.encoding = encoding
  end
  opts.on_tail("-h", "--help", "Show this message") do
    puts opts
    exit
  end
 end
 opts.parse!(ARGV)
 parse options, ARGV
--- a/vendor/plugins/HTML5lib/tests/preamble.rb
+++ b/vendor/plugins/HTML5lib/tests/preamble.rb
@ -21,3 +21,53 @@ rescue LoadError
    end
  end
 end
 module HTML5lib
  module TestSupport
    def self.startswith?(a, b)
      b[0... a.length] == a
    end
    def self.parseTestcase(data)
      innerHTML = nil
      input = []
      output = []
      errors = []
      currentList = input
      data.split(/\n/).each do |line|
        if !line.empty? and !startswith?("#errors", line) and
          !startswith?("#document", line) and
          !startswith?("#data", line) and
          !startswith?("#document-fragment", line)
          if currentList == output and startswith?("|", line)
            currentList.push(line[2..-1])
          else
            currentList.push(line)
          end
        elsif line == "#errors"
          currentList = errors
        elsif line == "#document" or startswith?("#document-fragment", line)
          if startswith?("#document-fragment", line)
            innerHTML = line[19..-1]
            raise AssertionError unless innerHTML
          end
          currentList = output
        end
      end
      return innerHTML, input.join("\n"), output.join("\n"), errors
    end
    # convert the output of str(document) to the format used in the testcases
    def convertTreeDump(treedump)
      treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
    end
    def sortattrs(output)
      output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) do |match|
         match.split("\n").sort.join("\n")
      end
    end
  end
 end
--- a/vendor/plugins/HTML5lib/tests/test_encoding.rb
+++ b/vendor/plugins/HTML5lib/tests/test_encoding.rb
@ -4,33 +4,33 @@ require 'html5lib/inputstream'
 class Html5EncodingTestCase < Test::Unit::TestCase
-begin
+  begin
    require 'rubygems'
    require 'UniversalDetector'
    def test_chardet
-        File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
+      File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
-            stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
+        stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
-            assert_equal 'big5', stream.char_encoding.downcase
+        assert_equal 'big5', stream.char_encoding.downcase
-        end
+      end
    end
-rescue LoadError
+  rescue LoadError
    puts "chardet not found, skipping chardet tests"
-end
+  end
-    html5lib_test_files('encoding').each do |test_file|        
+  html5lib_test_files('encoding').each do |test_file|        
-        test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
+    test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
-        File.read(test_file).split("#data\n").each_with_index do |data, index|
+    File.read(test_file).split("#data\n").each_with_index do |data, index|
-            next if data.empty?
+      next if data.empty?
-            input, encoding = data.split(/\n#encoding\s+/, 2)
+      input, encoding = data.split(/\n#encoding\s+/, 2)
-            encoding = encoding.split[0]
+      encoding = encoding.split[0]
-            define_method 'test_%s_%d' % [ test_name, index + 1 ] do
+      define_method 'test_%s_%d' % [ test_name, index + 1 ] do
-                stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
+        stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
-                assert_equal encoding.downcase, stream.char_encoding.downcase, input
+        assert_equal encoding.downcase, stream.char_encoding.downcase, input
-            end
+      end
        end
    end
  end
 end
--- a/vendor/plugins/HTML5lib/tests/test_parser.rb
+++ b/vendor/plugins/HTML5lib/tests/test_parser.rb
@ -14,53 +14,12 @@ end
 $CHECK_PARSER_ERRORS = false
-puts 'Testing: ' + $tree_types_to_test * ', '
+puts 'Testing tree builders: ' + $tree_types_to_test * ', '
 class Html5ParserTestCase < Test::Unit::TestCase
-
+  include HTML5lib
-  def self.startswith?(a, b)
+  include TestSupport
    b[0... a.length] == a
  end
  def self.parseTestcase(data)
    innerHTML = nil
    input = []
    output = []
    errors = []
    currentList = input
    data.split(/\n/).each do |line|
      if !line.empty? and !startswith?("#errors", line) and
        !startswith?("#document", line) and
        !startswith?("#data", line) and
        !startswith?("#document-fragment", line)
        if currentList == output and startswith?("|", line)
          currentList.push(line[2..-1])
        else
          currentList.push(line)
        end
      elsif line == "#errors"
        currentList = errors
      elsif line == "#document" or startswith?("#document-fragment", line)
        if startswith?("#document-fragment", line)
          innerHTML = line[19..-1]
          raise AssertionError unless innerHTML
        end
        currentList = output
      end
    end
    return innerHTML, input.join("\n"), output.join("\n"), errors
  end
  # convert the output of str(document) to the format used in the testcases
  def convertTreeDump(treedump)
    treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
  end
  def sortattrs(output)
    output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
  end
  html5lib_test_files('tree-construction').each do |test_file|
@ -69,12 +28,13 @@ class Html5ParserTestCase < Test::Unit::TestCase
    File.read(test_file).split("#data\n").each_with_index do |data, index|
      next if data.empty?
-      innerHTML, input, expected_output, expected_errors = parseTestcase(data)
+      innerHTML, input, expected_output, expected_errors =
        TestSupport.parseTestcase(data)
      $tree_types_to_test.each do |tree_name|
        define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
-          parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name))
+          parser = HTMLParser.new(:tree => TreeBuilders[tree_name])
          if innerHTML
            parser.parseFragment(input, innerHTML)
--- a/vendor/plugins/HTML5lib/tests/test_sanitizer.rb
+++ b/vendor/plugins/HTML5lib/tests/test_sanitizer.rb
@ -2,9 +2,11 @@
 require File.join(File.dirname(__FILE__), 'preamble')
 require 'html5lib/sanitizer'
 require 'html5lib/html5parser'
 require 'html5lib/liberalxmlparser'
 require 'html5lib/treewalkers'
 require 'html5lib/serializer'
 require 'html5lib/sanitizer'
 class SanitizeTest < Test::Unit::TestCase
  include HTML5lib
--- a/vendor/plugins/HTML5lib/tests/test_serializer.rb
+++ b/vendor/plugins/HTML5lib/tests/test_serializer.rb
@ -0,0 +1,52 @@
 require File.join(File.dirname(__FILE__), 'preamble')
 require 'html5lib/html5parser'
 require 'html5lib/serializer'
 require 'html5lib/treewalkers'
 #Run the serialize error checks
 checkSerializeErrors = false
 class JsonWalker < HTML5lib::TreeWalkers::Base
  def each
    @tree.each do |token|
      case token[0]
      when 'StartTag'
        yield startTag(token[1], token[2])
      when 'EndTag'
        yield endTag(token[1])
      when 'EmptyTag'
        yield emptyTag(token[1], token[2])
      when 'Comment'
        yield comment(token[1])
      when 'Characters', 'SpaceCharacters'
        text(token[1]) {|textToken| yield textToken}
      when 'Doctype'
        yield doctype(token[1])
      else
        raise ValueError("Unknown token type: " + type)
      end
    end
  end
 end
 class Html5SerializeTestcase < Test::Unit::TestCase
  html5lib_test_files('serializer').each do |filename|
    test_name = File.basename(filename).sub('.test', '')
    tests = JSON::parse(open(filename).read)
    tests['tests'].each_with_index do |test, index|
      define_method "test_#{test_name}_#{index+1}" do
        result = HTML5lib::HTMLSerializer.
          serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
        expected = test["expected"]
        if expected.length == 1
          assert_equal(expected[0], result, test["description"])
        elsif !expected.include?(result)
          flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
        end
      end
    end
  end
 end
--- a/vendor/plugins/HTML5lib/tests/test_stream.rb
+++ b/vendor/plugins/HTML5lib/tests/test_stream.rb
@ -0,0 +1,54 @@
 require File.join(File.dirname(__FILE__), 'preamble')
 require 'html5lib/inputstream'
 class HTMLInputStreamTest < Test::Unit::TestCase
  include HTML5lib
  def test_char_ascii
    stream = HTMLInputStream.new("'")
    assert_equal('ascii', stream.char_encoding)
    assert_equal("'", stream.char)
  end
  def test_char_null
    stream = HTMLInputStream.new("\x00")
    assert_equal("\xef\xbf\xbd", stream.char)
  end
  def test_char_utf8
    stream = HTMLInputStream.new("\xe2\x80\x98")
    assert_equal('utf-8', stream.char_encoding)
    assert_equal("\xe2\x80\x98", stream.char)
  end
  def test_char_win1252
    stream = HTMLInputStream.new("\x91")
    assert_equal('windows-1252', stream.char_encoding)
    assert_equal("\xe2\x80\x98", stream.char)
  end
  def test_bom
    stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
    assert_equal('utf-8', stream.char_encoding)
    assert_equal("'", stream.char)
  end
  def test_utf_16
    stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
    assert(stream.char_encoding, 'utf-16-le')
    assert_equal(1025, stream.chars_until(' ',true).length)
  end
  def test_newlines
    stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
    assert_equal(0, stream.instance_eval {@tell})
    assert_equal("a\nbb\n", stream.chars_until('c'))
    assert_equal(6, stream.instance_eval {@tell})
    assert_equal([3,1], stream.position)
    assert_equal("ccc\ndddd", stream.chars_until('x'))
    assert_equal(14, stream.instance_eval {@tell})
    assert_equal([4,5], stream.position)
    assert_equal([0,1,4,8], stream.instance_eval {@new_lines})
  end
 end
--- a/vendor/plugins/HTML5lib/tests/test_treewalkers.rb
+++ b/vendor/plugins/HTML5lib/tests/test_treewalkers.rb
@ -0,0 +1,110 @@
 require File.join(File.dirname(__FILE__), 'preamble')
 require 'html5lib/html5parser'
 require 'html5lib/treewalkers'
 require 'html5lib/treebuilders'
 $tree_types_to_test = {
  'simpletree' =>
    {:builder => HTML5lib::TreeBuilders['simpletree'],
     :walker  => HTML5lib::TreeWalkers['simpletree']},
  'rexml' =>
    {:builder => HTML5lib::TreeBuilders['rexml'],
     :walker  => HTML5lib::TreeWalkers['rexml']},
 # 'hpricot' =>
 #   {:builder => HTML5lib::TreeBuilders['hpricot'],
 #    :walker  => HTML5lib::TreeWalkers['hpricot']},
 }
 puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
 class TestTreeWalkers < Test::Unit::TestCase
  include HTML5lib::TestSupport
  def concatenateCharacterTokens(tokens)
    charactersToken = nil
    for token in tokens
        type = token[:type]
        if [:Characters, :SpaceCharacters].include?(type)
            if charactersToken == nil
                charactersToken = {:type => :Characters, :data => token[:data]}
            else
                charactersToken[:data] += token[:data]
            end
        else
            if charactersToken != nil
                yield charactersToken
                charactersToken = nil
            end
            yield token
        end
    end
    yield charactersToken if charactersToken != nil
  end
  def convertTokens(tokens)
    output = []
    indent = 0
    concatenateCharacterTokens(tokens) do |token|
        case token[:type]
        when :StartTag, :EmptyTag
            output << "#{' '*indent}<#{token[:name]}>"
            indent += 2
            for name, value in token[:data].to_a.sort
                next if name=='xmlns'
                output << "#{' '*indent}#{name}=\"#{value}\""
            end
            indent -= 2 if token[:type] == :EmptyTag
        when :EndTag
            indent -= 2
        when :Comment
            output << "#{' '*indent}<!-- #{token[:data]} -->"
        when :Doctype
            output << "#{' '*indent}<!DOCTYPE #{token[:name]}>"
        when :Characters, :SpaceCharacters
            output << "#{' '*indent}\"#{token[:data]}\""
        else
            # TODO: what to do with errors?
        end
    end
    return output.join("\n")
  end
  html5lib_test_files('tree-construction').each do |test_file|
    test_name = File.basename(test_file).sub('.dat', '')
    File.read(test_file).split("#data\n").each_with_index do |data, index|
      next if data.empty?
      innerHTML, input, expected_output, expected_errors =
        HTML5lib::TestSupport::parseTestcase(data)
      rexml = $tree_types_to_test['rexml']
      $tree_types_to_test.each do |tree_name, treeClass|
        define_method "test_#{test_name}_#{index}_#{tree_name}" do
          parser = HTML5lib::HTMLParser.new(:tree => treeClass[:builder])
          if innerHTML
            parser.parseFragment(input, innerHTML)
          else
            parser.parse(input)
          end
          document = parser.tree.getDocument
          begin
            output = sortattrs(convertTokens(treeClass[:walker].new(document)))
            expected = sortattrs(expected_output)
            errorMsg = "\n\nExpected:\n#{expected}\nRecieved:\n#{output}\n"
            assert_equal(expected, output, errorMsg)
          rescue NotImplementedError
            # Amnesty for those that confess...
          end
        end
      end
   end
  end
 end
--- a/vendor/plugins/maruku/lib/maruku/output/to_html.rb
+++ b/vendor/plugins/maruku/lib/maruku/output/to_html.rb
@ -154,6 +154,21 @@ Example:
 	CSS: style.css math.css
 =end
 	# Render to an HTML fragment (returns a REXML document tree)
 	def to_html_tree
 		div = Element.new 'div'
                        children_to_html.each do |e|
                                div << e
                        end
                        # render footnotes
                        if @doc.footnotes_order.size > 0
                                div << render_footnotes
                        end
                doc = Document.new(nil,{:respect_whitespace =>:all})
                doc << div
 	end
 	# Render to a complete HTML document (returns a REXML document tree)
 	def to_html_document_tree