HTML5lib Sanitizer

Replaced native Sanitizer with HTML5lib version. Synced with latest Maruku.
2007-05-25 20:52:27 -05:00 · 2007-05-25 20:52:27 -05:00 · 6b21ac484f
commit 6b21ac484f
parent 457ec8627c
36 changed files with 6534 additions and 215 deletions
--- a/app/controllers/wiki_controller.rb
+++ b/app/controllers/wiki_controller.rb
@ -294,13 +294,13 @@ class WikiController < ApplicationController

  def s5
    if @web.markup == :markdownMML
-      @s5_content = sanitize_html(Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
+      @s5_content = sanitize_xhtml(Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
           {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}'], :content_only => true,
-            :author => @page.author, :title => @page.plain_name}).to_s5).to_ncr
+            :author => @page.author, :title => @page.plain_name}).to_s5.to_ncr)
    elsif @web.markup == :markdown
-      @s5_content = sanitize_html(Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
+      @s5_content = sanitize_xhtml(Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
           {:math_enabled => false, :content_only => true,
-            :author => @page.author, :title => @page.plain_name}).to_s5).to_ncr
+            :author => @page.author, :title => @page.plain_name}).to_s5.to_ncr)
    else
      @s5_content = "S5 not supported with this text filter"
    end
--- a/attic/lib/node.rb
+++ b/attic/lib/node.rb
--- a/attic/lib/sanitize.rb
+++ b/attic/lib/sanitize.rb
@ -0,0 +1,207 @@
+module Sanitize
+
+# This module provides sanitization of XHTML+MathML+SVG
+# and of inline style attributes.
+#
+# Based heavily on Sam Ruby's code in the Universal FeedParser.
+
+  require 'html/tokenizer'
+  require 'node'
+
+  acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
+      'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
+      'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
+      'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+      'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
+      'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
+      'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
+      'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
+      'ul', 'var']
+      
+  mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
+      'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
+      'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
+      'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
+      'munderover', 'none']
+      
+  svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
+      'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
+      'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
+      'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
+      'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
+      'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
+      
+  acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
+      'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
+      'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
+      'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
+      'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
+      'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
+      'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
+      'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
+      'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
+      'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', 'title',
+      'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang']
+
+
+  mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
+      'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
+      'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
+      'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
+      'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
+      'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
+      'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
+      'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
+      'xlink:type', 'xmlns', 'xmlns:xlink']
+
+      
+  svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
+       'arabic-form', 'ascent', 'attributeName', 'attributeType',
+       'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
+       'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
+       'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
+       'font-family', 'font-size', 'font-stretch', 'font-style', 'font-variant',
+       'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name', 
+       'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
+       'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
+       'lang', 'marker-end', 'marker-mid', 'marker-start', 'markerHeight',
+       'markerUnits', 'markerWidth', 'mathematical', 'max', 'min', 'name',
+       'offset', 'opacity', 'orient', 'origin', 'overline-position',
+       'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
+       'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount', 'repeatDur',
+       'requiredExtensions', 'requiredFeatures', 'restart', 'rotate', 'rx',
+       'ry', 'slope', 'stemh', 'stemv', 'stop-color', 'stop-opacity',
+       'strikethrough-position', 'strikethrough-thickness', 'stroke',
+       'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
+       'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
+       'stroke-width', 'systemLanguage', 'target',
+       'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
+       'underline-position', 'underline-thickness', 'unicode',
+       'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
+       'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
+       'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
+       'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
+       'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
+
+  attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'xlink:href']
+  
+  acceptable_css_properties = ['azimuth', 'background-color',
+      'border-bottom-color', 'border-collapse', 'border-color',
+      'border-left-color', 'border-right-color', 'border-top-color', 'clear',
+      'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
+      'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
+      'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
+      'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
+      'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
+      'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
+      'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
+      'white-space', 'width']
+
+  acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
+      'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
+      'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
+      'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
+      'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
+      'transparent', 'underline', 'white', 'yellow']
+
+  acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
+      'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
+      'stroke-opacity']
+
+  acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
+      'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
+      'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
+      'ssh', 'sftp', 'rtsp', 'afs' ]
+
+      ALLOWED_ELEMENTS = acceptable_elements + mathml_elements + svg_elements  unless defined?(ALLOWED_ELEMENTS)
+      ALLOWED_ATTRIBUTES = acceptable_attributes + mathml_attributes + svg_attributes unless defined?(ALLOWED_ATTRIBUTES)
+      ALLOWED_CSS_PROPERTIES = acceptable_css_properties unless defined?(ALLOWED_CSS_PROPERTIES)
+      ALLOWED_CSS_KEYWORDS = acceptable_css_keywords unless defined?(ALLOWED_CSS_KEYWORDS)
+      ALLOWED_SVG_PROPERTIES = acceptable_svg_properties unless defined?(ALLOWED_SVG_PROPERTIES)
+      ALLOWED_PROTOCOLS = acceptable_protocols unless defined?(ALLOWED_PROTOCOLS)
+      ATTR_VAL_IS_URI = attr_val_is_uri unless defined?(ATTR_VAL_IS_URI)
+
+      # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and stripping out all
+      # attributes not in ALLOWED_ATTRIBUTES. Style attributes are parsed, and a restricted set,
+      # specified by ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
+      # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified in
+      # ALLOWED_PROTOCOLS are allowed.
+      # You can adjust what gets sanitized, by defining these constant arrays before this Module is loaded. 
+      #
+      #   sanitize_html('<script> do_nasty_stuff() </script>')
+      #    => &lt;script> do_nasty_stuff() &lt;/script>
+      #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
+      #    => <a>Click here for $100</a>
+      def sanitize_html(html)
+        if html.index("<")
+          tokenizer = HTML::Tokenizer.new(html)
+          new_text = ""
+
+          while token = tokenizer.next
+            node = XHTML::Node.parse(nil, 0, 0, token, false)
+            new_text << case node.tag?
+              when true
+                if ALLOWED_ELEMENTS.include?(node.name)
+                  if node.closing != :close
+                    node.attributes.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
+                    ATTR_VAL_IS_URI.each do |attr|
+                      val_unescaped = CGI.unescapeHTML(node.attributes[attr].to_s).gsub(/[\000-\040\177\s]+|\302*[\200-\240]/,'').downcase
+                      if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0]) 
+                        node.attributes.delete attr 
+                      end
+                    end
+                    if node.attributes['style']
+                      node.attributes['style'] = sanitize_css(node.attributes['style']) 
+                    end
+                  end
+                  node.to_s
+                else
+                  node.to_s.gsub(/</, "&lt;")
+                end
+              else
+                node.to_s.gsub(/</, "&lt;")
+            end
+          end
+
+          html = new_text
+        end
+        html
+      end
+      
+      def sanitize_css(style)
+          # disallow urls
+          style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
+
+          # gauntlet
+          if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
+             style = ''
+             return style
+          end
+          if style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
+             style = ''
+             return style
+          end
+
+          clean = []
+          style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
+            if ALLOWED_CSS_PROPERTIES.include?(prop.downcase)
+              clean <<  prop + ': ' + val + ';'
+            elsif ['background','border','margin','padding'].include?(prop.split('-')[0].downcase) 
+              goodval = true
+              val.split().each do |keyword|
+                if !ALLOWED_CSS_KEYWORDS.include?(keyword) and 
+                   keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
+                  goodval = false
+                end
+              end
+              if goodval 
+                clean <<  prop + ': ' + val + ';'
+              end
+            elsif ALLOWED_SVG_PROPERTIES.include?(prop.downcase)
+               clean <<  prop + ': ' + val + ';'
+            end
+          end
+
+          style = clean.join(' ')
+      end
+end      
--- a/attic/test/unit/sanitize_test.rb
+++ b/attic/test/unit/sanitize_test.rb
@ -0,0 +1,187 @@
+#!/usr/bin/env ruby
+
+require File.expand_path(File.join(File.dirname(__FILE__), '/../test_helper'))
+require 'sanitize'
+
+class SanitizeTest < Test::Unit::TestCase
+  include Sanitize
+
+  def setup
+
+  end
+
+  Sanitize::ALLOWED_ELEMENTS.each do |tag_name|
+    define_method "test_should_allow_#{tag_name}_tag" do
+      assert_equal "<#{tag_name} title=\"1\">foo &lt;bad>bar&lt;/bad> baz</#{tag_name}>",
+        sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
+    end
+  end
+
+  Sanitize::ALLOWED_ELEMENTS.each do |tag_name|
+    define_method "test_should_forbid_#{tag_name.upcase}_tag" do
+      assert_equal "&lt;#{tag_name.upcase} title=\"1\">foo &lt;bad>bar&lt;/bad> baz&lt;/#{tag_name.upcase}>",
+        sanitize_html("<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>")
+    end
+  end
+
+  Sanitize::ALLOWED_ATTRIBUTES.each do |attribute_name|
+    if attribute_name != 'style'
+      define_method "test_should_allow_#{attribute_name}_attribute" do
+        assert_equal "<p #{attribute_name}=\"foo\">foo &lt;bad>bar&lt;/bad> baz</p>",
+          sanitize_html("<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>")
+      end
+    end
+  end
+
+  Sanitize::ALLOWED_ATTRIBUTES.each do |attribute_name|
+    define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
+      assert_equal "<p>foo &lt;bad>bar&lt;/bad> baz</p>",
+        sanitize_html("<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>")
+    end
+  end
+
+  Sanitize::ALLOWED_PROTOCOLS.each do |protocol|
+    define_method "test_should_allow_#{protocol}_uris" do
+      assert_equal "<a href=\"#{protocol}\">foo</a>",
+        sanitize_html(%(<a href="#{protocol}">foo</a>))
+    end
+  end
+
+  Sanitize::ALLOWED_PROTOCOLS.each do |protocol|
+    define_method "test_should_allow_uppercase_#{protocol}_uris" do
+      assert_equal "<a href=\"#{protocol.upcase}\">foo</a>",
+        sanitize_html(%(<a href="#{protocol.upcase}">foo</a>))
+    end
+  end
+
+  def test_should_allow_anchors
+    assert_equal "<a href=\"foo\">&lt;script>baz&lt;/script></a>",
+     sanitize_html("<a href='foo' onclick='bar'><script>baz</script></a>")
+  end
+
+  # RFC 3986, sec 4.2
+  def test_allow_colons_in_path_component
+    assert_equal "<a href=\"./this:that\">foo</a>",
+      sanitize_html("<a href=\"./this:that\">foo</a>")
+  end
+
+  %w(src width height alt).each do |img_attr|
+    define_method "test_should_allow_image_#{img_attr}_attribute" do
+      assert_equal "<img #{img_attr}=\"foo\" />",
+        sanitize_html("<img #{img_attr}='foo' onclick='bar' />")
+    end
+  end
+
+  def test_should_handle_non_html
+    assert_equal 'abc',  sanitize_html("abc")
+  end
+
+  def test_should_handle_blank_text
+    assert_equal '', sanitize_html('')
+  end
+
+  [%w(img src), %w(a href)].each do |(tag, attr)|
+    define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols" do
+      assert_equal %(<#{tag} title="1">boo</#{tag}>), sanitize_html(%(<#{tag} #{attr}="javascript:XSS" title="1">boo</#{tag}>))
+    end
+  end
+
+  [%w(img src), %w(a href)].each do |(tag, attr)|
+    define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols_and_whitespace" do
+      assert_equal %(<#{tag} title="1">boo</#{tag}>), sanitize_html(%(<#{tag} #{attr}=" javascript:XSS" title="1">boo</#{tag}>))
+    end
+  end
+
+  [%(<img src="javascript:alert('XSS');" />), 
+   %(<img src=javascript:alert('XSS') />), 
+   %(<img src="JaVaScRiPt:alert('XSS')" />), 
+   %(<img src='javascript:alert(&quot;XSS&quot;)' />),
+   %(<img src='javascript:alert(String.fromCharCode(88,83,83))' />),
+   %(<img src='&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;' />),
+   %(<img src='&#0000106;&#0000097;&#0000118;&#0000097;&#0000115;&#0000099;&#0000114;&#0000105;&#0000112;&#0000116;&#0000058;&#0000097;&#0000108;&#0000101;&#0000114;&#0000116;&#0000040;&#0000039;&#0000088;&#0000083;&#0000083;&#0000039;&#0000041' />),
+   %(<img src='&#x6A;&#x61;&#x76;&#x61;&#x73;&#x63;&#x72;&#x69;&#x70;&#x74;&#x3A;&#x61;&#x6C;&#x65;&#x72;&#x74;&#x28;&#x27;&#x58;&#x53;&#x53;&#x27;&#x29' />),
+   %(<img src="jav\tascript:alert('XSS');" />),
+   %(<img src="jav&#x09;ascript:alert('XSS');" />),
+   %(<img src="jav&#x0A;ascript:alert('XSS');" />),
+   %(<img src="jav&#x0D;ascript:alert('XSS');" />),
+   %(<img src=" &#14;  javascript:alert('XSS');" />),
+   %(<img src="&#x20;javascript:alert('XSS');" />),
+   %(<img src="&#xA0;javascript:alert('XSS');" />)].each_with_index do |img_hack, i|
+    define_method "test_should_not_fall_for_xss_image_hack_#{i}" do
+      assert_equal "<img />", sanitize_html(img_hack)
+    end
+  end
+
+  def test_should_sanitize_tag_broken_up_by_null
+    assert_equal "&lt;scr>alert(\"XSS\")&lt;/scr>", sanitize_html(%(<scr\0ipt>alert(\"XSS\")</scr\0ipt>))
+  end
+  
+  def test_should_sanitize_invalid_script_tag
+    assert_equal "&lt;script />&lt;/script>", sanitize_html(%(<script/XSS SRC="http://ha.ckers.org/xss.js"></script>))
+  end
+  
+  def test_should_sanitize_script_tag_with_multiple_open_brackets
+    assert_equal "&lt;&lt;script>alert(\"XSS\");//&lt;&lt;/script>", sanitize_html(%(<<script>alert("XSS");//<</script>))
+    assert_equal %(&lt;iframe src="http:" />&lt;), sanitize_html(%(<iframe src=http://ha.ckers.org/scriptlet.html\n<))
+  end
+  
+  def test_should_sanitize_unclosed_script
+    assert_equal "&lt;script src=\"http:\" /><b>", sanitize_html(%(<script src=http://ha.ckers.org/xss.js?<b>))
+  end
+  
+  def test_should_sanitize_half_open_scripts
+    assert_equal  "<img>", sanitize_html(%(<img src="javascript:alert('XSS')"))
+  end
+  
+  def test_should_not_fall_for_ridiculous_hack
+    img_hack = %(<img\nsrc\n=\n"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n"\n />)
+    assert_equal "<img />", sanitize_html(img_hack)
+  end
+
+  def test_platypus
+    assert_equal %(<a href=\"http://www.ragingplatypus.com/\" style=\"display: block; width: 100%; height: 100%; background-color: black; background-image: ; background-x: center; background-y: center;\">never trust your upstream platypus</a>),
+       sanitize_html(%(<a href="http://www.ragingplatypus.com/" style="display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;">never trust your upstream platypus</a>))
+  end
+
+  def test_xul
+    assert_equal %(<p style="">fubar</p>),
+     sanitize_html(%(<p style="-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')">fubar</p>))
+  end
+
+  def test_input_image
+    assert_equal %(<input type="image" />),
+      sanitize_html(%(<input type="image" src="javascript:alert('XSS');" />))
+  end
+
+  def test_non_alpha_non_digit
+    assert_equal "&lt;script />&lt;/script>",
+      sanitize_html(%(<script/XSS src="http://ha.ckers.org/xss.js"></script>))
+    assert_equal "<a>foo</a>",
+      sanitize_html('<a onclick!#$%&()*~+-_.,:;?@[/|\]^`=alert("XSS")>foo</a>')
+    assert_equal "<img />",
+      sanitize_html('<img/src="http://ha.ckers.org/xss.js"/>')
+  end
+
+  def test_img_dynsrc_lowsrc
+     assert_equal "<img />",
+       sanitize_html(%(<img dynsrc="javascript:alert('XSS')" />))
+     assert_equal "<img />",
+       sanitize_html(%(<img lowsrc="javascript:alert('XSS')" />))
+  end
+
+  def test_div_background_image_unicode_encoded
+    assert_equal '<div style="">foo</div>',
+      sanitize_html(%(<div style="background-image:\0075\0072\006C\0028'\006a\0061\0076\0061\0073\0063\0072\0069\0070\0074\003a\0061\006c\0065\0072\0074\0028.1027\0058.1053\0053\0027\0029'\0029">foo</div>))
+  end
+
+  def test_div_expression
+    assert_equal '<div style="">foo</div>',
+      sanitize_html(%(<div style="width: expression(alert('XSS'));">foo</div>))
+  end
+
+  def test_img_vbscript
+     assert_equal '<img />',
+       sanitize_html(%(<img src='vbscript:msgbox("XSS")' />))
+  end
+
+end
--- a/lib/chunks/engines.rb
+++ b/lib/chunks/engines.rb
@ -32,7 +32,7 @@ module Engines
      redcloth.filter_html = false
      redcloth.no_span_caps = false  
      html = redcloth.to_html(:textile)
-      sanitize_html(html)
+      sanitize_xhtml(html)
    end
  end

@ -43,7 +43,7 @@ module Engines
      require_dependency 'maruku'
      require_dependency 'maruku/ext/math'
      html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"), {:math_enabled => false}).to_html
-      sanitize_html(html).to_ncr
+      sanitize_xhtml(html.to_ncr)
    end
  end

@ -55,7 +55,7 @@ module Engines
      require_dependency 'maruku/ext/math'
      html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
            {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html
-      sanitize_html(html).to_ncr
+      sanitize_xhtml(html.to_ncr)
    end
  end

@ -68,7 +68,7 @@ module Engines
      redcloth.filter_html = false
      redcloth.no_span_caps = false
      html = redcloth.to_html
-      sanitize_html(html)
+      sanitize_xhtml(html)
    end
  end

@ -78,7 +78,7 @@ module Engines
    def mask
      require_dependency 'rdocsupport'
      html = RDocSupport::RDocFormatter.new(@content).to_html
-      sanitize_html(html)
+      sanitize_xhtml(html)
    end
  end

--- a/lib/sanitize.rb
+++ b/lib/sanitize.rb
@ -1,207 +1,26 @@
 module Sanitize

-# This module provides sanitization of XHTML+MathML+SVG
+# This module provides sanitization of XHTML+MathML+SVG 
 # and of inline style attributes.
 #
-# Based heavily on Sam Ruby's code in the Universal FeedParser.
-
-  require 'html/tokenizer'
-  require 'node'
-
-  acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
-      'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
-      'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
-      'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
-      'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
-      'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
-      'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
-      'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
-      'ul', 'var']
-      
-  mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
-      'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
-      'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
-      'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
-      'munderover', 'none']
-      
-  svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
-      'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
-      'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
-      'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
-      'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
-      'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
-      
-  acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
-      'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
-      'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
-      'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
-      'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
-      'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
-      'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
-      'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
-      'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
-      'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', 'title',
-      'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang']
+# Uses the HTML5lib parser, so that the parsing behaviour should
+# resemble that of browsers.
+#
+#  sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
+#  sanitize_html() is a case-insensitive sanitizer suitable for HTML


-  mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
-      'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
-      'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
-      'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
-      'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
-      'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
-      'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
-      'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
-      'xlink:type', 'xmlns', 'xmlns:xlink']
+  require 'html5lib/sanitizer'
+  require 'html5lib/html5parser'
+  require 'html5lib/liberalxmlparser'
+  include HTML5lib

-      
-  svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
-       'arabic-form', 'ascent', 'attributeName', 'attributeType',
-       'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
-       'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
-       'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
-       'font-family', 'font-size', 'font-stretch', 'font-style', 'font-variant',
-       'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name', 
-       'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
-       'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
-       'lang', 'marker-end', 'marker-mid', 'marker-start', 'markerHeight',
-       'markerUnits', 'markerWidth', 'mathematical', 'max', 'min', 'name',
-       'offset', 'opacity', 'orient', 'origin', 'overline-position',
-       'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
-       'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount', 'repeatDur',
-       'requiredExtensions', 'requiredFeatures', 'restart', 'rotate', 'rx',
-       'ry', 'slope', 'stemh', 'stemv', 'stop-color', 'stop-opacity',
-       'strikethrough-position', 'strikethrough-thickness', 'stroke',
-       'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
-       'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
-       'stroke-width', 'systemLanguage', 'target',
-       'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
-       'underline-position', 'underline-thickness', 'unicode',
-       'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
-       'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
-       'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
-       'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
-       'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
+  def sanitize_xhtml(html)
+    XHTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
+  end

-  attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'xlink:href']
-  
-  acceptable_css_properties = ['azimuth', 'background-color',
-      'border-bottom-color', 'border-collapse', 'border-color',
-      'border-left-color', 'border-right-color', 'border-top-color', 'clear',
-      'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
-      'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
-      'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
-      'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
-      'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
-      'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
-      'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
-      'white-space', 'width']
+  def sanitize_html(html)
+    HTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
+  end

-  acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
-      'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
-      'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
-      'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
-      'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
-      'transparent', 'underline', 'white', 'yellow']
-
-  acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
-      'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
-      'stroke-opacity']
-
-  acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
-      'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
-      'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
-      'ssh', 'sftp', 'rtsp', 'afs' ]
-
-      ALLOWED_ELEMENTS = acceptable_elements + mathml_elements + svg_elements  unless defined?(ALLOWED_ELEMENTS)
-      ALLOWED_ATTRIBUTES = acceptable_attributes + mathml_attributes + svg_attributes unless defined?(ALLOWED_ATTRIBUTES)
-      ALLOWED_CSS_PROPERTIES = acceptable_css_properties unless defined?(ALLOWED_CSS_PROPERTIES)
-      ALLOWED_CSS_KEYWORDS = acceptable_css_keywords unless defined?(ALLOWED_CSS_KEYWORDS)
-      ALLOWED_SVG_PROPERTIES = acceptable_svg_properties unless defined?(ALLOWED_SVG_PROPERTIES)
-      ALLOWED_PROTOCOLS = acceptable_protocols unless defined?(ALLOWED_PROTOCOLS)
-      ATTR_VAL_IS_URI = attr_val_is_uri unless defined?(ATTR_VAL_IS_URI)
-
-      # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and stripping out all
-      # attributes not in ALLOWED_ATTRIBUTES. Style attributes are parsed, and a restricted set,
-      # specified by ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
-      # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified in
-      # ALLOWED_PROTOCOLS are allowed.
-      # You can adjust what gets sanitized, by defining these constant arrays before this Module is loaded. 
-      #
-      #   sanitize_html('<script> do_nasty_stuff() </script>')
-      #    => &lt;script> do_nasty_stuff() &lt;/script>
-      #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
-      #    => <a>Click here for $100</a>
-      def sanitize_html(html)
-        if html.index("<")
-          tokenizer = HTML::Tokenizer.new(html)
-          new_text = ""
-
-          while token = tokenizer.next
-            node = XHTML::Node.parse(nil, 0, 0, token, false)
-            new_text << case node.tag?
-              when true
-                if ALLOWED_ELEMENTS.include?(node.name)
-                  if node.closing != :close
-                    node.attributes.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
-                    ATTR_VAL_IS_URI.each do |attr|
-                      val_unescaped = CGI.unescapeHTML(node.attributes[attr].to_s).gsub(/[\000-\040\177-\240]+/,'').downcase
-                      if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0]) 
-                        node.attributes.delete attr 
-                      end
-                    end
-                    if node.attributes['style']
-                      node.attributes['style'] = sanitize_css(node.attributes['style']) 
-                    end
-                  end
-                  node.to_s
-                else
-                  node.to_s.gsub(/</, "&lt;")
-                end
-              else
-                node.to_s.gsub(/</, "&lt;")
-            end
-          end
-
-          html = new_text
-        end
-        html
-      end
-      
-      def sanitize_css(style)
-          # disallow urls
-          style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
-
-          # gauntlet
-          if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
-             style = ''
-             return style
-          end
-          if style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
-             style = ''
-             return style
-          end
-
-          clean = []
-          style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
-            if ALLOWED_CSS_PROPERTIES.include?(prop.downcase)
-              clean <<  prop + ': ' + val + ';'
-            elsif ['background','border','margin','padding'].include?(prop.split('-')[0].downcase) 
-              goodval = true
-              val.split().each do |keyword|
-                if !ALLOWED_CSS_KEYWORDS.include?(keyword) and 
-                   keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
-                  goodval = false
-                end
-              end
-              if goodval 
-                clean <<  prop + ': ' + val + ';'
-              end
-            elsif ALLOWED_SVG_PROPERTIES.include?(prop.downcase)
-               clean <<  prop + ': ' + val + ';'
-            end
-          end
-
-          style = clean.join(' ')
-      end
-end      
+end
--- a/vendor/plugins/HTML5lib/README
+++ b/vendor/plugins/HTML5lib/README
@ -0,0 +1,9 @@
+= HTML5lib
+
+== Basic Usage
+
+    require 'html5lib'
+
+    doc = HTML5lib.parse('<html>...</html>')
+
+    doc.class # REXML::Document
--- a/vendor/plugins/HTML5lib/Rakefile.rb
+++ b/vendor/plugins/HTML5lib/Rakefile.rb
@ -0,0 +1,7 @@
+require 'rake'
+require 'rake/testtask'
+
+Rake::TestTask.new do |task|
+	task.pattern = 'tests/test_*.rb'
+	task.verbose = true
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib.rb
@ -0,0 +1,11 @@
+require 'html5lib/html5parser'
+
+module HTML5lib
+    def self.parse(stream, options={})
+        HTMLParser.parse(stream, options)
+    end
+
+    def self.parseFragment(stream, options={})
+        HTMLParser.parse(stream, options)
+    end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/constants.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/constants.rb
@ -0,0 +1,676 @@
+module HTML5lib
+
+class EOF < Exception; end
+
+CONTENT_MODEL_FLAGS = [
+    :PCDATA,
+    :RCDATA,
+    :CDATA,
+    :PLAINTEXT
+]
+
+SCOPING_ELEMENTS = %w[
+    button
+    caption
+    html
+    marquee
+    object
+    table
+    td
+    th
+]
+
+FORMATTING_ELEMENTS = %w[
+    a
+    b
+    big
+    em
+    font
+    i
+    nobr
+    s
+    small
+    strike
+    strong
+    tt
+    u
+]
+
+SPECIAL_ELEMENTS = %w[
+    address
+    area
+    base
+    basefont
+    bgsound
+    blockquote
+    body
+    br
+    center
+    col
+    colgroup
+    dd
+    dir
+    div
+    dl
+    dt
+    embed
+    fieldset
+    form
+    frame
+    frameset
+    h1
+    h2
+    h3
+    h4
+    h5
+    h6
+    head
+    hr
+    iframe
+    image
+    img
+    input
+    isindex
+    li
+    link
+    listing
+    menu
+    meta
+    noembed
+    noframes
+    noscript
+    ol
+    optgroup
+    option
+    p
+    param
+    plaintext
+    pre
+    script
+    select
+    spacer
+    style
+    tbody
+    textarea
+    tfoot
+    thead
+    title
+    tr
+    ul
+    wbr
+]
+
+SPACE_CHARACTERS = %W[
+    \t
+    \n
+    \x0B
+    \x0C
+    \x20
+    \r
+]
+
+TABLE_INSERT_MODE_ELEMENTS = %w[
+    table
+    tbody
+    tfoot
+    thead
+    tr
+]
+
+ASCII_LOWERCASE = ('a'..'z').to_a.join('')
+ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
+ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
+DIGITS = '0'..'9'
+HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
+
+# Heading elements need to be ordered 
+HEADING_ELEMENTS = %w[
+    h1
+    h2
+    h3
+    h4
+    h5
+    h6
+]
+
+# XXX What about event-source and command?
+VOID_ELEMENTS = %w[
+    base
+    link
+    meta
+    hr
+    br
+    img
+    embed
+    param
+    area
+    col
+    input
+]
+
+# entitiesWindows1252 has to be _ordered_ and needs to have an index.
+ENTITIES_WINDOWS1252 = [
+    8364,  # 0x80  0x20AC  EURO SIGN
+    65533, # 0x81          UNDEFINED
+    8218,  # 0x82  0x201A  SINGLE LOW-9 QUOTATION MARK
+    402,   # 0x83  0x0192  LATIN SMALL LETTER F WITH HOOK
+    8222,  # 0x84  0x201E  DOUBLE LOW-9 QUOTATION MARK
+    8230,  # 0x85  0x2026  HORIZONTAL ELLIPSIS
+    8224,  # 0x86  0x2020  DAGGER
+    8225,  # 0x87  0x2021  DOUBLE DAGGER
+    710,   # 0x88  0x02C6  MODIFIER LETTER CIRCUMFLEX ACCENT
+    8240,  # 0x89  0x2030  PER MILLE SIGN
+    352,   # 0x8A  0x0160  LATIN CAPITAL LETTER S WITH CARON
+    8249,  # 0x8B  0x2039  SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+    338,   # 0x8C  0x0152  LATIN CAPITAL LIGATURE OE
+    65533, # 0x8D          UNDEFINED
+    381,   # 0x8E  0x017D  LATIN CAPITAL LETTER Z WITH CARON
+    65533, # 0x8F          UNDEFINED
+    65533, # 0x90          UNDEFINED
+    8216,  # 0x91  0x2018  LEFT SINGLE QUOTATION MARK
+    8217,  # 0x92  0x2019  RIGHT SINGLE QUOTATION MARK
+    8220,  # 0x93  0x201C  LEFT DOUBLE QUOTATION MARK
+    8221,  # 0x94  0x201D  RIGHT DOUBLE QUOTATION MARK
+    8226,  # 0x95  0x2022  BULLET
+    8211,  # 0x96  0x2013  EN DASH
+    8212,  # 0x97  0x2014  EM DASH
+    732,   # 0x98  0x02DC  SMALL TILDE
+    8482,  # 0x99  0x2122  TRADE MARK SIGN
+    353,   # 0x9A  0x0161  LATIN SMALL LETTER S WITH CARON
+    8250,  # 0x9B  0x203A  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+    339,   # 0x9C  0x0153  LATIN SMALL LIGATURE OE
+    65533, # 0x9D          UNDEFINED
+    382,   # 0x9E  0x017E  LATIN SMALL LETTER Z WITH CARON
+    376    # 0x9F  0x0178  LATIN CAPITAL LETTER Y WITH DIAERESIS
+]
+
+private
+
+  def self.U n
+    [n].pack('U')
+  end
+
+public
+
+ENTITIES = {
+    "AElig" => U(0xC6),
+    "Aacute" => U(0xC1),
+    "Acirc" => U(0xC2),
+    "Agrave" => U(0xC0),
+    "Alpha" => U(0x0391),
+    "Aring" => U(0xC5),
+    "Atilde" => U(0xC3),
+    "Auml" => U(0xC4),
+    "Beta" => U(0x0392),
+    "Ccedil" => U(0xC7),
+    "Chi" => U(0x03A7),
+    "Dagger" => U(0x2021),
+    "Delta" => U(0x0394),
+    "ETH" => U(0xD0),
+    "Eacute" => U(0xC9),
+    "Ecirc" => U(0xCA),
+    "Egrave" => U(0xC8),
+    "Epsilon" => U(0x0395),
+    "Eta" => U(0x0397),
+    "Euml" => U(0xCB),
+    "Gamma" => U(0x0393),
+    "Iacute" => U(0xCD),
+    "Icirc" => U(0xCE),
+    "Igrave" => U(0xCC),
+    "Iota" => U(0x0399),
+    "Iuml" => U(0xCF),
+    "Kappa" => U(0x039A),
+    "Lambda" => U(0x039B),
+    "Mu" => U(0x039C),
+    "Ntilde" => U(0xD1),
+    "Nu" => U(0x039D),
+    "OElig" => U(0x0152),
+    "Oacute" => U(0xD3),
+    "Ocirc" => U(0xD4),
+    "Ograve" => U(0xD2),
+    "Omega" => U(0x03A9),
+    "Omicron" => U(0x039F),
+    "Oslash" => U(0xD8),
+    "Otilde" => U(0xD5),
+    "Ouml" => U(0xD6),
+    "Phi" => U(0x03A6),
+    "Pi" => U(0x03A0),
+    "Prime" => U(0x2033),
+    "Psi" => U(0x03A8),
+    "Rho" => U(0x03A1),
+    "Scaron" => U(0x0160),
+    "Sigma" => U(0x03A3),
+    "THORN" => U(0xDE),
+    "Tau" => U(0x03A4),
+    "Theta" => U(0x0398),
+    "Uacute" => U(0xDA),
+    "Ucirc" => U(0xDB),
+    "Ugrave" => U(0xD9),
+    "Upsilon" => U(0x03A5),
+    "Uuml" => U(0xDC),
+    "Xi" => U(0x039E),
+    "Yacute" => U(0xDD),
+    "Yuml" => U(0x0178),
+    "Zeta" => U(0x0396),
+    "aacute" => U(0xE1),
+    "acirc" => U(0xE2),
+    "acute" => U(0xB4),
+    "aelig" => U(0xE6),
+    "agrave" => U(0xE0),
+    "alefsym" => U(0x2135),
+    "alpha" => U(0x03B1),
+    "amp" => U(0x26),
+    "AMP" => U(0x26),
+    "and" => U(0x2227),
+    "ang" => U(0x2220),
+    "apos" => U(0x27),
+    "aring" => U(0xE5),
+    "asymp" => U(0x2248),
+    "atilde" => U(0xE3),
+    "auml" => U(0xE4),
+    "bdquo" => U(0x201E),
+    "beta" => U(0x03B2),
+    "brvbar" => U(0xA6),
+    "bull" => U(0x2022),
+    "cap" => U(0x2229),
+    "ccedil" => U(0xE7),
+    "cedil" => U(0xB8),
+    "cent" => U(0xA2),
+    "chi" => U(0x03C7),
+    "circ" => U(0x02C6),
+    "clubs" => U(0x2663),
+    "cong" => U(0x2245),
+    "copy" => U(0xA9),
+    "COPY" => U(0xA9),
+    "crarr" => U(0x21B5),
+    "cup" => U(0x222A),
+    "curren" => U(0xA4),
+    "dArr" => U(0x21D3),
+    "dagger" => U(0x2020),
+    "darr" => U(0x2193),
+    "deg" => U(0xB0),
+    "delta" => U(0x03B4),
+    "diams" => U(0x2666),
+    "divide" => U(0xF7),
+    "eacute" => U(0xE9),
+    "ecirc" => U(0xEA),
+    "egrave" => U(0xE8),
+    "empty" => U(0x2205),
+    "emsp" => U(0x2003),
+    "ensp" => U(0x2002),
+    "epsilon" => U(0x03B5),
+    "equiv" => U(0x2261),
+    "eta" => U(0x03B7),
+    "eth" => U(0xF0),
+    "euml" => U(0xEB),
+    "euro" => U(0x20AC),
+    "exist" => U(0x2203),
+    "fnof" => U(0x0192),
+    "forall" => U(0x2200),
+    "frac12" => U(0xBD),
+    "frac14" => U(0xBC),
+    "frac34" => U(0xBE),
+    "frasl" => U(0x2044),
+    "gamma" => U(0x03B3),
+    "ge" => U(0x2265),
+    "gt" => U(0x3E),
+    "GT" => U(0x3E),
+    "hArr" => U(0x21D4),
+    "harr" => U(0x2194),
+    "hearts" => U(0x2665),
+    "hellip" => U(0x2026),
+    "iacute" => U(0xED),
+    "icirc" => U(0xEE),
+    "iexcl" => U(0xA1),
+    "igrave" => U(0xEC),
+    "image" => U(0x2111),
+    "infin" => U(0x221E),
+    "int" => U(0x222B),
+    "iota" => U(0x03B9),
+    "iquest" => U(0xBF),
+    "isin" => U(0x2208),
+    "iuml" => U(0xEF),
+    "kappa" => U(0x03BA),
+    "lArr" => U(0x21D0),
+    "lambda" => U(0x03BB),
+    "lang" => U(0x2329),
+    "laquo" => U(0xAB),
+    "larr" => U(0x2190),
+    "lceil" => U(0x2308),
+    "ldquo" => U(0x201C),
+    "le" => U(0x2264),
+    "lfloor" => U(0x230A),
+    "lowast" => U(0x2217),
+    "loz" => U(0x25CA),
+    "lrm" => U(0x200E),
+    "lsaquo" => U(0x2039),
+    "lsquo" => U(0x2018),
+    "lt" => U(0x3C),
+    "LT" => U(0x3C),
+    "macr" => U(0xAF),
+    "mdash" => U(0x2014),
+    "micro" => U(0xB5),
+    "middot" => U(0xB7),
+    "minus" => U(0x2212),
+    "mu" => U(0x03BC),
+    "nabla" => U(0x2207),
+    "nbsp" => U(0xA0),
+    "ndash" => U(0x2013),
+    "ne" => U(0x2260),
+    "ni" => U(0x220B),
+    "not" => U(0xAC),
+    "notin" => U(0x2209),
+    "nsub" => U(0x2284),
+    "ntilde" => U(0xF1),
+    "nu" => U(0x03BD),
+    "oacute" => U(0xF3),
+    "ocirc" => U(0xF4),
+    "oelig" => U(0x0153),
+    "ograve" => U(0xF2),
+    "oline" => U(0x203E),
+    "omega" => U(0x03C9),
+    "omicron" => U(0x03BF),
+    "oplus" => U(0x2295),
+    "or" => U(0x2228),
+    "ordf" => U(0xAA),
+    "ordm" => U(0xBA),
+    "oslash" => U(0xF8),
+    "otilde" => U(0xF5),
+    "otimes" => U(0x2297),
+    "ouml" => U(0xF6),
+    "para" => U(0xB6),
+    "part" => U(0x2202),
+    "permil" => U(0x2030),
+    "perp" => U(0x22A5),
+    "phi" => U(0x03C6),
+    "pi" => U(0x03C0),
+    "piv" => U(0x03D6),
+    "plusmn" => U(0xB1),
+    "pound" => U(0xA3),
+    "prime" => U(0x2032),
+    "prod" => U(0x220F),
+    "prop" => U(0x221D),
+    "psi" => U(0x03C8),
+    "quot" => U(0x22),
+    "QUOT" => U(0x22),
+    "rArr" => U(0x21D2),
+    "radic" => U(0x221A),
+    "rang" => U(0x232A),
+    "raquo" => U(0xBB),
+    "rarr" => U(0x2192),
+    "rceil" => U(0x2309),
+    "rdquo" => U(0x201D),
+    "real" => U(0x211C),
+    "reg" => U(0xAE),
+    "REG" => U(0xAE),
+    "rfloor" => U(0x230B),
+    "rho" => U(0x03C1),
+    "rlm" => U(0x200F),
+    "rsaquo" => U(0x203A),
+    "rsquo" => U(0x2019),
+    "sbquo" => U(0x201A),
+    "scaron" => U(0x0161),
+    "sdot" => U(0x22C5),
+    "sect" => U(0xA7),
+    "shy" => U(0xAD),
+    "sigma" => U(0x03C3),
+    "sigmaf" => U(0x03C2),
+    "sim" => U(0x223C),
+    "spades" => U(0x2660),
+    "sub" => U(0x2282),
+    "sube" => U(0x2286),
+    "sum" => U(0x2211),
+    "sup" => U(0x2283),
+    "sup1" => U(0xB9),
+    "sup2" => U(0xB2),
+    "sup3" => U(0xB3),
+    "supe" => U(0x2287),
+    "szlig" => U(0xDF),
+    "tau" => U(0x03C4),
+    "there4" => U(0x2234),
+    "theta" => U(0x03B8),
+    "thetasym" => U(0x03D1),
+    "thinsp" => U(0x2009),
+    "thorn" => U(0xFE),
+    "tilde" => U(0x02DC),
+    "times" => U(0xD7),
+    "trade" => U(0x2122),
+    "uArr" => U(0x21D1),
+    "uacute" => U(0xFA),
+    "uarr" => U(0x2191),
+    "ucirc" => U(0xFB),
+    "ugrave" => U(0xF9),
+    "uml" => U(0xA8),
+    "upsih" => U(0x03D2),
+    "upsilon" => U(0x03C5),
+    "uuml" => U(0xFC),
+    "weierp" => U(0x2118),
+    "xi" => U(0x03BE),
+    "yacute" => U(0xFD),
+    "yen" => U(0xA5),
+    "yuml" => U(0xFF),
+    "zeta" => U(0x03B6),
+    "zwj" => U(0x200D),
+    "zwnj" => U(0x200C)
+}
+
+ENCODINGS = %w[
+    ansi_x3.4-1968
+    iso-ir-6
+    ansi_x3.4-1986
+    iso_646.irv:1991
+    ascii
+    iso646-us
+    us-ascii
+    us
+    ibm367
+    cp367
+    csascii
+    ks_c_5601-1987
+    korean
+    iso-2022-kr
+    csiso2022kr
+    euc-kr
+    iso-2022-jp
+    csiso2022jp
+    iso-2022-jp-2
+    iso-ir-58
+    chinese
+    csiso58gb231280
+    iso_8859-1:1987
+    iso-ir-100
+    iso_8859-1
+    iso-8859-1
+    latin1
+    l1
+    ibm819
+    cp819
+    csisolatin1
+    iso_8859-2:1987
+    iso-ir-101
+    iso_8859-2
+    iso-8859-2
+    latin2
+    l2
+    csisolatin2
+    iso_8859-3:1988
+    iso-ir-109
+    iso_8859-3
+    iso-8859-3
+    latin3
+    l3
+    csisolatin3
+    iso_8859-4:1988
+    iso-ir-110
+    iso_8859-4
+    iso-8859-4
+    latin4
+    l4
+    csisolatin4
+    iso_8859-6:1987
+    iso-ir-127
+    iso_8859-6
+    iso-8859-6
+    ecma-114
+    asmo-708
+    arabic
+    csisolatinarabic
+    iso_8859-7:1987
+    iso-ir-126
+    iso_8859-7
+    iso-8859-7
+    elot_928
+    ecma-118
+    greek
+    greek8
+    csisolatingreek
+    iso_8859-8:1988
+    iso-ir-138
+    iso_8859-8
+    iso-8859-8
+    hebrew
+    csisolatinhebrew
+    iso_8859-5:1988
+    iso-ir-144
+    iso_8859-5
+    iso-8859-5
+    cyrillic
+    csisolatincyrillic
+    iso_8859-9:1989
+    iso-ir-148
+    iso_8859-9
+    iso-8859-9
+    latin5
+    l5
+    csisolatin5
+    iso-8859-10
+    iso-ir-157
+    l6
+    iso_8859-10:1992
+    csisolatin6
+    latin6
+    hp-roman8
+    roman8
+    r8
+    ibm037
+    cp037
+    csibm037
+    ibm424
+    cp424
+    csibm424
+    ibm437
+    cp437
+    437
+    cspc8codepage437
+    ibm500
+    cp500
+    csibm500
+    ibm775
+    cp775
+    cspc775baltic
+    ibm850
+    cp850
+    850
+    cspc850multilingual
+    ibm852
+    cp852
+    852
+    cspcp852
+    ibm855
+    cp855
+    855
+    csibm855
+    ibm857
+    cp857
+    857
+    csibm857
+    ibm860
+    cp860
+    860
+    csibm860
+    ibm861
+    cp861
+    861
+    cp-is
+    csibm861
+    ibm862
+    cp862
+    862
+    cspc862latinhebrew
+    ibm863
+    cp863
+    863
+    csibm863
+    ibm864
+    cp864
+    csibm864
+    ibm865
+    cp865
+    865
+    csibm865
+    ibm866
+    cp866
+    866
+    csibm866
+    ibm869
+    cp869
+    869
+    cp-gr
+    csibm869
+    ibm1026
+    cp1026
+    csibm1026
+    koi8-r
+    cskoi8r
+    koi8-u
+    big5-hkscs
+    ptcp154
+    csptcp154
+    pt154
+    cp154
+    utf-7
+    utf-16be
+    utf-16le
+    utf-16
+    utf-8
+    iso-8859-13
+    iso-8859-14
+    iso-ir-199
+    iso_8859-14:1998
+    iso_8859-14
+    latin8
+    iso-celtic
+    l8
+    iso-8859-15
+    iso_8859-15
+    iso-8859-16
+    iso-ir-226
+    iso_8859-16:2001
+    iso_8859-16
+    latin10
+    l10
+    gbk
+    cp936
+    ms936
+    gb18030
+    shift_jis
+    ms_kanji
+    csshiftjis
+    euc-jp
+    gb2312
+    big5
+    csbig5
+    windows-1250
+    windows-1251
+    windows-1252
+    windows-1253
+    windows-1254
+    windows-1255
+    windows-1256
+    windows-1257
+    windows-1258
+    tis-620
+    hz-gb-2312
+]
+
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb
--- a/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
@ -0,0 +1,549 @@
+require 'stringio'
+require 'html5lib/constants'
+
+module HTML5lib
+
+# Provides a unicode stream of characters to the HTMLTokenizer.
+
+# This class takes care of character encoding and removing or replacing
+# incorrect byte-sequences and also provides column and line tracking.
+
+class HTMLInputStream
+
+    attr_accessor :queue, :charEncoding
+
+    # Initialises the HTMLInputStream.
+    # 
+    # HTMLInputStream(source, [encoding]) -> Normalized stream from source
+    # for use by the HTML5Lib.
+    # 
+    # source can be either a file-object, local filename or a string.
+    # 
+    # The optional encoding parameter must be a string that indicates
+    # the encoding.  If specified, that encoding will be used,
+    # regardless of any BOM or later declaration (such as in a meta
+    # element)
+    #  
+    # parseMeta - Look for a <meta> element containing encoding information
+
+    def initialize(source, options = {})
+        @encoding = nil
+        @parseMeta = true
+        @chardet = true
+
+        options.each { |name, value| instance_variable_set("@#{name}", value) }
+
+        # List of where new lines occur
+        @newLines = []
+
+        # Raw Stream
+        @rawStream = openStream(source)
+
+        # Encoding Information
+        #Number of bytes to use when looking for a meta element with
+        #encoding information
+        @NUM_BYTES_META = 512
+        #Encoding to use if no other information can be found
+        @DEFAULT_ENCODING = 'windows-1252'
+        
+        #Detect encoding iff no explicit "transport level" encoding is supplied
+        if @encoding.nil? or not HTML5lib.isValidEncoding(@encoding)
+            @charEncoding = detectEncoding
+        else
+            @charEncoding = @encoding
+        end
+
+        # Read bytes from stream decoding them into Unicode
+        uString = @rawStream.read
+        unless @charEncoding == 'utf-8'
+            begin
+                require 'iconv'
+                uString = Iconv.iconv('utf-8', @encoding, uString)[0]
+            rescue
+            end
+        end
+
+        # Normalize newlines and null characters
+        uString.gsub!(/\r\n?/, "\n")
+        uString.gsub!("\x00", [0xFFFD].pack('U'))
+
+        # Convert the unicode string into a list to be used as the data stream
+        @dataStream = uString
+
+        @queue = []
+
+        # Reset position in the list to read from
+        reset
+    end
+
+    # Produces a file object from source.
+    #
+    # source can be either a file object, local filename or a string.
+    def openStream(source)
+        # Already an IO like object
+        if source.respond_to?(:read)
+            @stream = source
+        else
+            # Treat source as a string and wrap in StringIO
+            @stream = StringIO.new(source)
+        end
+        return @stream
+    end
+
+    def detectEncoding
+
+        #First look for a BOM
+        #This will also read past the BOM if present
+        encoding = detectBOM
+        #If there is no BOM need to look for meta elements with encoding 
+        #information
+        if encoding.nil? and @parseMeta
+            encoding = detectEncodingMeta
+        end
+        #Guess with chardet, if avaliable
+        if encoding.nil? and @chardet
+            begin
+                require 'rubygems'
+                require 'UniversalDetector' # gem install chardet
+                buffer = @rawStream.read
+                encoding = UniversalDetector::chardet(buffer)['encoding']
+                @rawStream = openStream(buffer)
+            rescue LoadError
+            end
+        end
+        # If all else fails use the default encoding
+        if encoding.nil?
+            encoding = @DEFAULT_ENCODING
+        end
+        
+        #Substitute for equivalent encodings:
+        encodingSub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
+
+        if encodingSub.has_key?(encoding.downcase)
+            encoding = encodingSub[encoding.downcase]
+        end
+
+        return encoding
+    end
+
+    # Attempts to detect at BOM at the start of the stream. If
+    # an encoding can be determined from the BOM return the name of the
+    # encoding otherwise return nil
+    def detectBOM
+        bomDict = {
+            "\xef\xbb\xbf" => 'utf-8',
+            "\xff\xfe" => 'utf-16-le',
+            "\xfe\xff" => 'utf-16-be',
+            "\xff\xfe\x00\x00" => 'utf-32-le',
+            "\x00\x00\xfe\xff" => 'utf-32-be'
+        }
+
+        # Go to beginning of file and read in 4 bytes
+        @rawStream.seek(0)
+        string = @rawStream.read(4)
+        return nil unless string
+
+        # Try detecting the BOM using bytes from the string
+        encoding = bomDict[string[0...3]]          # UTF-8
+        seek = 3
+        unless encoding
+            # Need to detect UTF-32 before UTF-16
+            encoding = bomDict[string]             # UTF-32
+            seek = 4
+            unless encoding
+                encoding = bomDict[string[0...2]]  # UTF-16
+                seek = 2
+            end
+        end
+
+        #AT - move this to the caller?
+        # Set the read position past the BOM if one was found, otherwise
+        # set it to the start of the stream
+        @rawStream.seek(encoding ? seek : 0)
+
+        return encoding
+    end
+
+    # Report the encoding declared by the meta element
+    def detectEncodingMeta
+        parser = EncodingParser.new(@rawStream.read(@NUM_BYTES_META))
+        @rawStream.seek(0)
+        return parser.getEncoding
+    end
+
+    def determineNewLines
+        # Looks through the stream to find where new lines occur so
+        # the position method can tell where it is.
+        @newLines.push(0)
+        (0...@dataStream.length).each { |i| @newLines.push(i) if @dataStream[i] == ?\n }
+    end
+
+    # Returns (line, col) of the current position in the stream.
+    def position
+        # Generate list of new lines first time around
+        determineNewLines if @newLines.empty?
+        line = 0
+        tell = @tell
+        @newLines.each do |pos|
+            break unless pos < tell
+            line += 1
+        end
+        col = tell - @newLines[line-1] - 1
+        return [line, col]
+    end
+
+    # Resets the position in the stream back to the start.
+    def reset
+        @tell = 0
+    end
+
+    # Read one character from the stream or queue if available. Return
+    # EOF when EOF is reached.
+    def char
+        unless @queue.empty?
+            return @queue.shift
+        else
+            begin
+                @tell += 1
+                return @dataStream[@tell - 1].chr
+            rescue
+                return :EOF
+            end
+        end
+    end
+
+    # Returns a string of characters from the stream up to but not
+    # including any character in characters or EOF. characters can be
+    # any container that supports the in method being called on it.
+    def charsUntil(characters, opposite = false)
+        charStack = [char]
+
+        unless charStack[0] == :EOF
+            while (characters.include? charStack[-1]) == opposite
+                unless @queue.empty?
+                    # First from the queue
+                    charStack.push(@queue.shift)
+                    break if charStack[-1] == :EOF
+                else
+                    # Then the rest
+                    begin
+                        charStack.push(@dataStream[@tell].chr)
+                        @tell += 1
+                    rescue
+                        charStack.push(:EOF)
+                        break
+                    end
+                end
+            end
+        end
+
+        # Put the character stopped on back to the front of the queue
+        # from where it came.
+        @queue.insert(0, charStack.pop)
+        return charStack.join('')
+    end
+end
+
+# String-like object with an assosiated position and various extra methods
+# If the position is ever greater than the string length then an exception is raised
+class EncodingBytes < String
+
+    attr_accessor :position
+
+    def initialize(value)
+        super(value)
+        @position = -1
+    end
+    
+    def each
+        while @position < length
+            @position += 1
+            yield self[@position]
+        end
+    rescue EOF
+    end
+    
+    def currentByte
+        raise EOF if @position >= length
+        return self[@position].chr
+    end
+    
+    # Skip past a list of characters
+    def skip(chars = SPACE_CHARACTERS)
+        while chars.include?(currentByte)
+            @position += 1
+        end
+    end
+
+    # Look for a sequence of bytes at the start of a string. If the bytes 
+    # are found return true and advance the position to the byte after the 
+    # match. Otherwise return false and leave the position alone
+    def matchBytes(bytes, lower = false)
+        data = self[position ... position+bytes.length]
+        data.downcase! if lower
+        rv = (data == bytes)
+        @position += bytes.length if rv == true
+        return rv
+    end
+    
+    # Look for the next sequence of bytes matching a given sequence. If
+    # a match is found advance the position to the last byte of the match
+    def jumpTo(bytes)
+        newPosition = self[position .. -1].index(bytes)
+        if newPosition
+            @position += (newPosition + bytes.length-1)
+            return true
+        else
+            raise EOF
+        end
+    end
+    
+    # Move the pointer so it points to the next byte in a set of possible
+    # bytes
+    def findNext(byteList)
+        until byteList.include?(currentByte)
+            @position += 1
+        end
+    end
+end
+
+# Mini parser for detecting character encoding from meta elements
+class EncodingParser
+
+    # string - the data to work on for encoding detection
+    def initialize(data)
+        @data = EncodingBytes.new(data.to_s)
+        @encoding = nil
+    end
+
+    @@method_dispatch = [
+        ['<!--', :handleComment],
+        ['<meta', :handleMeta],
+        ['</', :handlePossibleEndTag],
+        ['<!', :handleOther],
+        ['<?', :handleOther],
+        ['<', :handlePossibleStartTag]
+    ]
+
+    def getEncoding
+        @data.each do |byte|
+            keepParsing = true
+            @@method_dispatch.each do |(key, method)|
+                if @data.matchBytes(key, lower = true)
+                    keepParsing = send(method)    
+                    break
+                end
+            end
+            break unless keepParsing
+        end
+        @encoding = @encoding.strip unless @encoding.nil?
+        return @encoding
+    end
+
+    # Skip over comments
+    def handleComment
+        return @data.jumpTo('-->')
+    end
+
+    def handleMeta
+        # if we have <meta not followed by a space so just keep going
+        return true unless SPACE_CHARACTERS.include?(@data.currentByte)
+
+        #We have a valid meta element we want to search for attributes
+        while true
+            #Try to find the next attribute after the current position
+            attr = getAttribute
+
+            return true if attr.nil?
+                
+            if attr[0] == 'charset'
+                tentativeEncoding = attr[1]
+                if HTML5lib.isValidEncoding(tentativeEncoding)
+                    @encoding = tentativeEncoding    
+                    return false
+                end
+            elsif attr[0] == 'content'
+                contentParser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
+                tentativeEncoding = contentParser.parse
+                if HTML5lib.isValidEncoding(tentativeEncoding)
+                    @encoding = tentativeEncoding    
+                    return false
+                end
+            end
+        end
+    end
+
+    def handlePossibleStartTag
+        return handlePossibleTag(false)
+    end
+
+    def handlePossibleEndTag
+        @data.position+=1
+        return handlePossibleTag(true)
+    end
+
+    def handlePossibleTag(endTag)
+        unless ASCII_LETTERS.include?(@data.currentByte)
+            #If the next byte is not an ascii letter either ignore this
+            #fragment (possible start tag case) or treat it according to 
+            #handleOther
+            if endTag
+                @data.position -= 1
+                handleOther
+            end
+            return true
+        end
+        
+        @data.findNext(SPACE_CHARACTERS + ['<', '>'])
+
+        if @data.currentByte == '<'
+            #return to the first step in the overall "two step" algorithm
+            #reprocessing the < byte
+            @data.position -= 1    
+        else
+            #Read all attributes
+            {} until getAttribute.nil?
+        end
+        return true
+    end
+
+    def handleOther
+        return @data.jumpTo('>')
+    end
+
+    # Return a name,value pair for the next attribute in the stream, 
+    # if one is found, or nil
+    def getAttribute
+        @data.skip(SPACE_CHARACTERS + ['/'])
+
+        if @data.currentByte == '<'
+            @data.position -= 1
+            return nil
+        elsif @data.currentByte == '>'
+            return nil
+        end
+
+        attrName = []
+        attrValue = []
+        spaceFound = false
+        #Step 5 attribute name
+        while true
+            if @data.currentByte == '=' and attrName:   
+                break
+            elsif SPACE_CHARACTERS.include?(@data.currentByte)
+                spaceFound = true
+                break
+            elsif ['/', '<', '>'].include?(@data.currentByte)
+                return [attrName.join(''), '']
+            elsif ASCII_UPPERCASE.include?(@data.currentByte)
+                attrName.push(@data.currentByte.downcase)
+            else
+                attrName.push(@data.currentByte)
+            end
+            #Step 6
+            @data.position += 1
+        end
+        #Step 7
+        if spaceFound
+            @data.skip
+            #Step 8
+            unless @data.currentByte == '='
+                @data.position -= 1
+                return [attrName.join(''), '']
+            end
+        end
+        #XXX need to advance position in both spaces and value case
+        #Step 9
+        @data.position += 1
+        #Step 10
+        @data.skip
+        #Step 11
+        if ["'", '"'].include?(@data.currentByte)
+            #11.1
+            quoteChar = @data.currentByte
+            while true
+                @data.position+=1
+                #11.3
+                if @data.currentByte == quoteChar
+                    @data.position += 1
+                    return [attrName.join(''), attrValue.join('')]
+                #11.4
+                elsif ASCII_UPPERCASE.include?(@data.currentByte)
+                    attrValue.push(@data.currentByte.downcase)
+                #11.5
+                else
+                    attrValue.push(@data.currentByte)
+                end
+            end
+        elsif ['>', '<'].include?(@data.currentByte)
+            return [attrName.join(''), '']
+        elsif ASCII_UPPERCASE.include?(@data.currentByte)
+            attrValue.push(@data.currentByte.downcase)
+        else
+            attrValue.push(@data.currentByte)
+        end
+        while true
+            @data.position +=1
+            if (SPACE_CHARACTERS + ['>', '<']).include?(@data.currentByte)
+                return [attrName.join(''), attrValue.join('')]
+            elsif ASCII_UPPERCASE.include?(@data.currentByte)
+                attrValue.push(@data.currentByte.downcase)
+            else
+                attrValue.push(@data.currentByte)
+            end
+        end
+    end
+end
+
+class ContentAttrParser
+    def initialize(data)
+        @data = data
+    end
+    def parse
+        begin
+            #Skip to the first ";"
+            @data.position = 0
+            @data.jumpTo(';')
+            @data.position += 1
+            @data.skip
+            #Check if the attr name is charset 
+            #otherwise return
+            @data.jumpTo('charset')
+            @data.position += 1
+            @data.skip
+            unless @data.currentByte == '='
+                #If there is no = sign keep looking for attrs
+                return nil
+            end
+            @data.position += 1
+            @data.skip
+            #Look for an encoding between matching quote marks
+            if ['"', "'"].include?(@data.currentByte)
+                quoteMark = @data.currentByte
+                @data.position += 1
+                oldPosition = @data.position
+                @data.jumpTo(quoteMark)
+                return @data[oldPosition ... @data.position]
+            else
+                #Unquoted value
+                oldPosition = @data.position
+                begin
+                    @data.findNext(SPACE_CHARACTERS)
+                    return @data[oldPosition ... @data.position]
+                rescue EOF
+                    #Return the whole remaining value
+                    return @data[oldPosition .. -1]
+                end
+            end
+        rescue EOF
+            return nil
+        end
+    end
+end
+
+# Determine if a string is a supported encoding
+def self.isValidEncoding(encoding)
+    (not encoding.nil? and encoding.kind_of?(String) and ENCODINGS.include?(encoding.downcase.strip))
+end
+
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/liberalxmlparser.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/liberalxmlparser.rb
@ -0,0 +1,141 @@
+# Warning: this module is experimental and subject to change and even removal
+# at any time. 
+# 
+# For background/rationale, see:
+#  * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
+#  * http://tinyurl.com/ylfj8k (and follow-ups)
+# 
+# References:
+#  * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
+#  * http://wiki.whatwg.org/wiki/HtmlVsXhtml
+# 
+# @@TODO:
+# * Selectively lowercase only XHTML, but not foreign markup
+require 'html5lib/html5parser'
+require 'html5lib/constants'
+
+module HTML5lib
+
+# liberal XML parser
+class XMLParser < HTMLParser
+
+    def initialize(options={})
+        super options
+        @phases[:initial] = XmlRootPhase.new(self, @tree)
+    end
+
+    def normalizeToken(token)
+        if token[:type] == :StartTag or token[:type] == :EmptyTag
+            # We need to remove the duplicate attributes and convert attributes
+            # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
+
+            token[:data] = Hash[*token[:data].reverse.flatten]
+
+            # For EmptyTags, process both a Start and an End tag
+            if token[:type] == :EmptyTag
+                @phase.processStartTag(token[:name], token[:data])
+                token[:data] = {}
+                token[:type] = :EndTag
+            end
+
+        elsif token[:type] == :EndTag
+            if token[:data]
+               parseError(_("End tag contains unexpected attributes."))
+            end
+
+        elsif token[:type] == :Comment
+            # Rescue CDATA from the comments
+            if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
+                token[:type] = :Characters
+                token[:data] = token[:data][7 ... -2]
+            end
+        end
+
+        return token
+    end
+end
+
+# liberal XMTHML parser
+class XHTMLParser < XMLParser
+
+    def initialize(options={})
+        super options
+        @phases[:initial] = InitialPhase.new(self, @tree)
+        @phases[:rootElement] = XhmlRootPhase.new(self, @tree)
+    end
+
+    def normalizeToken(token)
+        super(token)
+
+        # ensure that non-void XHTML elements have content so that separate
+        # open and close tags are emitted
+        if token[:type]  == :EndTag and \
+            not VOID_ELEMENTS.include? token[:name] and \
+            token[:name] == @tree.openElements[-1].name and \
+            not @tree.openElements[-1].hasContent
+            @tree.insertText('') unless
+                @tree.openElements.any? {|e|
+                    e.attributes.keys.include? 'xmlns' and
+                    e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
+                }
+        end
+
+        return token
+    end
+end
+
+class XhmlRootPhase < RootElementPhase
+    def insertHtmlElement
+        element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
+        @tree.openElements.push(element)
+        @tree.document.appendChild(element)
+        @parser.phase = @parser.phases[:beforeHead]
+    end
+end
+
+class XmlRootPhase < Phase
+    # Prime the Xml parser
+    @start_tag_handlers = Hash.new(:startTagOther)
+    @end_tag_handlers = Hash.new(:endTagOther)
+    def startTagOther(name, attributes)
+        @tree.openElements.push(@tree.document)
+        element = @tree.createElement(name, attributes)
+        @tree.openElements[-1].appendChild(element)
+        @tree.openElements.push(element)
+        @parser.phase = XmlElementPhase.new(@parser,@tree)
+    end
+    def endTagOther(name)
+        super
+        @tree.openElements.pop
+    end
+end
+
+class XmlElementPhase < Phase
+    # Generic handling for all XML elements
+
+    @start_tag_handlers = Hash.new(:startTagOther)
+    @end_tag_handlers = Hash.new(:endTagOther)
+
+    def startTagOther(name, attributes)
+        element = @tree.createElement(name, attributes)
+        @tree.openElements[-1].appendChild(element)
+        @tree.openElements.push(element)
+    end
+
+    def endTagOther(name)
+        for node in @tree.openElements.reverse
+            if node.name == name
+                {} while @tree.openElements.pop != node
+                break
+            else
+                @parser.parseError
+            end
+        end
+    end
+
+    def processCharacters(data)
+        @tree.insertText(data)
+    end
+end
+
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb
@ -0,0 +1,178 @@
+require 'html5lib/tokenizer'
+require 'cgi'
+
+module HTML5lib
+
+# This module provides sanitization of XHTML+MathML+SVG
+# and of inline style attributes.
+
+class HTMLSanitizer < HTMLTokenizer
+
+    ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
+        button caption center cite code col colgroup dd del dfn dir div dl dt
+        em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label
+        legend li map menu ol optgroup option p pre q s samp select small span
+        strike strong sub sup table tbody td textarea tfoot th thead tr tt u
+        ul var]
+
+    MATHML_ELEMENTS = %w[maction math merror mfrac mi mmultiscripts mn mo
+        mover mpadded mphantom mprescripts mroot mrow mspace msqrt mstyle msub
+        msubsup msup mtable mtd mtext mtr munder munderover none]
+
+    SVG_ELEMENTS = %w[a animate animateColor animateMotion animateTransform
+        circle defs desc ellipse font-face font-face-name font-face-src g
+        glyph hkern image linearGradient line marker metadata missing-glyph
+        mpath path polygon polyline radialGradient rect set stop svg switch
+        text title tspan use]
+
+    ACCEPTABLE_ATTRIBUTES = %w[abbr accept accept-charset accesskey action
+        align alt axis border cellpadding cellspacing char charoff charset
+        checked cite class clear cols colspan color compact coords datetime
+        dir disabled enctype for frame headers height href hreflang hspace id
+        ismap label lang longdesc maxlength media method multiple name nohref
+        noshade nowrap prompt readonly rel rev rows rowspan rules scope
+        selected shape size span src start style summary tabindex target title
+        type usemap valign value vspace width xml:lang]
+
+    MATHML_ATTRIBUTES = %w[actiontype align columnalign columnalign
+        columnalign columnlines columnspacing columnspan depth display
+        displaystyle equalcolumns equalrows fence fontstyle fontweight frame
+        height linethickness lspace mathbackground mathcolor mathvariant
+        mathvariant maxsize minsize other rowalign rowalign rowalign rowlines
+        rowspacing rowspan rspace scriptlevel selection separator stretchy
+        width width xlink:href xlink:show xlink:type xmlns xmlns:xlink]
+
+    SVG_ATTRIBUTES = %w[accent-height accumulate additive alphabetic
+         arabic-form ascent attributeName attributeType baseProfile bbox begin
+         by calcMode cap-height class color color-rendering content cx cy d dx
+         dy descent display dur end fill fill-rule font-family font-size
+         font-stretch font-style font-variant font-weight from fx fy g1 g2
+         glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
+         ideographic k keyPoints keySplines keyTimes lang marker-end
+         marker-mid marker-start markerHeight markerUnits markerWidth
+         mathematical max min name offset opacity orient origin
+         overline-position overline-thickness panose-1 path pathLength points
+         preserveAspectRatio r refX refY repeatCount repeatDur
+         requiredExtensions requiredFeatures restart rotate rx ry slope stemh
+         stemv stop-color stop-opacity strikethrough-position
+         strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
+         stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
+         stroke-width systemLanguage target text-anchor to transform type u1
+         u2 underline-position underline-thickness unicode unicode-range
+         units-per-em values version viewBox visibility width widths x
+         x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
+         xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
+         xmlns:xlink y y1 y2 zoomAndPan]
+
+    ATTR_VAL_IS_URI = %w[href src cite action longdesc xlink:href]
+
+    ACCEPTABLE_CSS_PROPERTIES = %w[azimuth background-color
+        border-bottom-color border-collapse border-color border-left-color
+        border-right-color border-top-color clear color cursor direction
+        display elevation float font font-family font-size font-style
+        font-variant font-weight height letter-spacing line-height overflow
+        pause pause-after pause-before pitch pitch-range richness speak
+        speak-header speak-numeral speak-punctuation speech-rate stress
+        text-align text-decoration text-indent unicode-bidi vertical-align
+        voice-family volume white-space width]
+
+    ACCEPTABLE_CSS_KEYWORDS = %w[auto aqua black block blue bold both bottom
+        brown center collapse dashed dotted fuchsia gray green !important
+        italic left lime maroon medium none navy normal nowrap olive pointer
+        purple red right solid silver teal top transparent underline white
+        yellow]
+
+    ACCEPTABLE_SVG_PROPERTIES = %w[fill fill-opacity fill-rule stroke
+        stroke-width stroke-linecap stroke-linejoin stroke-opacity]
+
+    ACCEPTABLE_PROTOCOLS = %w[ed2k ftp http https irc mailto news gopher nntp
+        telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
+
+    # subclasses may define their own versions of these constants
+    ALLOWED_ELEMENTS = ACCEPTABLE_ELEMENTS + MATHML_ELEMENTS + SVG_ELEMENTS
+    ALLOWED_ATTRIBUTES = ACCEPTABLE_ATTRIBUTES + MATHML_ATTRIBUTES + SVG_ATTRIBUTES
+    ALLOWED_CSS_PROPERTIES = ACCEPTABLE_CSS_PROPERTIES
+    ALLOWED_CSS_KEYWORDS = ACCEPTABLE_CSS_KEYWORDS
+    ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
+    ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
+
+    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
+    # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
+    # attributes are parsed, and a restricted set, # specified by
+    # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
+    # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
+    # in ALLOWED_PROTOCOLS are allowed.
+    #
+    #   sanitize_html('<script> do_nasty_stuff() </script>')
+    #    => &lt;script> do_nasty_stuff() &lt;/script>
+    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
+    #    => <a>Click here for $100</a>
+    def each
+        super do |token|
+            case token[:type]
+            when :StartTag, :EndTag, :EmptyTag
+                if ALLOWED_ELEMENTS.include?(token[:name])
+                    if token.has_key? :data
+                        attrs = Hash[*token[:data].flatten]
+                        attrs.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
+                        ATTR_VAL_IS_URI.each do |attr|
+                            val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
+                            if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
+                                attrs.delete attr
+                            end
+                        end
+                        if attrs['style']
+                            attrs['style'] = sanitize_css(attrs['style'])
+                        end
+                        token[:data] = attrs.map {|k,v| [k,v]}
+                    end
+                    yield token
+                else
+                    if token[:type] == :EndTag
+                        token[:data] = "</#{token[:name]}>"
+                    elsif token[:data]
+                        attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('')
+                        token[:data] = "<#{token[:name]}#{attrs}>"
+                    else
+                        token[:data] = "<#{token[:name]}>"
+                    end
+                    token[:data].insert(-2,'/') if token[:type] == :EmptyTag
+                    token[:type] = :Characters
+                    token.delete(:name)
+                    yield token
+                end
+            else
+                yield token
+            end
+
+          end
+      end
+
+      def sanitize_css(style)
+          # disallow urls
+          style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
+
+          # gauntlet
+          return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
+          return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
+
+          clean = []
+          style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
+              next if val.empty?
+              prop.downcase!
+              if ALLOWED_CSS_PROPERTIES.include?(prop)
+                  clean << "#{prop}: #{val};"
+              elsif %w[background border margin padding].include?(prop.split('-')[0])
+                  clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
+                      !ALLOWED_CSS_KEYWORDS.include?(keyword) and
+                      keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
+                  end
+              elsif ALLOWED_SVG_PROPERTIES.include?(prop)
+                  clean << "#{prop}: #{val};"
+              end
+          end
+
+          style = clean.join(' ')
+      end
+end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb
@ -0,0 +1,854 @@
+require 'html5lib/constants'
+require 'html5lib/inputstream'
+
+module HTML5lib
+
+# This class takes care of tokenizing HTML.
+#
+# * @currentToken
+#   Holds the token that is currently being processed.
+#
+# * @state
+#   Holds a reference to the method to be invoked... XXX
+#
+# * @states
+#   Holds a mapping between states and methods that implement the state.
+#
+# * @stream
+#   Points to HTMLInputStream object.
+
+class HTMLTokenizer
+    attr_accessor :contentModelFlag, :currentToken
+    attr_reader :stream
+
+    # XXX need to fix documentation
+
+    def initialize(stream, options={})
+        @stream = HTMLInputStream.new(stream, options)
+
+        @states = {
+            :data => :dataState,
+            :entityData => :entityDataState,
+            :tagOpen => :tagOpenState,
+            :closeTagOpen => :closeTagOpenState,
+            :tagName => :tagNameState,
+            :beforeAttributeName => :beforeAttributeNameState,
+            :attributeName => :attributeNameState,
+            :afterAttributeName => :afterAttributeNameState,
+            :beforeAttributeValue => :beforeAttributeValueState,
+            :attributeValueDoubleQuoted => :attributeValueDoubleQuotedState,
+            :attributeValueSingleQuoted => :attributeValueSingleQuotedState,
+            :attributeValueUnQuoted => :attributeValueUnQuotedState,
+            :bogusComment => :bogusCommentState,
+            :markupDeclarationOpen => :markupDeclarationOpenState,
+            :comment => :commentState,
+            :commentDash => :commentDashState,
+            :commentEnd => :commentEndState,
+            :doctype => :doctypeState,
+            :beforeDoctypeName => :beforeDoctypeNameState,
+            :doctypeName => :doctypeNameState,
+            :afterDoctypeName => :afterDoctypeNameState,
+            :bogusDoctype => :bogusDoctypeState
+        }
+
+        # Setup the initial tokenizer state
+        @contentModelFlag = :PCDATA
+        @state = @states[:data]
+
+        # The current token being created
+        @currentToken = nil
+
+        # Tokens to be processed.
+        @tokenQueue = []
+    end
+
+    # This is where the magic happens.
+    #
+    # We do our usually processing through the states and when we have a token
+    # to return we yield the token which pauses processing until the next token
+    # is requested.
+    def each
+        @stream.reset
+        @tokenQueue = []
+        # Start processing. When EOF is reached @state will return false
+        # instead of true and the loop will terminate.
+        while send @state
+            while not @tokenQueue.empty?
+                yield @tokenQueue.shift
+            end
+        end
+    end
+
+    # Below are various helper functions the tokenizer states use worked out.
+    
+    # If the next character is a '>', convert the currentToken into
+    # an EmptyTag
+
+    def processSolidusInTag
+
+        # We need to consume another character to make sure it's a ">"
+        data = @stream.char
+
+        if @currentToken[:type] == :StartTag and data == ">"
+            @currentToken[:type] = :EmptyTag
+        else
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Solidus (/) incorrectly placed in tag.")})
+        end
+
+        # The character we just consumed need to be put back on the stack so it
+        # doesn't get lost...
+        @stream.queue.push(data)
+    end
+
+    # This function returns either U+FFFD or the character based on the
+    # decimal or hexadecimal representation. It also discards ";" if present.
+    # If not present @tokenQueue.push({:type => :ParseError}") is invoked.
+
+    def consumeNumberEntity(isHex)
+
+        # XXX More need to be done here. For instance, #13 should prolly be
+        # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
+        # such. Thoughts on this appreciated.
+        allowed = DIGITS
+        radix = 10
+        if isHex
+            allowed = HEX_DIGITS
+            radix = 16
+        end
+
+        char = [0xFFFD].pack('U')
+        charStack = []
+
+        # Consume all the characters that are in range while making sure we
+        # don't hit an EOF.
+        c = @stream.char
+        while allowed.include?(c) and c != :EOF
+            charStack.push(c)
+            c = @stream.char
+        end
+
+        # Convert the set of characters consumed to an int.
+        charAsInt = charStack.join('').to_i(radix)
+
+        # If the integer is between 127 and 160 (so 128 and bigger and 159 and
+        # smaller) we need to do the "windows trick".
+        if (127...160).include? charAsInt
+            #XXX - removed parse error from windows 1252 entity for now
+            #we may want to reenable this later
+            #@tokenQueue.push({:type => :ParseError, :data =>
+            #  _("Entity used with illegal number (windows-1252 reference).")})
+
+            charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
+        end
+
+        # 0 is not a good number.
+        if charAsInt == 0
+            charAsInt = 65533
+        end
+
+        if charAsInt <= 0x10FFF
+            char = [charAsInt].pack('U')
+        else
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Numeric entity couldn't be converted to character.")})
+        end
+
+        # Discard the ; if present. Otherwise, put it back on the queue and
+        # invoke parseError on parser.
+        if c != ";"
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Numeric entity didn't end with ';'.")})
+            @stream.queue.push(c)
+        end
+
+        return char
+    end
+
+    def consumeEntity
+        char = nil
+        charStack = [@stream.char]
+        if charStack[0] == "#"
+            # We might have a number entity here.
+            charStack += [@stream.char, @stream.char]
+            if charStack.include? :EOF
+                # If we reach the end of the file put everything up to :EOF
+                # back in the queue
+                charStack = charStack[0...charStack.index(:EOF)]
+                @stream.queue+= charStack
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Numeric entity expected. Got end of file instead.")})
+            else
+                if charStack[1].downcase == "x" \
+                  and HEX_DIGITS.include? charStack[2]
+                    # Hexadecimal entity detected.
+                    @stream.queue.push(charStack[2])
+                    char = consumeNumberEntity(true)
+                elsif DIGITS.include? charStack[1]
+                    # Decimal entity detected.
+                    @stream.queue += charStack[1..-1]
+                    char = consumeNumberEntity(false)
+                else
+                    # No number entity detected.
+                    @stream.queue += charStack
+                    @tokenQueue.push({:type => :ParseError, :data =>
+                      _("Numeric entity expected but none found.")})
+                end
+            end
+        # Break out if we reach the end of the file
+        elsif charStack[0] == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Entity expected. Got end of file instead.")})
+        else
+            # At this point in the process might have named entity. Entities
+            # are stored in the global variable "entities".
+            #
+            # Consume characters and compare to these to a substring of the
+            # entity names in the list until the substring no longer matches.
+            filteredEntityList = ENTITIES.keys
+            filteredEntityList.reject! {|e| e[0].chr != charStack[0]}
+            entityName = nil
+
+            while charStack[-1] != :EOF
+                name = charStack.join('')
+                if filteredEntityList.any? {|e| e[0...name.length] == name}
+                    filteredEntityList.reject! {|e| e[0...name.length] != name}
+                    charStack.push(@stream.char)
+                else
+                    break
+                end
+
+                if ENTITIES.include? name
+                    entityName = name
+                end
+            end
+
+            if entityName != nil
+                char = ENTITIES[entityName]
+
+                # Check whether or not the last character returned can be
+                # discarded or needs to be put back.
+                if not charStack[-1] == ";"
+                    @tokenQueue.push({:type => :ParseError, :data =>
+                      _("Named entity didn't end with ';'.")})
+                    @stream.queue += charStack[entityName.length..-1]
+                end
+            else
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Named entity expected. Got none.")})
+                @stream.queue += charStack
+            end
+        end
+        return char
+    end
+
+    # This method replaces the need for "entityInAttributeValueState".
+    def processEntityInAttribute
+        entity = consumeEntity
+        if entity
+            @currentToken[:data][-1][1] += entity
+        else
+            @currentToken[:data][-1][1] += "&"
+        end
+    end
+
+    # This method is a generic handler for emitting the tags. It also sets
+    # the state to "data" because that's what's needed after a token has been
+    # emitted.
+    def emitCurrentToken
+        # Add token to the queue to be yielded
+        @tokenQueue.push(@currentToken)
+        @state = @states[:data]
+    end
+
+
+    # Below are the various tokenizer states worked out.
+
+    # XXX AT Perhaps we should have Hixie run some evaluation on billions of
+    # documents to figure out what the order of the various if and elsif
+    # statements should be.
+
+    def dataState
+        data = @stream.char
+        if data == "&" and (@contentModelFlag == :PCDATA or
+            @contentModelFlag == :RCDATA)
+            @state = @states[:entityData]
+        elsif data == "<" and @contentModelFlag != :PLAINTEXT
+            @state = @states[:tagOpen]
+        elsif data == :EOF
+            # Tokenization ends.
+            return false
+        elsif SPACE_CHARACTERS.include? data
+            # Directly after emitting a token you switch back to the "data
+            # state". At that point SPACE_CHARACTERS are important so they are
+            # emitted separately.
+            # XXX need to check if we don't need a special "spaces" flag on
+            # characters.
+            @tokenQueue.push({:type => :SpaceCharacters, :data =>
+              data + @stream.charsUntil(SPACE_CHARACTERS, true)})
+        else
+            @tokenQueue.push({:type => :Characters, :data => 
+              data + @stream.charsUntil(["&", "<"])})
+        end
+        return true
+    end
+
+    def entityDataState
+        entity = consumeEntity
+        if entity
+            @tokenQueue.push({:type => :Characters, :data => entity})
+        else
+            @tokenQueue.push({:type => :Characters, :data => "&"})
+        end
+        @state = @states[:data]
+        return true
+    end
+
+    def tagOpenState
+        data = @stream.char
+        if @contentModelFlag == :PCDATA
+            if data == "!"
+                @state = @states[:markupDeclarationOpen]
+            elsif data == "/"
+                @state = @states[:closeTagOpen]
+            elsif data != :EOF and ASCII_LETTERS.include? data
+                @currentToken =\
+                  {:type => :StartTag, :name => data, :data => []}
+                @state = @states[:tagName]
+            elsif data == ">"
+                # XXX In theory it could be something besides a tag name. But
+                # do we really care?
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Expected tag name. Got '>' instead.")})
+                @tokenQueue.push({:type => :Characters, :data => "<>"})
+                @state = @states[:data]
+            elsif data == "?"
+                # XXX In theory it could be something besides a tag name. But
+                # do we really care?
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Expected tag name. Got '?' instead (HTML doesn't " +
+                  "support processing instructions).")})
+                @stream.queue.push(data)
+                @state = @states[:bogusComment]
+            else
+                # XXX
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Expected tag name. Got something else instead")})
+                @tokenQueue.push({:type => :Characters, :data => "<"})
+                @stream.queue.push(data)
+                @state = @states[:data]
+            end
+        else
+            # We know the content model flag is set to either RCDATA or CDATA
+            # now because this state can never be entered with the PLAINTEXT
+            # flag.
+            if data == "/"
+                @state = @states[:closeTagOpen]
+            else
+                @tokenQueue.push({:type => :Characters, :data => "<"})
+                @stream.queue.insert(0, data)
+                @state = @states[:data]
+            end
+        end
+        return true
+    end
+
+    def closeTagOpenState
+        if (@contentModelFlag == :RCDATA or @contentModelFlag == :CDATA)
+            if @currentToken
+                charStack = []
+
+                # So far we know that "</" has been consumed. We now need to know
+                # whether the next few characters match the name of last emitted
+                # start tag which also happens to be the currentToken. We also need
+                # to have the character directly after the characters that could
+                # match the start tag name.
+                (@currentToken[:name].length + 1).times do
+                    charStack.push(@stream.char)
+                    # Make sure we don't get hit by :EOF
+                    break if charStack[-1] == :EOF
+                end
+
+                # Since this is just for checking. We put the characters back on
+                # the stack.
+                @stream.queue += charStack
+            end
+
+            if @currentToken and
+              @currentToken[:name].downcase == 
+                charStack[0...-1].join('').downcase and
+              (SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? charStack[-1]
+                # Because the characters are correct we can safely switch to
+                # PCDATA mode now. This also means we don't have to do it when
+                # emitting the end tag token.
+                @contentModelFlag = :PCDATA
+            else
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Expected closing tag after seeing '</'. None found.")})
+                @tokenQueue.push({:type => :Characters, :data => "</"})
+                @state = @states[:data]
+
+                # Need to return here since we don't want the rest of the
+                # method to be walked through.
+                return true
+            end
+        end
+
+        if @contentModelFlag == :PCDATA
+            data = @stream.char
+            if data == :EOF
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Expected closing tag. Unexpected end of file.")})
+                @tokenQueue.push({:type => :Characters, :data => "</"})
+                @state = @states[:data]
+            elsif ASCII_LETTERS.include? data
+                @currentToken =\
+                  {:type => :EndTag, :name => data, :data => []}
+                @state = @states[:tagName]
+            elsif data == ">"
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
+                @state = @states[:data]
+            else
+                # XXX data can be _'_...
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Expected closing tag. Unexpected character '" + data + "' found.")})
+                @stream.queue.push(data)
+                @state = @states[:bogusComment]
+            end
+        end
+        return true
+    end
+
+    def tagNameState
+        data = @stream.char
+        if SPACE_CHARACTERS.include? data
+            @state = @states[:beforeAttributeName]
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in the tag name.")})
+            emitCurrentToken
+        elsif ASCII_LETTERS.include? data
+            @currentToken[:name] += data +\
+              @stream.charsUntil(ASCII_LETTERS, true)
+        elsif data == ">"
+            emitCurrentToken
+        elsif data == "<"
+            @stream.queue.push(data)
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected < character when getting the tag name.")})
+            emitCurrentToken
+        elsif data == "/"
+            processSolidusInTag
+            @state = @states[:beforeAttributeName]
+        else
+            @currentToken[:name] += data
+        end
+        return true
+    end
+
+    def beforeAttributeNameState
+        data = @stream.char
+        if SPACE_CHARACTERS.include? data
+            @stream.charsUntil(SPACE_CHARACTERS, true)
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file. Expected attribute name instead.")})
+            emitCurrentToken
+        elsif ASCII_LETTERS.include? data
+            @currentToken[:data].push([data, ""])
+            @state = @states[:attributeName]
+        elsif data == ">"
+            emitCurrentToken
+        elsif data == "/"
+            processSolidusInTag
+        elsif data == "<"
+            @stream.queue.push(data)
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected < character. Expected attribute name instead.")})
+            emitCurrentToken
+        else
+            @currentToken[:data].push([data, ""])
+            @state = @states[:attributeName]
+        end
+        return true
+    end
+
+    def attributeNameState
+        data = @stream.char
+        leavingThisState = true
+        if data == "="
+            @state = @states[:beforeAttributeValue]
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in attribute name.")})
+            emitCurrentToken
+            leavingThisState = false
+        elsif ASCII_LETTERS.include? data
+            @currentToken[:data][-1][0] += data +\
+              @stream.charsUntil(ASCII_LETTERS, true)
+            leavingThisState = false
+        elsif data == ">"
+            # XXX If we emit here the attributes are converted to a dict
+            # without being checked and when the code below runs we error
+            # because data is a dict not a list
+        elsif SPACE_CHARACTERS.include? data
+            @state = @states[:afterAttributeName]
+        elsif data == "/"
+            processSolidusInTag
+            @state = @states[:beforeAttributeName]
+        elsif data == "<"
+            @stream.queue.push(data)
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected < character in attribute name.")})
+            emitCurrentToken
+            leavingThisState = false
+        else
+            @currentToken[:data][-1][0] += data
+            leavingThisState = false
+        end
+
+        if leavingThisState
+            # Attributes are not dropped at this stage. That happens when the
+            # start tag token is emitted so values can still be safely appended
+            # to attributes, but we do want to report the parse error in time.
+            @currentToken[:data][0...-1].each {|name,value|
+                if @currentToken[:data][-1][0] == name
+                    @tokenQueue.push({:type => :ParseError, :data =>
+                      _("Dropped duplicate attribute on tag.")})
+                end
+            }
+            # XXX Fix for above XXX
+            if data == ">"
+                emitCurrentToken
+            end
+        end
+        return true
+    end
+
+    def afterAttributeNameState
+        data = @stream.char
+        if SPACE_CHARACTERS.include? data
+            @stream.charsUntil(SPACE_CHARACTERS, true)
+        elsif data == "="
+            @state = @states[:beforeAttributeValue]
+        elsif data == ">"
+            emitCurrentToken
+        elsif ASCII_LETTERS.include? data
+            @currentToken[:data].push([data, ""])
+            @state = @states[:attributeName]
+        elsif data == "/"
+            processSolidusInTag
+            @state = @states[:beforeAttributeName]
+        elsif data == "<"
+            @stream.queue.push(data)
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected < character. Expected = or end of tag.")})
+            emitCurrentToken
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file. Expected = or end of tag.")})
+            emitCurrentToken
+        else
+            @currentToken[:data].push([data, ""])
+            @state = @states[:attributeName]
+        end
+        return true
+    end
+
+    def beforeAttributeValueState
+        data = @stream.char
+        if SPACE_CHARACTERS.include? data
+            @stream.charsUntil(SPACE_CHARACTERS, true)
+        elsif data == "\""
+            @state = @states[:attributeValueDoubleQuoted]
+        elsif data == "&"
+            @state = @states[:attributeValueUnQuoted]
+            @stream.queue.push(data);
+        elsif data == "'"
+            @state = @states[:attributeValueSingleQuoted]
+        elsif data == ">"
+            emitCurrentToken
+        elsif data == "<"
+            @stream.queue.push(data)
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected < character. Expected attribute value.")})
+            emitCurrentToken
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file. Expected attribute value.")})
+            emitCurrentToken
+        else
+            @currentToken[:data][-1][1] += data
+            @state = @states[:attributeValueUnQuoted]
+        end
+        return true
+    end
+
+    def attributeValueDoubleQuotedState
+        data = @stream.char
+        if data == "\""
+            @state = @states[:beforeAttributeName]
+        elsif data == "&"
+            processEntityInAttribute
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in attribute value (\").")})
+            emitCurrentToken
+        else
+            @currentToken[:data][-1][1] += data +\
+              @stream.charsUntil(["\"", "&"])
+        end
+        return true
+    end
+
+    def attributeValueSingleQuotedState
+        data = @stream.char
+        if data == "'"
+            @state = @states[:beforeAttributeName]
+        elsif data == "&"
+            processEntityInAttribute
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in attribute value (').")})
+            emitCurrentToken
+        else
+            @currentToken[:data][-1][1] += data +\
+              @stream.charsUntil(["'", "&"])
+        end
+        return true
+    end
+
+    def attributeValueUnQuotedState
+        data = @stream.char
+        if SPACE_CHARACTERS.include? data
+            @state = @states[:beforeAttributeName]
+        elsif data == "&"
+            processEntityInAttribute
+        elsif data == ">"
+            emitCurrentToken
+        elsif data == "<"
+            @stream.queue.push(data)
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected < character in attribute value.")})
+            emitCurrentToken
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in attribute value.")})
+            emitCurrentToken
+        else
+            @currentToken[:data][-1][1] += data + 
+              @stream.charsUntil(["&", ">","<"] + SPACE_CHARACTERS)
+        end
+        return true
+    end
+
+    def bogusCommentState
+        # Make a new comment token and give it as value all the characters
+        # until the first > or :EOF (charsUntil checks for :EOF automatically)
+        # and emit it.
+        @tokenQueue.push(
+          {:type => :Comment, :data => @stream.charsUntil((">"))})
+
+        # Eat the character directly after the bogus comment which is either a
+        # ">" or an :EOF.
+        @stream.char
+        @state = @states[:data]
+        return true
+    end
+
+    def markupDeclarationOpenState
+        charStack = [@stream.char, @stream.char]
+        if charStack == ["-", "-"]
+            @currentToken = {:type => :Comment, :data => ""}
+            @state = @states[:comment]
+        else
+            5.times { charStack.push(@stream.char) }
+            # Put in explicit :EOF check
+            if ((not charStack.include? :EOF) and
+                charStack.join("").upcase == "DOCTYPE")
+                @currentToken =\
+                  {:type => :Doctype, :name => "", :data => true}
+                @state = @states[:doctype]
+            else
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Expected '--' or 'DOCTYPE'. Not found.")})
+                @stream.queue += charStack
+                @state = @states[:bogusComment]
+            end
+        end
+        return true
+    end
+
+    def commentState
+        data = @stream.char
+        if data == "-"
+            @state = @states[:commentDash]
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in comment.")})
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        else
+            @currentToken[:data] += data + @stream.charsUntil("-")
+        end
+        return true
+    end
+
+    def commentDashState
+        data = @stream.char
+        if data == "-"
+            @state = @states[:commentEnd]
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in comment (-)")})
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        else
+            @currentToken[:data] += "-" + data +\
+              @stream.charsUntil("-")
+            # Consume the next character which is either a "-" or an :EOF as
+            # well so if there's a "-" directly after the "-" we go nicely to
+            # the "comment end state" without emitting a ParseError there.
+            @stream.char
+        end
+        return true
+    end
+
+    def commentEndState
+        data = @stream.char
+        if data == ">"
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        elsif data == "-"
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected '-' after '--' found in comment.")})
+            @currentToken[:data] += data
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in comment (--).")})
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        else
+            # XXX
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected character in comment found.")})
+            @currentToken[:data] += "--" + data
+            @state = @states[:comment]
+        end
+        return true
+    end
+
+    def doctypeState
+        data = @stream.char
+        if SPACE_CHARACTERS.include? data
+            @state = @states[:beforeDoctypeName]
+        else
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("No space after literal string 'DOCTYPE'.")})
+            @stream.queue.push(data)
+            @state = @states[:beforeDoctypeName]
+        end
+        return true
+    end
+
+    def beforeDoctypeNameState
+        data = @stream.char
+        if SPACE_CHARACTERS.include? data
+        elsif ASCII_LOWERCASE.include? data
+            @currentToken[:name] = data.upcase
+            @state = @states[:doctypeName]
+        elsif data == ">"
+            # Character needs to be consumed per the specification so don't
+            # invoke emitCurrentTokenWithParseError with :data as argument.
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected > character. Expected DOCTYPE name.")})
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file. Expected DOCTYPE name.")})
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        else
+            @currentToken[:name] = data
+            @state = @states[:doctypeName]
+        end
+        return true
+    end
+
+    def doctypeNameState
+        data = @stream.char
+        needsDoctypeCheck = false
+        if SPACE_CHARACTERS.include? data
+            @state = @states[:afterDoctypeName]
+            needsDoctypeCheck = true
+        elsif data == ">"
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in DOCTYPE name.")})
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        else
+            # We can't just uppercase everything that arrives here. For
+            # instance, non-ASCII characters.
+            if ASCII_LOWERCASE.include? data
+                data = data.upcase
+            end
+            @currentToken[:name] += data
+            needsDoctypeCheck = true
+        end
+
+        # After some iterations through this state it should eventually say
+        # "HTML". Otherwise there's an error.
+        if needsDoctypeCheck and @currentToken[:name] == "HTML"
+            @currentToken[:data] = false
+        end
+        return true
+    end
+
+    def afterDoctypeNameState
+        data = @stream.char
+        if SPACE_CHARACTERS.include? data
+        elsif data == ">"
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        elsif data == :EOF
+            @currentToken[:data] = true
+            # XXX EMIT
+            @stream.queue.push(data)
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in DOCTYPE.")})
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        else
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Expected space or '>'. Got '" + data + "'")})
+            @currentToken[:data] = true
+            @state = @states[:bogusDoctype]
+        end
+        return true
+    end
+
+    def bogusDoctypeState
+        data = @stream.char
+        if data == ">"
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        elsif data == :EOF
+            # XXX EMIT
+            @stream.queue.push(data)
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in bogus doctype.")})
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        end
+        return true
+    end
+
+    def _(string); string; end
+end
+
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb
@ -0,0 +1,21 @@
+module HTML5lib
+module TreeBuilders
+
+  def self.getTreeBuilder(name)
+    case name.to_s.downcase
+        when 'simpletree' then
+            require 'html5lib/treebuilders/simpletree'
+            SimpleTree::TreeBuilder
+        when 'rexml' then
+            require 'html5lib/treebuilders/rexml'
+            REXMLTree::TreeBuilder
+        when 'hpricot' then
+            require 'html5lib/treebuilders/hpricot'
+            Hpricot::TreeBuilder
+        else
+            raise "Unknown TreeBuilder #{name}"
+    end
+  end
+
+end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb
@ -0,0 +1,330 @@
+require 'html5lib/constants'
+
+#XXX - TODO; make the default interface more ElementTree-like rather than DOM-like
+
+module HTML5lib
+
+# The scope markers are inserted when entering buttons, object elements,
+# marquees, table cells, and table captions, and are used to prevent formatting
+# from "leaking" into tables, buttons, object elements, and marquees.
+Marker = nil
+
+module TreeBuilders
+module Base
+
+class Node
+    # The parent of the current node (or nil for the document node)
+    attr_accessor :parent
+
+    # a list of child nodes of the current node. This must 
+    # include all elements but not necessarily other node types
+    attr_accessor :childNodes
+
+    # A list of miscellaneous flags that can be set on the node
+    attr_accessor :_flags
+
+    def initialize(name)
+        @parent = nil
+        @childNodes = []
+        @_flags = []
+    end
+
+    # Insert node as a child of the current node
+    def appendChild(node)
+        raise NotImplementedError
+    end
+
+    # Insert data as text in the current node, positioned before the 
+    # start of node insertBefore or to the end of the node's text.
+    def insertText(data, insertBefore = nil)
+        raise NotImplementedError
+    end
+
+    # Insert node as a child of the current node, before refNode in the 
+    # list of child nodes. Raises ValueError if refNode is not a child of 
+    # the current node
+    def insertBefore(node, refNode)
+        raise NotImplementedError
+    end
+
+    # Remove node from the children of the current node
+    def removeChild(node)
+        raise NotImplementedError
+    end
+
+    # Move all the children of the current node to newParent. 
+    # This is needed so that trees that don't store text as nodes move the 
+    # text in the correct way
+    def reparentChildren(newParent)
+        #XXX - should this method be made more general?
+        @childNodes.each { |child| newParent.appendChild(child) }
+        @childNodes = []
+    end
+
+    # Return a shallow copy of the current node i.e. a node with the same
+    # name and attributes but with no parent or child nodes
+    def cloneNode
+        raise NotImplementedError
+    end
+
+    # Return true if the node has children or text, false otherwise
+    def hasContent
+        raise NotImplementedError
+    end
+end
+
+# Base treebuilder implementation
+class TreeBuilder
+
+    attr_accessor :openElements
+
+    attr_accessor :activeFormattingElements
+
+    attr_accessor :document
+
+    attr_accessor :headPointer
+
+    attr_accessor :formPointer
+
+    # Class to use for document root
+    documentClass = nil
+
+    # Class to use for HTML elements
+    elementClass = nil
+
+    # Class to use for comments
+    commentClass = nil
+
+    # Class to use for doctypes
+    doctypeClass = nil
+    
+    # Fragment class
+    fragmentClass = nil
+
+    def initialize
+        reset
+    end
+    
+    def reset
+        @openElements = []
+        @activeFormattingElements = []
+
+        #XXX - rename these to headElement, formElement
+        @headPointer = nil
+        @formPointer = nil
+
+        self.insertFromTable = false
+
+        @document = @documentClass.new
+    end
+
+    def elementInScope(target, tableVariant = false)
+        # Exit early when possible.
+        return true if @openElements[-1].name == target
+
+        # AT How about while true and simply set node to [-1] and set it to
+        # [-2] at the end...
+        @openElements.reverse.each do |element|
+            if element.name == target
+                return true
+            elsif element.name == 'table'
+                return false
+            elsif not tableVariant and SCOPING_ELEMENTS.include?(element.name)
+                return false
+            elsif element.name == 'html'
+                return false
+            end
+        end
+        assert false # We should never reach this point
+    end
+
+    def reconstructActiveFormattingElements
+        # Within this algorithm the order of steps described in the
+        # specification is not quite the same as the order of steps in the
+        # code. It should still do the same though.
+
+        # Step 1: stop the algorithm when there's nothing to do.
+        return unless @activeFormattingElements
+
+        # Step 2 and step 3: we start with the last element. So i is -1.
+        i = -1
+        entry = @activeFormattingElements[i]
+        return if entry == Marker or @openElements.include?(entry)
+
+        # Step 6
+        until entry == Marker or @openElements.include?(entry)
+            # Step 5: let entry be one earlier in the list.
+            i -= 1
+            begin
+                entry = @activeFormattingElements[i]
+            rescue
+                # Step 4: at this point we need to jump to step 8. By not doing
+                # i += 1 which is also done in step 7 we achieve that.
+                break
+            end
+        end
+        while true
+            # Step 7
+            i += 1
+
+            # Step 8
+            clone = @activeFormattingElements[i].cloneNode
+
+            # Step 9
+            element = insertElement(clone.name, clone.attributes)
+
+            # Step 10
+            @activeFormattingElements[i] = element
+
+            # Step 11
+            break if element == @activeFormattingElements[-1]
+        end
+    end
+
+    def clearActiveFormattingElements
+        {} until @activeFormattingElements.empty? || @activeFormattingElements.pop == Marker
+    end
+
+    # Check if an element exists between the end of the active
+    # formatting elements and the last marker. If it does, return it, else
+    # return false
+    def elementInActiveFormattingElements(name)
+        @activeFormattingElements.reverse.each do |element|
+            # Check for Marker first because if it's a Marker it doesn't have a
+            # name attribute.
+            break if element == Marker
+            return element if element.name == name
+        end
+        return false
+    end
+
+    def insertDoctype(name)
+        @document.appendChild(@doctypeClass.new(name))
+    end
+
+    def insertComment(data, parent = nil)
+        parent = @openElements[-1] if parent.nil?
+        parent.appendChild(@commentClass.new(data))
+    end
+                           
+    # Create an element but don't insert it anywhere
+    def createElement(name, attributes)
+        element = @elementClass.new(name)
+        element.attributes = attributes
+        return element
+    end
+
+    # Switch the function used to insert an element from the
+    # normal one to the misnested table one and back again
+    def insertFromTable=(value)
+        @insertFromTable = value
+        @insertElement = value ? :insertElementTable : :insertElementNormal
+    end
+
+    def insertElement(name, attributes)
+        send(@insertElement, name, attributes)
+    end
+
+    def insertElementNormal(name, attributes)
+        element = @elementClass.new(name)
+        element.attributes = attributes
+        @openElements[-1].appendChild(element)
+        @openElements.push(element)
+        return element
+    end
+
+    # Create an element and insert it into the tree
+    def insertElementTable(name, attributes)
+        element = @elementClass.new(name)
+        element.attributes = attributes
+        if TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)
+            #We should be in the InTable mode. This means we want to do
+            #special magic element rearranging
+            parent, insertBefore = getTableMisnestedNodePosition
+            if insertBefore.nil?
+                parent.appendChild(element)
+            else
+                parent.insertBefore(element, insertBefore)
+            end
+            @openElements.push(element)
+        else
+            return insertElementNormal(name, attributes)
+        end
+        return element
+    end
+
+    def insertText(data, parent = nil)
+        parent = @openElements[-1] if parent.nil?
+
+        if (not(@insertFromTable) or (@insertFromTable and not TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)))
+            parent.insertText(data)
+        else
+            #We should be in the InTable mode. This means we want to do
+            #special magic element rearranging
+            parent, insertBefore = getTableMisnestedNodePosition
+            parent.insertText(data, insertBefore)
+        end
+    end
+            
+    # Get the foster parent element, and sibling to insert before
+    # (or nil) when inserting a misnested table node
+    def getTableMisnestedNodePosition
+        #The foster parent element is the one which comes before the most
+        #recently opened table element
+        #XXX - this is really inelegant
+        lastTable = nil
+        fosterParent = nil
+        insertBefore = nil
+        @openElements.reverse.each do |element|
+            if element.name == "table"
+                lastTable = element
+                break
+            end
+        end
+        if lastTable
+            #XXX - we should really check that this parent is actually a
+            #node here
+            if lastTable.parent
+                fosterParent = lastTable.parent
+                insertBefore = lastTable
+            else
+                fosterParent = @openElements[@openElements.index(lastTable) - 1]
+            end
+        else
+            fosterParent = @openElements[0]
+        end
+        return fosterParent, insertBefore
+    end
+
+    def generateImpliedEndTags(exclude = nil)
+        name = @openElements[-1].name
+
+        if (['dd', 'dt', 'li', 'p', 'td', 'th', 'tr'].include?(name) and name != exclude)
+            @openElements.pop
+            # XXX This is not entirely what the specification says. We should
+            # investigate it more closely.
+            generateImpliedEndTags(exclude)
+        end
+    end
+
+    def getDocument
+        @document
+    end
+    
+    def getFragment
+        #assert @innerHTML
+        fragment = @fragmentClass.new
+        @openElements[0].reparentChildren(fragment)
+        return fragment
+    end
+
+    # Serialize the subtree of node in the format required by unit tests
+    # node - the node from which to start serializing
+    def testSerializer(node)
+        raise NotImplementedError
+    end
+
+end
+end
+end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb
@ -0,0 +1,211 @@
+require 'html5lib/treebuilders/base'
+require 'hpricot'
+require 'forwardable'
+
+module HTML5lib
+module TreeBuilders
+module Hpricot
+
+class Node < Base::Node
+
+    extend Forwardable
+
+    def_delegators :@hpricot, :name
+
+    attr_accessor :hpricot
+
+    def initialize(name)
+        super(name)
+        @hpricot = self.class.hpricot_class.new name
+    end
+
+    def appendChild(node)
+        if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
+            childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s
+        else
+            childNodes << node
+            hpricot.children << node.hpricot
+        end
+        node.parent = self
+    end
+
+    def removeChild(node)
+       childNodes.delete(node)
+       hpricot.children.delete_at(hpricot.children.index(node.hpricot))
+       node.parent = nil
+    end
+
+    def insertText(data, before = nil)
+        if before
+            insertBefore(TextNode.new(data), before)
+        else
+            appendChild(TextNode.new(data))
+        end
+    end
+
+    def insertBefore(node, refNode)
+        index = childNodes.index(refNode)
+        if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
+            childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
+        else
+            childNodes.insert(index, node)
+        end
+    end
+
+    def hasContent
+        childNodes.any?
+    end
+end
+
+class Element < Node
+    def self.hpricot_class
+        ::Hpricot::Elem
+    end
+
+    def initialize(name)
+        super(name)
+
+        @hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
+    end
+
+    def name
+        @hpricot.stag.name
+    end
+
+    def cloneNode
+        attributes.inject(self.class.new(name)) do |node, (name, value)|
+            node.hpricot[name] = value
+            node
+        end
+    end
+
+    # A call to Hpricot::Elem#raw_attributes is built dynamically,
+    # so alterations to the returned value (a hash) will be lost.
+    #
+    # AttributeProxy works around this by forwarding :[]= calls
+    # to the raw_attributes accessor on the element start tag.
+    #
+    class AttributeProxy
+        def initialize(hpricot)
+            @hpricot = hpricot
+        end
+        def []=(k, v)
+            @hpricot.stag.send(stag_attributes_method)[k] = v
+        end
+        def stag_attributes_method
+            # STag#attributes changed to STag#raw_attributes after Hpricot 0.5
+            @hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
+        end
+        def method_missing(*a, &b)
+            @hpricot.attributes.send(*a, &b)
+        end
+    end
+
+    def attributes
+        AttributeProxy.new(@hpricot)
+    end
+
+    def attributes=(attrs)
+        attrs.each { |name, value| @hpricot[name] = value }
+    end
+
+    def printTree(indent = 0)
+        tree = "\n|#{' ' * indent}<#{name}>"
+        indent += 2
+        attributes.each do |name, value|
+            next if name == 'xmlns'
+            tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
+        end
+        childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
+    end
+end
+
+class Document < Node
+    def self.hpricot_class
+        ::Hpricot::Doc
+    end
+
+    def initialize
+        super(nil)
+    end
+
+    def printTree(indent = 0)
+        childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
+    end
+end
+
+class DocumentType < Node
+    def self.hpricot_class
+        ::Hpricot::DocType
+    end
+
+    def initialize(name)
+        begin
+            super(name)
+        rescue ArgumentError # needs 3...
+        end
+
+        @hpricot = ::Hpricot::DocType.new(name, nil, nil)
+    end
+
+    def printTree(indent = 0)
+        "\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
+    end
+end
+
+class DocumentFragment < Element
+    def initialize
+        super('')
+    end
+
+    def printTree(indent = 0)
+        childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
+    end
+end
+
+class TextNode < Node
+    def initialize(data)
+        @hpricot = ::Hpricot::Text.new(data)
+    end
+
+    def printTree(indent = 0)
+        "\n|#{' ' * indent}\"#{hpricot.content}\""
+    end
+end
+
+class CommentNode < Node
+    def self.hpricot_class
+        ::Hpricot::Comment
+    end
+
+    def printTree(indent = 0)
+        "\n|#{' ' * indent}<!-- #{hpricot.content} -->"
+    end
+end
+
+class TreeBuilder < Base::TreeBuilder
+    def initialize
+        @documentClass = Document
+        @doctypeClass = DocumentType
+        @elementClass = Element
+        @commentClass = CommentNode
+        @fragmentClass = DocumentFragment
+    end
+
+    def testSerializer(node)
+        node.printTree
+    end
+
+    def getDocument
+        @document.hpricot
+    end
+
+    def getFragment
+        @document = super
+        return @document.hpricot.children
+    end
+end
+
+end
+end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb
@ -0,0 +1,191 @@
+require 'html5lib/treebuilders/base'
+require 'rexml/document'
+require 'forwardable'
+
+module HTML5lib
+module TreeBuilders
+module REXMLTree
+
+class Node < Base::Node
+    extend Forwardable
+    def_delegators :@rxobj, :name, :attributes
+    attr_accessor :rxobj
+
+    def initialize name
+        super name
+        @rxobj = self.class.rxclass.new name
+    end
+
+    def appendChild node
+        if node.kind_of? TextNode and 
+          childNodes.length>0 and childNodes[-1].kind_of? TextNode
+            childNodes[-1].rxobj.value =
+              childNodes[-1].rxobj.to_s + node.rxobj.to_s
+            childNodes[-1].rxobj.raw = true
+        else
+            childNodes.push node
+            rxobj.add node.rxobj
+        end
+        node.parent = self
+    end
+
+    def removeChild node
+       childNodes.delete node
+       rxobj.delete node.rxobj
+       node.parent = nil
+    end
+
+    def insertText data, before=nil
+        if before
+            insertBefore TextNode.new(data), before
+        else
+            appendChild TextNode.new(data)
+        end
+    end
+
+    def insertBefore node, refNode
+        index = childNodes.index(refNode)
+        if node.kind_of? TextNode and index>0 and 
+          childNodes[index-1].kind_of? TextNode
+            childNodes[index-1].rxobj.value =
+              childNodes[index-1].rxobj.to_s + node.rxobj.to_s
+            childNodes[index-1].rxobj.raw = true
+        else
+            childNodes.insert index, node
+        end
+    end
+
+    def hasContent
+        return (childNodes.length > 0)
+    end
+end
+
+class Element < Node
+    def self.rxclass
+        REXML::Element
+    end
+
+    def initialize name
+        super name
+    end
+
+    def cloneNode
+        newNode = self.class.new name
+        attributes.each {|name,value| newNode.attributes[name] = value}
+        newNode
+    end
+
+    def attributes= value
+        value.each {|name,value| rxobj.attributes[name]=value}
+    end
+
+    def printTree indent=0
+        tree = "\n|#{' ' * indent}<#{name}>"
+        indent += 2
+        for name, value in attributes
+            next if name == 'xmlns'
+            tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
+        end
+        for child in childNodes
+            tree += child.printTree(indent)
+        end
+        return tree
+    end
+end
+
+class Document < Node
+    def self.rxclass
+        REXML::Document
+    end
+
+    def initialize
+        super nil
+    end
+
+    def appendChild node
+       if node.kind_of? Element and node.name == 'html'
+           node.rxobj.add_namespace('http://www.w3.org/1999/xhtml')
+       end
+       super node
+    end
+
+    def printTree indent=0
+        tree = "#document"
+        for child in childNodes
+            tree += child.printTree(indent + 2)
+        end
+        return tree
+    end
+end
+
+class DocumentType < Node
+    def self.rxclass
+        REXML::DocType
+    end
+
+    def printTree indent=0
+        "\n|#{' ' * indent}<!DOCTYPE #{name}>"
+    end
+end
+
+class DocumentFragment < Element
+    def initialize
+        super nil
+    end
+
+    def printTree indent=0
+        tree = ""
+        for child in childNodes
+            tree += child.printTree(indent+2)
+        end
+        return tree
+    end
+end
+
+class TextNode < Node
+    def initialize data
+        raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;')
+        @rxobj = REXML::Text.new(raw, true, nil, true)
+    end
+
+    def printTree indent=0
+        "\n|#{' ' * indent}\"#{rxobj.value}\""
+    end
+end
+
+class CommentNode < Node
+    def self.rxclass
+        REXML::Comment
+    end
+
+    def printTree indent=0
+        "\n|#{' ' * indent}<!-- #{rxobj.string} -->"
+    end
+end
+
+class TreeBuilder < Base::TreeBuilder
+    def initialize
+        @documentClass = Document
+        @doctypeClass = DocumentType
+        @elementClass = Element
+        @commentClass = CommentNode
+        @fragmentClass = DocumentFragment
+    end
+
+    def testSerializer node
+        node.printTree()
+    end
+
+    def getDocument
+        @document.rxobj
+    end
+
+    def getFragment
+        @document = super
+        return @document.rxobj.children
+    end
+end
+
+end
+end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/simpletree.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/simpletree.rb
@ -0,0 +1,178 @@
+require 'html5lib/treebuilders/base'
+
+module HTML5lib
+module TreeBuilders
+module SimpleTree
+
+class Node < Base::Node
+    # Node representing an item in the tree.
+    # name - The tag name associated with the node
+    attr_accessor :name
+
+    # The value of the current node (applies to text nodes and 
+    # comments
+    attr_accessor :value
+
+    # a dict holding name, value pairs for attributes of the node
+    attr_accessor :attributes
+
+    def initialize name
+        super
+        @name = name
+        @value = nil
+        @attributes = {}
+    end
+
+    def appendChild node
+        if node.kind_of? TextNode and 
+          childNodes.length>0 and childNodes[-1].kind_of? TextNode
+            childNodes[-1].value += node.value
+        else
+            childNodes.push node
+        end
+        node.parent = self
+    end
+
+    def removeChild node
+       childNodes.delete node
+       node.parent = nil
+    end
+
+    def cloneNode
+        newNode = self.class.new name
+        attributes.each {|name,value| newNode.attributes[name] = value}
+        newNode.value = value
+        newNode
+    end
+
+    def insertText data, before=nil
+        if before
+            insertBefore TextNode.new(data), before
+        else
+            appendChild TextNode.new(data)
+        end
+    end
+
+    def insertBefore node, refNode
+        index = childNodes.index(refNode)
+        if node.kind_of? TextNode and index>0 and 
+          childNodes[index-1].kind_of? TextNode
+            childNodes[index-1].value += node.value
+        else
+            childNodes.insert index, node
+        end
+    end
+
+    def printTree indent=0
+        tree = "\n|%s%s" % [' '* indent, self.to_s]
+        for child in childNodes
+            tree += child.printTree(indent + 2)
+        end
+        return tree
+    end
+
+    def hasContent
+        return (childNodes.length > 0)
+    end
+end
+
+class Element < Node
+    def to_s
+       "<%s>" % name
+    end
+
+    def printTree indent=0
+        tree = "\n|%s%s" % [' '* indent, self.to_s]
+        indent += 2
+        for name, value in attributes
+            tree += "\n|%s%s=\"%s\"" % [' ' * indent, name, value]
+        end
+        for child in childNodes
+            tree += child.printTree(indent)
+        end
+        return tree
+    end
+end
+
+class Document < Node
+    def to_s
+       "#document"
+    end
+
+    def initialize
+        super nil
+    end
+
+    def printTree indent=0
+        tree = to_s
+        for child in childNodes
+            tree += child.printTree(indent + 2)
+        end
+        return tree
+    end
+end
+
+class DocumentType < Node
+    def to_s
+       "<!DOCTYPE %s>" % name
+    end
+end
+
+class DocumentFragment < Element
+    def initialize
+        super nil
+    end
+
+    def printTree indent=0
+        tree = ""
+        for child in childNodes
+            tree += child.printTree(indent+2)
+        end
+        return tree
+    end
+end
+
+class TextNode < Node
+    def initialize value
+        super nil
+        @value = value
+    end
+
+    def to_s
+       '"%s"' % value
+    end
+end
+
+class CommentNode < Node
+    def initialize value
+        super nil
+        @value = value
+    end
+
+    def to_s
+        "<!-- %s -->" % value
+    end
+end
+
+class TreeBuilder < Base::TreeBuilder
+    def initialize
+        @documentClass = Document
+        @doctypeClass = DocumentType
+        @elementClass = Element
+        @commentClass = CommentNode
+        @fragmentClass = DocumentFragment
+    end
+
+    def testSerializer node
+        node.printTree()
+    end
+
+    def getFragment
+        @document = super
+        return @document.childNodes
+    end
+end
+
+end
+end
+end
--- a/vendor/plugins/HTML5lib/tests/preamble.rb
+++ b/vendor/plugins/HTML5lib/tests/preamble.rb
@ -0,0 +1,11 @@
+require 'test/unit'
+
+HTML5LIB_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__))))
+
+$:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
+
+$:.unshift File.dirname(__FILE__)
+
+def html5lib_test_files(subdirectory)
+    Dir[File.join(HTML5LIB_BASE, 'tests', subdirectory, '*.*')]
+end
--- a/vendor/plugins/HTML5lib/tests/test_encoding.rb
+++ b/vendor/plugins/HTML5lib/tests/test_encoding.rb
@ -0,0 +1,36 @@
+require File.join(File.dirname(__FILE__), 'preamble')
+
+require 'html5lib/inputstream'
+
+class Html5EncodingTestCase < Test::Unit::TestCase
+
+begin
+    require 'rubygems'
+    require 'UniversalDetector'
+
+    def test_chardet
+        File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
+            stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
+            assert_equal 'big5', stream.charEncoding.downcase
+        end
+    end
+rescue LoadError
+    puts "chardet not found, skipping chardet tests"
+end
+
+    html5lib_test_files('encoding').each do |test_file|        
+        test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
+
+        File.read(test_file).split("#data\n").each_with_index do |data, index|
+            next if data.empty?
+            input, encoding = data.split(/\n#encoding\s+/, 2)
+            encoding = encoding.split[0]
+
+            define_method 'test_%s_%d' % [ test_name, index + 1 ] do
+                stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
+                assert_equal encoding.downcase, stream.charEncoding.downcase, input
+            end
+        end
+    end
+
+end
--- a/vendor/plugins/HTML5lib/tests/test_lxp.rb
+++ b/vendor/plugins/HTML5lib/tests/test_lxp.rb
@ -0,0 +1,212 @@
+require File.join(File.dirname(__FILE__), 'preamble')
+
+require 'html5lib/liberalxmlparser'
+
+XMLELEM = /<(\w+\s*)((?:[-:\w]+="[^"]*"\s*)+)(\/?)>/
+SORTATTRS = '<#{$1+$2.split.sort.join(' ')+$3}>'
+
+def assert_xml_equal(input, expected=nil, parser=HTML5lib::XMLParser)
+    document = parser.parse(input.chomp).root
+    if not expected
+        expected = input.chomp.gsub(XMLELEM,SORTATTRS)
+        expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
+        output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,SORTATTRS)
+        assert_equal(expected, output)
+    else
+        assert_equal(expected, document.to_s.gsub(/'/,'"'))
+    end
+end
+
+def assert_xhtml_equal(input, expected=nil, parser=HTML5lib::XHTMLParser)
+      assert_xml_equal(input, expected, parser)
+end
+
+class BasicXhtml5Test < Test::Unit::TestCase
+
+  def test_title_body_mismatched_close
+    assert_xhtml_equal(
+      '<title>Xhtml</title><b><i>content</b></i>',
+      '<html xmlns="http://www.w3.org/1999/xhtml">' +
+        '<head><title>Xhtml</title></head>' + 
+        '<body><b><i>content</i></b></body>' +
+      '</html>')
+  end
+
+  def test_title_body_named_charref
+    assert_xhtml_equal(
+      '<title>mdash</title>A &mdash B',
+      '<html xmlns="http://www.w3.org/1999/xhtml">' +
+        '<head><title>mdash</title></head>' + 
+        '<body>A '+ [0x2014].pack('U') + ' B</body>' +
+      '</html>')
+  end
+end
+
+class BasicXmlTest < Test::Unit::TestCase
+
+  def test_comment
+    assert_xml_equal("<x><!-- foo --></x>")
+  end
+
+  def test_cdata
+    assert_xml_equal("<x><![CDATA[foo]]></x>","<x>foo</x>")
+  end
+
+  def test_simple_text
+    assert_xml_equal("<p>foo</p>","<p>foo</p>")
+  end
+
+  def test_optional_close
+    assert_xml_equal("<p>foo","<p>foo</p>")
+  end
+
+  def test_html_mismatched
+    assert_xml_equal("<b><i>foo</b></i>","<b><i>foo</i></b>")
+  end
+end
+
+class OpmlTest < Test::Unit::TestCase
+
+  def test_mixedCaseElement
+    assert_xml_equal(
+      '<opml version="1.0">' +
+        '<head><ownerName>Dave Winer</ownerName></head>' +
+      '</opml>')
+  end
+
+  def test_mixedCaseAttribute
+    assert_xml_equal(
+      '<opml version="1.0">' +
+        '<body><outline isComment="true"/></body>' +
+      '</opml>')
+  end
+
+  def test_malformed
+    assert_xml_equal(
+      '<opml version="1.0">' +
+        '<body><outline text="Odds & Ends"/></body>' +
+      '</opml>',
+      '<opml version="1.0">' +
+        '<body><outline text="Odds &amp; Ends"/></body>' +
+      '</opml>')
+  end
+end
+
+class XhtmlTest < Test::Unit::TestCase
+
+  def test_mathml
+    assert_xhtml_equal <<EOX
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>MathML</title></head>
+<body>
+  <math xmlns="http://www.w3.org/1998/Math/MathML">
+    <mrow>
+      <mi>x</mi>
+      <mo>=</mo>
+
+      <mfrac>
+        <mrow>
+          <mrow>
+            <mo>-</mo>
+            <mi>b</mi>
+          </mrow>
+          <mo>&#177;</mo>
+          <msqrt>
+
+            <mrow>
+              <msup>
+                <mi>b</mi>
+                <mn>2</mn>
+              </msup>
+              <mo>-</mo>
+              <mrow>
+
+                <mn>4</mn>
+                <mo>&#8290;</mo>
+                <mi>a</mi>
+                <mo>&#8290;</mo>
+                <mi>c</mi>
+              </mrow>
+            </mrow>
+
+          </msqrt>
+        </mrow>
+        <mrow>
+          <mn>2</mn>
+          <mo>&#8290;</mo>
+          <mi>a</mi>
+        </mrow>
+      </mfrac>
+
+    </mrow>
+  </math>
+</body></html>
+EOX
+  end
+
+  def test_svg
+    assert_xhtml_equal <<EOX
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>SVG</title></head>
+<body>
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
+    <path d="M38,38c0-12,24-15,23-2c0,9-16,13-16,23v7h11v-4c0-9,17-12,17-27
+             c-2-22-45-22-45,3zM45,70h11v11h-11z" fill="#371">
+    </path>
+    <circle cx="50" cy="50" r="45" fill="none" stroke="#371" stroke-width="10">
+    </circle>
+
+  </svg>
+</body></html>
+EOX
+  end
+
+  def test_xlink
+    assert_xhtml_equal <<EOX
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>XLINK</title></head>
+<body>
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
+    <defs xmlns:l="http://www.w3.org/1999/xlink">
+      <radialGradient id="s1" fx=".4" fy=".2" r=".7">
+        <stop stop-color="#FE8"/>
+        <stop stop-color="#D70" offset="1"/>
+      </radialGradient>
+      <radialGradient id="s2" fx=".8" fy=".5" l:href="#s1"/>
+      <radialGradient id="s3" fx=".5" fy=".9" l:href="#s1"/>
+      <radialGradient id="s4" fx=".1" fy=".5" l:href="#s1"/>
+    </defs>
+    <g stroke="#940">
+      <path d="M73,29c-37-40-62-24-52,4l6-7c-8-16,7-26,42,9z" fill="url(#s1)"/>
+      <path d="M47,8c33-16,48,21,9,47l-6-5c38-27,20-44,5-37z" fill="url(#s2)"/>
+      <path d="M77,32c22,30,10,57-39,51l-1-8c3,3,67,5,36-36z" fill="url(#s3)"/>
+
+      <path d="M58,84c-4,20-38-4-8-24l-6-5c-36,43,15,56,23,27z" fill="url(#s4)"/>
+      <path d="M40,14c-40,37-37,52-9,68l1-8c-16-13-29-21,16-56z" fill="url(#s1)"/>
+      <path d="M31,33c19,23,20,7,35,41l-9,1.7c-4-19-8-14-31-37z" fill="url(#s2)"/>
+    </g>
+  </svg>
+</body></html>
+EOX
+  end
+
+  def test_br
+    assert_xhtml_equal <<EOX
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>XLINK</title></head>
+<body>
+<br/>
+</body></html>
+EOX
+  end
+
+  def xtest_strong
+    assert_xhtml_equal <<EOX
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>XLINK</title></head>
+<body>
+<strong></strong>
+</body></html>
+EOX
+  end
+end
--- a/vendor/plugins/HTML5lib/tests/test_parser.rb
+++ b/vendor/plugins/HTML5lib/tests/test_parser.rb
@ -0,0 +1,108 @@
+require File.join(File.dirname(__FILE__), 'preamble')
+
+require 'html5lib/treebuilders'
+require 'html5lib/html5parser'
+
+
+$tree_types_to_test = ['simpletree', 'rexml']
+
+begin
+    require 'hpricot'
+    $tree_types_to_test.push('hpricot')
+rescue LoadError
+end
+
+$CHECK_PARSER_ERRORS = false
+
+puts 'Testing: ' + $tree_types_to_test * ', '
+
+
+class Html5ParserTestCase < Test::Unit::TestCase
+
+    def self.startswith?(a, b)
+        b[0... a.length] == a
+    end
+
+    def self.parseTestcase(data)
+        innerHTML = nil
+        input = []
+        output = []
+        errors = []
+        currentList = input
+        data.split(/\n/).each do |line|
+            if !line.empty? and !startswith?("#errors", line) and
+              !startswith?("#document", line) and
+              !startswith?("#data", line) and
+              !startswith?("#document-fragment", line)
+
+                if currentList == output and startswith?("|", line)
+                    currentList.push(line[2..-1])
+                else
+                    currentList.push(line)
+                end
+            elsif line == "#errors"
+                currentList = errors
+            elsif line == "#document" or startswith?("#document-fragment", line)
+                if startswith?("#document-fragment", line)
+                    innerHTML = line[19..-1]
+                    raise AssertionError unless innerHTML
+                end
+                currentList = output
+            end
+        end
+        return innerHTML, input.join("\n"), output.join("\n"), errors
+    end
+    
+    # convert the output of str(document) to the format used in the testcases
+    def convertTreeDump(treedump)
+        treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
+    end
+
+    def sortattrs(output)
+        output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
+    end
+
+    html5lib_test_files('tree-construction').each do |test_file|
+
+        test_name = File.basename(test_file).sub('.dat', '')
+
+        File.read(test_file).split("#data\n").each_with_index do |data, index|
+            next if data.empty?
+       
+            innerHTML, input, expected_output, expected_errors = parseTestcase(data)
+
+            $tree_types_to_test.each do |tree_name|
+                define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
+
+                    parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name))
+                
+                    if innerHTML
+                        parser.parseFragment(input, innerHTML)
+                    else
+                        parser.parse(input)
+                    end
+                
+                    actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
+
+                    assert_equal sortattrs(expected_output), sortattrs(actual_output), [
+                        'Input:', input,
+                        'Expected:', expected_output,
+                        'Recieved:', actual_output
+                    ].join("\n")
+
+                    if $CHECK_PARSER_ERRORS
+                        actual_errors = parser.errors.map do |(line, col), message|
+                            'Line: %i Col: %i %s' % [line, col, message]
+                        end
+                        assert_equal parser.errors.length, expected_errors.length, [
+                            'Expected errors:', expected_errors.join("\n"),
+                            'Actual errors:', actual_errors.join("\n") 
+                        ].join("\n")
+                    end
+                    
+                end
+            end
+        end
+    end
+
+end
--- a/vendor/plugins/HTML5lib/tests/test_sanitizer.rb
+++ b/vendor/plugins/HTML5lib/tests/test_sanitizer.rb
@ -0,0 +1,206 @@
+#!/usr/bin/env ruby
+
+require File.join(File.dirname(__FILE__), 'preamble')
+
+require 'html5lib/sanitizer'
+require 'html5lib/html5parser'
+require 'html5lib/liberalxmlparser'
+
+class SanitizeTest < Test::Unit::TestCase
+  include HTML5lib
+
+  def sanitize_xhtml stream
+    XHTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
+  end
+
+  def sanitize_html stream
+    HTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
+  end
+
+  HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
+    next if %w[caption col colgroup optgroup option table tbody td tfoot th thead tr].include?(tag_name) ### TODO
+    define_method "test_should_allow_#{tag_name}_tag" do
+      if tag_name == 'image'
+        assert_equal "<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
+          sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
+      elsif VOID_ELEMENTS.include?(tag_name)
+        assert_equal "<#{tag_name} title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
+          sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
+      else
+        assert_equal "<#{tag_name.downcase} title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>",
+          sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
+        assert_equal "<#{tag_name} title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>",
+          sanitize_xhtml("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
+      end
+    end
+  end
+
+  HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
+    define_method "test_should_forbid_#{tag_name.upcase}_tag" do
+      assert_equal "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;",
+        sanitize_html("<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>")
+    end
+  end
+
+  HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
+    next if attribute_name == 'style'
+    define_method "test_should_allow_#{attribute_name}_attribute" do
+      assert_equal "<p #{attribute_name.downcase}=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
+        sanitize_html("<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>")
+      assert_equal "<p #{attribute_name}=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
+        sanitize_xhtml("<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>")
+    end
+  end
+
+  HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
+    define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
+      assert_equal "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
+        sanitize_html("<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>")
+    end
+  end
+
+  HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
+    define_method "test_should_allow_#{protocol}_uris" do
+      assert_equal "<a href=\"#{protocol}\">foo</a>",
+        sanitize_html(%(<a href="#{protocol}">foo</a>))
+    end
+  end
+
+  HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
+    define_method "test_should_allow_uppercase_#{protocol}_uris" do
+      assert_equal "<a href=\"#{protocol.upcase}\">foo</a>",
+        sanitize_html(%(<a href="#{protocol.upcase}">foo</a>))
+    end
+  end
+
+  def test_should_allow_anchors
+    assert_equal "<a href=\"foo\">&lt;script&gt;baz&lt;/script&gt;</a>",
+     sanitize_html("<a href='foo' onclick='bar'><script>baz</script></a>")
+  end
+
+  # RFC 3986, sec 4.2
+  def test_allow_colons_in_path_component
+    assert_equal "<a href=\"./this:that\">foo</a>",
+      sanitize_html("<a href=\"./this:that\">foo</a>")
+  end
+
+  %w(src width height alt).each do |img_attr|
+    define_method "test_should_allow_image_#{img_attr}_attribute" do
+      assert_equal "<img #{img_attr}=\"foo\"/>",
+        sanitize_html("<img #{img_attr}='foo' onclick='bar' />")
+    end
+  end
+
+  def test_should_handle_non_html
+    assert_equal 'abc',  sanitize_html("abc")
+  end
+
+  def test_should_handle_blank_text
+    assert_equal '', sanitize_html('')
+  end
+
+  [%w(img src), %w(a href)].each do |(tag, attr)|
+    close = VOID_ELEMENTS.include?(tag) ? "/>boo" : ">boo</#{tag}>"
+
+    define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols" do
+      assert_equal %(<#{tag} title="1"#{close}), sanitize_html(%(<#{tag} #{attr}="javascript:XSS" title="1">boo</#{tag}>))
+    end
+
+    define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols_and_whitespace" do
+      assert_equal %(<#{tag} title="1"#{close}), sanitize_html(%(<#{tag} #{attr}=" javascript:XSS" title="1">boo</#{tag}>))
+    end
+  end
+
+  [%(<img src="javascript:alert('XSS');" />),
+   %(<img src=javascript:alert('XSS') />),
+   %(<img src="JaVaScRiPt:alert('XSS')" />),
+   %(<img src='javascript:alert(&quot;XSS&quot;)' />),
+   %(<img src='javascript:alert(String.fromCharCode(88,83,83))' />),
+   %(<img src='&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;' />),
+   %(<img src='&#0000106;&#0000097;&#0000118;&#0000097;&#0000115;&#0000099;&#0000114;&#0000105;&#0000112;&#0000116;&#0000058;&#0000097;&#0000108;&#0000101;&#0000114;&#0000116;&#0000040;&#0000039;&#0000088;&#0000083;&#0000083;&#0000039;&#0000041' />),
+   %(<img src='&#x6A;&#x61;&#x76;&#x61;&#x73;&#x63;&#x72;&#x69;&#x70;&#x74;&#x3A;&#x61;&#x6C;&#x65;&#x72;&#x74;&#x28;&#x27;&#x58;&#x53;&#x53;&#x27;&#x29' />),
+   %(<img src="jav\tascript:alert('XSS');" />),
+   %(<img src="jav&#x09;ascript:alert('XSS');" />),
+   %(<img src="jav&#x0A;ascript:alert('XSS');" />),
+   %(<img src="jav&#x0D;ascript:alert('XSS');" />),
+   %(<img src=" &#14;  javascript:alert('XSS');" />),
+   %(<img src="&#x20;javascript:alert('XSS');" />),
+   %(<img src="&#xA0;javascript:alert('XSS');" />)].each_with_index do |img_hack, i|
+    define_method "test_should_not_fall_for_xss_image_hack_#{i}" do
+      assert_equal "<img/>", sanitize_html(img_hack)
+    end
+  end
+
+  def test_should_sanitize_tag_broken_up_by_null
+    assert_equal "&lt;scr\357\277\275ipt&gt;alert(\"XSS\")&lt;/scr\357\277\275ipt&gt;", sanitize_html(%(<scr\0ipt>alert(\"XSS\")</scr\0ipt>))
+  end
+
+  def test_should_sanitize_invalid_script_tag
+    assert_equal "&lt;script XSS=\"\" SRC=\"http://ha.ckers.org/xss.js\"&gt;&lt;/script&gt;", sanitize_html(%(<script/XSS SRC="http://ha.ckers.org/xss.js"></script>))
+  end
+
+  def test_should_sanitize_script_tag_with_multiple_open_brackets
+    assert_equal "&lt;&lt;script&gt;alert(\"XSS\");//&lt;&lt;/script&gt;", sanitize_html(%(<<script>alert("XSS");//<</script>))
+    assert_equal %(&lt;iframe src=\"http://ha.ckers.org/scriptlet.html\"&gt;&lt;), sanitize_html(%(<iframe src=http://ha.ckers.org/scriptlet.html\n<))
+  end
+
+  def test_should_sanitize_unclosed_script
+    assert_equal "&lt;script src=\"http://ha.ckers.org/xss.js?\"&gt;<b/>", sanitize_html(%(<script src=http://ha.ckers.org/xss.js?<b>))
+  end
+
+  def test_should_sanitize_half_open_scripts
+    assert_equal  "<img/>", sanitize_html(%(<img src="javascript:alert('XSS')"))
+  end
+
+  def test_should_not_fall_for_ridiculous_hack
+    img_hack = %(<img\nsrc\n=\n"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n"\n />)
+    assert_equal "<img/>", sanitize_html(img_hack)
+  end
+
+  def test_platypus
+    assert_equal %(<a href=\"http://www.ragingplatypus.com/\" style=\"display: block; width: 100%; height: 100%; background-color: black; background-x: center; background-y: center;\">never trust your upstream platypus</a>),
+       sanitize_html(%(<a href="http://www.ragingplatypus.com/" style="display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;">never trust your upstream platypus</a>))
+  end
+
+  def test_xul
+    assert_equal %(<p style="">fubar</p>),
+     sanitize_html(%(<p style="-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')">fubar</p>))
+  end
+
+  def test_input_image
+    assert_equal %(<input type="image"/>),
+      sanitize_html(%(<input type="image" src="javascript:alert('XSS');" />))
+  end
+
+  def test_non_alpha_non_digit
+    assert_equal "&lt;script XSS=\"\" src=\"http://ha.ckers.org/xss.js\"&gt;&lt;/script&gt;",
+      sanitize_html(%(<script/XSS src="http://ha.ckers.org/xss.js"></script>))
+    assert_equal "<a>foo</a>",
+      sanitize_html('<a onclick!#$%&()*~+-_.,:;?@[/|\]^`=alert("XSS")>foo</a>')
+    assert_equal "<img src=\"http://ha.ckers.org/xss.js\"/>",
+      sanitize_html('<img/src="http://ha.ckers.org/xss.js"/>')
+  end
+
+  def test_img_dynsrc_lowsrc
+     assert_equal "<img/>",
+       sanitize_html(%(<img dynsrc="javascript:alert('XSS')" />))
+     assert_equal "<img/>",
+       sanitize_html(%(<img lowsrc="javascript:alert('XSS')" />))
+  end
+
+  def test_div_background_image_unicode_encoded
+    assert_equal '<div style="">foo</div>',
+      sanitize_html(%(<div style="background-image:\0075\0072\006C\0028'\006a\0061\0076\0061\0073\0063\0072\0069\0070\0074\003a\0061\006c\0065\0072\0074\0028.1027\0058.1053\0053\0027\0029'\0029">foo</div>))
+  end
+
+  def test_div_expression
+    assert_equal '<div style="">foo</div>',
+      sanitize_html(%(<div style="width: expression(alert('XSS'));">foo</div>))
+  end
+
+  def test_img_vbscript
+     assert_equal '<img/>',
+       sanitize_html(%(<img src='vbscript:msgbox("XSS")' />))
+  end
+
+end
--- a/vendor/plugins/HTML5lib/tests/test_tokenizer.rb
+++ b/vendor/plugins/HTML5lib/tests/test_tokenizer.rb
@ -0,0 +1,78 @@
+require File.join(File.dirname(__FILE__), 'preamble')
+
+require 'html5lib/tokenizer'
+
+require 'tokenizer_test_parser'
+
+begin
+  require 'jsonx'
+rescue LoadError
+  class JSON
+    def self.parse json
+      json.gsub! /"\s*:/, '"=>'
+      json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
+      eval json
+    end
+  end
+end 
+
+class Html5TokenizerTestCase < Test::Unit::TestCase
+
+    def type_of?(token_name, token)
+        token != 'ParseError' and token_name == token.first
+    end
+
+    def convert_attribute_arrays_to_hashes(tokens)
+        tokens.inject([]) do |tokens, token|
+            token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
+            tokens << token
+        end
+    end
+    
+    def concatenate_consecutive_characters(tokens)
+        tokens.inject([]) do |tokens, token|
+            if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
+                tokens.last[1] = tokens.last[1] + token[1]
+                next tokens
+            end
+            tokens << token
+        end
+    end
+
+    def tokenizer_test(data)
+        (data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
+            message = [
+                'Description:', data['description'],
+                'Input:', data['input'],
+                'Content Model Flag:', content_model_flag ] * "\n"
+
+            assert_nothing_raised message do
+                tokenizer = HTML5lib::HTMLTokenizer.new(data['input'])
+
+                tokenizer.contentModelFlag = content_model_flag.to_sym
+                
+                tokenizer.currentToken = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
+
+                tokens = TokenizerTestParser.new(tokenizer).parse
+
+                actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
+
+                expected = concatenate_consecutive_characters(data['output'])
+
+                assert_equal expected, actual, message
+            end
+        end 
+    end
+
+    html5lib_test_files('tokenizer').each do |test_file|
+        test_name = File.basename(test_file).sub('.test', '')
+
+        tests = JSON.parse(File.read(test_file))['tests']
+
+        tests.each_with_index do |data, index|
+            define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
+        end
+    end
+
+end
+
--- a/vendor/plugins/HTML5lib/tests/tokenizer_test_parser.rb
+++ b/vendor/plugins/HTML5lib/tests/tokenizer_test_parser.rb
@ -0,0 +1,62 @@
+require 'html5lib/constants'
+
+class TokenizerTestParser
+    def initialize(tokenizer)
+        @tokenizer = tokenizer
+    end
+
+    def parse
+        @outputTokens = []
+
+        debug = nil
+        for token in @tokenizer
+            debug = token.inspect if token[:type] == :ParseError
+            send ('process' + token[:type].to_s), token
+        end
+
+        return @outputTokens
+    end
+
+    def processDoctype(token)
+        @outputTokens.push(["DOCTYPE", token[:name], token[:data]])
+    end
+
+    def processStartTag(token)
+        @outputTokens.push(["StartTag", token[:name], token[:data]])
+    end
+
+    def processEmptyTag(token)
+        if not HTML5lib::VOID_ELEMENTS.include? token[:name]
+            @outputTokens.push("ParseError")
+        end
+        @outputTokens.push(["StartTag", token[:name], token[:data]])
+    end
+
+    def processEndTag(token)
+        if token[:data].length > 0
+            self.processParseError(token)
+        end
+        @outputTokens.push(["EndTag", token[:name]])
+    end
+
+    def processComment(token)
+        @outputTokens.push(["Comment", token[:data]])
+    end
+
+    def processCharacters(token)
+        @outputTokens.push(["Character", token[:data]])
+    end
+
+    alias processSpaceCharacters processCharacters
+
+    def processCharacters(token)
+        @outputTokens.push(["Character", token[:data]])
+    end
+
+    def processEOF(token)
+    end
+
+    def processParseError(token)
+        @outputTokens.push("ParseError")
+    end
+end
--- a/vendor/plugins/maruku/lib/maruku/defaults.rb
+++ b/vendor/plugins/maruku/lib/maruku/defaults.rb
@ -31,6 +31,9 @@ Globals = {
 	:maruku_signature => false,
 	:code_background_color => '#fef',
 	:code_show_spaces => false,
+	
+	:filter_html => false,
+	
 	:html_math_output_mathml => true, # also set :html_math_engine
 	:html_math_engine => 'itex2mml', #ritex, itex2mml, none
 	
--- a/vendor/plugins/maruku/lib/maruku/input/parse_block.rb
+++ b/vendor/plugins/maruku/lib/maruku/input/parse_block.rb
@ -477,7 +477,7 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
 		end
 		
 		id = match[1]; url = match[2]; title = match[3]; 
-		id = id.strip.downcase.gsub(' ','_')
+		id = sanitize_ref_id(id)
 		
 		hash = self.refs[id] = {:url=>url,:title=>title}
 		
--- a/vendor/plugins/maruku/lib/maruku/input/parse_span_better.rb
+++ b/vendor/plugins/maruku/lib/maruku/input/parse_span_better.rb
@ -287,7 +287,7 @@ module MaRuKu; module In; module Markdown; module SpanLevelParser
 	end

 	def extension_meta(src, con, break_on_chars)
-		if m = src.read_regexp(/([^\s\:]+):/)
+		if m = src.read_regexp(/([^\s\:\"\']+):/)
 			name = m[1]
 			al = read_attribute_list(src, con, break_on_chars)
 #			puts "#{name}=#{al.inspect}"
@ -581,9 +581,9 @@ module MaRuKu; module In; module Markdown; module SpanLevelParser
 			ref_id = read_ref_id(src,con)
 			if ref_id
 				if ref_id.size == 0
-					ref_id =  children.to_s.downcase.gsub(' ','_')
+					ref_id = sanitize_ref_id(children.to_s)
 				else
-					ref_id = ref_id.downcase
+					ref_id = sanitize_ref_id(ref_id)
 				end	
 				con.push_element md_link(children, ref_id)
 			else 
--- a/vendor/plugins/maruku/lib/maruku/input_textile2/t2_parser.rb
+++ b/vendor/plugins/maruku/lib/maruku/input_textile2/t2_parser.rb
@ -108,6 +108,7 @@ module MaRuKu
 		# Input is a LineSource
 		def t2_parse_blocks(src, output)
 			while src.cur_line
+				l = src.shift_line
 				
 				# ignore empty line
 				if l.t2_empty? then 
@ -115,7 +116,6 @@ module MaRuKu
 					next 
 				end
 				
-				l = src.shift_line
 				# TODO: lists
 				# TODO: xml
 				# TODO: `==`
--- a/vendor/plugins/maruku/lib/maruku/output/to_html.rb
+++ b/vendor/plugins/maruku/lib/maruku/output/to_html.rb
@ -741,7 +741,17 @@ of the form `#ff00ff`.
 		return a
 	end

+=begin maruku_doc
+Attribute: filter_html
+Scope: document
+
+If true, raw HTML is discarded from the output.
+
+=end
+
 	def to_html_raw_html
+		return [] if get_setting(:filter_html)
+		
 		raw_html = self.raw_html
 		if rexml_doc = @parsed_html
 			root = rexml_doc.root
--- a/vendor/plugins/maruku/lib/maruku/output/to_latex.rb
+++ b/vendor/plugins/maruku/lib/maruku/output/to_latex.rb
@ -152,7 +152,7 @@ end end
 module MaRuKu; module Out; module Latex
 	
 	def to_latex_hrule; "\n\\vspace{.5em} \\hrule \\vspace{.5em}\n" end
-	def to_latex_linebreak; "\\linebreak " end
+	def to_latex_linebreak; "\\newline " end
 	
 	def to_latex_paragraph 
 		children_to_latex+"\n\n"
--- a/vendor/plugins/maruku/lib/maruku/string_utils.rb
+++ b/vendor/plugins/maruku/lib/maruku/string_utils.rb
@ -146,6 +146,10 @@ module MaRuKu; module Strings
 		s[0, i+1].strip
 	end
 	
+	def sanitize_ref_id(x)
+		x.downcase.gsub(' ','_').gsub(/[^\w]/,'')
+	end
+

 	# removes initial quote
 	def unquote(s)
--- a/vendor/plugins/maruku/lib/maruku/tests/new_parser.rb
+++ b/vendor/plugins/maruku/lib/maruku/tests/new_parser.rb
@ -155,7 +155,7 @@ module MaRuKu; module Tests
 		["[a]",   [ md_link(["a"],'a')], 'Empty link'],
 		["[a][]", ],
 		["[a][]b",   [ md_link(["a"],'a'),'b'], 'Empty link'],
-		["[a\\]][]", [ md_link(["a]"],'a]')], 'Escape inside link'],
+		["[a\\]][]", [ md_link(["a]"],'a')], 'Escape inside link (throw ?] away)'],
 		
 		["[a",  :throw,   'Link not closed'],
 		["[a][",  :throw,   'Ref not closed'],
--- a/vendor/plugins/maruku/lib/maruku/version.rb
+++ b/vendor/plugins/maruku/lib/maruku/version.rb
@ -19,7 +19,7 @@
 #++

 module MaRuKu
-	Version = '0.5.5'
+	Version = '0.5.6'
 	
 	MarukuURL = 'http://maruku.rubyforge.org/'