Ruby 1.9 Compatibility

Completely removed the html5lib sanitizer. Fixed the string-handling to work in both Ruby 1.8.x and 1.9.2. There are still, inexplicably, two functional tests that fail. But the rest seems to work quite well.
2009-11-30 16:28:18 -06:00 · 2009-11-30 16:28:18 -06:00 · a6429f8c22
commit a6429f8c22
parent 79c8572053
142 changed files with 519 additions and 843 deletions
--- a/attic/vendor/plugins/HTML5lib/test/preamble.rb
+++ b/attic/vendor/plugins/HTML5lib/test/preamble.rb
@ -0,0 +1,70 @@
+require 'test/unit'
+
+HTML5_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__)))) 
+
+if File.exists?(File.join(HTML5_BASE, 'ruby', 'testdata'))
+  TESTDATA_DIR = File.join(HTML5_BASE, 'ruby', 'testdata')
+else
+  HTML5_BASE_RUBY = File.dirname(File.dirname(File.expand_path(__FILE__)))
+  TESTDATA_DIR = File.join(HTML5_BASE_RUBY, 'testdata')
+end
+
+$:.unshift File.join(File.dirname(File.dirname(__FILE__)), 'lib')
+$:.unshift File.dirname(__FILE__)
+
+def html5_test_files(subdirectory)
+  Dir[File.join(TESTDATA_DIR, subdirectory, '*.*')]
+end
+
+require 'rubygems'
+require 'json'
+
+module HTML5
+  module TestSupport
+    # convert the output of str(document) to the format used in the testcases
+    def convertTreeDump(treedump)
+      treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
+    end
+
+    def sortattrs(output)
+      output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) do |match|
+         match.split("\n").sort.join("\n")
+      end
+    end
+
+    class TestData
+      include Enumerable
+
+      def initialize(filename, sections)
+        @f = open(filename)
+        @sections = sections
+      end
+    
+      def each
+        data = {}
+        key = nil
+        @f.each_line do |line|
+          if line[0] == ?# and @sections.include?(line[1..-2])
+            heading = line[1..-2]
+            if data.any? and heading == @sections[0]
+              data[key].chomp!  #Remove trailing newline
+              yield normaliseOutput(data)
+              data = {}
+            end
+            key = heading
+            data[key]=""
+          elsif key
+            data[key] += line
+          end
+        end
+        yield normaliseOutput(data) if data
+      end
+        
+      def normaliseOutput(data)
+        #Remove trailing newlines
+        data.keys.each { |key| data[key].chomp! }
+        @sections.map {|heading| data[heading]}
+      end
+    end
+  end
+end
--- a/attic/vendor/plugins/HTML5lib/test/test_cli.rb
+++ b/attic/vendor/plugins/HTML5lib/test/test_cli.rb
@ -0,0 +1,16 @@
+require File.join(File.dirname(__FILE__), 'preamble')
+require "html5/cli"
+
+class TestCli < Test::Unit::TestCase
+  def test_open_input
+    assert_equal $stdin, HTML5::CLI.open_input('-')
+    assert_kind_of StringIO, HTML5::CLI.open_input('http://whatwg.org/')
+    assert_kind_of File, HTML5::CLI.open_input('testdata/sites/google-results.htm')
+  end
+  
+  def test_parse_opts
+    HTML5::CLI.parse_opts [] # TODO test defaults
+    assert_equal 'hpricot', HTML5::CLI.parse_opts(['-b', 'hpricot']).treebuilder
+    assert_equal 'hpricot', HTML5::CLI.parse_opts(['--treebuilder', 'hpricot']).treebuilder
+  end
+end
--- a/attic/vendor/plugins/HTML5lib/test/test_encoding.rb
+++ b/attic/vendor/plugins/HTML5lib/test/test_encoding.rb
@ -0,0 +1,35 @@
+require File.join(File.dirname(__FILE__), 'preamble')
+
+require 'html5/inputstream'
+
+class Html5EncodingTestCase < Test::Unit::TestCase
+  include HTML5
+  include TestSupport
+
+  begin
+    require 'rubygems'
+    require 'UniversalDetector'
+
+    def test_chardet #TODO: can we get rid of this?
+      file = File.open(File.join(TESTDATA_DIR, 'encoding', 'chardet', 'test_big5.txt'), 'r')
+      stream = HTML5::HTMLInputStream.new(file, :chardet => true)
+      assert_equal 'big5', stream.char_encoding.downcase
+    rescue LoadError
+      puts "chardet not found, skipping chardet tests"
+    end
+  end
+
+  html5_test_files('encoding').each do |test_file|        
+    test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
+
+    TestData.new(test_file, %w(data encoding)).
+      each_with_index do |(input, encoding), index|
+
+      define_method 'test_%s_%d' % [ test_name, index + 1 ] do
+        stream = HTML5::HTMLInputStream.new(input, :chardet => false)
+        assert_equal encoding.downcase, stream.char_encoding.downcase, input
+      end
+    end
+  end
+
+end
--- a/attic/vendor/plugins/HTML5lib/test/test_input_stream.rb
+++ b/attic/vendor/plugins/HTML5lib/test/test_input_stream.rb
@ -0,0 +1,26 @@
+require File.join(File.dirname(__FILE__), 'preamble')
+require "test/unit"
+require "html5/inputstream"
+
+class TestHtml5Inputstream < Test::Unit::TestCase
+  def test_newline_in_queue
+    stream = HTML5::HTMLInputStream.new("\nfoo")
+    stream.unget(stream.char)
+    assert_equal [1, 0], stream.position
+  end
+  
+  def test_buffer_boundary
+    stream = HTML5::HTMLInputStream.new("abcdefghijklmnopqrstuvwxyz" * 50, :encoding => 'windows-1252')
+    1022.times{stream.char}
+    assert_equal "i", stream.char
+  end
+  
+  def test_chars_until
+    stream = HTML5::HTMLInputStream.new("aaaaaaab")
+    assert_equal "aaaaaaa", stream.chars_until("b")
+
+    stream = HTML5::HTMLInputStream.new("aaaaaaab")
+    assert_equal "aaaaaaab", stream.chars_until("c")
+    
+  end
+end
--- a/attic/vendor/plugins/HTML5lib/test/test_lxp.rb
+++ b/attic/vendor/plugins/HTML5lib/test/test_lxp.rb
@ -0,0 +1,283 @@
+require File.join(File.dirname(__FILE__), 'preamble')
+
+require 'html5/liberalxmlparser'
+
+XMLELEM = /<(\w+\s*)((?:[-:\w]+="[^"]*"\s*)+)(\/?)>/
+
+def assert_xml_equal(input, expected=nil, parser=HTML5::XMLParser)
+  sortattrs = proc {"<#{$1+$2.split.sort.join(' ')+$3}>"}
+  document = parser.parse(input.chomp, :lowercase_attr_name => false, :lowercase_element_name => false).root
+  if not expected
+    expected = input.chomp.gsub(XMLELEM,&sortattrs)
+    if expected.respond_to? :force_encoding
+      expected = expected.gsub(/&#(\d+);/) {$1.to_i.chr('utf-8')}
+    else
+      expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
+    end
+    output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,&sortattrs)
+    assert_equal(expected, output)
+  else
+    assert_equal(expected, document.to_s.gsub(/'/,'"'))
+  end
+end
+
+def assert_xhtml_equal(input, expected=nil, parser=HTML5::XHTMLParser)
+  assert_xml_equal(input, expected, parser)
+end
+
+class BasicXhtml5Test < Test::Unit::TestCase
+
+  def test_title_body_mismatched_close
+    assert_xhtml_equal(
+      '<title>Xhtml</title><b><i>content</b></i>',
+      '<html xmlns="http://www.w3.org/1999/xhtml">' +
+      '<head><title>Xhtml</title></head>' + 
+      '<body><b><i>content</i></b></body>' +
+      '</html>')
+  end
+
+  def test_title_body_named_charref
+    assert_xhtml_equal(
+      '<title>ntilde</title>A &ntilde B',
+      '<html xmlns="http://www.w3.org/1999/xhtml">' +
+      '<head><title>ntilde</title></head>' + 
+      '<body>A '+ [0xF1].pack('U') + ' B</body>' +
+      '</html>')
+  end
+end
+
+class BasicXmlTest < Test::Unit::TestCase
+
+  def test_comment
+    assert_xml_equal("<x><!-- foo --></x>")
+  end
+
+  def test_cdata
+    assert_xml_equal("<x><![CDATA[foo]]></x>","<x>foo</x>")
+  end
+
+  def test_simple_text
+    assert_xml_equal("<p>foo</p>","<p>foo</p>")
+  end
+
+  def test_optional_close
+    assert_xml_equal("<p>foo","<p>foo</p>")
+  end
+
+  def test_html_mismatched
+    assert_xml_equal("<b><i>foo</b></i>","<b><i>foo</i></b>")
+  end
+end
+
+class OpmlTest < Test::Unit::TestCase
+
+  def test_mixedCaseElement
+    assert_xml_equal(
+      '<opml version="1.0">' +
+      '<head><ownerName>Dave Winer</ownerName></head>' +
+      '</opml>')
+  end
+
+  def test_mixedCaseAttribute
+    assert_xml_equal(
+      '<opml version="1.0">' +
+      '<body><outline isComment="true"/></body>' +
+      '</opml>')
+  end
+
+  def test_malformed
+    assert_xml_equal(
+      '<opml version="1.0">' +
+      '<body><outline text="Odds & Ends"/></body>' +
+      '</opml>',
+      '<opml version="1.0">' +
+      '<body><outline text="Odds &amp; Ends"/></body>' +
+      '</opml>')
+  end
+end
+
+class XhtmlTest < Test::Unit::TestCase
+
+  def test_mathml
+    assert_xhtml_equal <<EOX
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>MathML</title></head>
+<body>
+  <math xmlns="http://www.w3.org/1998/Math/MathML">
+  <mrow>
+    <mi>x</mi>
+    <mo>=</mo>
+
+    <mfrac>
+    <mrow>
+      <mrow>
+      <mo>-</mo>
+      <mi>b</mi>
+      </mrow>
+      <mo>&#177;</mo>
+      <msqrt>
+
+      <mrow>
+        <msup>
+        <mi>b</mi>
+        <mn>2</mn>
+        </msup>
+        <mo>-</mo>
+        <mrow>
+
+        <mn>4</mn>
+        <mo>&#8290;</mo>
+        <mi>a</mi>
+        <mo>&#8290;</mo>
+        <mi>c</mi>
+        </mrow>
+      </mrow>
+
+      </msqrt>
+    </mrow>
+    <mrow>
+      <mn>2</mn>
+      <mo>&#8290;</mo>
+      <mi>a</mi>
+    </mrow>
+    </mfrac>
+
+  </mrow>
+  </math>
+</body></html>
+EOX
+  end
+
+  def test_svg
+    assert_xhtml_equal <<EOX
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>SVG</title></head>
+<body>
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
+  <path d="M38,38c0-12,24-15,23-2c0,9-16,13-16,23v7h11v-4c0-9,17-12,17-27
+       c-2-22-45-22-45,3zM45,70h11v11h-11z" fill="#371">
+  </path>
+  <circle cx="50" cy="50" r="45" fill="none" stroke="#371" stroke-width="10">
+  </circle>
+
+  </svg>
+</body></html>
+EOX
+  end
+
+  def test_xlink
+    assert_xhtml_equal <<EOX
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>XLINK</title></head>
+<body>
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
+  <defs xmlns:l="http://www.w3.org/1999/xlink">
+    <radialGradient id="s1" fx=".4" fy=".2" r=".7">
+    <stop stop-color="#FE8"/>
+    <stop stop-color="#D70" offset="1"/>
+    </radialGradient>
+    <radialGradient id="s2" fx=".8" fy=".5" l:href="#s1"/>
+    <radialGradient id="s3" fx=".5" fy=".9" l:href="#s1"/>
+    <radialGradient id="s4" fx=".1" fy=".5" l:href="#s1"/>
+  </defs>
+  <g stroke="#940">
+    <path d="M73,29c-37-40-62-24-52,4l6-7c-8-16,7-26,42,9z" fill="url(#s1)"/>
+    <path d="M47,8c33-16,48,21,9,47l-6-5c38-27,20-44,5-37z" fill="url(#s2)"/>
+    <path d="M77,32c22,30,10,57-39,51l-1-8c3,3,67,5,36-36z" fill="url(#s3)"/>
+
+    <path d="M58,84c-4,20-38-4-8-24l-6-5c-36,43,15,56,23,27z" fill="url(#s4)"/>
+    <path d="M40,14c-40,37-37,52-9,68l1-8c-16-13-29-21,16-56z" fill="url(#s1)"/>
+    <path d="M31,33c19,23,20,7,35,41l-9,1.7c-4-19-8-14-31-37z" fill="url(#s2)"/>
+  </g>
+  </svg>
+</body></html>
+EOX
+  end
+
+  def test_br
+    assert_xhtml_equal <<EOX1
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>BR</title></head>
+<body>
+<br/>
+</body></html>
+EOX1
+  end
+
+  def test_strong
+    assert_xhtml_equal <<EOX
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>STRONG</title></head>
+<body>
+<strong></strong>
+</body></html>
+EOX
+  end
+
+  def test_script
+    assert_xhtml_equal <<EOX
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>SCRIPT</title></head>
+<body>
+<script>1 &lt; 2 &amp; 3</script>
+</body></html>
+EOX
+  end
+
+  def test_script_src
+    assert_xhtml_equal <<EOX1, <<EOX2.strip
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>SCRIPT</title><script src="http://example.com"/></head>
+<body>
+<script>1 &lt; 2 &amp; 3</script>
+</body></html>
+EOX1
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>SCRIPT</title><script src="http://example.com"></script></head>
+<body>
+<script>1 &lt; 2 &amp; 3</script>
+</body></html>
+EOX2
+  end
+
+  def test_title
+    assert_xhtml_equal <<EOX
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>1 &lt; 2 &amp; 3</title></head>
+<body>
+</body></html>
+EOX
+  end
+
+  def test_prolog
+    assert_xhtml_equal <<EOX1, <<EOX2.strip
+<?xml version="1.0" encoding="UTF-8" ?>
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>PROLOG</title></head>
+<body>
+</body></html>
+EOX1
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>PROLOG</title></head>
+<body>
+</body></html>
+EOX2
+  end
+
+  def test_tagsoup
+    assert_xhtml_equal <<EOX1, <<EOX2.strip
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>TAGSOUP</title></head>
+<body>
+<u><blockquote><p></u>
+</body></html>
+EOX1
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>TAGSOUP</title></head>
+<body>
+<u/><blockquote><u/><p><u/>
+</p></blockquote></body></html>
+EOX2
+  end
+
+end
--- a/attic/vendor/plugins/HTML5lib/test/test_parser.rb
+++ b/attic/vendor/plugins/HTML5lib/test/test_parser.rb
@ -0,0 +1,63 @@
+require File.join(File.dirname(__FILE__), 'preamble')
+
+require 'html5/treebuilders'
+require 'html5/html5parser'
+require 'html5/cli'
+
+$tree_types_to_test = ['simpletree', 'rexml']
+
+begin
+  require 'hpricot'
+  $tree_types_to_test.push('hpricot')
+rescue LoadError
+end
+
+class Html5ParserTestCase < Test::Unit::TestCase
+  include HTML5
+  include TestSupport
+
+  html5_test_files('tree-construction').each do |test_file|
+
+    test_name = File.basename(test_file).sub('.dat', '')
+
+    TestData.new(test_file, %w(data errors document-fragment document)).each_with_index do |(input, errors, inner_html, expected), index|
+
+      errors = errors.split("\n")
+      expected = expected.gsub("\n| ","\n")[2..-1]
+
+      $tree_types_to_test.each do |tree_name|
+        define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
+
+          parser = HTMLParser.new(:tree => TreeBuilders[tree_name])
+
+          if inner_html
+            parser.parse_fragment(input, inner_html)
+          else
+            parser.parse(input)
+          end
+
+          actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
+
+          assert_equal sortattrs(expected), sortattrs(actual_output), [
+            '', 'Input:', input,
+            '', 'Expected:', expected,
+            '', 'Recieved:', actual_output
+          ].join("\n")
+
+          actual_errors = parser.errors.map do |(line, col), message, datavars|
+            message = CLI::PythonicTemplate.new(E[message]).to_s(datavars)
+            "Line: #{line} Col: #{col} #{message}"
+          end
+
+          assert_equal errors, actual_errors, [
+            '', 'Input', input,
+            '', "Expected errors (#{errors.length}):", errors.join("\n"),
+            '', "Actual errors (#{actual_errors.length}):",
+                 actual_errors.join("\n") + "\n"
+          ].join("\n")
+        end
+      end
+    end
+  end
+
+end
--- a/attic/vendor/plugins/HTML5lib/test/test_sanitizer.rb
+++ b/attic/vendor/plugins/HTML5lib/test/test_sanitizer.rb
@ -0,0 +1,179 @@
+#!/usr/bin/env ruby
+
+require File.join(File.dirname(__FILE__), 'preamble')
+
+require 'html5/html5parser'
+require 'html5/liberalxmlparser'
+require 'html5/treewalkers'
+require 'html5/serializer'
+require 'html5/sanitizer'
+
+class SanitizeTest < Test::Unit::TestCase
+  include HTML5
+
+  def sanitize_xhtml stream
+    XHTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).join
+  end
+
+  def sanitize_html stream
+    HTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).join
+  end
+
+  def sanitize_rexml stream
+    require 'rexml/document'
+    doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{stream}</div>")
+    tokens = TreeWalkers.get_tree_walker('rexml').new(doc)
+    XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
+      :quote_char => "'",
+      :inject_meta_charset => false,
+      :sanitize => true}).gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
+  rescue REXML::ParseException
+    return "Ill-formed XHTML!"
+  end
+
+  def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
+    assert_equal htmloutput, sanitize_html(input)
+    assert_equal xhtmloutput, sanitize_xhtml(input)
+    assert_equal rexmloutput, sanitize_rexml(input)
+  end
+
+  HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
+    define_method "test_should_allow_#{tag_name}_tag" do
+      input       = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
+      htmloutput  = "<#{tag_name.downcase} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>"
+      xhtmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>"
+      rexmloutput = xhtmloutput
+
+      if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
+        htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
+        xhtmloutput = htmloutput
+      elsif tag_name == 'col'
+        htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
+        xhtmloutput = htmloutput
+        rexmloutput = "<col title='1' />"
+      elsif tag_name == 'table'
+        htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt;baz<table title='1'> </table>"
+        xhtmloutput = htmloutput
+      elsif tag_name == 'image'
+        htmloutput = "<img title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
+        xhtmloutput = htmloutput
+        rexmloutput = "<image title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</image>"
+      elsif VOID_ELEMENTS.include?(tag_name)
+        htmloutput = "<#{tag_name} title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
+        xhtmloutput = htmloutput
+        htmloutput += '<br/>' if tag_name == 'br'
+        rexmloutput =  "<#{tag_name} title='1' />"
+      end
+      check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
+    end
+  end
+
+  HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
+    define_method "test_should_forbid_#{tag_name.upcase}_tag" do
+      input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
+      output = "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
+      check_sanitization(input, output, output, output)
+    end
+  end
+
+  HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
+    next if attribute_name == 'style'
+    define_method "test_should_allow_#{attribute_name}_attribute" do
+      input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
+      output = "<p #{attribute_name}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
+      htmloutput = "<p #{attribute_name.downcase}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
+      rexmloutput = attribute_name.include?(':') && !(attribute_name =~ /^xml(ns)?:/) ? "Ill-formed XHTML!" : output
+      check_sanitization(input, htmloutput, output, rexmloutput)
+    end
+  end
+
+  HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
+    define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
+      input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
+      output =  "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
+      rexmloutput = attribute_name.include?(':') ? "Ill-formed XHTML!" : output
+      check_sanitization(input, output, output, rexmloutput)
+    end
+  end
+
+  HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
+    define_method "test_should_allow_#{protocol}_uris" do
+      input = %(<a href="#{protocol}">foo</a>)
+      output = "<a href='#{protocol}'>foo</a>"
+      check_sanitization(input, output, output, output)
+    end
+  end
+
+  HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
+    define_method "test_should_allow_uppercase_#{protocol}_uris" do
+      input = %(<a href="#{protocol.upcase}">foo</a>)
+      output = "<a href='#{protocol.upcase}'>foo</a>"
+      check_sanitization(input, output, output, output)
+    end
+  end
+
+  HTMLSanitizer::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
+    next unless HTMLSanitizer::ALLOWED_ELEMENTS.include?(tag_name)
+    define_method "test_#{tag_name}_should_allow_local_href" do
+      input = %(<#{tag_name} xlink:href="#foo"/>)
+      output = "<#{tag_name.downcase} xlink:href='#foo'/>"
+      xhtmloutput = "<#{tag_name} xlink:href='#foo'></#{tag_name}>"
+      rexmloutput = "Ill-formed XHTML!"
+      check_sanitization(input, output, xhtmloutput, rexmloutput)
+    end
+
+    define_method "test_#{tag_name}_should_allow_local_href_with_newline" do
+      input = %(<#{tag_name} xlink:href="\n#foo"/>)
+      output = "<#{tag_name.downcase} xlink:href='\n#foo'/>"
+      xhtmloutput = "<#{tag_name} xlink:href='\n#foo'></#{tag_name}>"
+      rexmloutput = "Ill-formed XHTML!"
+      check_sanitization(input, output, xhtmloutput, rexmloutput)
+    end
+
+    define_method "test_#{tag_name}_should_forbid_nonlocal_href" do
+      input = %(<#{tag_name} xlink:href="http://bad.com/foo"/>)
+      output = "<#{tag_name.downcase}/>"
+      xhtmloutput = "<#{tag_name}></#{tag_name}>"
+      rexmloutput = "Ill-formed XHTML!"
+      check_sanitization(input, output, xhtmloutput, rexmloutput)
+    end
+
+    define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline" do
+      input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo"/>)
+      output = "<#{tag_name.downcase}/>"
+      xhtmloutput = "<#{tag_name}></#{tag_name}>"
+      rexmloutput = "Ill-formed XHTML!"
+      check_sanitization(input, output, xhtmloutput, rexmloutput)
+    end
+  end
+
+  def test_should_handle_astral_plane_characters
+    input = "<p>&#x1d4b5; &#x1d538;</p>"
+    output = "<p>\360\235\222\265 \360\235\224\270</p>"
+    check_sanitization(input, output, output, output)
+
+    input = "<p><tspan>\360\235\224\270</tspan> a</p>"
+    output = "<p><tspan>\360\235\224\270</tspan> a</p>"
+    check_sanitization(input, output, output, output)
+  end
+
+# This affects only NS4. Is it worth fixing?
+#  def test_javascript_includes
+#    input = %(<div size="&{alert('XSS')}">foo</div>)
+#    output = "<div>foo</div>"    
+#    check_sanitization(input, output, output, output)
+#  end
+
+  html5_test_files('sanitizer').each do |filename|
+    JSON::parse(open(filename).read).each do |test|
+      define_method "test_#{test['name']}" do
+        check_sanitization(
+          test['input'],
+          test['output'],
+          test['xhtml'] || test['output'],
+          test['rexml'] || test['output']
+        )
+      end
+    end
+  end
+end
--- a/attic/vendor/plugins/HTML5lib/test/test_serializer.rb
+++ b/attic/vendor/plugins/HTML5lib/test/test_serializer.rb
@ -0,0 +1,67 @@
+require File.join(File.dirname(__FILE__), 'preamble')
+
+require 'html5/html5parser'
+require 'html5/serializer'
+require 'html5/treewalkers'
+
+#Run the serialize error checks
+checkSerializeErrors = false
+
+class JsonWalker < HTML5::TreeWalkers::Base
+  def each
+    @tree.each do |token|
+      case token[0]
+      when 'StartTag'
+        yield start_tag(token[1], token[2])
+      when 'EndTag'
+        yield end_tag(token[1])
+      when 'EmptyTag'
+        yield empty_tag(token[1], token[2])
+      when 'Comment'
+        yield comment(token[1])
+      when 'Characters', 'SpaceCharacters'
+        text(token[1]) {|textToken| yield textToken}
+      when 'Doctype'
+        yield doctype(token[1], token[2], token[3])
+      else
+        raise "Unknown token type: " + token[0]
+      end
+    end
+  end
+end
+
+class Html5SerializeTestcase < Test::Unit::TestCase
+  html5_test_files('serializer').each do |filename|
+    test_name = File.basename(filename).sub('.test', '')
+    tests = JSON::parse(open(filename).read)
+    tests['tests'].each_with_index do |test, index|
+
+      define_method "test_#{test_name}_#{index+1}" do
+        if test["options"] and test["options"]["encoding"]
+          test["options"][:encoding] = test["options"]["encoding"]
+        end
+
+        result = HTML5::HTMLSerializer.
+          serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
+        expected = test["expected"]
+        if expected.length == 1
+          assert_equal(expected[0], result, test["description"])
+        elsif !expected.include?(result)
+          flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
+        end
+
+        next if test_name == 'optionaltags'
+
+        result = HTML5::XHTMLSerializer.
+          serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
+        expected = test["xhtml"] || test["expected"]
+        if expected.length == 1
+          assert_equal(expected[0], result, test["description"])
+        elsif !expected.include?(result)
+          flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
+        end
+      end
+
+    end
+  end
+end
--- a/attic/vendor/plugins/HTML5lib/test/test_sniffer.rb
+++ b/attic/vendor/plugins/HTML5lib/test/test_sniffer.rb
@ -0,0 +1,27 @@
+require File.join(File.dirname(__FILE__), 'preamble')
+require "html5/sniffer"
+
+class TestFeedTypeSniffer < Test::Unit::TestCase
+  include HTML5
+  include TestSupport
+  include Sniffer
+  
+  html5_test_files('sniffer').each do |test_file|
+    test_name = File.basename(test_file).sub('.test', '')
+
+    tests = JSON.parse(File.read(test_file))
+
+    tests.each_with_index do |data, index|
+      define_method('test_%s_%d' % [test_name, index + 1]) do
+        assert_equal data['type'], html_or_feed(data['input'])
+      end
+    end
+  end
+  # each_with_index do |t, i|
+  #     define_method "test_#{i}" do
+  #       assert_equal t[0], sniff_feed_type(t[1])
+  #     end
+  #   end
+  
+
+end
--- a/attic/vendor/plugins/HTML5lib/test/test_stream.rb
+++ b/attic/vendor/plugins/HTML5lib/test/test_stream.rb
@ -0,0 +1,71 @@
+require File.join(File.dirname(__FILE__), 'preamble')
+
+require 'html5/inputstream'
+
+class HTMLInputStreamTest < Test::Unit::TestCase
+  include HTML5
+
+  def getc stream
+    if String.method_defined? :force_encoding
+      stream.char.force_encoding('binary')
+    else
+      stream.char
+    end
+  end
+
+  def test_char_ascii
+    stream = HTMLInputStream.new("'", :encoding=>'ascii')
+    assert_equal('ascii', stream.char_encoding)
+    assert_equal("'", stream.char)
+  end
+
+  def test_char_null
+    stream = HTMLInputStream.new("\x00")
+    assert_equal("\xef\xbf\xbd", getc(stream))
+  end
+
+  def test_char_utf8
+    stream = HTMLInputStream.new("\xe2\x80\x98", :encoding=>'utf-8')
+    assert_equal('utf-8', stream.char_encoding)
+    assert_equal("\xe2\x80\x98", getc(stream))
+  end
+
+  def test_char_win1252
+    stream = HTMLInputStream.new("\xa2\xc5\xf1\x92\x86")
+    assert_equal('windows-1252', stream.char_encoding)
+    assert_equal("\xc2\xa2", getc(stream))
+    assert_equal("\xc3\x85", getc(stream))
+    assert_equal("\xc3\xb1", getc(stream))
+    assert_equal("\xe2\x80\x99", getc(stream))
+    assert_equal("\xe2\x80\xa0", getc(stream))
+  end
+
+  def test_bom
+    stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
+    assert_equal('utf-8', stream.char_encoding)
+    assert_equal("'", stream.char)
+  end
+
+  begin
+    require 'iconv'
+
+    def test_utf_16
+      input = Iconv.new('utf-16', 'utf-8').iconv(' '*1025)
+      stream = HTMLInputStream.new(input)
+      assert('utf-16-le', stream.char_encoding)
+      assert_equal(1025, stream.chars_until(' ', true).length)
+    end
+  rescue LoadError
+    puts "iconv not found, skipping iconv tests"
+  end
+
+  def test_newlines
+    stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
+    assert_equal([1,0], stream.position)
+    assert_equal("a\nbb\n", stream.chars_until('c'))
+    assert_equal([3,0], stream.position)
+    assert_equal("ccc\ndddd", stream.chars_until('x'))
+    assert_equal([4,4], stream.position)
+    assert_equal([1,2,3], stream.instance_eval {@line_lengths})
+  end
+end
--- a/attic/vendor/plugins/HTML5lib/test/test_tokenizer.rb
+++ b/attic/vendor/plugins/HTML5lib/test/test_tokenizer.rb
@ -0,0 +1,94 @@
+require File.join(File.dirname(__FILE__), 'preamble')
+
+require 'html5/tokenizer'
+
+require 'tokenizer_test_parser'
+
+class Html5TokenizerTestCase < Test::Unit::TestCase
+
+  def assert_tokens_match(expectedTokens, receivedTokens, ignoreErrorOrder, message)
+    if !ignoreErrorOrder
+      return expectedTokens == receivedTokens
+    else
+      #Sort the tokens into two groups; non-parse errors and parse errors
+      expected = [[],[]]
+      received = [[],[]]
+      
+      for token in expectedTokens
+        if token != "ParseError"
+          expected[0] << token
+        else
+          expected[1] << token
+        end
+      end
+
+      for token in receivedTokens
+        if token != "ParseError"
+          received[0] << token
+        else
+          received[1] << token
+        end
+      end
+      assert_equal expected, received, message
+    end
+  end
+
+  def type_of?(token_name, token)
+    token != 'ParseError' and token_name == token.first
+  end
+
+  def convert_attribute_arrays_to_hashes(tokens)
+    tokens.inject([]) do |tokens, token|
+      token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
+      tokens << token
+    end
+  end
+  
+  def concatenate_consecutive_characters(tokens)
+    tokens.inject([]) do |tokens, token|
+      if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
+        tokens.last[1] = tokens.last[1] + token[1]
+        next tokens
+      end
+      tokens << token
+    end
+  end
+
+  def tokenizer_test(data)
+    (data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
+      message = [
+        '', 'Description:', data['description'],
+        '', 'Input:', data['input'],
+        '', 'Content Model Flag:', content_model_flag,
+        '' ] * "\n"
+
+      assert_nothing_raised message do
+        tokenizer = HTML5::HTMLTokenizer.new(data['input'])
+
+        tokenizer.content_model_flag = content_model_flag.to_sym
+
+        tokenizer.current_token = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
+
+        tokens = TokenizerTestParser.new(tokenizer).parse
+
+        actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
+
+        expected = concatenate_consecutive_characters(data['output'])
+
+        assert_tokens_match expected, actual, data["ignoreErrorOrder"], message
+      end
+    end 
+  end
+
+  html5_test_files('tokenizer').each do |test_file|
+    test_name = File.basename(test_file).sub('.test', '')
+
+    tests = JSON.parse(File.read(test_file))['tests']
+
+    tests.each_with_index do |data, index|
+      define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
+    end
+  end
+
+end
+
--- a/attic/vendor/plugins/HTML5lib/test/test_treewalkers.rb
+++ b/attic/vendor/plugins/HTML5lib/test/test_treewalkers.rb
@ -0,0 +1,135 @@
+require File.join(File.dirname(__FILE__), 'preamble')
+
+require 'html5/html5parser'
+require 'html5/treewalkers'
+require 'html5/treebuilders'
+
+$tree_types_to_test = {
+  'simpletree' =>
+    {:builder => HTML5::TreeBuilders['simpletree'],
+     :walker  => HTML5::TreeWalkers['simpletree']},
+  'rexml' =>
+    {:builder => HTML5::TreeBuilders['rexml'],
+     :walker  => HTML5::TreeWalkers['rexml']},
+  'hpricot' =>
+    {:builder => HTML5::TreeBuilders['hpricot'],
+     :walker  => HTML5::TreeWalkers['hpricot']},
+}
+
+puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
+
+class TestTreeWalkers < Test::Unit::TestCase
+  include HTML5::TestSupport
+
+  def concatenateCharacterTokens(tokens)
+    charactersToken = nil
+    for token in tokens
+        type = token[:type]
+        if [:Characters, :SpaceCharacters].include?(type)
+            if charactersToken == nil
+                charactersToken = {:type => :Characters, :data => token[:data]}
+            else
+                charactersToken[:data] += token[:data]
+            end
+        else
+            if charactersToken != nil
+                yield charactersToken
+                charactersToken = nil
+            end
+            yield token
+        end
+    end
+    yield charactersToken if charactersToken != nil
+  end
+
+  def convertTokens(tokens)
+    output = []
+    indent = 0
+    concatenateCharacterTokens(tokens) do |token|
+      case token[:type]
+      when :StartTag, :EmptyTag
+        output << "#{' '*indent}<#{token[:name]}>"
+        indent += 2
+        for name, value in token[:data].to_a.sort
+          next if name=='xmlns'
+          output << "#{' '*indent}#{name}=\"#{value}\""
+        end
+        indent -= 2 if token[:type] == :EmptyTag
+      when :EndTag
+        indent -= 2
+      when :Comment
+        output << "#{' '*indent}<!-- #{token[:data]} -->"
+      when :Doctype
+        if token[:name] and token[:name].any?
+          output << "#{' '*indent}<!DOCTYPE #{token[:name]}>"
+        else
+          output << "#{' '*indent}<!DOCTYPE >"
+        end
+      when :Characters, :SpaceCharacters
+        output << "#{' '*indent}\"#{token[:data]}\""
+      end
+    end
+    output.join("\n")
+  end
+
+  html5_test_files('tree-construction').each do |test_file|
+
+    test_name = File.basename(test_file).sub('.dat', '')
+    next if test_name == 'tests5' # TODO
+
+    TestData.new(test_file, %w(data errors document-fragment document)).
+      each_with_index do |(input, errors, inner_html, expected), index|
+
+      expected = expected.gsub("\n| ","\n")[2..-1]
+
+      $tree_types_to_test.each do |tree_name, tree_class|
+
+        define_method "test_#{test_name}_#{index}_#{tree_name}" do
+
+          parser = HTML5::HTMLParser.new(:tree => tree_class[:builder])
+
+          if inner_html
+            parser.parse_fragment(input, inner_html)
+          else
+            parser.parse(input)
+          end
+
+          document = parser.tree.get_document
+
+          begin
+            output = sortattrs(convertTokens(tree_class[:walker].new(document)))
+            expected = sortattrs(expected)
+            assert_equal expected, output, [
+              '', 'Input:', input,
+              '', 'Expected:', expected,
+              '', 'Recieved:', output
+            ].join("\n")
+          rescue NotImplementedError
+            # Amnesty for those that confess...
+          end
+        end
+      end
+   end
+  end
+
+  def test_all_tokens
+    expected = [
+        {:data => [], :type => :StartTag, :name => 'html'},
+        {:data => [], :type => :StartTag, :name => 'head'},
+        {:data => [], :type => :EndTag,   :name => 'head'},
+        {:data => [], :type => :StartTag, :name => 'body'},
+        {:data => [], :type => :EndTag,   :name => 'body'},
+        {:data => [], :type => :EndTag,   :name => 'html'}]
+    for treeName, tree_class in $tree_types_to_test
+      p = HTML5::HTMLParser.new(:tree => tree_class[:builder])
+      document = p.parse("<html></html>")
+      # document = tree_class.get(:adapter)(document)
+      output = tree_class[:walker].new(document)
+      expected.zip(output) do |expected_token, output_token|
+        assert_equal(expected_token, output_token)
+      end
+    end
+  end
+
+
+end
--- a/attic/vendor/plugins/HTML5lib/test/test_validator.rb
+++ b/attic/vendor/plugins/HTML5lib/test/test_validator.rb
@ -0,0 +1,31 @@
+#!/usr/bin/env ruby -wKU
+
+require File.join(File.dirname(__FILE__), 'preamble')
+
+require 'html5'
+require 'html5/filters/validator'
+
+class TestValidator < Test::Unit::TestCase
+  def run_validator_test(test)
+    p = HTML5::HTMLParser.new(:tokenizer => HTMLConformanceChecker)
+    p.parse(test['input'])
+    errorCodes = p.errors.collect{|e| e[1]}
+    if test.has_key?('fail-if')
+      assert !errorCodes.include?(test['fail-if'])
+    end
+    if test.has_key?('fail-unless')
+      assert errorCodes.include?(test['fail-unless'])
+    end
+  end
+
+  for filename in html5_test_files('validator')
+    tests    = JSON.load(open(filename))
+    testName = File.basename(filename).sub(".test", "")
+    tests['tests'].each_with_index do |test, index|
+      define_method "test_#{testName}_#{index}" do
+        run_validator_test(test)
+      end
+    end
+  end
+end
+
--- a/attic/vendor/plugins/HTML5lib/test/tokenizer_test_parser.rb
+++ b/attic/vendor/plugins/HTML5lib/test/tokenizer_test_parser.rb
@ -0,0 +1,63 @@
+require 'html5/constants'
+
+class TokenizerTestParser
+  def initialize(tokenizer)
+    @tokenizer = tokenizer
+  end
+
+  def parse
+    @outputTokens = []
+
+    debug = nil
+    for token in @tokenizer
+      debug = token.inspect if token[:type] == :ParseError
+      send(('process' + token[:type].to_s), token)
+    end
+
+    return @outputTokens
+  end
+
+  def processDoctype(token)
+    @outputTokens.push(["DOCTYPE", token[:name], token[:publicId],
+      token[:systemId], token[:correct]])
+  end
+
+  def processStartTag(token)
+    @outputTokens.push(["StartTag", token[:name], token[:data]])
+  end
+
+  def processEmptyTag(token)
+    if not HTML5::VOID_ELEMENTS.include? token[:name]
+      @outputTokens.push("ParseError")
+    end
+    @outputTokens.push(["StartTag", token[:name], token[:data]])
+  end
+
+  def processEndTag(token)
+    if token[:data].length > 0
+      self.processParseError(token)
+    end
+    @outputTokens.push(["EndTag", token[:name]])
+  end
+
+  def processComment(token)
+    @outputTokens.push(["Comment", token[:data]])
+  end
+
+  def processCharacters(token)
+    @outputTokens.push(["Character", token[:data]])
+  end
+
+  alias processSpaceCharacters processCharacters
+
+  def processCharacters(token)
+    @outputTokens.push(["Character", token[:data]])
+  end
+
+  def process_eof(token)
+  end
+
+  def processParseError(token)
+    @outputTokens.push("ParseError")
+  end
+end