diff --git a/lib/chunks/chunk.rb b/lib/chunks/chunk.rb index 46382c76..90b4939a 100644 --- a/lib/chunks/chunk.rb +++ b/lib/chunks/chunk.rb @@ -9,6 +9,10 @@ require 'uri/common' module Chunk class Abstract +# Rails's default utf-8 support causes problems here. So, for Chunk::Abstract class, turn off +# multibyte character support. + $KCODE = 'iso-8859-1' + # automatically construct the array of derivatives of Chunk::Abstract @derivatives = [] diff --git a/vendor/plugins/HTML5lib/Manifest.txt b/vendor/plugins/HTML5lib/Manifest.txt index 8a8a1bca..082b2b0b 100644 --- a/vendor/plugins/HTML5lib/Manifest.txt +++ b/vendor/plugins/HTML5lib/Manifest.txt @@ -2,12 +2,18 @@ History.txt Manifest.txt README Rakefile.rb +bin/html5 +lib/core_ext/string.rb lib/html5.rb lib/html5/constants.rb lib/html5/filters/base.rb lib/html5/filters/inject_meta_charset.rb +lib/html5/filters/iso639codes.rb lib/html5/filters/optionaltags.rb +lib/html5/filters/rfc2046.rb +lib/html5/filters/rfc3987.rb lib/html5/filters/sanitizer.rb +lib/html5/filters/validator.rb lib/html5/filters/whitespace.rb lib/html5/html5parser.rb lib/html5/html5parser/after_body_phase.rb @@ -34,6 +40,7 @@ lib/html5/sanitizer.rb lib/html5/serializer.rb lib/html5/serializer/htmlserializer.rb lib/html5/serializer/xhtmlserializer.rb +lib/html5/sniffer.rb lib/html5/tokenizer.rb lib/html5/treebuilders.rb lib/html5/treebuilders/base.rb @@ -46,14 +53,65 @@ lib/html5/treewalkers/hpricot.rb lib/html5/treewalkers/rexml.rb lib/html5/treewalkers/simpletree.rb lib/html5/version.rb -parse.rb +testdata/encoding/chardet/test_big5.txt +testdata/encoding/test-yahoo-jp.dat +testdata/encoding/tests1.dat +testdata/encoding/tests2.dat +testdata/sanitizer/tests1.dat +testdata/serializer/core.test +testdata/serializer/injectmeta.test +testdata/serializer/optionaltags.test +testdata/serializer/options.test +testdata/serializer/whitespace.test +testdata/sites/google-results.htm +testdata/sites/python-ref-import.htm +testdata/sites/web-apps-old.htm +testdata/sites/web-apps.htm +testdata/sniffer/htmlOrFeed.json +testdata/tokenizer/contentModelFlags.test +testdata/tokenizer/entities.test +testdata/tokenizer/escapeFlag.test +testdata/tokenizer/test1.test +testdata/tokenizer/test2.test +testdata/tokenizer/test3.test +testdata/tokenizer/test4.test +testdata/tree-construction/tests1.dat +testdata/tree-construction/tests2.dat +testdata/tree-construction/tests3.dat +testdata/tree-construction/tests4.dat +testdata/tree-construction/tests5.dat +testdata/tree-construction/tests6.dat +testdata/validator/attributes.test +testdata/validator/base-href-attribute.test +testdata/validator/base-target-attribute.test +testdata/validator/blockquote-cite-attribute.test +testdata/validator/classattribute.test +testdata/validator/contenteditableattribute.test +testdata/validator/contextmenuattribute.test +testdata/validator/dirattribute.test +testdata/validator/draggableattribute.test +testdata/validator/html-xmlns-attribute.test +testdata/validator/idattribute.test +testdata/validator/inputattributes.test +testdata/validator/irrelevantattribute.test +testdata/validator/langattribute.test +testdata/validator/li-value-attribute.test +testdata/validator/link-href-attribute.test +testdata/validator/link-hreflang-attribute.test +testdata/validator/link-rel-attribute.test +testdata/validator/ol-start-attribute.test +testdata/validator/starttags.test +testdata/validator/style-scoped-attribute.test +testdata/validator/tabindexattribute.test tests/preamble.rb tests/test_encoding.rb tests/test_lxp.rb tests/test_parser.rb tests/test_sanitizer.rb tests/test_serializer.rb +tests/test_sniffer.rb tests/test_stream.rb tests/test_tokenizer.rb tests/test_treewalkers.rb +tests/test_validator.rb tests/tokenizer_test_parser.rb diff --git a/vendor/plugins/HTML5lib/Rakefile.rb b/vendor/plugins/HTML5lib/Rakefile.rb index 65b20295..49324fcb 100644 --- a/vendor/plugins/HTML5lib/Rakefile.rb +++ b/vendor/plugins/HTML5lib/Rakefile.rb @@ -18,16 +18,16 @@ end require 'rcov/rcovtask' -namespace :test do +namespace :test do namespace :coverage do desc "Delete aggregate coverage data." task(:clean) { rm_f "coverage.data" } end desc 'Aggregate code coverage for unit, functional and integration tests' Rcov::RcovTask.new(:coverage => "test:coverage:clean") do |t| - t.libs << "tests" - t.test_files = FileList["tests/test_*.rb"] - t.output_dir = "tests/coverage/" + t.libs << "test" + t.test_files = FileList["test/test_*.rb"] + t.output_dir = "test/coverage/" t.verbose = true end end \ No newline at end of file diff --git a/vendor/plugins/HTML5lib/bin/html5 b/vendor/plugins/HTML5lib/bin/html5 index e16e9248..c74e780b 100755 --- a/vendor/plugins/HTML5lib/bin/html5 +++ b/vendor/plugins/HTML5lib/bin/html5 @@ -1,217 +1,5 @@ #!/usr/bin/env ruby -require 'core_ext/string' -$:.unshift File.dirname(__FILE__), 'lib' +require 'html5/cli' -def parse(opts, args) - encoding = nil - - f = args[-1] - if f - begin - if f[0..6] == 'http://' - require 'open-uri' - f = URI.parse(f).open - encoding = f.charset - elsif f == '-' - f = $stdin - else - f = open(f) - end - rescue - end - else - $stderr.write("No filename provided. Use -h for help\n") - exit(1) - end - - require 'html5/treebuilders' - treebuilder = HTML5::TreeBuilders[opts.treebuilder] - - if opts.output == :xml - require 'html5/liberalxmlparser' - p = HTML5::XMLParser.new(:tree=>treebuilder) - else - require 'html5/html5parser' - p = HTML5::HTMLParser.new(:tree=>treebuilder) - end - - if opts.parsemethod == :parse - args = [f, encoding] - else - args = [f, (opts.container || 'div'), encoding] - end - - if opts.profile - require 'profiler' - Profiler__::start_profile - p.send(opts.parsemethod, *args) - Profiler__::stop_profile - Profiler__::print_profile($stderr) - elsif opts.time - require 'time' # TODO: switch to benchmark - t0 = Time.new - document = p.send(opts.parsemethod, *args) - t1 = Time.new - print_output(p, document, opts) - t2 = Time.new - puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1] - else - document = p.send(opts.parsemethod, *args) - print_output(p, document, opts) - end -end - -def print_output(parser, document, opts) - puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding - - case opts.output - when :xml - print document - when :html - require 'html5/treewalkers' - tokens = HTML5::TreeWalkers[opts.treebuilder].new(document) - require 'html5/serializer' - puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer) - when :hilite - print document.hilite - when :tree - document = [document] unless document.respond_to?(:each) - document.each {|fragment| puts parser.tree.testSerializer(fragment)} - end - - if opts.error - errList=[] - for pos, errorcode, datavars in parser.errors - errList << "Line #{pos[0]} Col #{pos[1]} " + (HTML5::E[errorcode] || "Unknown error \"#{errorcode}\"") % datavars - end - $stdout.write("\nParse errors:\n" + errList.join("\n")+"\n") - end -end - -require 'ostruct' -options = OpenStruct.new -options.profile = false -options.time = false -options.output = :html -options.treebuilder = 'simpletree' -options.error = false -options.encoding = false -options.parsemethod = :parse -options.serializer = { - :encoding => 'utf-8', - :omit_optional_tags => false, - :inject_meta_charset => false -} - -require 'optparse' -opts = OptionParser.new do |opts| - opts.separator "" - opts.separator "Parse Options:" - - opts.on("-b", "--treebuilder NAME") do |treebuilder| - options.treebuilder = treebuilder - end - - opts.on("-f", "--fragment CONTAINER", "Parse as a fragment") do |container| - options.parsemethod = :parse_fragment - options.container = container if container - end - - opts.separator "" - opts.separator "Filter Options:" - - opts.on("--[no-]inject-meta-charset", "inject ") do |inject| - options.serializer[:inject_meta_charset] = inject - end - - opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip| - options.serializer[:strip_whitespace] = strip - end - - opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize| - options.serializer[:sanitize] = sanitize - end - - opts.separator "" - opts.separator "Output Options:" - - opts.on("--tree", "output as debug tree") do |tree| - options.output = :tree - end - - opts.on("-x", "--xml", "output as xml") do |xml| - options.output = :xml - options.treebuilder = "rexml" - end - - opts.on("--[no-]html", "Output as html") do |html| - options.output = (html ? :html : nil) - end - - opts.on("--hilite", "Output as formatted highlighted code.") do |hilite| - options.output = :hilite - end - - opts.on("-e", "--error", "Print a list of parse errors") do |error| - options.error = error - end - - opts.separator "" - opts.separator "Serialization Options:" - - opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit| - options.serializer[:omit_optional_tags] = omit - end - - opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote| - options.serializer[:quote_attr_values] = quote - end - - opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best| - options.serializer[:use_best_quote_char] = best - end - - opts.on("--quote-char C", "Use specified quote character") do |c| - options.serializer[:quote_char] = c - end - - opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min| - options.serializer[:minimize_boolean_attributes] = min - end - - opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash| - options.serializer[:use_trailing_solidus] = slash - end - - opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt| - options.serializer[:escape_lt_in_attrs] = lt - end - - opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata| - options.serializer[:escape_rcdata] = rcdata - end - - opts.separator "" - opts.separator "Other Options:" - - opts.on("-p", "--[no-]profile", "Profile the run") do |profile| - options.profile = profile - end - - opts.on("-t", "--[no-]time", "Time the run") do |time| - options.time = time - end - - opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding| - options.encoding = encoding - end - - opts.on_tail("-h", "--help", "Show this message") do - puts opts - exit - end -end - -opts.parse!(ARGV) -parse options, ARGV +HTML5::CLI.run \ No newline at end of file diff --git a/vendor/plugins/HTML5lib/lib/html5.rb b/vendor/plugins/HTML5lib/lib/html5.rb index 7ca2ee61..68bd6b16 100644 --- a/vendor/plugins/HTML5lib/lib/html5.rb +++ b/vendor/plugins/HTML5lib/lib/html5.rb @@ -8,6 +8,6 @@ module HTML5 end def self.parse_fragment(stream, options={}) - HTMLParser.parse(stream, options) + HTMLParser.parse_fragment(stream, options) end end diff --git a/vendor/plugins/HTML5lib/lib/html5/cli.rb b/vendor/plugins/HTML5lib/lib/html5/cli.rb new file mode 100644 index 00000000..ef49d4c6 --- /dev/null +++ b/vendor/plugins/HTML5lib/lib/html5/cli.rb @@ -0,0 +1,231 @@ +$:.unshift File.dirname(__FILE__), 'lib' +require 'html5' +require 'core_ext/string' +require 'ostruct' +require 'optparse' + +module HTML5::CLI + + def self.parse_opts argv + options = OpenStruct.new + options.profile = false + options.time = false + options.output = :html + options.treebuilder = 'simpletree' + options.error = false + options.encoding = false + options.parsemethod = :parse + options.serializer = { + :encoding => 'utf-8', + :omit_optional_tags => false, + :inject_meta_charset => false + } + + opts = OptionParser.new do |opts| + opts.separator "" + opts.separator "Parse Options:" + + opts.on("-b", "--treebuilder NAME") do |treebuilder| + options.treebuilder = treebuilder + end + + opts.on("-f", "--fragment CONTAINER", "Parse as a fragment") do |container| + options.parsemethod = :parse_fragment + options.container = container if container + end + + opts.separator "" + opts.separator "Filter Options:" + + opts.on("--[no-]inject-meta-charset", "inject ") do |inject| + options.serializer[:inject_meta_charset] = inject + end + + opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip| + options.serializer[:strip_whitespace] = strip + end + + opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize| + options.serializer[:sanitize] = sanitize + end + + opts.separator "" + opts.separator "Output Options:" + + opts.on("--tree", "output as debug tree") do |tree| + options.output = :tree + end + + opts.on("-x", "--xml", "output as xml") do |xml| + options.output = :xml + options.treebuilder = "rexml" + end + + opts.on("--[no-]html", "Output as html") do |html| + options.output = (html ? :html : nil) + end + + opts.on("--hilite", "Output as formatted highlighted code.") do |hilite| + options.output = :hilite + end + + opts.on("-e", "--error", "Print a list of parse errors") do |error| + options.error = error + end + + opts.separator "" + opts.separator "Serialization Options:" + + opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit| + options.serializer[:omit_optional_tags] = omit + end + + opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote| + options.serializer[:quote_attr_values] = quote + end + + opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best| + options.serializer[:use_best_quote_char] = best + end + + opts.on("--quote-char C", "Use specified quote character") do |c| + options.serializer[:quote_char] = c + end + + opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min| + options.serializer[:minimize_boolean_attributes] = min + end + + opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash| + options.serializer[:use_trailing_solidus] = slash + end + + opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt| + options.serializer[:escape_lt_in_attrs] = lt + end + + opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata| + options.serializer[:escape_rcdata] = rcdata + end + + opts.separator "" + opts.separator "Other Options:" + + opts.on("-p", "--[no-]profile", "Profile the run") do |profile| + options.profile = profile + end + + opts.on("-t", "--[no-]time", "Time the run") do |time| + options.time = time + end + + opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding| + options.encoding = encoding + end + + opts.on_tail("-h", "--help", "Show this message") do + puts opts + exit + end + + + end + opts.parse!(argv) + options + end + + def self.open_input f + if f + begin + if f[0..6] == 'http://' + require 'open-uri' + f = URI.parse(f).open + encoding = f.charset + elsif f == '-' + f = $stdin + else + f = open(f) + end + rescue + end + else + $stderr.write("No filename provided. Use -h for help\n") + exit(1) + end + f + end + + def self.parse(opts, args) + encoding = nil + + f = open_input args.last + + require 'html5/treebuilders' + treebuilder = HTML5::TreeBuilders[opts.treebuilder] + + if opts.output == :xml + require 'html5/liberalxmlparser' + p = HTML5::XMLParser.new(:tree=>treebuilder) + else + require 'html5/html5parser' + p = HTML5::HTMLParser.new(:tree=>treebuilder) + end + + if opts.parsemethod == :parse + args = [f, encoding] + else + args = [f, (opts.container || 'div'), encoding] + end + + if opts.profile + require 'profiler' + Profiler__::start_profile + p.send(opts.parsemethod, *args) + Profiler__::stop_profile + Profiler__::print_profile($stderr) + elsif opts.time + require 'time' # TODO: switch to benchmark + t0 = Time.new + document = p.send(opts.parsemethod, *args) + t1 = Time.new + print_output(p, document, opts) + t2 = Time.new + puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1] + else + document = p.send(opts.parsemethod, *args) + print_output(p, document, opts) + end + end + + def self.print_output(parser, document, opts) + puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding + + case opts.output + when :xml + print document + when :html + require 'html5/treewalkers' + tokens = HTML5::TreeWalkers[opts.treebuilder].new(document) + require 'html5/serializer' + puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer) + when :hilite + print document.hilite + when :tree + document = [document] unless document.respond_to?(:each) + document.each {|fragment| puts parser.tree.testSerializer(fragment)} + end + + if opts.error + errList=[] + for pos, errorcode, datavars in parser.errors + errList << "Line #{pos[0]} Col #{pos[1]} " + (HTML5::E[errorcode] || "Unknown error \"#{errorcode}\"") % datavars + end + $stdout.write("\nParse errors:\n" + errList.join("\n")+"\n") + end + end + + def self.run + options = parse_opts ARGV + parse options, ARGV + end +end \ No newline at end of file diff --git a/vendor/plugins/HTML5lib/lib/html5/constants.rb b/vendor/plugins/HTML5lib/lib/html5/constants.rb index ed34b086..53baa8e8 100755 --- a/vendor/plugins/HTML5lib/lib/html5/constants.rb +++ b/vendor/plugins/HTML5lib/lib/html5/constants.rb @@ -908,7 +908,7 @@ module HTML5 "eof-in-bogus-doctype" => _("Unexpected end of file in bogus doctype."), "eof-in-innerhtml" => - _("XXX innerHTML EOF"), + _("Unexpected EOF in inner html mode."), "unexpected-doctype" => _("Unexpected DOCTYPE. Ignored."), "non-html-root" => @@ -1040,7 +1040,8 @@ module HTML5 _("Unexpected end tag (%(name))" + ". Expected end of file."), "unexpected-end-table-in-caption" => - _("Unexpected end table tag in caption. Generates implied end caption.") + _("Unexpected end table tag in caption. Generates implied end caption."), + "end-html-in-innerhtml" => _("Unexpected html end tag in inner html mode.") } end diff --git a/vendor/plugins/HTML5lib/lib/html5/filters.rb b/vendor/plugins/HTML5lib/lib/html5/filters.rb deleted file mode 100644 index 74c7f0e0..00000000 --- a/vendor/plugins/HTML5lib/lib/html5/filters.rb +++ /dev/null @@ -1 +0,0 @@ -require 'html5/filters/optionaltags' diff --git a/vendor/plugins/HTML5lib/lib/html5/html5parser/after_body_phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/after_body_phase.rb index a55e4701..2cba741b 100644 --- a/vendor/plugins/HTML5lib/lib/html5/html5parser/after_body_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/after_body_phase.rb @@ -25,7 +25,7 @@ module HTML5 def endTagHtml(name) if @parser.inner_html - parse_error + parse_error "end-html-in-innerhtml" else # XXX: This may need to be done, not sure # Don't set last_phase to the current phase but to the inBody phase diff --git a/vendor/plugins/HTML5lib/lib/html5/html5parser/in_body_phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_body_phase.rb index 8b8ed02e..f592c57b 100644 --- a/vendor/plugins/HTML5lib/lib/html5/html5parser/in_body_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_body_phase.rb @@ -51,34 +51,22 @@ module HTML5 super(parser, tree) # for special handling of whitespace in
-      if $-w
-        $-w = false
-        class << self; alias processSpaceCharactersNonPre processSpaceCharacters; end
-        $-w = true
-      else
-        class << self; alias processSpaceCharactersNonPre processSpaceCharacters; end
+      silence do
+        class << self
+          alias processSpaceCharactersNonPre processSpaceCharacters
+        end
       end
     end
 
     def processSpaceCharactersDropNewline(data)
       # #Sometimes (start of 
 blocks) we want to drop leading newlines
 
-      if $-w
-        $-w = false
-        class << self
-          silence do
-            alias processSpaceCharacters processSpaceCharactersNonPre
-          end
-        end
-        $-w = true
-      else
-        class << self
-          silence do
-            alias processSpaceCharacters processSpaceCharactersNonPre
-          end
+      class << self
+        silence do
+          alias processSpaceCharacters processSpaceCharactersNonPre
         end
       end
-
+      
       if (data.length > 0 and data[0] == ?\n && 
         %w[pre textarea].include?(@tree.open_elements.last.name) && !@tree.open_elements.last.hasContent)
         data = data[1..-1]
@@ -376,16 +364,6 @@ module HTML5
     end
 
     def endTagBlock(name)
-      #Put us back in the right whitespace handling mode
-      if name == 'pre'
-        class << self; 
-          silence do
-            alias processSpaceCharacters processSpaceCharactersNonPre;
-          end
-        end
-      end
-      
-
       @tree.generateImpliedEndTags if in_scope?(name)
 
       unless @tree.open_elements.last.name == name
diff --git a/vendor/plugins/HTML5lib/lib/html5/html5parser/phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/phase.rb
index 237ae7d1..b6ea65b3 100644
--- a/vendor/plugins/HTML5lib/lib/html5/html5parser/phase.rb
+++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/phase.rb
@@ -144,7 +144,7 @@ module HTML5
 
     def remove_open_elements_until(name=nil)
       finished = false
-      until finished
+      until finished || @tree.open_elements.length == 0
         element = @tree.open_elements.pop
         finished = name.nil? ? yield(element) : element.name == name
       end
diff --git a/vendor/plugins/HTML5lib/parse.rb b/vendor/plugins/HTML5lib/parse.rb
deleted file mode 100755
index ba0d9071..00000000
--- a/vendor/plugins/HTML5lib/parse.rb
+++ /dev/null
@@ -1,217 +0,0 @@
-#!/usr/bin/env ruby
-# 
-# Parse a document to a simpletree tree, with optional profiling
-
-$:.unshift File.dirname(__FILE__),'lib'
-
-def parse(opts, args)
-  encoding = nil
-
-  f = args[-1]
-  if f
-    begin
-      if f[0..6] == 'http://'
-        require 'open-uri'
-        f = URI.parse(f).open
-        encoding = f.charset
-      elsif f == '-'
-        f = $stdin
-      else
-        f = open(f)
-      end
-    rescue
-    end
-  else
-    $stderr.write("No filename provided. Use -h for help\n")
-    exit(1)
-  end
-
-  require 'html5/treebuilders'
-  treebuilder = HTML5::TreeBuilders[opts.treebuilder]
-
-  if opts.output == :xml
-    require 'html5/liberalxmlparser'
-    p = HTML5::XHTMLParser.new(:tree=>treebuilder)
-  else
-    require 'html5/html5parser'
-    p = HTML5::HTMLParser.new(:tree=>treebuilder)
-  end
-
-  if opts.parsemethod == :parse
-    args = [f, encoding]
-  else
-    args = [f, 'div', encoding]
-  end
-
-  if opts.profile
-    require 'profiler'
-    Profiler__::start_profile
-    p.send(opts.parsemethod, *args)
-    Profiler__::stop_profile
-    Profiler__::print_profile($stderr)
-  elsif opts.time
-    require 'time'
-    t0 = Time.new
-    document = p.send(opts.parsemethod, *args)
-    t1 = Time.new
-    printOutput(p, document, opts)
-    t2 = Time.new
-    puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1]
-  else
-    document = p.send(opts.parsemethod, *args)
-    printOutput(p, document, opts)
-  end
-end
-
-def printOutput(parser, document, opts)
-  puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding
-
-  case opts.output
-  when :xml
-    print document
-  when :html
-    require 'html5/treewalkers'
-    tokens = HTML5::TreeWalkers[opts.treebuilder].new(document)
-    require 'html5/serializer'
-    puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer)
-  when :hilite
-    print document.hilite
-  when :tree
-    document = [document] unless document.respond_to?(:each)
-    document.each {|fragment| puts parser.tree.testSerializer(fragment)}
-  end
-
-  if opts.error
-    errList=[]
-    for pos, message in parser.errors
-        errList << ("Line %i Col %i"%pos + " " + message)
-    end
-    $stdout.write("\nParse errors:\n" + errList.join("\n")+"\n")
-  end
-end
-
-require 'ostruct'
-options = OpenStruct.new
-options.profile = false
-options.time = false
-options.output = :html
-options.treebuilder = 'simpletree'
-options.error = false
-options.encoding = false
-options.parsemethod = :parse
-options.serializer = {
-  :encoding => 'utf-8',
-  :omit_optional_tags => false,
-  :inject_meta_charset => false
-}
-
-require 'optparse'
-opts = OptionParser.new do |opts|
-  opts.separator ""
-  opts.separator "Parse Options:"
-
-  opts.on("-b", "--treebuilder NAME") do |treebuilder|
-    options.treebuilder = treebuilder
-  end
-
-  opts.on("-f", "--fragment", "Parse as a fragment") do |parse|
-    options.parsemethod = :parseFragment
-  end
-
-  opts.separator ""
-  opts.separator "Filter Options:"
-
-  opts.on("--[no-]inject-meta-charset", "inject ") do |inject|
-    options.serializer[:inject_meta_charset] = inject
-  end
-
-  opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
-    options.serializer[:strip_whitespace] = strip
-  end
-
-  opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
-    options.serializer[:sanitize] = sanitize
-  end
-
-  opts.separator ""
-  opts.separator "Output Options:"
-
-  opts.on("--tree", "output as debug tree") do |tree|
-    options.output = :tree
-  end
-  
-  opts.on("-x", "--xml", "output as xml") do |xml|
-    options.output = :xml
-    options.treebuilder = "rexml"
-  end
-  
-  opts.on("--[no-]html", "Output as html") do |html|
-    options.output = (html ? :html : nil)
-  end
-  
-  opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
-    options.output = :hilite
-  end
-  
-  opts.on("-e", "--error", "Print a list of parse errors") do |error|
-    options.error = error
-  end
-
-  opts.separator ""
-  opts.separator "Serialization Options:"
-
-  opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit|
-    options.serializer[:omit_optional_tags] = omit
-  end
-
-  opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote|
-    options.serializer[:quote_attr_values] = quote
-  end
-
-  opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best|
-    options.serializer[:use_best_quote_char] = best
-  end
-
-  opts.on("--quote-char C", "Use specified quote character") do |c|
-    options.serializer[:quote_char] = c
-  end
-
-  opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min|
-    options.serializer[:minimize_boolean_attributes] = min
-  end
-
-  opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash|
-    options.serializer[:use_trailing_solidus] = slash
-  end
-
-  opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt|
-    options.serializer[:escape_lt_in_attrs] = lt
-  end
-
-  opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata|
-    options.serializer[:escape_rcdata] = rcdata
-  end
-
-  opts.separator ""
-  opts.separator "Other Options:"
-
-  opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
-    options.profile = profile
-  end
-    
-  opts.on("-t", "--[no-]time", "Time the run") do |time|
-    options.time = time
-  end
-    
-  opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
-    options.encoding = encoding
-  end
-
-  opts.on_tail("-h", "--help", "Show this message") do
-    puts opts
-    exit
-  end
-end
-
-opts.parse!(ARGV)
-parse options, ARGV
diff --git a/vendor/plugins/HTML5lib/tests/preamble.rb b/vendor/plugins/HTML5lib/test/preamble.rb
similarity index 86%
rename from vendor/plugins/HTML5lib/tests/preamble.rb
rename to vendor/plugins/HTML5lib/test/preamble.rb
index f38a581a..ce4b1297 100644
--- a/vendor/plugins/HTML5lib/tests/preamble.rb
+++ b/vendor/plugins/HTML5lib/test/preamble.rb
@@ -2,15 +2,14 @@ require 'test/unit'
 
 HTML5_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__)))) 
 
-if File.exists?(File.join(HTML5_BASE, 'testdata'))
-  TESTDATA_DIR = File.join(HTML5_BASE, 'testdata')
+if File.exists?(File.join(HTML5_BASE, 'ruby', 'testdata'))
+  TESTDATA_DIR = File.join(HTML5_BASE, 'ruby', 'testdata')
 else
-  TESTDATA_DIR = File.join(File.dirname(File.dirname(File.expand_path(__FILE__))), 'testdata')
+  TESTDATA_DIR = File.join(HTML5_BASE, 'testdata')
 end
 
-# $:.unshift File.join(File.dirname(File.dirname(__FILE__)), 'lib')
-
-# $:.unshift File.dirname(__FILE__)
+$:.unshift File.join(File.dirname(File.dirname(__FILE__)), 'lib')
+$:.unshift File.dirname(__FILE__)
 
 require 'core_ext/string'
 
diff --git a/vendor/plugins/HTML5lib/test/test_cli.rb b/vendor/plugins/HTML5lib/test/test_cli.rb
new file mode 100644
index 00000000..1725ffc4
--- /dev/null
+++ b/vendor/plugins/HTML5lib/test/test_cli.rb
@@ -0,0 +1,16 @@
+require File.join(File.dirname(__FILE__), 'preamble')
+require "html5/cli"
+
+class TestCli < Test::Unit::TestCase
+  def test_open_input
+    assert_equal $stdin, HTML5::CLI.open_input('-')
+    assert_kind_of StringIO, HTML5::CLI.open_input('http://whatwg.org/')
+    assert_kind_of File, HTML5::CLI.open_input('testdata/sites/google-results.htm')
+  end
+  
+  def test_parse_opts
+    HTML5::CLI.parse_opts [] # TODO test defaults
+    assert_equal 'hpricot', HTML5::CLI.parse_opts(['-b', 'hpricot']).treebuilder
+    assert_equal 'hpricot', HTML5::CLI.parse_opts(['--treebuilder', 'hpricot']).treebuilder
+  end
+end
\ No newline at end of file
diff --git a/vendor/plugins/HTML5lib/tests/test_encoding.rb b/vendor/plugins/HTML5lib/test/test_encoding.rb
similarity index 100%
rename from vendor/plugins/HTML5lib/tests/test_encoding.rb
rename to vendor/plugins/HTML5lib/test/test_encoding.rb
diff --git a/vendor/plugins/HTML5lib/tests/test_input_stream.rb b/vendor/plugins/HTML5lib/test/test_input_stream.rb
similarity index 67%
rename from vendor/plugins/HTML5lib/tests/test_input_stream.rb
rename to vendor/plugins/HTML5lib/test/test_input_stream.rb
index 00cbbac6..6a7d855f 100644
--- a/vendor/plugins/HTML5lib/tests/test_input_stream.rb
+++ b/vendor/plugins/HTML5lib/test/test_input_stream.rb
@@ -14,4 +14,13 @@ class TestHtml5Inputstream < Test::Unit::TestCase
     1022.times{stream.char}
     assert_equal "i", stream.char
   end
+  
+  def test_chars_until
+    stream = HTML5::HTMLInputStream.new("aaaaaaab")
+    assert_equal "aaaaaaa", stream.chars_until("b")
+
+    stream = HTML5::HTMLInputStream.new("aaaaaaab")
+    assert_equal "aaaaaaab", stream.chars_until("c")
+    
+  end
 end
\ No newline at end of file
diff --git a/vendor/plugins/HTML5lib/tests/test_lxp.rb b/vendor/plugins/HTML5lib/test/test_lxp.rb
similarity index 100%
rename from vendor/plugins/HTML5lib/tests/test_lxp.rb
rename to vendor/plugins/HTML5lib/test/test_lxp.rb
diff --git a/vendor/plugins/HTML5lib/tests/test_parser.rb b/vendor/plugins/HTML5lib/test/test_parser.rb
similarity index 83%
rename from vendor/plugins/HTML5lib/tests/test_parser.rb
rename to vendor/plugins/HTML5lib/test/test_parser.rb
index b3c042dc..15764c52 100644
--- a/vendor/plugins/HTML5lib/tests/test_parser.rb
+++ b/vendor/plugins/HTML5lib/test/test_parser.rb
@@ -12,11 +12,6 @@ begin
 rescue LoadError
 end
 
-$CHECK_PARSER_ERRORS = ARGV.delete('-p') # TODO
-
-puts 'Testing tree builders: ' + $tree_types_to_test * ', '
-
-
 class Html5ParserTestCase < Test::Unit::TestCase
   include HTML5
   include TestSupport
@@ -25,8 +20,7 @@ class Html5ParserTestCase < Test::Unit::TestCase
 
     test_name = File.basename(test_file).sub('.dat', '')
 
-    TestData.new(test_file, %w(data errors document-fragment document)).
-      each_with_index do |(input, errors, inner_html, expected), index|
+    TestData.new(test_file, %w(data errors document-fragment document)).each_with_index do |(input, errors, inner_html, expected), index|
 
       errors = errors.split("\n")
       expected = expected.gsub("\n| ","\n")[2..-1]
@@ -35,13 +29,13 @@ class Html5ParserTestCase < Test::Unit::TestCase
         define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
 
           parser = HTMLParser.new(:tree => TreeBuilders[tree_name])
-        
+
           if inner_html
             parser.parse_fragment(input, inner_html)
           else
             parser.parse(input)
           end
-        
+
           actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
 
           assert_equal sortattrs(expected), sortattrs(actual_output), [
@@ -53,13 +47,13 @@ class Html5ParserTestCase < Test::Unit::TestCase
           actual_errors = parser.errors.map do |(line, col), message, datavars|
             'Line: %i Col: %i %s' % [line, col, E[message] % datavars]
           end
-          assert_equal errors.length, parser.errors.length, [
+
+          assert_equal errors, actual_errors, [
             '', 'Input', input,
             '', "Expected errors (#{errors.length}):", errors.join("\n"),
             '', "Actual errors (#{actual_errors.length}):",
-                 actual_errors.join("\n")
+                 actual_errors.join("\n") + "\n"
           ].join("\n")
-          
         end
       end
     end
diff --git a/vendor/plugins/HTML5lib/tests/test_sanitizer.rb b/vendor/plugins/HTML5lib/test/test_sanitizer.rb
similarity index 100%
rename from vendor/plugins/HTML5lib/tests/test_sanitizer.rb
rename to vendor/plugins/HTML5lib/test/test_sanitizer.rb
diff --git a/vendor/plugins/HTML5lib/tests/test_serializer.rb b/vendor/plugins/HTML5lib/test/test_serializer.rb
similarity index 100%
rename from vendor/plugins/HTML5lib/tests/test_serializer.rb
rename to vendor/plugins/HTML5lib/test/test_serializer.rb
diff --git a/vendor/plugins/HTML5lib/tests/test_sniffer.rb b/vendor/plugins/HTML5lib/test/test_sniffer.rb
similarity index 100%
rename from vendor/plugins/HTML5lib/tests/test_sniffer.rb
rename to vendor/plugins/HTML5lib/test/test_sniffer.rb
diff --git a/vendor/plugins/HTML5lib/tests/test_stream.rb b/vendor/plugins/HTML5lib/test/test_stream.rb
similarity index 88%
rename from vendor/plugins/HTML5lib/tests/test_stream.rb
rename to vendor/plugins/HTML5lib/test/test_stream.rb
index 2ce4e560..40955bd7 100755
--- a/vendor/plugins/HTML5lib/tests/test_stream.rb
+++ b/vendor/plugins/HTML5lib/test/test_stream.rb
@@ -42,9 +42,10 @@ class HTMLInputStreamTest < Test::Unit::TestCase
     require 'iconv'
 
     def test_utf_16
-      stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
-      assert(stream.char_encoding, 'utf-16-le')
-      assert_equal(1025, stream.chars_until(' ',true).length)
+      input = Iconv.new('utf-16', 'utf-8').iconv(' '*1025)
+      stream = HTMLInputStream.new(input)
+      assert('utf-16-le', stream.char_encoding)
+      assert_equal(1025, stream.chars_until(' ', true).length)
     end
   rescue LoadError
     puts "iconv not found, skipping iconv tests"
diff --git a/vendor/plugins/HTML5lib/tests/test_tokenizer.rb b/vendor/plugins/HTML5lib/test/test_tokenizer.rb
similarity index 100%
rename from vendor/plugins/HTML5lib/tests/test_tokenizer.rb
rename to vendor/plugins/HTML5lib/test/test_tokenizer.rb
diff --git a/vendor/plugins/HTML5lib/tests/test_treewalkers.rb b/vendor/plugins/HTML5lib/test/test_treewalkers.rb
similarity index 100%
rename from vendor/plugins/HTML5lib/tests/test_treewalkers.rb
rename to vendor/plugins/HTML5lib/test/test_treewalkers.rb
diff --git a/vendor/plugins/HTML5lib/tests/test_validator.rb b/vendor/plugins/HTML5lib/test/test_validator.rb
similarity index 100%
rename from vendor/plugins/HTML5lib/tests/test_validator.rb
rename to vendor/plugins/HTML5lib/test/test_validator.rb
diff --git a/vendor/plugins/HTML5lib/tests/tokenizer_test_parser.rb b/vendor/plugins/HTML5lib/test/tokenizer_test_parser.rb
similarity index 100%
rename from vendor/plugins/HTML5lib/tests/tokenizer_test_parser.rb
rename to vendor/plugins/HTML5lib/test/tokenizer_test_parser.rb
diff --git a/vendor/plugins/HTML5lib/testdata/sanitizer/tests1.dat b/vendor/plugins/HTML5lib/testdata/sanitizer/tests1.dat
index 53de66c5..73de161a 100644
--- a/vendor/plugins/HTML5lib/testdata/sanitizer/tests1.dat
+++ b/vendor/plugins/HTML5lib/testdata/sanitizer/tests1.dat
@@ -433,9 +433,9 @@
   {
     "name": "uri_ref_with_space_in svg_attribute",
     "input": "",
-    "rexml": "",
-    "xhtml": "",
-    "output": ""
+    "rexml": "",
+    "xhtml": "",
+    "output": ""
   },
 
   {
diff --git a/vendor/plugins/HTML5lib/testdata/tree-construction/tests4.dat b/vendor/plugins/HTML5lib/testdata/tree-construction/tests4.dat
index 26785058..f65133c7 100644
--- a/vendor/plugins/HTML5lib/testdata/tree-construction/tests4.dat
+++ b/vendor/plugins/HTML5lib/testdata/tree-construction/tests4.dat
@@ -41,7 +41,7 @@ plaintext
 #data
 setting html's innerHTML
 #errors
-Line: 1 Col: 24 XXX innerHTML EOF
+Line: 1 Col: 24 Unexpected EOF in inner html mode.
 #document-fragment
 html
 #document
diff --git a/vendor/plugins/HTML5lib/testdata/tree-construction/tests6.dat b/vendor/plugins/HTML5lib/testdata/tree-construction/tests6.dat
index 24115123..bf8a0a39 100644
--- a/vendor/plugins/HTML5lib/testdata/tree-construction/tests6.dat
+++ b/vendor/plugins/HTML5lib/testdata/tree-construction/tests6.dat
@@ -608,4 +608,25 @@ Line: 1 Col: 6 Unexpected start tag (html). Expected DOCTYPE.
 | 
 |   
 |   
-| 
\ No newline at end of file
+| 
+
+#data
+
+#document-fragment
+html
+#errors
+Line: 1 Col: 20 Unexpected html end tag in inner html mode.
+Line: 1 Col: 20 Unexpected EOF in inner html mode.
+#document
+| 
+| 
+
+#data
+ 
+#errors
+Line: 1 Col: 6 Unexpected start tag (html). Expected DOCTYPE.
+#document
+| 
+|   
+|   
+|   " "