HTML5lib is Back.

Synced with latest version of HTML5lib, which fixes problem with Astral plane characters. I should really do some tests, but the HTML5lib Sanitizer seems to be 2-5 times slower than the old sanitizer.
2007-05-30 10:45:52 -05:00 · 2007-05-30 10:45:52 -05:00 · 4dd70af5ae
commit 4dd70af5ae
parent e1a6827f1f
39 changed files with 4843 additions and 5576 deletions
--- a/vendor/plugins/HTML5lib/tests/preamble.rb
+++ b/vendor/plugins/HTML5lib/tests/preamble.rb
@ -7,5 +7,17 @@ $:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
 $:.unshift File.dirname(__FILE__)

 def html5lib_test_files(subdirectory)
-    Dir[File.join(HTML5LIB_BASE, 'tests', subdirectory, '*.*')]
+  Dir[File.join(HTML5LIB_BASE, 'tests', subdirectory, '*.*')]
+end
+
+begin
+  require 'jsonx'
+rescue LoadError
+  class JSON
+    def self.parse json
+      json.gsub! /"\s*:/, '"=>'
+      json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
+      eval json
+    end
+  end
 end
--- a/vendor/plugins/HTML5lib/tests/test_encoding.rb
+++ b/vendor/plugins/HTML5lib/tests/test_encoding.rb
@ -11,7 +11,7 @@ begin
    def test_chardet
        File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
            stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
-            assert_equal 'big5', stream.charEncoding.downcase
+            assert_equal 'big5', stream.char_encoding.downcase
        end
    end
 rescue LoadError
@ -28,7 +28,7 @@ end

            define_method 'test_%s_%d' % [ test_name, index + 1 ] do
                stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
-                assert_equal encoding.downcase, stream.charEncoding.downcase, input
+                assert_equal encoding.downcase, stream.char_encoding.downcase, input
            end
        end
    end
--- a/vendor/plugins/HTML5lib/tests/test_lxp.rb
+++ b/vendor/plugins/HTML5lib/tests/test_lxp.rb
@ -6,19 +6,19 @@ XMLELEM = /<(\w+\s*)((?:[-:\w]+="[^"]*"\s*)+)(\/?)>/
 SORTATTRS = '<#{$1+$2.split.sort.join(' ')+$3}>'

 def assert_xml_equal(input, expected=nil, parser=HTML5lib::XMLParser)
-    document = parser.parse(input.chomp).root
-    if not expected
-        expected = input.chomp.gsub(XMLELEM,SORTATTRS)
-        expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
-        output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,SORTATTRS)
-        assert_equal(expected, output)
-    else
-        assert_equal(expected, document.to_s.gsub(/'/,'"'))
-    end
+  document = parser.parse(input.chomp).root
+  if not expected
+    expected = input.chomp.gsub(XMLELEM,SORTATTRS)
+    expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
+    output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,SORTATTRS)
+    assert_equal(expected, output)
+  else
+    assert_equal(expected, document.to_s.gsub(/'/,'"'))
+  end
 end

 def assert_xhtml_equal(input, expected=nil, parser=HTML5lib::XHTMLParser)
-      assert_xml_equal(input, expected, parser)
+  assert_xml_equal(input, expected, parser)
 end

 class BasicXhtml5Test < Test::Unit::TestCase
@ -27,8 +27,8 @@ class BasicXhtml5Test < Test::Unit::TestCase
    assert_xhtml_equal(
      '<title>Xhtml</title><b><i>content</b></i>',
      '<html xmlns="http://www.w3.org/1999/xhtml">' +
-        '<head><title>Xhtml</title></head>' + 
-        '<body><b><i>content</i></b></body>' +
+      '<head><title>Xhtml</title></head>' + 
+      '<body><b><i>content</i></b></body>' +
      '</html>')
  end

@ -36,8 +36,8 @@ class BasicXhtml5Test < Test::Unit::TestCase
    assert_xhtml_equal(
      '<title>mdash</title>A &mdash B',
      '<html xmlns="http://www.w3.org/1999/xhtml">' +
-        '<head><title>mdash</title></head>' + 
-        '<body>A '+ [0x2014].pack('U') + ' B</body>' +
+      '<head><title>mdash</title></head>' + 
+      '<body>A '+ [0x2014].pack('U') + ' B</body>' +
      '</html>')
  end
 end
@ -70,24 +70,24 @@ class OpmlTest < Test::Unit::TestCase
  def test_mixedCaseElement
    assert_xml_equal(
      '<opml version="1.0">' +
-        '<head><ownerName>Dave Winer</ownerName></head>' +
+      '<head><ownerName>Dave Winer</ownerName></head>' +
      '</opml>')
  end

  def test_mixedCaseAttribute
    assert_xml_equal(
      '<opml version="1.0">' +
-        '<body><outline isComment="true"/></body>' +
+      '<body><outline isComment="true"/></body>' +
      '</opml>')
  end

  def test_malformed
    assert_xml_equal(
      '<opml version="1.0">' +
-        '<body><outline text="Odds & Ends"/></body>' +
+      '<body><outline text="Odds & Ends"/></body>' +
      '</opml>',
      '<opml version="1.0">' +
-        '<body><outline text="Odds &amp; Ends"/></body>' +
+      '<body><outline text="Odds &amp; Ends"/></body>' +
      '</opml>')
  end
 end
@ -100,45 +100,45 @@ class XhtmlTest < Test::Unit::TestCase
 <head><title>MathML</title></head>
 <body>
  <math xmlns="http://www.w3.org/1998/Math/MathML">
+  <mrow>
+    <mi>x</mi>
+    <mo>=</mo>
+
+    <mfrac>
    <mrow>
-      <mi>x</mi>
-      <mo>=</mo>
+      <mrow>
+      <mo>-</mo>
+      <mi>b</mi>
+      </mrow>
+      <mo>&#177;</mo>
+      <msqrt>

-      <mfrac>
+      <mrow>
+        <msup>
+        <mi>b</mi>
+        <mn>2</mn>
+        </msup>
+        <mo>-</mo>
        <mrow>
-          <mrow>
-            <mo>-</mo>
-            <mi>b</mi>
-          </mrow>
-          <mo>&#177;</mo>
-          <msqrt>

-            <mrow>
-              <msup>
-                <mi>b</mi>
-                <mn>2</mn>
-              </msup>
-              <mo>-</mo>
-              <mrow>
-
-                <mn>4</mn>
-                <mo>&#8290;</mo>
-                <mi>a</mi>
-                <mo>&#8290;</mo>
-                <mi>c</mi>
-              </mrow>
-            </mrow>
-
-          </msqrt>
+        <mn>4</mn>
+        <mo>&#8290;</mo>
+        <mi>a</mi>
+        <mo>&#8290;</mo>
+        <mi>c</mi>
        </mrow>
-        <mrow>
-          <mn>2</mn>
-          <mo>&#8290;</mo>
-          <mi>a</mi>
-        </mrow>
-      </mfrac>
+      </mrow>

+      </msqrt>
    </mrow>
+    <mrow>
+      <mn>2</mn>
+      <mo>&#8290;</mo>
+      <mi>a</mi>
+    </mrow>
+    </mfrac>
+
+  </mrow>
  </math>
 </body></html>
 EOX
@ -150,11 +150,11 @@ EOX
 <head><title>SVG</title></head>
 <body>
  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
-    <path d="M38,38c0-12,24-15,23-2c0,9-16,13-16,23v7h11v-4c0-9,17-12,17-27
-             c-2-22-45-22-45,3zM45,70h11v11h-11z" fill="#371">
-    </path>
-    <circle cx="50" cy="50" r="45" fill="none" stroke="#371" stroke-width="10">
-    </circle>
+  <path d="M38,38c0-12,24-15,23-2c0,9-16,13-16,23v7h11v-4c0-9,17-12,17-27
+       c-2-22-45-22-45,3zM45,70h11v11h-11z" fill="#371">
+  </path>
+  <circle cx="50" cy="50" r="45" fill="none" stroke="#371" stroke-width="10">
+  </circle>

  </svg>
 </body></html>
@ -167,24 +167,24 @@ EOX
 <head><title>XLINK</title></head>
 <body>
  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
-    <defs xmlns:l="http://www.w3.org/1999/xlink">
-      <radialGradient id="s1" fx=".4" fy=".2" r=".7">
-        <stop stop-color="#FE8"/>
-        <stop stop-color="#D70" offset="1"/>
-      </radialGradient>
-      <radialGradient id="s2" fx=".8" fy=".5" l:href="#s1"/>
-      <radialGradient id="s3" fx=".5" fy=".9" l:href="#s1"/>
-      <radialGradient id="s4" fx=".1" fy=".5" l:href="#s1"/>
-    </defs>
-    <g stroke="#940">
-      <path d="M73,29c-37-40-62-24-52,4l6-7c-8-16,7-26,42,9z" fill="url(#s1)"/>
-      <path d="M47,8c33-16,48,21,9,47l-6-5c38-27,20-44,5-37z" fill="url(#s2)"/>
-      <path d="M77,32c22,30,10,57-39,51l-1-8c3,3,67,5,36-36z" fill="url(#s3)"/>
+  <defs xmlns:l="http://www.w3.org/1999/xlink">
+    <radialGradient id="s1" fx=".4" fy=".2" r=".7">
+    <stop stop-color="#FE8"/>
+    <stop stop-color="#D70" offset="1"/>
+    </radialGradient>
+    <radialGradient id="s2" fx=".8" fy=".5" l:href="#s1"/>
+    <radialGradient id="s3" fx=".5" fy=".9" l:href="#s1"/>
+    <radialGradient id="s4" fx=".1" fy=".5" l:href="#s1"/>
+  </defs>
+  <g stroke="#940">
+    <path d="M73,29c-37-40-62-24-52,4l6-7c-8-16,7-26,42,9z" fill="url(#s1)"/>
+    <path d="M47,8c33-16,48,21,9,47l-6-5c38-27,20-44,5-37z" fill="url(#s2)"/>
+    <path d="M77,32c22,30,10,57-39,51l-1-8c3,3,67,5,36-36z" fill="url(#s3)"/>

-      <path d="M58,84c-4,20-38-4-8-24l-6-5c-36,43,15,56,23,27z" fill="url(#s4)"/>
-      <path d="M40,14c-40,37-37,52-9,68l1-8c-16-13-29-21,16-56z" fill="url(#s1)"/>
-      <path d="M31,33c19,23,20,7,35,41l-9,1.7c-4-19-8-14-31-37z" fill="url(#s2)"/>
-    </g>
+    <path d="M58,84c-4,20-38-4-8-24l-6-5c-36,43,15,56,23,27z" fill="url(#s4)"/>
+    <path d="M40,14c-40,37-37,52-9,68l1-8c-16-13-29-21,16-56z" fill="url(#s1)"/>
+    <path d="M31,33c19,23,20,7,35,41l-9,1.7c-4-19-8-14-31-37z" fill="url(#s2)"/>
+  </g>
  </svg>
 </body></html>
 EOX
--- a/vendor/plugins/HTML5lib/tests/test_parser.rb
+++ b/vendor/plugins/HTML5lib/tests/test_parser.rb
@ -7,8 +7,8 @@ require 'html5lib/html5parser'
 $tree_types_to_test = ['simpletree', 'rexml']

 begin
-    require 'hpricot'
-    $tree_types_to_test.push('hpricot')
+  require 'hpricot'
+  $tree_types_to_test.push('hpricot')
 rescue LoadError
 end

@ -19,90 +19,90 @@ puts 'Testing: ' + $tree_types_to_test * ', '

 class Html5ParserTestCase < Test::Unit::TestCase

-    def self.startswith?(a, b)
-        b[0... a.length] == a
-    end
+  def self.startswith?(a, b)
+    b[0... a.length] == a
+  end

-    def self.parseTestcase(data)
-        innerHTML = nil
-        input = []
-        output = []
-        errors = []
-        currentList = input
-        data.split(/\n/).each do |line|
-            if !line.empty? and !startswith?("#errors", line) and
-              !startswith?("#document", line) and
-              !startswith?("#data", line) and
-              !startswith?("#document-fragment", line)
+  def self.parseTestcase(data)
+    innerHTML = nil
+    input = []
+    output = []
+    errors = []
+    currentList = input
+    data.split(/\n/).each do |line|
+      if !line.empty? and !startswith?("#errors", line) and
+        !startswith?("#document", line) and
+        !startswith?("#data", line) and
+        !startswith?("#document-fragment", line)

-                if currentList == output and startswith?("|", line)
-                    currentList.push(line[2..-1])
-                else
-                    currentList.push(line)
-                end
-            elsif line == "#errors"
-                currentList = errors
-            elsif line == "#document" or startswith?("#document-fragment", line)
-                if startswith?("#document-fragment", line)
-                    innerHTML = line[19..-1]
-                    raise AssertionError unless innerHTML
-                end
-                currentList = output
-            end
+        if currentList == output and startswith?("|", line)
+          currentList.push(line[2..-1])
+        else
+          currentList.push(line)
        end
-        return innerHTML, input.join("\n"), output.join("\n"), errors
-    end
-    
-    # convert the output of str(document) to the format used in the testcases
-    def convertTreeDump(treedump)
-        treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
-    end
-
-    def sortattrs(output)
-        output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
-    end
-
-    html5lib_test_files('tree-construction').each do |test_file|
-
-        test_name = File.basename(test_file).sub('.dat', '')
-
-        File.read(test_file).split("#data\n").each_with_index do |data, index|
-            next if data.empty?
-       
-            innerHTML, input, expected_output, expected_errors = parseTestcase(data)
-
-            $tree_types_to_test.each do |tree_name|
-                define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
-
-                    parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name))
-                
-                    if innerHTML
-                        parser.parseFragment(input, innerHTML)
-                    else
-                        parser.parse(input)
-                    end
-                
-                    actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
-
-                    assert_equal sortattrs(expected_output), sortattrs(actual_output), [
-                        'Input:', input,
-                        'Expected:', expected_output,
-                        'Recieved:', actual_output
-                    ].join("\n")
-
-                    if $CHECK_PARSER_ERRORS
-                        actual_errors = parser.errors.map do |(line, col), message|
-                            'Line: %i Col: %i %s' % [line, col, message]
-                        end
-                        assert_equal parser.errors.length, expected_errors.length, [
-                            'Expected errors:', expected_errors.join("\n"),
-                            'Actual errors:', actual_errors.join("\n") 
-                        ].join("\n")
-                    end
-                    
-                end
-            end
+      elsif line == "#errors"
+        currentList = errors
+      elsif line == "#document" or startswith?("#document-fragment", line)
+        if startswith?("#document-fragment", line)
+          innerHTML = line[19..-1]
+          raise AssertionError unless innerHTML
        end
+        currentList = output
+      end
    end
+    return innerHTML, input.join("\n"), output.join("\n"), errors
+  end
+  
+  # convert the output of str(document) to the format used in the testcases
+  def convertTreeDump(treedump)
+    treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
+  end
+
+  def sortattrs(output)
+    output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
+  end
+
+  html5lib_test_files('tree-construction').each do |test_file|
+
+    test_name = File.basename(test_file).sub('.dat', '')
+
+    File.read(test_file).split("#data\n").each_with_index do |data, index|
+      next if data.empty?
+     
+      innerHTML, input, expected_output, expected_errors = parseTestcase(data)
+
+      $tree_types_to_test.each do |tree_name|
+        define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
+
+          parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name))
+        
+          if innerHTML
+            parser.parseFragment(input, innerHTML)
+          else
+            parser.parse(input)
+          end
+        
+          actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
+
+          assert_equal sortattrs(expected_output), sortattrs(actual_output), [
+            'Input:', input,
+            'Expected:', expected_output,
+            'Recieved:', actual_output
+          ].join("\n")
+
+          if $CHECK_PARSER_ERRORS
+            actual_errors = parser.errors.map do |(line, col), message|
+              'Line: %i Col: %i %s' % [line, col, message]
+            end
+            assert_equal parser.errors.length, expected_errors.length, [
+              'Expected errors:', expected_errors.join("\n"),
+              'Actual errors:', actual_errors.join("\n") 
+            ].join("\n")
+          end
+          
+        end
+      end
+    end
+  end

 end
--- a/vendor/plugins/HTML5lib/tests/test_sanitizer.rb
+++ b/vendor/plugins/HTML5lib/tests/test_sanitizer.rb
@ -203,4 +203,8 @@ class SanitizeTest < Test::Unit::TestCase
       sanitize_html(%(<img src='vbscript:msgbox("XSS")' />))
  end

+  def test_should_handle_astral_plane_characters
+    assert_equal "<p>\360\235\222\265 \360\235\224\270</p>",
+      sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
+  end
 end
--- a/vendor/plugins/HTML5lib/tests/test_tokenizer.rb
+++ b/vendor/plugins/HTML5lib/tests/test_tokenizer.rb
@ -4,75 +4,63 @@ require 'html5lib/tokenizer'

 require 'tokenizer_test_parser'

-begin
-  require 'jsonx'
-rescue LoadError
-  class JSON
-    def self.parse json
-      json.gsub! /"\s*:/, '"=>'
-      json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
-      eval json
-    end
-  end
-end 
-
 class Html5TokenizerTestCase < Test::Unit::TestCase

-    def type_of?(token_name, token)
-        token != 'ParseError' and token_name == token.first
+  def type_of?(token_name, token)
+    token != 'ParseError' and token_name == token.first
+  end
+
+  def convert_attribute_arrays_to_hashes(tokens)
+    tokens.inject([]) do |tokens, token|
+      token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
+      tokens << token
    end
-
-    def convert_attribute_arrays_to_hashes(tokens)
-        tokens.inject([]) do |tokens, token|
-            token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
-            tokens << token
-        end
+  end
+  
+  def concatenate_consecutive_characters(tokens)
+    tokens.inject([]) do |tokens, token|
+      if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
+        tokens.last[1] = tokens.last[1] + token[1]
+        next tokens
+      end
+      tokens << token
    end
-    
-    def concatenate_consecutive_characters(tokens)
-        tokens.inject([]) do |tokens, token|
-            if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
-                tokens.last[1] = tokens.last[1] + token[1]
-                next tokens
-            end
-            tokens << token
-        end
-    end
-
-    def tokenizer_test(data)
-        (data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
-            message = [
-                'Description:', data['description'],
-                'Input:', data['input'],
-                'Content Model Flag:', content_model_flag ] * "\n"
-
-            assert_nothing_raised message do
-                tokenizer = HTML5lib::HTMLTokenizer.new(data['input'])
-
-                tokenizer.contentModelFlag = content_model_flag.to_sym
-                
-                tokenizer.currentToken = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
-
-                tokens = TokenizerTestParser.new(tokenizer).parse
-
-                actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
-
-                expected = concatenate_consecutive_characters(data['output'])
-
-                assert_equal expected, actual, message
-            end
-        end 
-    end
-
-    html5lib_test_files('tokenizer').each do |test_file|
-        test_name = File.basename(test_file).sub('.test', '')
-
-        tests = JSON.parse(File.read(test_file))['tests']
-
-        tests.each_with_index do |data, index|
-            define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
-        end
+  end
+
+  def tokenizer_test(data)
+    (data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
+      message = [
+        'Description:', data['description'],
+        'Input:', data['input'],
+        'Content Model Flag:', content_model_flag ] * "\n"
+
+      assert_nothing_raised message do
+        tokenizer = HTML5lib::HTMLTokenizer.new(data['input'])
+
+        tokenizer.contentModelFlag = content_model_flag.to_sym
+
+        tokenizer.currentToken = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
+
+        tokens = TokenizerTestParser.new(tokenizer).parse
+
+        actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
+
+        expected = concatenate_consecutive_characters(data['output'])
+
+        assert_equal expected, actual, message
+      end
+    end 
+  end
+
+  html5lib_test_files('tokenizer').each do |test_file|
+    test_name = File.basename(test_file).sub('.test', '')
+
+    tests = JSON.parse(File.read(test_file))['tests']
+
+    tests.each_with_index do |data, index|
+      define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
    end
+  end

 end

--- a/vendor/plugins/HTML5lib/tests/tokenizer_test_parser.rb
+++ b/vendor/plugins/HTML5lib/tests/tokenizer_test_parser.rb
@ -1,62 +1,62 @@
 require 'html5lib/constants'

 class TokenizerTestParser
-    def initialize(tokenizer)
-        @tokenizer = tokenizer
+  def initialize(tokenizer)
+    @tokenizer = tokenizer
+  end
+
+  def parse
+    @outputTokens = []
+
+    debug = nil
+    for token in @tokenizer
+      debug = token.inspect if token[:type] == :ParseError
+      send ('process' + token[:type].to_s), token
    end

-    def parse
-        @outputTokens = []
+    return @outputTokens
+  end

-        debug = nil
-        for token in @tokenizer
-            debug = token.inspect if token[:type] == :ParseError
-            send ('process' + token[:type].to_s), token
-        end
+  def processDoctype(token)
+    @outputTokens.push(["DOCTYPE", token[:name], token[:data]])
+  end

-        return @outputTokens
+  def processStartTag(token)
+    @outputTokens.push(["StartTag", token[:name], token[:data]])
+  end
+
+  def processEmptyTag(token)
+    if not HTML5lib::VOID_ELEMENTS.include? token[:name]
+      @outputTokens.push("ParseError")
    end
+    @outputTokens.push(["StartTag", token[:name], token[:data]])
+  end

-    def processDoctype(token)
-        @outputTokens.push(["DOCTYPE", token[:name], token[:data]])
+  def processEndTag(token)
+    if token[:data].length > 0
+      self.processParseError(token)
    end
+    @outputTokens.push(["EndTag", token[:name]])
+  end

-    def processStartTag(token)
-        @outputTokens.push(["StartTag", token[:name], token[:data]])
-    end
+  def processComment(token)
+    @outputTokens.push(["Comment", token[:data]])
+  end

-    def processEmptyTag(token)
-        if not HTML5lib::VOID_ELEMENTS.include? token[:name]
-            @outputTokens.push("ParseError")
-        end
-        @outputTokens.push(["StartTag", token[:name], token[:data]])
-    end
+  def processCharacters(token)
+    @outputTokens.push(["Character", token[:data]])
+  end

-    def processEndTag(token)
-        if token[:data].length > 0
-            self.processParseError(token)
-        end
-        @outputTokens.push(["EndTag", token[:name]])
-    end
+  alias processSpaceCharacters processCharacters

-    def processComment(token)
-        @outputTokens.push(["Comment", token[:data]])
-    end
+  def processCharacters(token)
+    @outputTokens.push(["Character", token[:data]])
+  end

-    def processCharacters(token)
-        @outputTokens.push(["Character", token[:data]])
-    end
+  def processEOF(token)
+  end

-    alias processSpaceCharacters processCharacters
-
-    def processCharacters(token)
-        @outputTokens.push(["Character", token[:data]])
-    end
-
-    def processEOF(token)
-    end
-
-    def processParseError(token)
-        @outputTokens.push("ParseError")
-    end
+  def processParseError(token)
+    @outputTokens.push("ParseError")
+  end
 end