HTML5lib is Back.
Synced with latest version of HTML5lib, which fixes problem with Astral plane characters. I should really do some tests, but the HTML5lib Sanitizer seems to be 2-5 times slower than the old sanitizer.
This commit is contained in:
parent
e1a6827f1f
commit
4dd70af5ae
39 changed files with 4843 additions and 5576 deletions
14
vendor/plugins/HTML5lib/tests/preamble.rb
vendored
14
vendor/plugins/HTML5lib/tests/preamble.rb
vendored
|
@ -7,5 +7,17 @@ $:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
|
|||
$:.unshift File.dirname(__FILE__)
|
||||
|
||||
def html5lib_test_files(subdirectory)
|
||||
Dir[File.join(HTML5LIB_BASE, 'tests', subdirectory, '*.*')]
|
||||
Dir[File.join(HTML5LIB_BASE, 'tests', subdirectory, '*.*')]
|
||||
end
|
||||
|
||||
begin
|
||||
require 'jsonx'
|
||||
rescue LoadError
|
||||
class JSON
|
||||
def self.parse json
|
||||
json.gsub! /"\s*:/, '"=>'
|
||||
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
|
||||
eval json
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -11,7 +11,7 @@ begin
|
|||
def test_chardet
|
||||
File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
|
||||
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
|
||||
assert_equal 'big5', stream.charEncoding.downcase
|
||||
assert_equal 'big5', stream.char_encoding.downcase
|
||||
end
|
||||
end
|
||||
rescue LoadError
|
||||
|
@ -28,7 +28,7 @@ end
|
|||
|
||||
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
|
||||
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
|
||||
assert_equal encoding.downcase, stream.charEncoding.downcase, input
|
||||
assert_equal encoding.downcase, stream.char_encoding.downcase, input
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
144
vendor/plugins/HTML5lib/tests/test_lxp.rb
vendored
144
vendor/plugins/HTML5lib/tests/test_lxp.rb
vendored
|
@ -6,19 +6,19 @@ XMLELEM = /<(\w+\s*)((?:[-:\w]+="[^"]*"\s*)+)(\/?)>/
|
|||
SORTATTRS = '<#{$1+$2.split.sort.join(' ')+$3}>'
|
||||
|
||||
def assert_xml_equal(input, expected=nil, parser=HTML5lib::XMLParser)
|
||||
document = parser.parse(input.chomp).root
|
||||
if not expected
|
||||
expected = input.chomp.gsub(XMLELEM,SORTATTRS)
|
||||
expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
|
||||
output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,SORTATTRS)
|
||||
assert_equal(expected, output)
|
||||
else
|
||||
assert_equal(expected, document.to_s.gsub(/'/,'"'))
|
||||
end
|
||||
document = parser.parse(input.chomp).root
|
||||
if not expected
|
||||
expected = input.chomp.gsub(XMLELEM,SORTATTRS)
|
||||
expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
|
||||
output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,SORTATTRS)
|
||||
assert_equal(expected, output)
|
||||
else
|
||||
assert_equal(expected, document.to_s.gsub(/'/,'"'))
|
||||
end
|
||||
end
|
||||
|
||||
def assert_xhtml_equal(input, expected=nil, parser=HTML5lib::XHTMLParser)
|
||||
assert_xml_equal(input, expected, parser)
|
||||
assert_xml_equal(input, expected, parser)
|
||||
end
|
||||
|
||||
class BasicXhtml5Test < Test::Unit::TestCase
|
||||
|
@ -27,8 +27,8 @@ class BasicXhtml5Test < Test::Unit::TestCase
|
|||
assert_xhtml_equal(
|
||||
'<title>Xhtml</title><b><i>content</b></i>',
|
||||
'<html xmlns="http://www.w3.org/1999/xhtml">' +
|
||||
'<head><title>Xhtml</title></head>' +
|
||||
'<body><b><i>content</i></b></body>' +
|
||||
'<head><title>Xhtml</title></head>' +
|
||||
'<body><b><i>content</i></b></body>' +
|
||||
'</html>')
|
||||
end
|
||||
|
||||
|
@ -36,8 +36,8 @@ class BasicXhtml5Test < Test::Unit::TestCase
|
|||
assert_xhtml_equal(
|
||||
'<title>mdash</title>A &mdash B',
|
||||
'<html xmlns="http://www.w3.org/1999/xhtml">' +
|
||||
'<head><title>mdash</title></head>' +
|
||||
'<body>A '+ [0x2014].pack('U') + ' B</body>' +
|
||||
'<head><title>mdash</title></head>' +
|
||||
'<body>A '+ [0x2014].pack('U') + ' B</body>' +
|
||||
'</html>')
|
||||
end
|
||||
end
|
||||
|
@ -70,24 +70,24 @@ class OpmlTest < Test::Unit::TestCase
|
|||
def test_mixedCaseElement
|
||||
assert_xml_equal(
|
||||
'<opml version="1.0">' +
|
||||
'<head><ownerName>Dave Winer</ownerName></head>' +
|
||||
'<head><ownerName>Dave Winer</ownerName></head>' +
|
||||
'</opml>')
|
||||
end
|
||||
|
||||
def test_mixedCaseAttribute
|
||||
assert_xml_equal(
|
||||
'<opml version="1.0">' +
|
||||
'<body><outline isComment="true"/></body>' +
|
||||
'<body><outline isComment="true"/></body>' +
|
||||
'</opml>')
|
||||
end
|
||||
|
||||
def test_malformed
|
||||
assert_xml_equal(
|
||||
'<opml version="1.0">' +
|
||||
'<body><outline text="Odds & Ends"/></body>' +
|
||||
'<body><outline text="Odds & Ends"/></body>' +
|
||||
'</opml>',
|
||||
'<opml version="1.0">' +
|
||||
'<body><outline text="Odds & Ends"/></body>' +
|
||||
'<body><outline text="Odds & Ends"/></body>' +
|
||||
'</opml>')
|
||||
end
|
||||
end
|
||||
|
@ -100,45 +100,45 @@ class XhtmlTest < Test::Unit::TestCase
|
|||
<head><title>MathML</title></head>
|
||||
<body>
|
||||
<math xmlns="http://www.w3.org/1998/Math/MathML">
|
||||
<mrow>
|
||||
<mi>x</mi>
|
||||
<mo>=</mo>
|
||||
|
||||
<mfrac>
|
||||
<mrow>
|
||||
<mi>x</mi>
|
||||
<mo>=</mo>
|
||||
<mrow>
|
||||
<mo>-</mo>
|
||||
<mi>b</mi>
|
||||
</mrow>
|
||||
<mo>±</mo>
|
||||
<msqrt>
|
||||
|
||||
<mfrac>
|
||||
<mrow>
|
||||
<msup>
|
||||
<mi>b</mi>
|
||||
<mn>2</mn>
|
||||
</msup>
|
||||
<mo>-</mo>
|
||||
<mrow>
|
||||
<mrow>
|
||||
<mo>-</mo>
|
||||
<mi>b</mi>
|
||||
</mrow>
|
||||
<mo>±</mo>
|
||||
<msqrt>
|
||||
|
||||
<mrow>
|
||||
<msup>
|
||||
<mi>b</mi>
|
||||
<mn>2</mn>
|
||||
</msup>
|
||||
<mo>-</mo>
|
||||
<mrow>
|
||||
|
||||
<mn>4</mn>
|
||||
<mo>⁢</mo>
|
||||
<mi>a</mi>
|
||||
<mo>⁢</mo>
|
||||
<mi>c</mi>
|
||||
</mrow>
|
||||
</mrow>
|
||||
|
||||
</msqrt>
|
||||
<mn>4</mn>
|
||||
<mo>⁢</mo>
|
||||
<mi>a</mi>
|
||||
<mo>⁢</mo>
|
||||
<mi>c</mi>
|
||||
</mrow>
|
||||
<mrow>
|
||||
<mn>2</mn>
|
||||
<mo>⁢</mo>
|
||||
<mi>a</mi>
|
||||
</mrow>
|
||||
</mfrac>
|
||||
</mrow>
|
||||
|
||||
</msqrt>
|
||||
</mrow>
|
||||
<mrow>
|
||||
<mn>2</mn>
|
||||
<mo>⁢</mo>
|
||||
<mi>a</mi>
|
||||
</mrow>
|
||||
</mfrac>
|
||||
|
||||
</mrow>
|
||||
</math>
|
||||
</body></html>
|
||||
EOX
|
||||
|
@ -150,11 +150,11 @@ EOX
|
|||
<head><title>SVG</title></head>
|
||||
<body>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
|
||||
<path d="M38,38c0-12,24-15,23-2c0,9-16,13-16,23v7h11v-4c0-9,17-12,17-27
|
||||
c-2-22-45-22-45,3zM45,70h11v11h-11z" fill="#371">
|
||||
</path>
|
||||
<circle cx="50" cy="50" r="45" fill="none" stroke="#371" stroke-width="10">
|
||||
</circle>
|
||||
<path d="M38,38c0-12,24-15,23-2c0,9-16,13-16,23v7h11v-4c0-9,17-12,17-27
|
||||
c-2-22-45-22-45,3zM45,70h11v11h-11z" fill="#371">
|
||||
</path>
|
||||
<circle cx="50" cy="50" r="45" fill="none" stroke="#371" stroke-width="10">
|
||||
</circle>
|
||||
|
||||
</svg>
|
||||
</body></html>
|
||||
|
@ -167,24 +167,24 @@ EOX
|
|||
<head><title>XLINK</title></head>
|
||||
<body>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
|
||||
<defs xmlns:l="http://www.w3.org/1999/xlink">
|
||||
<radialGradient id="s1" fx=".4" fy=".2" r=".7">
|
||||
<stop stop-color="#FE8"/>
|
||||
<stop stop-color="#D70" offset="1"/>
|
||||
</radialGradient>
|
||||
<radialGradient id="s2" fx=".8" fy=".5" l:href="#s1"/>
|
||||
<radialGradient id="s3" fx=".5" fy=".9" l:href="#s1"/>
|
||||
<radialGradient id="s4" fx=".1" fy=".5" l:href="#s1"/>
|
||||
</defs>
|
||||
<g stroke="#940">
|
||||
<path d="M73,29c-37-40-62-24-52,4l6-7c-8-16,7-26,42,9z" fill="url(#s1)"/>
|
||||
<path d="M47,8c33-16,48,21,9,47l-6-5c38-27,20-44,5-37z" fill="url(#s2)"/>
|
||||
<path d="M77,32c22,30,10,57-39,51l-1-8c3,3,67,5,36-36z" fill="url(#s3)"/>
|
||||
<defs xmlns:l="http://www.w3.org/1999/xlink">
|
||||
<radialGradient id="s1" fx=".4" fy=".2" r=".7">
|
||||
<stop stop-color="#FE8"/>
|
||||
<stop stop-color="#D70" offset="1"/>
|
||||
</radialGradient>
|
||||
<radialGradient id="s2" fx=".8" fy=".5" l:href="#s1"/>
|
||||
<radialGradient id="s3" fx=".5" fy=".9" l:href="#s1"/>
|
||||
<radialGradient id="s4" fx=".1" fy=".5" l:href="#s1"/>
|
||||
</defs>
|
||||
<g stroke="#940">
|
||||
<path d="M73,29c-37-40-62-24-52,4l6-7c-8-16,7-26,42,9z" fill="url(#s1)"/>
|
||||
<path d="M47,8c33-16,48,21,9,47l-6-5c38-27,20-44,5-37z" fill="url(#s2)"/>
|
||||
<path d="M77,32c22,30,10,57-39,51l-1-8c3,3,67,5,36-36z" fill="url(#s3)"/>
|
||||
|
||||
<path d="M58,84c-4,20-38-4-8-24l-6-5c-36,43,15,56,23,27z" fill="url(#s4)"/>
|
||||
<path d="M40,14c-40,37-37,52-9,68l1-8c-16-13-29-21,16-56z" fill="url(#s1)"/>
|
||||
<path d="M31,33c19,23,20,7,35,41l-9,1.7c-4-19-8-14-31-37z" fill="url(#s2)"/>
|
||||
</g>
|
||||
<path d="M58,84c-4,20-38-4-8-24l-6-5c-36,43,15,56,23,27z" fill="url(#s4)"/>
|
||||
<path d="M40,14c-40,37-37,52-9,68l1-8c-16-13-29-21,16-56z" fill="url(#s1)"/>
|
||||
<path d="M31,33c19,23,20,7,35,41l-9,1.7c-4-19-8-14-31-37z" fill="url(#s2)"/>
|
||||
</g>
|
||||
</svg>
|
||||
</body></html>
|
||||
EOX
|
||||
|
|
164
vendor/plugins/HTML5lib/tests/test_parser.rb
vendored
164
vendor/plugins/HTML5lib/tests/test_parser.rb
vendored
|
@ -7,8 +7,8 @@ require 'html5lib/html5parser'
|
|||
$tree_types_to_test = ['simpletree', 'rexml']
|
||||
|
||||
begin
|
||||
require 'hpricot'
|
||||
$tree_types_to_test.push('hpricot')
|
||||
require 'hpricot'
|
||||
$tree_types_to_test.push('hpricot')
|
||||
rescue LoadError
|
||||
end
|
||||
|
||||
|
@ -19,90 +19,90 @@ puts 'Testing: ' + $tree_types_to_test * ', '
|
|||
|
||||
class Html5ParserTestCase < Test::Unit::TestCase
|
||||
|
||||
def self.startswith?(a, b)
|
||||
b[0... a.length] == a
|
||||
end
|
||||
def self.startswith?(a, b)
|
||||
b[0... a.length] == a
|
||||
end
|
||||
|
||||
def self.parseTestcase(data)
|
||||
innerHTML = nil
|
||||
input = []
|
||||
output = []
|
||||
errors = []
|
||||
currentList = input
|
||||
data.split(/\n/).each do |line|
|
||||
if !line.empty? and !startswith?("#errors", line) and
|
||||
!startswith?("#document", line) and
|
||||
!startswith?("#data", line) and
|
||||
!startswith?("#document-fragment", line)
|
||||
def self.parseTestcase(data)
|
||||
innerHTML = nil
|
||||
input = []
|
||||
output = []
|
||||
errors = []
|
||||
currentList = input
|
||||
data.split(/\n/).each do |line|
|
||||
if !line.empty? and !startswith?("#errors", line) and
|
||||
!startswith?("#document", line) and
|
||||
!startswith?("#data", line) and
|
||||
!startswith?("#document-fragment", line)
|
||||
|
||||
if currentList == output and startswith?("|", line)
|
||||
currentList.push(line[2..-1])
|
||||
else
|
||||
currentList.push(line)
|
||||
end
|
||||
elsif line == "#errors"
|
||||
currentList = errors
|
||||
elsif line == "#document" or startswith?("#document-fragment", line)
|
||||
if startswith?("#document-fragment", line)
|
||||
innerHTML = line[19..-1]
|
||||
raise AssertionError unless innerHTML
|
||||
end
|
||||
currentList = output
|
||||
end
|
||||
if currentList == output and startswith?("|", line)
|
||||
currentList.push(line[2..-1])
|
||||
else
|
||||
currentList.push(line)
|
||||
end
|
||||
return innerHTML, input.join("\n"), output.join("\n"), errors
|
||||
end
|
||||
|
||||
# convert the output of str(document) to the format used in the testcases
|
||||
def convertTreeDump(treedump)
|
||||
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
|
||||
end
|
||||
|
||||
def sortattrs(output)
|
||||
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
|
||||
end
|
||||
|
||||
html5lib_test_files('tree-construction').each do |test_file|
|
||||
|
||||
test_name = File.basename(test_file).sub('.dat', '')
|
||||
|
||||
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
||||
next if data.empty?
|
||||
|
||||
innerHTML, input, expected_output, expected_errors = parseTestcase(data)
|
||||
|
||||
$tree_types_to_test.each do |tree_name|
|
||||
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
|
||||
|
||||
parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name))
|
||||
|
||||
if innerHTML
|
||||
parser.parseFragment(input, innerHTML)
|
||||
else
|
||||
parser.parse(input)
|
||||
end
|
||||
|
||||
actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
|
||||
|
||||
assert_equal sortattrs(expected_output), sortattrs(actual_output), [
|
||||
'Input:', input,
|
||||
'Expected:', expected_output,
|
||||
'Recieved:', actual_output
|
||||
].join("\n")
|
||||
|
||||
if $CHECK_PARSER_ERRORS
|
||||
actual_errors = parser.errors.map do |(line, col), message|
|
||||
'Line: %i Col: %i %s' % [line, col, message]
|
||||
end
|
||||
assert_equal parser.errors.length, expected_errors.length, [
|
||||
'Expected errors:', expected_errors.join("\n"),
|
||||
'Actual errors:', actual_errors.join("\n")
|
||||
].join("\n")
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
elsif line == "#errors"
|
||||
currentList = errors
|
||||
elsif line == "#document" or startswith?("#document-fragment", line)
|
||||
if startswith?("#document-fragment", line)
|
||||
innerHTML = line[19..-1]
|
||||
raise AssertionError unless innerHTML
|
||||
end
|
||||
currentList = output
|
||||
end
|
||||
end
|
||||
return innerHTML, input.join("\n"), output.join("\n"), errors
|
||||
end
|
||||
|
||||
# convert the output of str(document) to the format used in the testcases
|
||||
def convertTreeDump(treedump)
|
||||
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
|
||||
end
|
||||
|
||||
def sortattrs(output)
|
||||
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
|
||||
end
|
||||
|
||||
html5lib_test_files('tree-construction').each do |test_file|
|
||||
|
||||
test_name = File.basename(test_file).sub('.dat', '')
|
||||
|
||||
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
||||
next if data.empty?
|
||||
|
||||
innerHTML, input, expected_output, expected_errors = parseTestcase(data)
|
||||
|
||||
$tree_types_to_test.each do |tree_name|
|
||||
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
|
||||
|
||||
parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name))
|
||||
|
||||
if innerHTML
|
||||
parser.parseFragment(input, innerHTML)
|
||||
else
|
||||
parser.parse(input)
|
||||
end
|
||||
|
||||
actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
|
||||
|
||||
assert_equal sortattrs(expected_output), sortattrs(actual_output), [
|
||||
'Input:', input,
|
||||
'Expected:', expected_output,
|
||||
'Recieved:', actual_output
|
||||
].join("\n")
|
||||
|
||||
if $CHECK_PARSER_ERRORS
|
||||
actual_errors = parser.errors.map do |(line, col), message|
|
||||
'Line: %i Col: %i %s' % [line, col, message]
|
||||
end
|
||||
assert_equal parser.errors.length, expected_errors.length, [
|
||||
'Expected errors:', expected_errors.join("\n"),
|
||||
'Actual errors:', actual_errors.join("\n")
|
||||
].join("\n")
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -203,4 +203,8 @@ class SanitizeTest < Test::Unit::TestCase
|
|||
sanitize_html(%(<img src='vbscript:msgbox("XSS")' />))
|
||||
end
|
||||
|
||||
def test_should_handle_astral_plane_characters
|
||||
assert_equal "<p>\360\235\222\265 \360\235\224\270</p>",
|
||||
sanitize_html("<p>𝒵 𝔸</p>")
|
||||
end
|
||||
end
|
||||
|
|
116
vendor/plugins/HTML5lib/tests/test_tokenizer.rb
vendored
116
vendor/plugins/HTML5lib/tests/test_tokenizer.rb
vendored
|
@ -4,75 +4,63 @@ require 'html5lib/tokenizer'
|
|||
|
||||
require 'tokenizer_test_parser'
|
||||
|
||||
begin
|
||||
require 'jsonx'
|
||||
rescue LoadError
|
||||
class JSON
|
||||
def self.parse json
|
||||
json.gsub! /"\s*:/, '"=>'
|
||||
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
|
||||
eval json
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
class Html5TokenizerTestCase < Test::Unit::TestCase
|
||||
|
||||
def type_of?(token_name, token)
|
||||
token != 'ParseError' and token_name == token.first
|
||||
def type_of?(token_name, token)
|
||||
token != 'ParseError' and token_name == token.first
|
||||
end
|
||||
|
||||
def convert_attribute_arrays_to_hashes(tokens)
|
||||
tokens.inject([]) do |tokens, token|
|
||||
token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
|
||||
tokens << token
|
||||
end
|
||||
|
||||
def convert_attribute_arrays_to_hashes(tokens)
|
||||
tokens.inject([]) do |tokens, token|
|
||||
token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
|
||||
tokens << token
|
||||
end
|
||||
end
|
||||
|
||||
def concatenate_consecutive_characters(tokens)
|
||||
tokens.inject([]) do |tokens, token|
|
||||
if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
|
||||
tokens.last[1] = tokens.last[1] + token[1]
|
||||
next tokens
|
||||
end
|
||||
tokens << token
|
||||
end
|
||||
|
||||
def concatenate_consecutive_characters(tokens)
|
||||
tokens.inject([]) do |tokens, token|
|
||||
if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
|
||||
tokens.last[1] = tokens.last[1] + token[1]
|
||||
next tokens
|
||||
end
|
||||
tokens << token
|
||||
end
|
||||
end
|
||||
|
||||
def tokenizer_test(data)
|
||||
(data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
|
||||
message = [
|
||||
'Description:', data['description'],
|
||||
'Input:', data['input'],
|
||||
'Content Model Flag:', content_model_flag ] * "\n"
|
||||
|
||||
assert_nothing_raised message do
|
||||
tokenizer = HTML5lib::HTMLTokenizer.new(data['input'])
|
||||
|
||||
tokenizer.contentModelFlag = content_model_flag.to_sym
|
||||
|
||||
tokenizer.currentToken = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
|
||||
|
||||
tokens = TokenizerTestParser.new(tokenizer).parse
|
||||
|
||||
actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
|
||||
|
||||
expected = concatenate_consecutive_characters(data['output'])
|
||||
|
||||
assert_equal expected, actual, message
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
html5lib_test_files('tokenizer').each do |test_file|
|
||||
test_name = File.basename(test_file).sub('.test', '')
|
||||
|
||||
tests = JSON.parse(File.read(test_file))['tests']
|
||||
|
||||
tests.each_with_index do |data, index|
|
||||
define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
|
||||
end
|
||||
end
|
||||
|
||||
def tokenizer_test(data)
|
||||
(data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
|
||||
message = [
|
||||
'Description:', data['description'],
|
||||
'Input:', data['input'],
|
||||
'Content Model Flag:', content_model_flag ] * "\n"
|
||||
|
||||
assert_nothing_raised message do
|
||||
tokenizer = HTML5lib::HTMLTokenizer.new(data['input'])
|
||||
|
||||
tokenizer.contentModelFlag = content_model_flag.to_sym
|
||||
|
||||
tokenizer.currentToken = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
|
||||
|
||||
tokens = TokenizerTestParser.new(tokenizer).parse
|
||||
|
||||
actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
|
||||
|
||||
expected = concatenate_consecutive_characters(data['output'])
|
||||
|
||||
assert_equal expected, actual, message
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
html5lib_test_files('tokenizer').each do |test_file|
|
||||
test_name = File.basename(test_file).sub('.test', '')
|
||||
|
||||
tests = JSON.parse(File.read(test_file))['tests']
|
||||
|
||||
tests.each_with_index do |data, index|
|
||||
define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
|
|
|
@ -1,62 +1,62 @@
|
|||
require 'html5lib/constants'
|
||||
|
||||
class TokenizerTestParser
|
||||
def initialize(tokenizer)
|
||||
@tokenizer = tokenizer
|
||||
def initialize(tokenizer)
|
||||
@tokenizer = tokenizer
|
||||
end
|
||||
|
||||
def parse
|
||||
@outputTokens = []
|
||||
|
||||
debug = nil
|
||||
for token in @tokenizer
|
||||
debug = token.inspect if token[:type] == :ParseError
|
||||
send ('process' + token[:type].to_s), token
|
||||
end
|
||||
|
||||
def parse
|
||||
@outputTokens = []
|
||||
return @outputTokens
|
||||
end
|
||||
|
||||
debug = nil
|
||||
for token in @tokenizer
|
||||
debug = token.inspect if token[:type] == :ParseError
|
||||
send ('process' + token[:type].to_s), token
|
||||
end
|
||||
def processDoctype(token)
|
||||
@outputTokens.push(["DOCTYPE", token[:name], token[:data]])
|
||||
end
|
||||
|
||||
return @outputTokens
|
||||
def processStartTag(token)
|
||||
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
||||
end
|
||||
|
||||
def processEmptyTag(token)
|
||||
if not HTML5lib::VOID_ELEMENTS.include? token[:name]
|
||||
@outputTokens.push("ParseError")
|
||||
end
|
||||
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
||||
end
|
||||
|
||||
def processDoctype(token)
|
||||
@outputTokens.push(["DOCTYPE", token[:name], token[:data]])
|
||||
def processEndTag(token)
|
||||
if token[:data].length > 0
|
||||
self.processParseError(token)
|
||||
end
|
||||
@outputTokens.push(["EndTag", token[:name]])
|
||||
end
|
||||
|
||||
def processStartTag(token)
|
||||
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
||||
end
|
||||
def processComment(token)
|
||||
@outputTokens.push(["Comment", token[:data]])
|
||||
end
|
||||
|
||||
def processEmptyTag(token)
|
||||
if not HTML5lib::VOID_ELEMENTS.include? token[:name]
|
||||
@outputTokens.push("ParseError")
|
||||
end
|
||||
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
||||
end
|
||||
def processCharacters(token)
|
||||
@outputTokens.push(["Character", token[:data]])
|
||||
end
|
||||
|
||||
def processEndTag(token)
|
||||
if token[:data].length > 0
|
||||
self.processParseError(token)
|
||||
end
|
||||
@outputTokens.push(["EndTag", token[:name]])
|
||||
end
|
||||
alias processSpaceCharacters processCharacters
|
||||
|
||||
def processComment(token)
|
||||
@outputTokens.push(["Comment", token[:data]])
|
||||
end
|
||||
def processCharacters(token)
|
||||
@outputTokens.push(["Character", token[:data]])
|
||||
end
|
||||
|
||||
def processCharacters(token)
|
||||
@outputTokens.push(["Character", token[:data]])
|
||||
end
|
||||
def processEOF(token)
|
||||
end
|
||||
|
||||
alias processSpaceCharacters processCharacters
|
||||
|
||||
def processCharacters(token)
|
||||
@outputTokens.push(["Character", token[:data]])
|
||||
end
|
||||
|
||||
def processEOF(token)
|
||||
end
|
||||
|
||||
def processParseError(token)
|
||||
@outputTokens.push("ParseError")
|
||||
end
|
||||
def processParseError(token)
|
||||
@outputTokens.push("ParseError")
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue