Fix Unicode bug

Fix Diego Restrepo's bug (see Rev 184).
Update to latest HTML5lib.
This commit is contained in:
Jacques Distler 2007-12-17 03:17:43 -06:00
parent 18da1a1d71
commit 0f6889e09f
29 changed files with 380 additions and 498 deletions

View file

@ -0,0 +1,71 @@
require 'test/unit'
HTML5_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__))))
if File.exists?(File.join(HTML5_BASE, 'ruby', 'testdata'))
TESTDATA_DIR = File.join(HTML5_BASE, 'ruby', 'testdata')
else
TESTDATA_DIR = File.join(HTML5_BASE, 'testdata')
end
$:.unshift File.join(File.dirname(File.dirname(__FILE__)), 'lib')
$:.unshift File.dirname(__FILE__)
require 'core_ext/string'
def html5_test_files(subdirectory)
Dir[File.join(TESTDATA_DIR, subdirectory, '*.*')]
end
require 'rubygems'
require 'json'
module HTML5
module TestSupport
# convert the output of str(document) to the format used in the testcases
def convertTreeDump(treedump)
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
end
def sortattrs(output)
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) do |match|
match.split("\n").sort.join("\n")
end
end
class TestData
include Enumerable
def initialize(filename, sections)
@f = open(filename)
@sections = sections
end
def each
data = {}
key = nil
@f.each_line do |line|
if line[0] == ?# and @sections.include?(line[1..-2])
heading = line[1..-2]
if data.any? and heading == @sections[0]
data[key].chomp! #Remove trailing newline
yield normaliseOutput(data)
data = {}
end
key = heading
data[key]=""
elsif key
data[key] += line
end
end
yield normaliseOutput(data) if data
end
def normaliseOutput(data)
#Remove trailing newlines
data.keys.each { |key| data[key].chomp! }
@sections.map {|heading| data[heading]}
end
end
end
end

View file

@ -0,0 +1,16 @@
require File.join(File.dirname(__FILE__), 'preamble')
require "html5/cli"
class TestCli < Test::Unit::TestCase
def test_open_input
assert_equal $stdin, HTML5::CLI.open_input('-')
assert_kind_of StringIO, HTML5::CLI.open_input('http://whatwg.org/')
assert_kind_of File, HTML5::CLI.open_input('testdata/sites/google-results.htm')
end
def test_parse_opts
HTML5::CLI.parse_opts [] # TODO test defaults
assert_equal 'hpricot', HTML5::CLI.parse_opts(['-b', 'hpricot']).treebuilder
assert_equal 'hpricot', HTML5::CLI.parse_opts(['--treebuilder', 'hpricot']).treebuilder
end
end

35
vendor/plugins/HTML5lib/test/test_encoding.rb vendored Executable file
View file

@ -0,0 +1,35 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5/inputstream'
class Html5EncodingTestCase < Test::Unit::TestCase
include HTML5
include TestSupport
begin
require 'rubygems'
require 'UniversalDetector'
def test_chardet #TODO: can we get rid of this?
file = File.open(File.join(TESTDATA_DIR, 'encoding', 'chardet', 'test_big5.txt'), 'r')
stream = HTML5::HTMLInputStream.new(file, :chardet => true)
assert_equal 'big5', stream.char_encoding.downcase
rescue LoadError
puts "chardet not found, skipping chardet tests"
end
end
html5_test_files('encoding').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
TestData.new(test_file, %w(data encoding)).
each_with_index do |(input, encoding), index|
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
stream = HTML5::HTMLInputStream.new(input, :chardet => false)
assert_equal encoding.downcase, stream.char_encoding.downcase, input
end
end
end
end

View file

@ -0,0 +1,26 @@
require File.join(File.dirname(__FILE__), 'preamble')
require "test/unit"
require "html5/inputstream"
class TestHtml5Inputstream < Test::Unit::TestCase
def test_newline_in_queue
stream = HTML5::HTMLInputStream.new("\nfoo")
stream.unget(stream.char)
assert_equal [1, 0], stream.position
end
def test_buffer_boundary
stream = HTML5::HTMLInputStream.new("abcdefghijklmnopqrstuvwxyz" * 50, :encoding => 'windows-1252')
1022.times{stream.char}
assert_equal "i", stream.char
end
def test_chars_until
stream = HTML5::HTMLInputStream.new("aaaaaaab")
assert_equal "aaaaaaa", stream.chars_until("b")
stream = HTML5::HTMLInputStream.new("aaaaaaab")
assert_equal "aaaaaaab", stream.chars_until("c")
end
end

279
vendor/plugins/HTML5lib/test/test_lxp.rb vendored Executable file
View file

@ -0,0 +1,279 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5/liberalxmlparser'
XMLELEM = /<(\w+\s*)((?:[-:\w]+="[^"]*"\s*)+)(\/?)>/
def assert_xml_equal(input, expected=nil, parser=HTML5::XMLParser)
sortattrs = proc {"<#{$1+$2.split.sort.join(' ')+$3}>"}
document = parser.parse(input.chomp, :lowercase_attr_name => false, :lowercase_element_name => false).root
if not expected
expected = input.chomp.gsub(XMLELEM,&sortattrs)
expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,&sortattrs)
assert_equal(expected, output)
else
assert_equal(expected, document.to_s.gsub(/'/,'"'))
end
end
def assert_xhtml_equal(input, expected=nil, parser=HTML5::XHTMLParser)
assert_xml_equal(input, expected, parser)
end
class BasicXhtml5Test < Test::Unit::TestCase
def test_title_body_mismatched_close
assert_xhtml_equal(
'<title>Xhtml</title><b><i>content</b></i>',
'<html xmlns="http://www.w3.org/1999/xhtml">' +
'<head><title>Xhtml</title></head>' +
'<body><b><i>content</i></b></body>' +
'</html>')
end
def test_title_body_named_charref
assert_xhtml_equal(
'<title>ntilde</title>A &ntilde B',
'<html xmlns="http://www.w3.org/1999/xhtml">' +
'<head><title>ntilde</title></head>' +
'<body>A '+ [0xF1].pack('U') + ' B</body>' +
'</html>')
end
end
class BasicXmlTest < Test::Unit::TestCase
def test_comment
assert_xml_equal("<x><!-- foo --></x>")
end
def test_cdata
assert_xml_equal("<x><![CDATA[foo]]></x>","<x>foo</x>")
end
def test_simple_text
assert_xml_equal("<p>foo</p>","<p>foo</p>")
end
def test_optional_close
assert_xml_equal("<p>foo","<p>foo</p>")
end
def test_html_mismatched
assert_xml_equal("<b><i>foo</b></i>","<b><i>foo</i></b>")
end
end
class OpmlTest < Test::Unit::TestCase
def test_mixedCaseElement
assert_xml_equal(
'<opml version="1.0">' +
'<head><ownerName>Dave Winer</ownerName></head>' +
'</opml>')
end
def test_mixedCaseAttribute
assert_xml_equal(
'<opml version="1.0">' +
'<body><outline isComment="true"/></body>' +
'</opml>')
end
def test_malformed
assert_xml_equal(
'<opml version="1.0">' +
'<body><outline text="Odds & Ends"/></body>' +
'</opml>',
'<opml version="1.0">' +
'<body><outline text="Odds &amp; Ends"/></body>' +
'</opml>')
end
end
class XhtmlTest < Test::Unit::TestCase
def test_mathml
assert_xhtml_equal <<EOX
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>MathML</title></head>
<body>
<math xmlns="http://www.w3.org/1998/Math/MathML">
<mrow>
<mi>x</mi>
<mo>=</mo>
<mfrac>
<mrow>
<mrow>
<mo>-</mo>
<mi>b</mi>
</mrow>
<mo>&#177;</mo>
<msqrt>
<mrow>
<msup>
<mi>b</mi>
<mn>2</mn>
</msup>
<mo>-</mo>
<mrow>
<mn>4</mn>
<mo>&#8290;</mo>
<mi>a</mi>
<mo>&#8290;</mo>
<mi>c</mi>
</mrow>
</mrow>
</msqrt>
</mrow>
<mrow>
<mn>2</mn>
<mo>&#8290;</mo>
<mi>a</mi>
</mrow>
</mfrac>
</mrow>
</math>
</body></html>
EOX
end
def test_svg
assert_xhtml_equal <<EOX
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>SVG</title></head>
<body>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
<path d="M38,38c0-12,24-15,23-2c0,9-16,13-16,23v7h11v-4c0-9,17-12,17-27
c-2-22-45-22-45,3zM45,70h11v11h-11z" fill="#371">
</path>
<circle cx="50" cy="50" r="45" fill="none" stroke="#371" stroke-width="10">
</circle>
</svg>
</body></html>
EOX
end
def test_xlink
assert_xhtml_equal <<EOX
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>XLINK</title></head>
<body>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
<defs xmlns:l="http://www.w3.org/1999/xlink">
<radialGradient id="s1" fx=".4" fy=".2" r=".7">
<stop stop-color="#FE8"/>
<stop stop-color="#D70" offset="1"/>
</radialGradient>
<radialGradient id="s2" fx=".8" fy=".5" l:href="#s1"/>
<radialGradient id="s3" fx=".5" fy=".9" l:href="#s1"/>
<radialGradient id="s4" fx=".1" fy=".5" l:href="#s1"/>
</defs>
<g stroke="#940">
<path d="M73,29c-37-40-62-24-52,4l6-7c-8-16,7-26,42,9z" fill="url(#s1)"/>
<path d="M47,8c33-16,48,21,9,47l-6-5c38-27,20-44,5-37z" fill="url(#s2)"/>
<path d="M77,32c22,30,10,57-39,51l-1-8c3,3,67,5,36-36z" fill="url(#s3)"/>
<path d="M58,84c-4,20-38-4-8-24l-6-5c-36,43,15,56,23,27z" fill="url(#s4)"/>
<path d="M40,14c-40,37-37,52-9,68l1-8c-16-13-29-21,16-56z" fill="url(#s1)"/>
<path d="M31,33c19,23,20,7,35,41l-9,1.7c-4-19-8-14-31-37z" fill="url(#s2)"/>
</g>
</svg>
</body></html>
EOX
end
def test_br
assert_xhtml_equal <<EOX1
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>BR</title></head>
<body>
<br/>
</body></html>
EOX1
end
def test_strong
assert_xhtml_equal <<EOX
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>STRONG</title></head>
<body>
<strong></strong>
</body></html>
EOX
end
def test_script
assert_xhtml_equal <<EOX
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>SCRIPT</title></head>
<body>
<script>1 &lt; 2 &amp; 3</script>
</body></html>
EOX
end
def test_script_src
assert_xhtml_equal <<EOX1, <<EOX2.strip
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>SCRIPT</title><script src="http://example.com"/></head>
<body>
<script>1 &lt; 2 &amp; 3</script>
</body></html>
EOX1
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>SCRIPT</title><script src="http://example.com"></script></head>
<body>
<script>1 &lt; 2 &amp; 3</script>
</body></html>
EOX2
end
def test_title
assert_xhtml_equal <<EOX
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>1 &lt; 2 &amp; 3</title></head>
<body>
</body></html>
EOX
end
def test_prolog
assert_xhtml_equal <<EOX1, <<EOX2.strip
<?xml version="1.0" encoding="UTF-8" ?>
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>PROLOG</title></head>
<body>
</body></html>
EOX1
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>PROLOG</title></head>
<body>
</body></html>
EOX2
end
def test_tagsoup
assert_xhtml_equal <<EOX1, <<EOX2.strip
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>TAGSOUP</title></head>
<body>
<u><blockquote><p></u>
</body></html>
EOX1
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>TAGSOUP</title></head>
<body>
<u/><blockquote><u/><p><u/>
</p></blockquote></body></html>
EOX2
end
end

View file

@ -0,0 +1,62 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5/treebuilders'
require 'html5/html5parser'
$tree_types_to_test = ['simpletree', 'rexml']
begin
require 'hpricot'
$tree_types_to_test.push('hpricot')
rescue LoadError
end
class Html5ParserTestCase < Test::Unit::TestCase
include HTML5
include TestSupport
html5_test_files('tree-construction').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '')
TestData.new(test_file, %w(data errors document-fragment document)).each_with_index do |(input, errors, inner_html, expected), index|
errors = errors.split("\n")
expected = expected.gsub("\n| ","\n")[2..-1]
$tree_types_to_test.each do |tree_name|
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
parser = HTMLParser.new(:tree => TreeBuilders[tree_name])
if inner_html
parser.parse_fragment(input, inner_html)
else
parser.parse(input)
end
actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
assert_equal sortattrs(expected), sortattrs(actual_output), [
'', 'Input:', input,
'', 'Expected:', expected,
'', 'Recieved:', actual_output
].join("\n")
actual_errors = parser.errors.map do |(line, col), message, datavars|
'Line: %i Col: %i %s' % [line, col, E[message] % datavars]
end
assert_equal errors, actual_errors, [
'', 'Input', input,
'', "Expected errors (#{errors.length}):", errors.join("\n"),
'', "Actual errors (#{actual_errors.length}):",
actual_errors.join("\n") + "\n"
].join("\n")
end
end
end
end
end

View file

@ -0,0 +1,173 @@
#!/usr/bin/env ruby
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5/html5parser'
require 'html5/liberalxmlparser'
require 'html5/treewalkers'
require 'html5/serializer'
require 'html5/sanitizer'
class SanitizeTest < Test::Unit::TestCase
include HTML5
def sanitize_xhtml stream
XHTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).to_s
end
def sanitize_html stream
HTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).to_s
end
def sanitize_rexml stream
require 'rexml/document'
doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{stream}</div>")
tokens = TreeWalkers.get_tree_walker('rexml').new(doc)
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
:quote_char => "'",
:inject_meta_charset => false,
:sanitize => true}).gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
rescue REXML::ParseException
return "Ill-formed XHTML!"
end
def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
assert_equal htmloutput, sanitize_html(input)
assert_equal xhtmloutput, sanitize_xhtml(input)
assert_equal rexmloutput, sanitize_rexml(input)
end
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
define_method "test_should_allow_#{tag_name}_tag" do
input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
htmloutput = "<#{tag_name.downcase} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>"
xhtmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>"
rexmloutput = xhtmloutput
if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
xhtmloutput = htmloutput
elsif tag_name == 'col'
htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
xhtmloutput = htmloutput
rexmloutput = "<col title='1' />"
elsif tag_name == 'table'
htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt;baz<table title='1'> </table>"
xhtmloutput = htmloutput
elsif tag_name == 'image'
htmloutput = "<img title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
xhtmloutput = htmloutput
rexmloutput = "<image title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</image>"
elsif VOID_ELEMENTS.include?(tag_name)
htmloutput = "<#{tag_name} title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
xhtmloutput = htmloutput
htmloutput += '<br/>' if tag_name == 'br'
rexmloutput = "<#{tag_name} title='1' />"
end
check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
end
end
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
define_method "test_should_forbid_#{tag_name.upcase}_tag" do
input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
output = "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
check_sanitization(input, output, output, output)
end
end
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
next if attribute_name == 'style'
define_method "test_should_allow_#{attribute_name}_attribute" do
input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
output = "<p #{attribute_name}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
htmloutput = "<p #{attribute_name.downcase}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
check_sanitization(input, htmloutput, output, output)
end
end
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
output = "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
check_sanitization(input, output, output, output)
end
end
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_#{protocol}_uris" do
input = %(<a href="#{protocol}">foo</a>)
output = "<a href='#{protocol}'>foo</a>"
check_sanitization(input, output, output, output)
end
end
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_uppercase_#{protocol}_uris" do
input = %(<a href="#{protocol.upcase}">foo</a>)
output = "<a href='#{protocol.upcase}'>foo</a>"
check_sanitization(input, output, output, output)
end
end
HTMLSanitizer::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
next unless HTMLSanitizer::ALLOWED_ELEMENTS.include?(tag_name)
define_method "test_#{tag_name}_should_allow_local_href" do
input = %(<#{tag_name} xlink:href="#foo"/>)
output = "<#{tag_name.downcase} xlink:href='#foo'/>"
xhtmloutput = "<#{tag_name} xlink:href='#foo'></#{tag_name}>"
check_sanitization(input, output, xhtmloutput, xhtmloutput)
end
define_method "test_#{tag_name}_should_allow_local_href_with_newline" do
input = %(<#{tag_name} xlink:href="\n#foo"/>)
output = "<#{tag_name.downcase} xlink:href='\n#foo'/>"
xhtmloutput = "<#{tag_name} xlink:href='\n#foo'></#{tag_name}>"
check_sanitization(input, output, xhtmloutput, xhtmloutput)
end
define_method "test_#{tag_name}_should_forbid_nonlocal_href" do
input = %(<#{tag_name} xlink:href="http://bad.com/foo"/>)
output = "<#{tag_name.downcase}/>"
xhtmloutput = "<#{tag_name}></#{tag_name}>"
check_sanitization(input, output, xhtmloutput, xhtmloutput)
end
define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline" do
input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo"/>)
output = "<#{tag_name.downcase}/>"
xhtmloutput = "<#{tag_name}></#{tag_name}>"
check_sanitization(input, output, xhtmloutput, xhtmloutput)
end
end
def test_should_handle_astral_plane_characters
input = "<p>&#x1d4b5; &#x1d538;</p>"
output = "<p>\360\235\222\265 \360\235\224\270</p>"
check_sanitization(input, output, output, output)
input = "<p><tspan>\360\235\224\270</tspan> a</p>"
output = "<p><tspan>\360\235\224\270</tspan> a</p>"
check_sanitization(input, output, output, output)
end
# This affects only NS4. Is it worth fixing?
# def test_javascript_includes
# input = %(<div size="&{alert('XSS')}">foo</div>)
# output = "<div>foo</div>"
# check_sanitization(input, output, output, output)
# end
html5_test_files('sanitizer').each do |filename|
JSON::parse(open(filename).read).each do |test|
define_method "test_#{test['name']}" do
check_sanitization(
test['input'],
test['output'],
test['xhtml'] || test['output'],
test['rexml'] || test['output']
)
end
end
end
end

View file

@ -0,0 +1,68 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5/html5parser'
require 'html5/serializer'
require 'html5/treewalkers'
#Run the serialize error checks
checkSerializeErrors = false
class JsonWalker < HTML5::TreeWalkers::Base
def each
@tree.each do |token|
case token[0]
when 'StartTag'
yield start_tag(token[1], token[2])
when 'EndTag'
yield end_tag(token[1])
when 'EmptyTag'
yield empty_tag(token[1], token[2])
when 'Comment'
yield comment(token[1])
when 'Characters', 'SpaceCharacters'
text(token[1]) {|textToken| yield textToken}
when 'Doctype'
yield doctype(token[1], token[2], token[3])
else
raise "Unknown token type: " + token[0]
end
end
end
end
class Html5SerializeTestcase < Test::Unit::TestCase
html5_test_files('serializer').each do |filename|
test_name = File.basename(filename).sub('.test', '')
tests = JSON::parse(open(filename).read)
tests['tests'].each_with_index do |test, index|
define_method "test_#{test_name}_#{index+1}" do
if test["options"] and test["options"]["encoding"]
test["options"][:encoding] = test["options"]["encoding"]
end
result = HTML5::HTMLSerializer.
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
expected = test["expected"]
if expected.length == 1
assert_equal(expected[0], result, test["description"])
elsif !expected.include?(result)
flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
end
return if test_name == 'optionaltags'
result = HTML5::XHTMLSerializer.
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
expected = test["xhtml"] || test["expected"]
if expected.length == 1
assert_equal(expected[0], result, test["description"])
elsif !expected.include?(result)
flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
end
end
end
end
end

View file

@ -0,0 +1,27 @@
require File.join(File.dirname(__FILE__), 'preamble')
require "html5/sniffer"
class TestFeedTypeSniffer < Test::Unit::TestCase
include HTML5
include TestSupport
include Sniffer
html5_test_files('sniffer').each do |test_file|
test_name = File.basename(test_file).sub('.test', '')
tests = JSON.parse(File.read(test_file))
tests.each_with_index do |data, index|
define_method('test_%s_%d' % [test_name, index + 1]) do
assert_equal data['type'], html_or_feed(data['input'])
end
end
end
# each_with_index do |t, i|
# define_method "test_#{i}" do
# assert_equal t[0], sniff_feed_type(t[1])
# end
# end
end

63
vendor/plugins/HTML5lib/test/test_stream.rb vendored Executable file
View file

@ -0,0 +1,63 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5/inputstream'
class HTMLInputStreamTest < Test::Unit::TestCase
include HTML5
def test_char_ascii
stream = HTMLInputStream.new("'", :encoding=>'ascii')
assert_equal('ascii', stream.char_encoding)
assert_equal("'", stream.char)
end
def test_char_null
stream = HTMLInputStream.new("\x00")
assert_equal("\xef\xbf\xbd", stream.char)
end
def test_char_utf8
stream = HTMLInputStream.new("\xe2\x80\x98", :encoding=>'utf-8')
assert_equal('utf-8', stream.char_encoding)
assert_equal("\xe2\x80\x98", stream.char)
end
def test_char_win1252
stream = HTMLInputStream.new("\xa2\xc5\xf1\x92\x86")
assert_equal('windows-1252', stream.char_encoding)
assert_equal("\xc2\xa2", stream.char)
assert_equal("\xc3\x85", stream.char)
assert_equal("\xc3\xb1", stream.char)
assert_equal("\xe2\x80\x99", stream.char)
assert_equal("\xe2\x80\xa0", stream.char)
end
def test_bom
stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
assert_equal('utf-8', stream.char_encoding)
assert_equal("'", stream.char)
end
begin
require 'iconv'
def test_utf_16
input = Iconv.new('utf-16', 'utf-8').iconv(' '*1025)
stream = HTMLInputStream.new(input)
assert('utf-16-le', stream.char_encoding)
assert_equal(1025, stream.chars_until(' ', true).length)
end
rescue LoadError
puts "iconv not found, skipping iconv tests"
end
def test_newlines
stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
assert_equal([1,0], stream.position)
assert_equal("a\nbb\n", stream.chars_until('c'))
assert_equal([3,0], stream.position)
assert_equal("ccc\ndddd", stream.chars_until('x'))
assert_equal([4,4], stream.position)
assert_equal([1,2,3], stream.instance_eval {@line_lengths})
end
end

View file

@ -0,0 +1,94 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5/tokenizer'
require 'tokenizer_test_parser'
class Html5TokenizerTestCase < Test::Unit::TestCase
def assert_tokens_match(expectedTokens, receivedTokens, ignoreErrorOrder, message)
if !ignoreErrorOrder
return expectedTokens == receivedTokens
else
#Sort the tokens into two groups; non-parse errors and parse errors
expected = [[],[]]
received = [[],[]]
for token in expectedTokens
if token != "ParseError"
expected[0] << token
else
expected[1] << token
end
end
for token in receivedTokens
if token != "ParseError"
received[0] << token
else
received[1] << token
end
end
assert_equal expected, received, message
end
end
def type_of?(token_name, token)
token != 'ParseError' and token_name == token.first
end
def convert_attribute_arrays_to_hashes(tokens)
tokens.inject([]) do |tokens, token|
token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
tokens << token
end
end
def concatenate_consecutive_characters(tokens)
tokens.inject([]) do |tokens, token|
if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
tokens.last[1] = tokens.last[1] + token[1]
next tokens
end
tokens << token
end
end
def tokenizer_test(data)
(data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
message = [
'', 'Description:', data['description'],
'', 'Input:', data['input'],
'', 'Content Model Flag:', content_model_flag,
'' ] * "\n"
assert_nothing_raised message do
tokenizer = HTML5::HTMLTokenizer.new(data['input'])
tokenizer.content_model_flag = content_model_flag.to_sym
tokenizer.current_token = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
tokens = TokenizerTestParser.new(tokenizer).parse
actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
expected = concatenate_consecutive_characters(data['output'])
assert_tokens_match expected, actual, data["ignoreErrorOrder"], message
end
end
end
html5_test_files('tokenizer').each do |test_file|
test_name = File.basename(test_file).sub('.test', '')
tests = JSON.parse(File.read(test_file))['tests']
tests.each_with_index do |data, index|
define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
end
end
end

View file

@ -0,0 +1,135 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5/html5parser'
require 'html5/treewalkers'
require 'html5/treebuilders'
$tree_types_to_test = {
'simpletree' =>
{:builder => HTML5::TreeBuilders['simpletree'],
:walker => HTML5::TreeWalkers['simpletree']},
'rexml' =>
{:builder => HTML5::TreeBuilders['rexml'],
:walker => HTML5::TreeWalkers['rexml']},
'hpricot' =>
{:builder => HTML5::TreeBuilders['hpricot'],
:walker => HTML5::TreeWalkers['hpricot']},
}
puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
class TestTreeWalkers < Test::Unit::TestCase
include HTML5::TestSupport
def concatenateCharacterTokens(tokens)
charactersToken = nil
for token in tokens
type = token[:type]
if [:Characters, :SpaceCharacters].include?(type)
if charactersToken == nil
charactersToken = {:type => :Characters, :data => token[:data]}
else
charactersToken[:data] += token[:data]
end
else
if charactersToken != nil
yield charactersToken
charactersToken = nil
end
yield token
end
end
yield charactersToken if charactersToken != nil
end
def convertTokens(tokens)
output = []
indent = 0
concatenateCharacterTokens(tokens) do |token|
case token[:type]
when :StartTag, :EmptyTag
output << "#{' '*indent}<#{token[:name]}>"
indent += 2
for name, value in token[:data].to_a.sort
next if name=='xmlns'
output << "#{' '*indent}#{name}=\"#{value}\""
end
indent -= 2 if token[:type] == :EmptyTag
when :EndTag
indent -= 2
when :Comment
output << "#{' '*indent}<!-- #{token[:data]} -->"
when :Doctype
if token[:name] and token[:name].any?
output << "#{' '*indent}<!DOCTYPE #{token[:name]}>"
else
output << "#{' '*indent}<!DOCTYPE >"
end
when :Characters, :SpaceCharacters
output << "#{' '*indent}\"#{token[:data]}\""
end
end
output.join("\n")
end
html5_test_files('tree-construction').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '')
next if test_name == 'tests5' # TODO
TestData.new(test_file, %w(data errors document-fragment document)).
each_with_index do |(input, errors, inner_html, expected), index|
expected = expected.gsub("\n| ","\n")[2..-1]
$tree_types_to_test.each do |tree_name, tree_class|
define_method "test_#{test_name}_#{index}_#{tree_name}" do
parser = HTML5::HTMLParser.new(:tree => tree_class[:builder])
if inner_html
parser.parse_fragment(input, inner_html)
else
parser.parse(input)
end
document = parser.tree.get_document
begin
output = sortattrs(convertTokens(tree_class[:walker].new(document)))
expected = sortattrs(expected)
assert_equal expected, output, [
'', 'Input:', input,
'', 'Expected:', expected,
'', 'Recieved:', output
].join("\n")
rescue NotImplementedError
# Amnesty for those that confess...
end
end
end
end
end
def test_all_tokens
expected = [
{:data => [], :type => :StartTag, :name => 'html'},
{:data => [], :type => :StartTag, :name => 'head'},
{:data => [], :type => :EndTag, :name => 'head'},
{:data => [], :type => :StartTag, :name => 'body'},
{:data => [], :type => :EndTag, :name => 'body'},
{:data => [], :type => :EndTag, :name => 'html'}]
for treeName, tree_class in $tree_types_to_test
p = HTML5::HTMLParser.new(:tree => tree_class[:builder])
document = p.parse("<html></html>")
# document = tree_class.get(:adapter)(document)
output = tree_class[:walker].new(document)
expected.zip(output) do |expected_token, output_token|
assert_equal(expected_token, output_token)
end
end
end
end

View file

@ -0,0 +1,31 @@
#!/usr/bin/env ruby -wKU
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5'
require 'html5/filters/validator'
class TestValidator < Test::Unit::TestCase
def run_validator_test(test)
p = HTML5::HTMLParser.new(:tokenizer => HTMLConformanceChecker)
p.parse(test['input'])
errorCodes = p.errors.collect{|e| e[1]}
if test.has_key?('fail-if')
assert !errorCodes.include?(test['fail-if'])
end
if test.has_key?('fail-unless')
assert errorCodes.include?(test['fail-unless'])
end
end
for filename in html5_test_files('validator')
tests = JSON.load(open(filename))
testName = File.basename(filename).sub(".test", "")
tests['tests'].each_with_index do |test, index|
define_method "test_#{testName}_#{index}" do
run_validator_test(test)
end
end
end
end

View file

@ -0,0 +1,63 @@
require 'html5/constants'
class TokenizerTestParser
def initialize(tokenizer)
@tokenizer = tokenizer
end
def parse
@outputTokens = []
debug = nil
for token in @tokenizer
debug = token.inspect if token[:type] == :ParseError
send(('process' + token[:type].to_s), token)
end
return @outputTokens
end
def processDoctype(token)
@outputTokens.push(["DOCTYPE", token[:name], token[:publicId],
token[:systemId], token[:correct]])
end
def processStartTag(token)
@outputTokens.push(["StartTag", token[:name], token[:data]])
end
def processEmptyTag(token)
if not HTML5::VOID_ELEMENTS.include? token[:name]
@outputTokens.push("ParseError")
end
@outputTokens.push(["StartTag", token[:name], token[:data]])
end
def processEndTag(token)
if token[:data].length > 0
self.processParseError(token)
end
@outputTokens.push(["EndTag", token[:name]])
end
def processComment(token)
@outputTokens.push(["Comment", token[:data]])
end
def processCharacters(token)
@outputTokens.push(["Character", token[:data]])
end
alias processSpaceCharacters processCharacters
def processCharacters(token)
@outputTokens.push(["Character", token[:data]])
end
def process_eof(token)
end
def processParseError(token)
@outputTokens.push("ParseError")
end
end