REXML Trees

Synced with latest HTML5lib.
Added preliminary support (currently disabled) for sanitizing REXML trees.
This commit is contained in:
Jacques Distler 2007-06-05 16:34:49 -05:00
parent 4dd70af5ae
commit bd8ba1f4b1
28 changed files with 1317 additions and 112 deletions

View file

@ -21,3 +21,53 @@ rescue LoadError
end
end
end
module HTML5lib
module TestSupport
def self.startswith?(a, b)
b[0... a.length] == a
end
def self.parseTestcase(data)
innerHTML = nil
input = []
output = []
errors = []
currentList = input
data.split(/\n/).each do |line|
if !line.empty? and !startswith?("#errors", line) and
!startswith?("#document", line) and
!startswith?("#data", line) and
!startswith?("#document-fragment", line)
if currentList == output and startswith?("|", line)
currentList.push(line[2..-1])
else
currentList.push(line)
end
elsif line == "#errors"
currentList = errors
elsif line == "#document" or startswith?("#document-fragment", line)
if startswith?("#document-fragment", line)
innerHTML = line[19..-1]
raise AssertionError unless innerHTML
end
currentList = output
end
end
return innerHTML, input.join("\n"), output.join("\n"), errors
end
# convert the output of str(document) to the format used in the testcases
def convertTreeDump(treedump)
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
end
def sortattrs(output)
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) do |match|
match.split("\n").sort.join("\n")
end
end
end
end

View file

@ -4,33 +4,33 @@ require 'html5lib/inputstream'
class Html5EncodingTestCase < Test::Unit::TestCase
begin
begin
require 'rubygems'
require 'UniversalDetector'
def test_chardet
File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
assert_equal 'big5', stream.char_encoding.downcase
end
File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
assert_equal 'big5', stream.char_encoding.downcase
end
end
rescue LoadError
rescue LoadError
puts "chardet not found, skipping chardet tests"
end
end
html5lib_test_files('encoding').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
html5lib_test_files('encoding').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
input, encoding = data.split(/\n#encoding\s+/, 2)
encoding = encoding.split[0]
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
input, encoding = data.split(/\n#encoding\s+/, 2)
encoding = encoding.split[0]
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
assert_equal encoding.downcase, stream.char_encoding.downcase, input
end
end
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
assert_equal encoding.downcase, stream.char_encoding.downcase, input
end
end
end
end

View file

@ -14,53 +14,12 @@ end
$CHECK_PARSER_ERRORS = false
puts 'Testing: ' + $tree_types_to_test * ', '
puts 'Testing tree builders: ' + $tree_types_to_test * ', '
class Html5ParserTestCase < Test::Unit::TestCase
def self.startswith?(a, b)
b[0... a.length] == a
end
def self.parseTestcase(data)
innerHTML = nil
input = []
output = []
errors = []
currentList = input
data.split(/\n/).each do |line|
if !line.empty? and !startswith?("#errors", line) and
!startswith?("#document", line) and
!startswith?("#data", line) and
!startswith?("#document-fragment", line)
if currentList == output and startswith?("|", line)
currentList.push(line[2..-1])
else
currentList.push(line)
end
elsif line == "#errors"
currentList = errors
elsif line == "#document" or startswith?("#document-fragment", line)
if startswith?("#document-fragment", line)
innerHTML = line[19..-1]
raise AssertionError unless innerHTML
end
currentList = output
end
end
return innerHTML, input.join("\n"), output.join("\n"), errors
end
# convert the output of str(document) to the format used in the testcases
def convertTreeDump(treedump)
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
end
def sortattrs(output)
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
end
include HTML5lib
include TestSupport
html5lib_test_files('tree-construction').each do |test_file|
@ -69,12 +28,13 @@ class Html5ParserTestCase < Test::Unit::TestCase
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
innerHTML, input, expected_output, expected_errors = parseTestcase(data)
innerHTML, input, expected_output, expected_errors =
TestSupport.parseTestcase(data)
$tree_types_to_test.each do |tree_name|
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name))
parser = HTMLParser.new(:tree => TreeBuilders[tree_name])
if innerHTML
parser.parseFragment(input, innerHTML)

View file

@ -2,9 +2,11 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/sanitizer'
require 'html5lib/html5parser'
require 'html5lib/liberalxmlparser'
require 'html5lib/treewalkers'
require 'html5lib/serializer'
require 'html5lib/sanitizer'
class SanitizeTest < Test::Unit::TestCase
include HTML5lib

View file

@ -0,0 +1,52 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/html5parser'
require 'html5lib/serializer'
require 'html5lib/treewalkers'
#Run the serialize error checks
checkSerializeErrors = false
class JsonWalker < HTML5lib::TreeWalkers::Base
def each
@tree.each do |token|
case token[0]
when 'StartTag'
yield startTag(token[1], token[2])
when 'EndTag'
yield endTag(token[1])
when 'EmptyTag'
yield emptyTag(token[1], token[2])
when 'Comment'
yield comment(token[1])
when 'Characters', 'SpaceCharacters'
text(token[1]) {|textToken| yield textToken}
when 'Doctype'
yield doctype(token[1])
else
raise ValueError("Unknown token type: " + type)
end
end
end
end
class Html5SerializeTestcase < Test::Unit::TestCase
html5lib_test_files('serializer').each do |filename|
test_name = File.basename(filename).sub('.test', '')
tests = JSON::parse(open(filename).read)
tests['tests'].each_with_index do |test, index|
define_method "test_#{test_name}_#{index+1}" do
result = HTML5lib::HTMLSerializer.
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
expected = test["expected"]
if expected.length == 1
assert_equal(expected[0], result, test["description"])
elsif !expected.include?(result)
flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
end
end
end
end
end

54
vendor/plugins/HTML5lib/tests/test_stream.rb vendored Executable file
View file

@ -0,0 +1,54 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/inputstream'
class HTMLInputStreamTest < Test::Unit::TestCase
include HTML5lib
def test_char_ascii
stream = HTMLInputStream.new("'")
assert_equal('ascii', stream.char_encoding)
assert_equal("'", stream.char)
end
def test_char_null
stream = HTMLInputStream.new("\x00")
assert_equal("\xef\xbf\xbd", stream.char)
end
def test_char_utf8
stream = HTMLInputStream.new("\xe2\x80\x98")
assert_equal('utf-8', stream.char_encoding)
assert_equal("\xe2\x80\x98", stream.char)
end
def test_char_win1252
stream = HTMLInputStream.new("\x91")
assert_equal('windows-1252', stream.char_encoding)
assert_equal("\xe2\x80\x98", stream.char)
end
def test_bom
stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
assert_equal('utf-8', stream.char_encoding)
assert_equal("'", stream.char)
end
def test_utf_16
stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
assert(stream.char_encoding, 'utf-16-le')
assert_equal(1025, stream.chars_until(' ',true).length)
end
def test_newlines
stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
assert_equal(0, stream.instance_eval {@tell})
assert_equal("a\nbb\n", stream.chars_until('c'))
assert_equal(6, stream.instance_eval {@tell})
assert_equal([3,1], stream.position)
assert_equal("ccc\ndddd", stream.chars_until('x'))
assert_equal(14, stream.instance_eval {@tell})
assert_equal([4,5], stream.position)
assert_equal([0,1,4,8], stream.instance_eval {@new_lines})
end
end

View file

@ -0,0 +1,110 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/html5parser'
require 'html5lib/treewalkers'
require 'html5lib/treebuilders'
$tree_types_to_test = {
'simpletree' =>
{:builder => HTML5lib::TreeBuilders['simpletree'],
:walker => HTML5lib::TreeWalkers['simpletree']},
'rexml' =>
{:builder => HTML5lib::TreeBuilders['rexml'],
:walker => HTML5lib::TreeWalkers['rexml']},
# 'hpricot' =>
# {:builder => HTML5lib::TreeBuilders['hpricot'],
# :walker => HTML5lib::TreeWalkers['hpricot']},
}
puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
class TestTreeWalkers < Test::Unit::TestCase
include HTML5lib::TestSupport
def concatenateCharacterTokens(tokens)
charactersToken = nil
for token in tokens
type = token[:type]
if [:Characters, :SpaceCharacters].include?(type)
if charactersToken == nil
charactersToken = {:type => :Characters, :data => token[:data]}
else
charactersToken[:data] += token[:data]
end
else
if charactersToken != nil
yield charactersToken
charactersToken = nil
end
yield token
end
end
yield charactersToken if charactersToken != nil
end
def convertTokens(tokens)
output = []
indent = 0
concatenateCharacterTokens(tokens) do |token|
case token[:type]
when :StartTag, :EmptyTag
output << "#{' '*indent}<#{token[:name]}>"
indent += 2
for name, value in token[:data].to_a.sort
next if name=='xmlns'
output << "#{' '*indent}#{name}=\"#{value}\""
end
indent -= 2 if token[:type] == :EmptyTag
when :EndTag
indent -= 2
when :Comment
output << "#{' '*indent}<!-- #{token[:data]} -->"
when :Doctype
output << "#{' '*indent}<!DOCTYPE #{token[:name]}>"
when :Characters, :SpaceCharacters
output << "#{' '*indent}\"#{token[:data]}\""
else
# TODO: what to do with errors?
end
end
return output.join("\n")
end
html5lib_test_files('tree-construction').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '')
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
innerHTML, input, expected_output, expected_errors =
HTML5lib::TestSupport::parseTestcase(data)
rexml = $tree_types_to_test['rexml']
$tree_types_to_test.each do |tree_name, treeClass|
define_method "test_#{test_name}_#{index}_#{tree_name}" do
parser = HTML5lib::HTMLParser.new(:tree => treeClass[:builder])
if innerHTML
parser.parseFragment(input, innerHTML)
else
parser.parse(input)
end
document = parser.tree.getDocument
begin
output = sortattrs(convertTokens(treeClass[:walker].new(document)))
expected = sortattrs(expected_output)
errorMsg = "\n\nExpected:\n#{expected}\nRecieved:\n#{output}\n"
assert_equal(expected, output, errorMsg)
rescue NotImplementedError
# Amnesty for those that confess...
end
end
end
end
end
end