REXML Trees
Synced with latest HTML5lib. Added preliminary support (currently disabled) for sanitizing REXML trees.
This commit is contained in:
parent
4dd70af5ae
commit
bd8ba1f4b1
28 changed files with 1317 additions and 112 deletions
50
vendor/plugins/HTML5lib/tests/preamble.rb
vendored
50
vendor/plugins/HTML5lib/tests/preamble.rb
vendored
|
@ -21,3 +21,53 @@ rescue LoadError
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
module HTML5lib
|
||||
module TestSupport
|
||||
def self.startswith?(a, b)
|
||||
b[0... a.length] == a
|
||||
end
|
||||
|
||||
def self.parseTestcase(data)
|
||||
innerHTML = nil
|
||||
input = []
|
||||
output = []
|
||||
errors = []
|
||||
currentList = input
|
||||
data.split(/\n/).each do |line|
|
||||
if !line.empty? and !startswith?("#errors", line) and
|
||||
!startswith?("#document", line) and
|
||||
!startswith?("#data", line) and
|
||||
!startswith?("#document-fragment", line)
|
||||
|
||||
if currentList == output and startswith?("|", line)
|
||||
currentList.push(line[2..-1])
|
||||
else
|
||||
currentList.push(line)
|
||||
end
|
||||
elsif line == "#errors"
|
||||
currentList = errors
|
||||
elsif line == "#document" or startswith?("#document-fragment", line)
|
||||
if startswith?("#document-fragment", line)
|
||||
innerHTML = line[19..-1]
|
||||
raise AssertionError unless innerHTML
|
||||
end
|
||||
currentList = output
|
||||
end
|
||||
end
|
||||
return innerHTML, input.join("\n"), output.join("\n"), errors
|
||||
end
|
||||
|
||||
# convert the output of str(document) to the format used in the testcases
|
||||
def convertTreeDump(treedump)
|
||||
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
|
||||
end
|
||||
|
||||
def sortattrs(output)
|
||||
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) do |match|
|
||||
match.split("\n").sort.join("\n")
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
|
|
36
vendor/plugins/HTML5lib/tests/test_encoding.rb
vendored
36
vendor/plugins/HTML5lib/tests/test_encoding.rb
vendored
|
@ -4,33 +4,33 @@ require 'html5lib/inputstream'
|
|||
|
||||
class Html5EncodingTestCase < Test::Unit::TestCase
|
||||
|
||||
begin
|
||||
begin
|
||||
require 'rubygems'
|
||||
require 'UniversalDetector'
|
||||
|
||||
def test_chardet
|
||||
File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
|
||||
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
|
||||
assert_equal 'big5', stream.char_encoding.downcase
|
||||
end
|
||||
File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
|
||||
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
|
||||
assert_equal 'big5', stream.char_encoding.downcase
|
||||
end
|
||||
end
|
||||
rescue LoadError
|
||||
rescue LoadError
|
||||
puts "chardet not found, skipping chardet tests"
|
||||
end
|
||||
end
|
||||
|
||||
html5lib_test_files('encoding').each do |test_file|
|
||||
test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
|
||||
html5lib_test_files('encoding').each do |test_file|
|
||||
test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
|
||||
|
||||
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
||||
next if data.empty?
|
||||
input, encoding = data.split(/\n#encoding\s+/, 2)
|
||||
encoding = encoding.split[0]
|
||||
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
||||
next if data.empty?
|
||||
input, encoding = data.split(/\n#encoding\s+/, 2)
|
||||
encoding = encoding.split[0]
|
||||
|
||||
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
|
||||
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
|
||||
assert_equal encoding.downcase, stream.char_encoding.downcase, input
|
||||
end
|
||||
end
|
||||
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
|
||||
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
|
||||
assert_equal encoding.downcase, stream.char_encoding.downcase, input
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
|
|
52
vendor/plugins/HTML5lib/tests/test_parser.rb
vendored
52
vendor/plugins/HTML5lib/tests/test_parser.rb
vendored
|
@ -14,53 +14,12 @@ end
|
|||
|
||||
$CHECK_PARSER_ERRORS = false
|
||||
|
||||
puts 'Testing: ' + $tree_types_to_test * ', '
|
||||
puts 'Testing tree builders: ' + $tree_types_to_test * ', '
|
||||
|
||||
|
||||
class Html5ParserTestCase < Test::Unit::TestCase
|
||||
|
||||
def self.startswith?(a, b)
|
||||
b[0... a.length] == a
|
||||
end
|
||||
|
||||
def self.parseTestcase(data)
|
||||
innerHTML = nil
|
||||
input = []
|
||||
output = []
|
||||
errors = []
|
||||
currentList = input
|
||||
data.split(/\n/).each do |line|
|
||||
if !line.empty? and !startswith?("#errors", line) and
|
||||
!startswith?("#document", line) and
|
||||
!startswith?("#data", line) and
|
||||
!startswith?("#document-fragment", line)
|
||||
|
||||
if currentList == output and startswith?("|", line)
|
||||
currentList.push(line[2..-1])
|
||||
else
|
||||
currentList.push(line)
|
||||
end
|
||||
elsif line == "#errors"
|
||||
currentList = errors
|
||||
elsif line == "#document" or startswith?("#document-fragment", line)
|
||||
if startswith?("#document-fragment", line)
|
||||
innerHTML = line[19..-1]
|
||||
raise AssertionError unless innerHTML
|
||||
end
|
||||
currentList = output
|
||||
end
|
||||
end
|
||||
return innerHTML, input.join("\n"), output.join("\n"), errors
|
||||
end
|
||||
|
||||
# convert the output of str(document) to the format used in the testcases
|
||||
def convertTreeDump(treedump)
|
||||
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
|
||||
end
|
||||
|
||||
def sortattrs(output)
|
||||
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
|
||||
end
|
||||
include HTML5lib
|
||||
include TestSupport
|
||||
|
||||
html5lib_test_files('tree-construction').each do |test_file|
|
||||
|
||||
|
@ -69,12 +28,13 @@ class Html5ParserTestCase < Test::Unit::TestCase
|
|||
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
||||
next if data.empty?
|
||||
|
||||
innerHTML, input, expected_output, expected_errors = parseTestcase(data)
|
||||
innerHTML, input, expected_output, expected_errors =
|
||||
TestSupport.parseTestcase(data)
|
||||
|
||||
$tree_types_to_test.each do |tree_name|
|
||||
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
|
||||
|
||||
parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name))
|
||||
parser = HTMLParser.new(:tree => TreeBuilders[tree_name])
|
||||
|
||||
if innerHTML
|
||||
parser.parseFragment(input, innerHTML)
|
||||
|
|
|
@ -2,9 +2,11 @@
|
|||
|
||||
require File.join(File.dirname(__FILE__), 'preamble')
|
||||
|
||||
require 'html5lib/sanitizer'
|
||||
require 'html5lib/html5parser'
|
||||
require 'html5lib/liberalxmlparser'
|
||||
require 'html5lib/treewalkers'
|
||||
require 'html5lib/serializer'
|
||||
require 'html5lib/sanitizer'
|
||||
|
||||
class SanitizeTest < Test::Unit::TestCase
|
||||
include HTML5lib
|
||||
|
|
52
vendor/plugins/HTML5lib/tests/test_serializer.rb
vendored
Normal file
52
vendor/plugins/HTML5lib/tests/test_serializer.rb
vendored
Normal file
|
@ -0,0 +1,52 @@
|
|||
require File.join(File.dirname(__FILE__), 'preamble')
|
||||
|
||||
require 'html5lib/html5parser'
|
||||
require 'html5lib/serializer'
|
||||
require 'html5lib/treewalkers'
|
||||
|
||||
#Run the serialize error checks
|
||||
checkSerializeErrors = false
|
||||
|
||||
class JsonWalker < HTML5lib::TreeWalkers::Base
|
||||
def each
|
||||
@tree.each do |token|
|
||||
case token[0]
|
||||
when 'StartTag'
|
||||
yield startTag(token[1], token[2])
|
||||
when 'EndTag'
|
||||
yield endTag(token[1])
|
||||
when 'EmptyTag'
|
||||
yield emptyTag(token[1], token[2])
|
||||
when 'Comment'
|
||||
yield comment(token[1])
|
||||
when 'Characters', 'SpaceCharacters'
|
||||
text(token[1]) {|textToken| yield textToken}
|
||||
when 'Doctype'
|
||||
yield doctype(token[1])
|
||||
else
|
||||
raise ValueError("Unknown token type: " + type)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
class Html5SerializeTestcase < Test::Unit::TestCase
|
||||
html5lib_test_files('serializer').each do |filename|
|
||||
test_name = File.basename(filename).sub('.test', '')
|
||||
tests = JSON::parse(open(filename).read)
|
||||
tests['tests'].each_with_index do |test, index|
|
||||
|
||||
define_method "test_#{test_name}_#{index+1}" do
|
||||
result = HTML5lib::HTMLSerializer.
|
||||
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
|
||||
expected = test["expected"]
|
||||
if expected.length == 1
|
||||
assert_equal(expected[0], result, test["description"])
|
||||
elsif !expected.include?(result)
|
||||
flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
54
vendor/plugins/HTML5lib/tests/test_stream.rb
vendored
Executable file
54
vendor/plugins/HTML5lib/tests/test_stream.rb
vendored
Executable file
|
@ -0,0 +1,54 @@
|
|||
require File.join(File.dirname(__FILE__), 'preamble')
|
||||
|
||||
require 'html5lib/inputstream'
|
||||
|
||||
class HTMLInputStreamTest < Test::Unit::TestCase
|
||||
include HTML5lib
|
||||
|
||||
def test_char_ascii
|
||||
stream = HTMLInputStream.new("'")
|
||||
assert_equal('ascii', stream.char_encoding)
|
||||
assert_equal("'", stream.char)
|
||||
end
|
||||
|
||||
def test_char_null
|
||||
stream = HTMLInputStream.new("\x00")
|
||||
assert_equal("\xef\xbf\xbd", stream.char)
|
||||
end
|
||||
|
||||
def test_char_utf8
|
||||
stream = HTMLInputStream.new("\xe2\x80\x98")
|
||||
assert_equal('utf-8', stream.char_encoding)
|
||||
assert_equal("\xe2\x80\x98", stream.char)
|
||||
end
|
||||
|
||||
def test_char_win1252
|
||||
stream = HTMLInputStream.new("\x91")
|
||||
assert_equal('windows-1252', stream.char_encoding)
|
||||
assert_equal("\xe2\x80\x98", stream.char)
|
||||
end
|
||||
|
||||
def test_bom
|
||||
stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
|
||||
assert_equal('utf-8', stream.char_encoding)
|
||||
assert_equal("'", stream.char)
|
||||
end
|
||||
|
||||
def test_utf_16
|
||||
stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
|
||||
assert(stream.char_encoding, 'utf-16-le')
|
||||
assert_equal(1025, stream.chars_until(' ',true).length)
|
||||
end
|
||||
|
||||
def test_newlines
|
||||
stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
|
||||
assert_equal(0, stream.instance_eval {@tell})
|
||||
assert_equal("a\nbb\n", stream.chars_until('c'))
|
||||
assert_equal(6, stream.instance_eval {@tell})
|
||||
assert_equal([3,1], stream.position)
|
||||
assert_equal("ccc\ndddd", stream.chars_until('x'))
|
||||
assert_equal(14, stream.instance_eval {@tell})
|
||||
assert_equal([4,5], stream.position)
|
||||
assert_equal([0,1,4,8], stream.instance_eval {@new_lines})
|
||||
end
|
||||
end
|
110
vendor/plugins/HTML5lib/tests/test_treewalkers.rb
vendored
Normal file
110
vendor/plugins/HTML5lib/tests/test_treewalkers.rb
vendored
Normal file
|
@ -0,0 +1,110 @@
|
|||
require File.join(File.dirname(__FILE__), 'preamble')
|
||||
|
||||
require 'html5lib/html5parser'
|
||||
require 'html5lib/treewalkers'
|
||||
require 'html5lib/treebuilders'
|
||||
|
||||
$tree_types_to_test = {
|
||||
'simpletree' =>
|
||||
{:builder => HTML5lib::TreeBuilders['simpletree'],
|
||||
:walker => HTML5lib::TreeWalkers['simpletree']},
|
||||
'rexml' =>
|
||||
{:builder => HTML5lib::TreeBuilders['rexml'],
|
||||
:walker => HTML5lib::TreeWalkers['rexml']},
|
||||
# 'hpricot' =>
|
||||
# {:builder => HTML5lib::TreeBuilders['hpricot'],
|
||||
# :walker => HTML5lib::TreeWalkers['hpricot']},
|
||||
}
|
||||
|
||||
puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
|
||||
|
||||
class TestTreeWalkers < Test::Unit::TestCase
|
||||
include HTML5lib::TestSupport
|
||||
|
||||
def concatenateCharacterTokens(tokens)
|
||||
charactersToken = nil
|
||||
for token in tokens
|
||||
type = token[:type]
|
||||
if [:Characters, :SpaceCharacters].include?(type)
|
||||
if charactersToken == nil
|
||||
charactersToken = {:type => :Characters, :data => token[:data]}
|
||||
else
|
||||
charactersToken[:data] += token[:data]
|
||||
end
|
||||
else
|
||||
if charactersToken != nil
|
||||
yield charactersToken
|
||||
charactersToken = nil
|
||||
end
|
||||
yield token
|
||||
end
|
||||
end
|
||||
yield charactersToken if charactersToken != nil
|
||||
end
|
||||
|
||||
def convertTokens(tokens)
|
||||
output = []
|
||||
indent = 0
|
||||
concatenateCharacterTokens(tokens) do |token|
|
||||
case token[:type]
|
||||
when :StartTag, :EmptyTag
|
||||
output << "#{' '*indent}<#{token[:name]}>"
|
||||
indent += 2
|
||||
for name, value in token[:data].to_a.sort
|
||||
next if name=='xmlns'
|
||||
output << "#{' '*indent}#{name}=\"#{value}\""
|
||||
end
|
||||
indent -= 2 if token[:type] == :EmptyTag
|
||||
when :EndTag
|
||||
indent -= 2
|
||||
when :Comment
|
||||
output << "#{' '*indent}<!-- #{token[:data]} -->"
|
||||
when :Doctype
|
||||
output << "#{' '*indent}<!DOCTYPE #{token[:name]}>"
|
||||
when :Characters, :SpaceCharacters
|
||||
output << "#{' '*indent}\"#{token[:data]}\""
|
||||
else
|
||||
# TODO: what to do with errors?
|
||||
end
|
||||
end
|
||||
return output.join("\n")
|
||||
end
|
||||
|
||||
html5lib_test_files('tree-construction').each do |test_file|
|
||||
|
||||
test_name = File.basename(test_file).sub('.dat', '')
|
||||
|
||||
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
||||
next if data.empty?
|
||||
|
||||
innerHTML, input, expected_output, expected_errors =
|
||||
HTML5lib::TestSupport::parseTestcase(data)
|
||||
|
||||
rexml = $tree_types_to_test['rexml']
|
||||
$tree_types_to_test.each do |tree_name, treeClass|
|
||||
|
||||
define_method "test_#{test_name}_#{index}_#{tree_name}" do
|
||||
|
||||
parser = HTML5lib::HTMLParser.new(:tree => treeClass[:builder])
|
||||
|
||||
if innerHTML
|
||||
parser.parseFragment(input, innerHTML)
|
||||
else
|
||||
parser.parse(input)
|
||||
end
|
||||
|
||||
document = parser.tree.getDocument
|
||||
|
||||
begin
|
||||
output = sortattrs(convertTokens(treeClass[:walker].new(document)))
|
||||
expected = sortattrs(expected_output)
|
||||
errorMsg = "\n\nExpected:\n#{expected}\nRecieved:\n#{output}\n"
|
||||
assert_equal(expected, output, errorMsg)
|
||||
rescue NotImplementedError
|
||||
# Amnesty for those that confess...
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Add table
Add a link
Reference in a new issue