#!/usr/bin/env ruby
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/html5parser'
require 'html5lib/liberalxmlparser'
require 'html5lib/treewalkers'
require 'html5lib/serializer'
require 'html5lib/sanitizer'
class SanitizeTest < Test::Unit::TestCase
include HTML5lib
def sanitize_xhtml stream
XHTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).to_s
end
def sanitize_html stream
HTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).to_s
end
def sanitize_rexml stream
require 'rexml/document'
doc = REXML::Document.new("
(.*)<\/div>$/, '\1')
rescue
return "Ill-formed XHTML!"
end
def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
assert_equal htmloutput, sanitize_html(input)
assert_equal xhtmloutput, sanitize_xhtml(input)
assert_equal rexmloutput, sanitize_rexml(input)
end
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
define_method "test_should_allow_#{tag_name}_tag" do
input = "<#{tag_name} title='1'>foo
bar baz#{tag_name}>"
htmloutput = "<#{tag_name.downcase} title='1'>foo <bad>bar</bad> baz#{tag_name.downcase}>"
xhtmloutput = "<#{tag_name} title='1'>foo <bad>bar</bad> baz#{tag_name}>"
rexmloutput = xhtmloutput
if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
htmloutput = "foo <bad>bar</bad> baz"
xhtmloutput = htmloutput
elsif tag_name == 'col'
htmloutput = "foo <bad>bar</bad> baz"
xhtmloutput = htmloutput
rexmloutput = "
"
elsif tag_name == 'table'
htmloutput = "foo <bad>bar</bad>baz
"
xhtmloutput = htmloutput
elsif tag_name == 'image'
htmloutput = "
foo <bad>bar</bad> baz"
xhtmloutput = htmloutput
rexmloutput = "
foo <bad>bar</bad> baz"
elsif VOID_ELEMENTS.include?(tag_name)
htmloutput = "<#{tag_name} title='1'/>foo <bad>bar</bad> baz"
xhtmloutput = htmloutput
rexmloutput = "<#{tag_name} title='1' />"
end
check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
end
end
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
define_method "test_should_forbid_#{tag_name.upcase}_tag" do
input = "<#{tag_name.upcase} title='1'>foo
bar baz#{tag_name.upcase}>"
output = "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>"
check_sanitization(input, output, output, output)
end
end
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
next if attribute_name == 'style'
define_method "test_should_allow_#{attribute_name}_attribute" do
input = "
foo bar baz
"
output = "
foo <bad>bar</bad> baz
"
htmloutput = "
foo <bad>bar</bad> baz
"
check_sanitization(input, htmloutput, output, output)
end
end
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
input = "
foo bar baz
"
output = "
foo <bad>bar</bad> baz
"
check_sanitization(input, output, output, output)
end
end
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_#{protocol}_uris" do
input = %(
foo)
output = "
foo"
check_sanitization(input, output, output, output)
end
end
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_uppercase_#{protocol}_uris" do
input = %(
foo)
output = "
foo"
check_sanitization(input, output, output, output)
end
end
def test_should_handle_astral_plane_characters
input = "
𝒵 𝔸
"
output = "
\360\235\222\265 \360\235\224\270
"
check_sanitization(input, output, output, output)
input = "
\360\235\224\270 a
"
output = "
\360\235\224\270 a
"
check_sanitization(input, output, output, output)
end
# This affects only NS4. Is it worth fixing?
# def test_javascript_includes
# input = %(
foo
)
# output = "
foo
"
# check_sanitization(input, output, output, output)
# end
html5lib_test_files('sanitizer').each do |filename|
JSON::parse(open(filename).read).each do |test|
define_method "test_#{test['name']}" do
check_sanitization(
test['input'],
test['output'],
test['xhtml'] || test['output'],
test['rexml'] || test['output']
)
end
end
end
end