2007-05-26 03:52:27 +02:00
|
|
|
#!/usr/bin/env ruby
|
|
|
|
|
|
|
|
require File.join(File.dirname(__FILE__), 'preamble')
|
|
|
|
|
2007-07-05 00:36:59 +02:00
|
|
|
require 'html5/html5parser'
|
|
|
|
require 'html5/liberalxmlparser'
|
|
|
|
require 'html5/treewalkers'
|
|
|
|
require 'html5/serializer'
|
|
|
|
require 'html5/sanitizer'
|
2007-05-26 03:52:27 +02:00
|
|
|
|
|
|
|
class SanitizeTest < Test::Unit::TestCase
|
2007-07-05 00:36:59 +02:00
|
|
|
include HTML5
|
2007-05-26 03:52:27 +02:00
|
|
|
|
|
|
|
def sanitize_xhtml stream
|
2007-08-30 19:19:10 +02:00
|
|
|
XHTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).to_s
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def sanitize_html stream
|
2007-08-30 19:19:10 +02:00
|
|
|
HTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).to_s
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
2007-06-06 07:56:43 +02:00
|
|
|
def sanitize_rexml stream
|
|
|
|
require 'rexml/document'
|
2007-06-10 05:21:50 +02:00
|
|
|
doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{stream}</div>")
|
2007-08-30 19:19:10 +02:00
|
|
|
tokens = TreeWalkers.get_tree_walker('rexml').new(doc)
|
2007-07-05 01:53:03 +02:00
|
|
|
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
|
2007-06-10 05:21:50 +02:00
|
|
|
:quote_char => "'",
|
2007-06-06 07:56:43 +02:00
|
|
|
:inject_meta_charset => false,
|
2007-06-22 10:12:08 +02:00
|
|
|
:sanitize => true}).gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
|
2007-06-13 08:56:44 +02:00
|
|
|
rescue REXML::ParseException
|
2007-06-12 06:33:06 +02:00
|
|
|
return "Ill-formed XHTML!"
|
2007-06-10 05:21:50 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
|
2007-06-12 06:33:06 +02:00
|
|
|
assert_equal htmloutput, sanitize_html(input)
|
|
|
|
assert_equal xhtmloutput, sanitize_xhtml(input)
|
|
|
|
assert_equal rexmloutput, sanitize_rexml(input)
|
2007-06-06 07:56:43 +02:00
|
|
|
end
|
|
|
|
|
2007-05-26 03:52:27 +02:00
|
|
|
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
|
|
|
|
define_method "test_should_allow_#{tag_name}_tag" do
|
2007-08-30 19:19:10 +02:00
|
|
|
input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
|
|
|
|
htmloutput = "<#{tag_name.downcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.downcase}>"
|
2007-06-10 05:21:50 +02:00
|
|
|
xhtmloutput = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
|
|
|
|
rexmloutput = xhtmloutput
|
|
|
|
|
2007-06-10 05:53:35 +02:00
|
|
|
if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
|
|
|
|
htmloutput = "foo <bad>bar</bad> baz"
|
|
|
|
xhtmloutput = htmloutput
|
|
|
|
elsif tag_name == 'col'
|
|
|
|
htmloutput = "foo <bad>bar</bad> baz"
|
|
|
|
xhtmloutput = htmloutput
|
|
|
|
rexmloutput = "<col title='1' />"
|
|
|
|
elsif tag_name == 'table'
|
|
|
|
htmloutput = "foo <bad>bar</bad>baz<table title='1'> </table>"
|
|
|
|
xhtmloutput = htmloutput
|
|
|
|
elsif tag_name == 'image'
|
2007-06-10 05:21:50 +02:00
|
|
|
htmloutput = "<img title='1'/>foo <bad>bar</bad> baz"
|
|
|
|
xhtmloutput = htmloutput
|
|
|
|
rexmloutput = "<image title='1'>foo <bad>bar</bad> baz</image>"
|
2007-05-26 03:52:27 +02:00
|
|
|
elsif VOID_ELEMENTS.include?(tag_name)
|
2007-06-10 05:21:50 +02:00
|
|
|
htmloutput = "<#{tag_name} title='1'/>foo <bad>bar</bad> baz"
|
|
|
|
xhtmloutput = htmloutput
|
2007-06-22 10:12:08 +02:00
|
|
|
htmloutput += '<br/>' if tag_name == 'br'
|
2007-06-10 05:21:50 +02:00
|
|
|
rexmloutput = "<#{tag_name} title='1' />"
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
2007-06-10 05:21:50 +02:00
|
|
|
check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
|
|
|
|
define_method "test_should_forbid_#{tag_name.upcase}_tag" do
|
2007-06-10 05:21:50 +02:00
|
|
|
input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
|
|
|
output = "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
|
|
|
check_sanitization(input, output, output, output)
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
|
|
|
next if attribute_name == 'style'
|
|
|
|
define_method "test_should_allow_#{attribute_name}_attribute" do
|
2007-06-10 05:21:50 +02:00
|
|
|
input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
|
|
|
|
output = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
|
|
|
|
htmloutput = "<p #{attribute_name.downcase}='foo'>foo <bad>bar</bad> baz</p>"
|
|
|
|
check_sanitization(input, htmloutput, output, output)
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
|
|
|
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
|
2007-06-10 05:21:50 +02:00
|
|
|
input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
|
|
|
|
output = "<p>foo <bad>bar</bad> baz</p>"
|
|
|
|
check_sanitization(input, output, output, output)
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
|
|
|
|
define_method "test_should_allow_#{protocol}_uris" do
|
2007-06-10 05:21:50 +02:00
|
|
|
input = %(<a href="#{protocol}">foo</a>)
|
|
|
|
output = "<a href='#{protocol}'>foo</a>"
|
|
|
|
check_sanitization(input, output, output, output)
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
|
|
|
|
define_method "test_should_allow_uppercase_#{protocol}_uris" do
|
2007-06-10 05:21:50 +02:00
|
|
|
input = %(<a href="#{protocol.upcase}">foo</a>)
|
|
|
|
output = "<a href='#{protocol.upcase}'>foo</a>"
|
|
|
|
check_sanitization(input, output, output, output)
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2007-05-30 17:45:52 +02:00
|
|
|
def test_should_handle_astral_plane_characters
|
2007-06-10 05:21:50 +02:00
|
|
|
input = "<p>𝒵 𝔸</p>"
|
|
|
|
output = "<p>\360\235\222\265 \360\235\224\270</p>"
|
|
|
|
check_sanitization(input, output, output, output)
|
|
|
|
|
|
|
|
input = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
|
|
|
output = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
|
|
|
check_sanitization(input, output, output, output)
|
2007-05-30 17:45:52 +02:00
|
|
|
end
|
2007-06-10 22:07:26 +02:00
|
|
|
|
|
|
|
# This affects only NS4. Is it worth fixing?
|
|
|
|
# def test_javascript_includes
|
|
|
|
# input = %(<div size="&{alert('XSS')}">foo</div>)
|
|
|
|
# output = "<div>foo</div>"
|
|
|
|
# check_sanitization(input, output, output, output)
|
|
|
|
# end
|
|
|
|
|
2007-07-05 00:36:59 +02:00
|
|
|
html5_test_files('sanitizer').each do |filename|
|
2007-06-12 06:33:06 +02:00
|
|
|
JSON::parse(open(filename).read).each do |test|
|
|
|
|
define_method "test_#{test['name']}" do
|
|
|
|
check_sanitization(
|
|
|
|
test['input'],
|
|
|
|
test['output'],
|
|
|
|
test['xhtml'] || test['output'],
|
|
|
|
test['rexml'] || test['output']
|
|
|
|
)
|
|
|
|
end
|
|
|
|
end
|
2007-06-11 07:03:51 +02:00
|
|
|
end
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|