a6429f8c22
Completely removed the html5lib sanitizer. Fixed the string-handling to work in both Ruby 1.8.x and 1.9.2. There are still, inexplicably, two functional tests that fail. But the rest seems to work quite well.
179 lines
7.3 KiB
Ruby
179 lines
7.3 KiB
Ruby
#!/usr/bin/env ruby
|
|
|
|
require File.join(File.dirname(__FILE__), 'preamble')
|
|
|
|
require 'html5/html5parser'
|
|
require 'html5/liberalxmlparser'
|
|
require 'html5/treewalkers'
|
|
require 'html5/serializer'
|
|
require 'html5/sanitizer'
|
|
|
|
class SanitizeTest < Test::Unit::TestCase
|
|
include HTML5
|
|
|
|
def sanitize_xhtml stream
|
|
XHTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).join
|
|
end
|
|
|
|
def sanitize_html stream
|
|
HTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).join
|
|
end
|
|
|
|
def sanitize_rexml stream
|
|
require 'rexml/document'
|
|
doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{stream}</div>")
|
|
tokens = TreeWalkers.get_tree_walker('rexml').new(doc)
|
|
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
|
|
:quote_char => "'",
|
|
:inject_meta_charset => false,
|
|
:sanitize => true}).gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
|
|
rescue REXML::ParseException
|
|
return "Ill-formed XHTML!"
|
|
end
|
|
|
|
def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
|
|
assert_equal htmloutput, sanitize_html(input)
|
|
assert_equal xhtmloutput, sanitize_xhtml(input)
|
|
assert_equal rexmloutput, sanitize_rexml(input)
|
|
end
|
|
|
|
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
|
|
define_method "test_should_allow_#{tag_name}_tag" do
|
|
input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
|
|
htmloutput = "<#{tag_name.downcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.downcase}>"
|
|
xhtmloutput = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
|
|
rexmloutput = xhtmloutput
|
|
|
|
if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
|
|
htmloutput = "foo <bad>bar</bad> baz"
|
|
xhtmloutput = htmloutput
|
|
elsif tag_name == 'col'
|
|
htmloutput = "foo <bad>bar</bad> baz"
|
|
xhtmloutput = htmloutput
|
|
rexmloutput = "<col title='1' />"
|
|
elsif tag_name == 'table'
|
|
htmloutput = "foo <bad>bar</bad>baz<table title='1'> </table>"
|
|
xhtmloutput = htmloutput
|
|
elsif tag_name == 'image'
|
|
htmloutput = "<img title='1'/>foo <bad>bar</bad> baz"
|
|
xhtmloutput = htmloutput
|
|
rexmloutput = "<image title='1'>foo <bad>bar</bad> baz</image>"
|
|
elsif VOID_ELEMENTS.include?(tag_name)
|
|
htmloutput = "<#{tag_name} title='1'/>foo <bad>bar</bad> baz"
|
|
xhtmloutput = htmloutput
|
|
htmloutput += '<br/>' if tag_name == 'br'
|
|
rexmloutput = "<#{tag_name} title='1' />"
|
|
end
|
|
check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
|
|
end
|
|
end
|
|
|
|
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
|
|
define_method "test_should_forbid_#{tag_name.upcase}_tag" do
|
|
input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
|
output = "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
|
check_sanitization(input, output, output, output)
|
|
end
|
|
end
|
|
|
|
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
|
next if attribute_name == 'style'
|
|
define_method "test_should_allow_#{attribute_name}_attribute" do
|
|
input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
|
|
output = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
|
|
htmloutput = "<p #{attribute_name.downcase}='foo'>foo <bad>bar</bad> baz</p>"
|
|
rexmloutput = attribute_name.include?(':') && !(attribute_name =~ /^xml(ns)?:/) ? "Ill-formed XHTML!" : output
|
|
check_sanitization(input, htmloutput, output, rexmloutput)
|
|
end
|
|
end
|
|
|
|
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
|
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
|
|
input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
|
|
output = "<p>foo <bad>bar</bad> baz</p>"
|
|
rexmloutput = attribute_name.include?(':') ? "Ill-formed XHTML!" : output
|
|
check_sanitization(input, output, output, rexmloutput)
|
|
end
|
|
end
|
|
|
|
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
|
|
define_method "test_should_allow_#{protocol}_uris" do
|
|
input = %(<a href="#{protocol}">foo</a>)
|
|
output = "<a href='#{protocol}'>foo</a>"
|
|
check_sanitization(input, output, output, output)
|
|
end
|
|
end
|
|
|
|
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
|
|
define_method "test_should_allow_uppercase_#{protocol}_uris" do
|
|
input = %(<a href="#{protocol.upcase}">foo</a>)
|
|
output = "<a href='#{protocol.upcase}'>foo</a>"
|
|
check_sanitization(input, output, output, output)
|
|
end
|
|
end
|
|
|
|
HTMLSanitizer::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
|
|
next unless HTMLSanitizer::ALLOWED_ELEMENTS.include?(tag_name)
|
|
define_method "test_#{tag_name}_should_allow_local_href" do
|
|
input = %(<#{tag_name} xlink:href="#foo"/>)
|
|
output = "<#{tag_name.downcase} xlink:href='#foo'/>"
|
|
xhtmloutput = "<#{tag_name} xlink:href='#foo'></#{tag_name}>"
|
|
rexmloutput = "Ill-formed XHTML!"
|
|
check_sanitization(input, output, xhtmloutput, rexmloutput)
|
|
end
|
|
|
|
define_method "test_#{tag_name}_should_allow_local_href_with_newline" do
|
|
input = %(<#{tag_name} xlink:href="\n#foo"/>)
|
|
output = "<#{tag_name.downcase} xlink:href='\n#foo'/>"
|
|
xhtmloutput = "<#{tag_name} xlink:href='\n#foo'></#{tag_name}>"
|
|
rexmloutput = "Ill-formed XHTML!"
|
|
check_sanitization(input, output, xhtmloutput, rexmloutput)
|
|
end
|
|
|
|
define_method "test_#{tag_name}_should_forbid_nonlocal_href" do
|
|
input = %(<#{tag_name} xlink:href="http://bad.com/foo"/>)
|
|
output = "<#{tag_name.downcase}/>"
|
|
xhtmloutput = "<#{tag_name}></#{tag_name}>"
|
|
rexmloutput = "Ill-formed XHTML!"
|
|
check_sanitization(input, output, xhtmloutput, rexmloutput)
|
|
end
|
|
|
|
define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline" do
|
|
input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo"/>)
|
|
output = "<#{tag_name.downcase}/>"
|
|
xhtmloutput = "<#{tag_name}></#{tag_name}>"
|
|
rexmloutput = "Ill-formed XHTML!"
|
|
check_sanitization(input, output, xhtmloutput, rexmloutput)
|
|
end
|
|
end
|
|
|
|
def test_should_handle_astral_plane_characters
|
|
input = "<p>𝒵 𝔸</p>"
|
|
output = "<p>\360\235\222\265 \360\235\224\270</p>"
|
|
check_sanitization(input, output, output, output)
|
|
|
|
input = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
|
output = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
|
check_sanitization(input, output, output, output)
|
|
end
|
|
|
|
# This affects only NS4. Is it worth fixing?
|
|
# def test_javascript_includes
|
|
# input = %(<div size="&{alert('XSS')}">foo</div>)
|
|
# output = "<div>foo</div>"
|
|
# check_sanitization(input, output, output, output)
|
|
# end
|
|
|
|
html5_test_files('sanitizer').each do |filename|
|
|
JSON::parse(open(filename).read).each do |test|
|
|
define_method "test_#{test['name']}" do
|
|
check_sanitization(
|
|
test['input'],
|
|
test['output'],
|
|
test['xhtml'] || test['output'],
|
|
test['rexml'] || test['output']
|
|
)
|
|
end
|
|
end
|
|
end
|
|
end
|