#!/usr/bin/env ruby
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/html5parser'
require 'html5lib/liberalxmlparser'
require 'html5lib/treewalkers'
require 'html5lib/serializer'
require 'html5lib/sanitizer'
class SanitizeTest < Test::Unit::TestCase
include HTML5lib
def sanitize_xhtml stream
XHTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
end
def sanitize_html stream
HTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
end
def sanitize_rexml stream
require 'rexml/document'
doc = REXML::Document.new("
(.*)<\/div>$/, '\1')
end
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
next if %w[caption col colgroup optgroup option table tbody td tfoot th thead tr].include?(tag_name) ### TODO
define_method "test_should_allow_#{tag_name}_tag" do
if tag_name == 'image'
assert_equal "
foo <bad>bar</bad> baz",
sanitize_html("<#{tag_name} title='1'>foo
bar baz#{tag_name}>")
elsif VOID_ELEMENTS.include?(tag_name)
assert_equal "<#{tag_name} title=\"1\"/>foo <bad>bar</bad> baz",
sanitize_html("<#{tag_name} title='1'>foo
bar baz#{tag_name}>")
else
assert_equal "<#{tag_name.downcase} title=\"1\">foo <bad>bar</bad> baz#{tag_name.downcase}>",
sanitize_html("<#{tag_name} title='1'>foo
bar baz#{tag_name}>")
assert_equal "<#{tag_name} title=\"1\">foo <bad>bar</bad> baz#{tag_name}>",
sanitize_xhtml("<#{tag_name} title='1'>foo
bar baz#{tag_name}>")
assert_equal "<#{tag_name} title=\"1\">foo <bad>bar</bad> baz#{tag_name}>",
sanitize_rexml("<#{tag_name} title='1'>foo
bar baz#{tag_name}>")
end
end
end
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
define_method "test_should_forbid_#{tag_name.upcase}_tag" do
assert_equal "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>",
sanitize_html("<#{tag_name.upcase} title='1'>foo
bar baz#{tag_name.upcase}>")
assert_equal "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>",
sanitize_rexml("<#{tag_name.upcase} title='1'>foo
bar baz#{tag_name.upcase}>")
end
end
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
next if attribute_name == 'style'
define_method "test_should_allow_#{attribute_name}_attribute" do
assert_equal "
foo <bad>bar</bad> baz
",
sanitize_html("
foo bar baz
")
assert_equal "
foo <bad>bar</bad> baz
",
sanitize_xhtml("
foo bar baz
")
assert_equal "
foo <bad>bar</bad> baz
",
sanitize_rexml("
foo bar baz
")
end
end
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
assert_equal "
foo <bad>bar</bad> baz
",
sanitize_html("
foo bar baz
")
assert_equal "
foo <bad>bar</bad> baz
",
sanitize_rexml("
foo bar baz
")
end
end
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_#{protocol}_uris" do
assert_equal "
foo",
sanitize_html(%(
foo))
assert_equal "
foo",
sanitize_rexml(%(
foo))
end
end
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_uppercase_#{protocol}_uris" do
assert_equal "
foo",
sanitize_html(%(
foo))
assert_equal "
foo",
sanitize_rexml(%(
foo))
end
end
def test_should_allow_anchors
assert_equal "
<script>baz</script>",
sanitize_html("
")
assert_equal "
<script>baz</script>",
sanitize_rexml("
")
end
# RFC 3986, sec 4.2
def test_allow_colons_in_path_component
assert_equal "
foo",
sanitize_html("
foo")
assert_equal "
foo",
sanitize_rexml("
foo")
end
%w(src width height alt).each do |img_attr|
define_method "test_should_allow_image_#{img_attr}_attribute" do
assert_equal "
",
sanitize_html("
")
assert_equal "
",
sanitize_rexml("
")
end
end
def test_should_handle_non_html
assert_equal 'abc', sanitize_html("abc")
assert_equal 'abc', sanitize_rexml("abc")
end
def test_should_handle_blank_text
assert_equal '', sanitize_html('')
assert_equal '', sanitize_rexml('')
end
[%w(img src), %w(a href)].each do |(tag, attr)|
close = VOID_ELEMENTS.include?(tag) ? "/>boo" : ">boo#{tag}>"
xclose = VOID_ELEMENTS.include?(tag) ? " />" : ">boo#{tag}>"
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols" do
assert_equal %(<#{tag} title="1"#{close}), sanitize_html(%(<#{tag} #{attr}="javascript:XSS" title="1">boo#{tag}>))
assert_equal %(<#{tag} title="1"#{xclose}), sanitize_rexml(%(<#{tag} #{attr}="javascript:XSS" title="1">boo#{tag}>))
end
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols_and_whitespace" do
assert_equal %(<#{tag} title="1"#{close}), sanitize_html(%(<#{tag} #{attr}=" javascript:XSS" title="1">boo#{tag}>))
assert_equal %(<#{tag} title="1"#{xclose}), sanitize_rexml(%(<#{tag} #{attr}=" javascript:XSS" title="1">boo#{tag}>))
end
end
[%(
),
%(
),
%(
),
%(
),
%(
),
%(
),
%(
),
%(
),
%(
),
%(
),
%(
),
%(
),
%(
),
%(
),
%(
)].each_with_index do |img_hack, i|
define_method "test_should_not_fall_for_xss_image_hack_#{i}" do
assert_equal "
", sanitize_html(img_hack)
end
end
def test_should_sanitize_tag_broken_up_by_null
assert_equal "<scr\357\277\275ipt>alert(\"XSS\")</scr\357\277\275ipt>", sanitize_html(%(
alert(\"XSS\")))
end
def test_should_sanitize_invalid_script_tag
assert_equal "<script XSS=\"\" SRC=\"http://ha.ckers.org/xss.js\"></script>", sanitize_html(%())
end
def test_should_sanitize_script_tag_with_multiple_open_brackets
assert_equal "<<script>alert(\"XSS\");//<</script>", sanitize_html(%(<))
assert_equal %(<iframe src=\"http://ha.ckers.org/scriptlet.html\"><), sanitize_html(%(