#!/usr/bin/env ruby require File.join(File.dirname(__FILE__), 'preamble') require 'html5lib/html5parser' require 'html5lib/liberalxmlparser' require 'html5lib/treewalkers' require 'html5lib/serializer' require 'html5lib/sanitizer' class SanitizeTest < Test::Unit::TestCase include HTML5lib def sanitize_xhtml stream XHTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"') end def sanitize_html stream HTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"') end def sanitize_rexml stream require 'rexml/document' doc = REXML::Document.new("
#{stream}
") tokens = TreeWalkers.getTreeWalker('rexml').new(doc) HTMLSerializer.serialize(tokens, {:encoding=>'utf-8', :quote_attr_values => true, :minimize_boolean_attributes => false, :use_trailing_solidus => true, :omit_optional_tags => false, :inject_meta_charset => false, :sanitize => true}).gsub(/^
(.*)<\/div>$/, '\1') end HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name| next if %w[caption col colgroup optgroup option table tbody td tfoot th thead tr].include?(tag_name) ### TODO define_method "test_should_allow_#{tag_name}_tag" do if tag_name == 'image' assert_equal "foo <bad>bar</bad> baz", sanitize_html("<#{tag_name} title='1'>foo bar baz") elsif VOID_ELEMENTS.include?(tag_name) assert_equal "<#{tag_name} title=\"1\"/>foo <bad>bar</bad> baz", sanitize_html("<#{tag_name} title='1'>foo bar baz") else assert_equal "<#{tag_name.downcase} title=\"1\">foo <bad>bar</bad> baz", sanitize_html("<#{tag_name} title='1'>foo bar baz") assert_equal "<#{tag_name} title=\"1\">foo <bad>bar</bad> baz", sanitize_xhtml("<#{tag_name} title='1'>foo bar baz") assert_equal "<#{tag_name} title=\"1\">foo <bad>bar</bad> baz", sanitize_rexml("<#{tag_name} title='1'>foo bar baz") end end end HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name| define_method "test_should_forbid_#{tag_name.upcase}_tag" do assert_equal "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>", sanitize_html("<#{tag_name.upcase} title='1'>foo bar baz") assert_equal "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>", sanitize_rexml("<#{tag_name.upcase} title='1'>foo bar baz") end end HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name| next if attribute_name == 'style' define_method "test_should_allow_#{attribute_name}_attribute" do assert_equal "

foo <bad>bar</bad> baz

", sanitize_html("

foo bar baz

") assert_equal "

foo <bad>bar</bad> baz

", sanitize_xhtml("

foo bar baz

") assert_equal "

foo <bad>bar</bad> baz

", sanitize_rexml("

foo bar baz

") end end HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name| define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do assert_equal "

foo <bad>bar</bad> baz

", sanitize_html("

foo bar baz

") assert_equal "

foo <bad>bar</bad> baz

", sanitize_rexml("

foo bar baz

") end end HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol| define_method "test_should_allow_#{protocol}_uris" do assert_equal "foo", sanitize_html(%(foo)) assert_equal "foo", sanitize_rexml(%(foo)) end end HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol| define_method "test_should_allow_uppercase_#{protocol}_uris" do assert_equal "foo", sanitize_html(%(foo)) assert_equal "foo", sanitize_rexml(%(foo)) end end def test_should_allow_anchors assert_equal "<script>baz</script>", sanitize_html("") assert_equal "<script>baz</script>", sanitize_rexml("") end # RFC 3986, sec 4.2 def test_allow_colons_in_path_component assert_equal "foo", sanitize_html("foo") assert_equal "foo", sanitize_rexml("foo") end %w(src width height alt).each do |img_attr| define_method "test_should_allow_image_#{img_attr}_attribute" do assert_equal "", sanitize_html("") assert_equal "", sanitize_rexml("") end end def test_should_handle_non_html assert_equal 'abc', sanitize_html("abc") assert_equal 'abc', sanitize_rexml("abc") end def test_should_handle_blank_text assert_equal '', sanitize_html('') assert_equal '', sanitize_rexml('') end [%w(img src), %w(a href)].each do |(tag, attr)| close = VOID_ELEMENTS.include?(tag) ? "/>boo" : ">boo" xclose = VOID_ELEMENTS.include?(tag) ? " />" : ">boo" define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols" do assert_equal %(<#{tag} title="1"#{close}), sanitize_html(%(<#{tag} #{attr}="javascript:XSS" title="1">boo)) assert_equal %(<#{tag} title="1"#{xclose}), sanitize_rexml(%(<#{tag} #{attr}="javascript:XSS" title="1">boo)) end define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols_and_whitespace" do assert_equal %(<#{tag} title="1"#{close}), sanitize_html(%(<#{tag} #{attr}=" javascript:XSS" title="1">boo)) assert_equal %(<#{tag} title="1"#{xclose}), sanitize_rexml(%(<#{tag} #{attr}=" javascript:XSS" title="1">boo)) end end [%(), %(), %(), %(), %(), %(), %(), %(), %(), %(), %(), %(), %(), %(), %()].each_with_index do |img_hack, i| define_method "test_should_not_fall_for_xss_image_hack_#{i}" do assert_equal "", sanitize_html(img_hack) end end def test_should_sanitize_tag_broken_up_by_null assert_equal "<scr\357\277\275ipt>alert(\"XSS\")</scr\357\277\275ipt>", sanitize_html(%(alert(\"XSS\"))) end def test_should_sanitize_invalid_script_tag assert_equal "<script XSS=\"\" SRC=\"http://ha.ckers.org/xss.js\"></script>", sanitize_html(%()) end def test_should_sanitize_script_tag_with_multiple_open_brackets assert_equal "<<script>alert(\"XSS\");//<</script>", sanitize_html(%(<)) assert_equal %(<iframe src=\"http://ha.ckers.org/scriptlet.html\"><), sanitize_html(%(