#!/usr/bin/env ruby require File.join(File.dirname(__FILE__), 'preamble') require 'html5lib/html5parser' require 'html5lib/liberalxmlparser' require 'html5lib/treewalkers' require 'html5lib/serializer' require 'html5lib/sanitizer' class SanitizeTest < Test::Unit::TestCase include HTML5lib def sanitize_xhtml stream XHTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).to_s end def sanitize_html stream HTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).to_s end def sanitize_rexml stream require 'rexml/document' doc = REXML::Document.new("
#{stream}
") tokens = TreeWalkers.getTreeWalker('rexml').new(doc) HTMLSerializer.serialize(tokens, {:encoding=>'utf-8', :quote_attr_values => true, :quote_char => "'", :minimize_boolean_attributes => false, :use_trailing_solidus => true, :omit_optional_tags => false, :inject_meta_charset => false, :sanitize => true}).gsub(/^
(.*)<\/div>$/, '\1') rescue return "Ill-formed XHTML!" end def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput) assert_equal htmloutput, sanitize_html(input) assert_equal xhtmloutput, sanitize_xhtml(input) assert_equal rexmloutput, sanitize_rexml(input) end HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name| define_method "test_should_allow_#{tag_name}_tag" do input = "<#{tag_name} title='1'>foo bar baz" htmloutput = "<#{tag_name.downcase} title='1'>foo <bad>bar</bad> baz" xhtmloutput = "<#{tag_name} title='1'>foo <bad>bar</bad> baz" rexmloutput = xhtmloutput if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name) htmloutput = "foo <bad>bar</bad> baz" xhtmloutput = htmloutput elsif tag_name == 'col' htmloutput = "foo <bad>bar</bad> baz" xhtmloutput = htmloutput rexmloutput = "" elsif tag_name == 'table' htmloutput = "foo <bad>bar</bad>baz
" xhtmloutput = htmloutput elsif tag_name == 'image' htmloutput = "foo <bad>bar</bad> baz" xhtmloutput = htmloutput rexmloutput = "foo <bad>bar</bad> baz" elsif VOID_ELEMENTS.include?(tag_name) htmloutput = "<#{tag_name} title='1'/>foo <bad>bar</bad> baz" xhtmloutput = htmloutput rexmloutput = "<#{tag_name} title='1' />" end check_sanitization(input, htmloutput, xhtmloutput, rexmloutput) end end HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name| define_method "test_should_forbid_#{tag_name.upcase}_tag" do input = "<#{tag_name.upcase} title='1'>foo bar baz" output = "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>" check_sanitization(input, output, output, output) end end HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name| next if attribute_name == 'style' define_method "test_should_allow_#{attribute_name}_attribute" do input = "

foo bar baz

" output = "

foo <bad>bar</bad> baz

" htmloutput = "

foo <bad>bar</bad> baz

" check_sanitization(input, htmloutput, output, output) end end HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name| define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do input = "

foo bar baz

" output = "

foo <bad>bar</bad> baz

" check_sanitization(input, output, output, output) end end HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol| define_method "test_should_allow_#{protocol}_uris" do input = %(foo) output = "foo" check_sanitization(input, output, output, output) end end HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol| define_method "test_should_allow_uppercase_#{protocol}_uris" do input = %(foo) output = "foo" check_sanitization(input, output, output, output) end end def test_should_allow_anchors input = "" output = "<script>baz</script>" check_sanitization(input, output, output, output) end # RFC 3986, sec 4.2 def test_allow_colons_in_path_component input = "foo" output = "foo" check_sanitization(input, output, output, output) end %w(src width height alt).each do |img_attr| define_method "test_should_allow_image_#{img_attr}_attribute" do input = "" output = "" rexmloutput = "" check_sanitization(input, output, output, rexmloutput) end end def test_should_handle_non_html input = 'abc' output = 'abc' check_sanitization(input, output, output, output) end def test_should_handle_blank_text input = '' output = '' check_sanitization(input, output, output, output) end [%w(img src), %w(a href)].each do |(tag, attr)| close = VOID_ELEMENTS.include?(tag) ? "/>boo" : ">boo" xclose = VOID_ELEMENTS.include?(tag) ? " />" : ">boo" input = %(<#{tag} #{attr}="javascript:XSS" title="1">boo) output = %(<#{tag} title='1'#{close}) rexmloutput = %(<#{tag} title='1'#{xclose}) define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols" do check_sanitization(input, output, output, rexmloutput) end define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols_and_whitespace" do input = %(<#{tag} #{attr}=" javascript:XSS" title="1">boo) output = %(<#{tag} title='1'#{close}) rexmloutput = %(<#{tag} title='1'#{xclose}) check_sanitization(input, output, output, rexmloutput) end end [%(), %(), %(), %(), %(), %(), %(), %(), %(), %(), %(), %(), %(), %(), %()].each_with_index do |img_hack, i| define_method "test_should_not_fall_for_xss_image_hack_#{i}" do output = "" rexmloutput = "" rexmloutput = "Ill-formed XHTML!" if i == 1 check_sanitization(img_hack, output, output, rexmloutput) end end def test_should_sanitize_tag_broken_up_by_null input = %(alert(\"XSS\")) output = "<scr\357\277\275ipt>alert(\"XSS\")</scr\357\277\275ipt>" rexmloutput = "Ill-formed XHTML!" check_sanitization(input, output, output, rexmloutput) end def test_should_sanitize_invalid_script_tag input = %() output = "<script XSS=\"\" SRC=\"http://ha.ckers.org/xss.js\"></script>" rexmloutput = "Ill-formed XHTML!" check_sanitization(input, output, output, rexmloutput) end def test_should_sanitize_script_tag_with_multiple_open_brackets input = %(<) output = "<<script>alert(\"XSS\");//<</script>" rexmloutput = "Ill-formed XHTML!" check_sanitization(input, output, output, rexmloutput) input = %(