diff --git a/attic/lib/sanitize.rb b/attic/lib/sanitize.rb index 11f65b5f..77b71863 100644 --- a/attic/lib/sanitize.rb +++ b/attic/lib/sanitize.rb @@ -1,207 +1,262 @@ +# == Introduction +# +# This module provides sanitization of XHTML+MathML+SVG +# and of inline style attributes. Its genesis is {described here}[http://golem.ph.utexas.edu/~distler/blog/archives/001181.html]. +# +# Uses the {HTML5lib parser}[http://code.google.com/p/html5lib/], so that the parsing behaviour should +# resemble that of browsers. +# +# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML +# sanitize_html() is a case-insensitive sanitizer suitable for HTML +# sanitize_rexml() sanitizes a REXML tree, returning a string +# safe_sanitize_xhtml() makes extra-sure that the result is well-formed XHTML +# by running the output of sanitize_xhtml() through REXML +# +# == Files +# +# {sanitize.rb}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/lib/sanitize.rb], +# {HTML5lib}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/vendor/plugins/HTML5lib/] +# +# == Author +# +# {Jacques Distler}[http://golem.ph.utexas.edu/~distler/] +# +# == License +# +# Ruby License + module Sanitize -# This module provides sanitization of XHTML+MathML+SVG -# and of inline style attributes. + require 'html5/html5parser' + require 'html5/liberalxmlparser' + require 'html5/treewalkers' + require 'html5/treebuilders' + require 'html5/serializer' + require 'html5/sanitizer' + require 'stringsupport.rb' + + include HTML5 + +# Sanitize a string, parsed using XHTML parsing rules. # -# Based heavily on Sam Ruby's code in the Universal FeedParser. - - require 'html/tokenizer' - require 'node' - - acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', - 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', - 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', - 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', - 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', - 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', - 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', - 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', - 'ul', 'var'] - - mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi', - 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom', - 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub', - 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder', - 'munderover', 'none'] - - svg_elements = ['a', 'animate', 'animateColor', 'animateMotion', - 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face', - 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image', - 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', - 'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect', - 'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use'] - - acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', - 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', - 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', - 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', - 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', - 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', - 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', - 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', - 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', - 'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', 'title', - 'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'] - - - mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign', - 'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth', - 'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence', - 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace', - 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize', - 'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines', - 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection', - 'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show', - 'xlink:type', 'xmlns', 'xmlns:xlink'] - - - svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic', - 'arabic-form', 'ascent', 'attributeName', 'attributeType', - 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height', - 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx', - 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', - 'font-family', 'font-size', 'font-stretch', 'font-style', 'font-variant', - 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name', - 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x', - 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes', - 'lang', 'marker-end', 'marker-mid', 'marker-start', 'markerHeight', - 'markerUnits', 'markerWidth', 'mathematical', 'max', 'min', 'name', - 'offset', 'opacity', 'orient', 'origin', 'overline-position', - 'overline-thickness', 'panose-1', 'path', 'pathLength', 'points', - 'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount', 'repeatDur', - 'requiredExtensions', 'requiredFeatures', 'restart', 'rotate', 'rx', - 'ry', 'slope', 'stemh', 'stemv', 'stop-color', 'stop-opacity', - 'strikethrough-position', 'strikethrough-thickness', 'stroke', - 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap', - 'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity', - 'stroke-width', 'systemLanguage', 'target', - 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2', - 'underline-position', 'underline-thickness', 'unicode', - 'unicode-range', 'units-per-em', 'values', 'version', 'viewBox', - 'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2', - 'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role', - 'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang', - 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'] - - attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'xlink:href'] +# :call-seq: +# sanitize_xhtml(string) -> string +# sanitize_xhtml(string, {:encoding => 'iso-8859-1', :to_tree => true}) -> REXML::Document +# +# Unless otherwise specified, the string is assumed to be utf-8 encoded. +# By default, the output is a string. But, optionally, you can return a REXML tree. +# +# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding. +# (REXML trees are always utf-8 encoded.) + def sanitize_xhtml(html, options = {}) + @encoding = 'utf-8' + @treebuilder = TreeBuilders::REXML::TreeBuilder + @to_tree = false + options.each do |name, value| + next unless %w(encoding treebuilder to_tree).include? name.to_s + if name.to_s == 'treebuilder' + @treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value) + else + instance_variable_set("@#{name}", value) + end + end + if @encoding == 'utf-8' + parsed = XHTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer, + :lowercase_element_name => false, :lowercase_attr_name => false, + :encoding => @encoding, :tree => @treebuilder }) + else + parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer, + :lowercase_element_name => false, :lowercase_attr_name => false, + :encoding => @encoding, :tree => @treebuilder }) + end + return parsed if @to_tree + return parsed.to_s + end - acceptable_css_properties = ['azimuth', 'background-color', - 'border-bottom-color', 'border-collapse', 'border-color', - 'border-left-color', 'border-right-color', 'border-top-color', 'clear', - 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font', - 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight', - 'height', 'letter-spacing', 'line-height', 'overflow', 'pause', - 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness', - 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation', - 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent', - 'unicode-bidi', 'vertical-align', 'voice-family', 'volume', - 'white-space', 'width'] +# Sanitize a string, parsed using XHTML parsing rules. Reparse the result to +# ensure well-formedness. +# +# :call-seq: +# safe_sanitize_xhtml(string) -> string +# +# Unless otherwise specified, the string is assumed to be utf-8 encoded. +# +# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding. +# (REXML trees are always utf-8 encoded.) + def safe_sanitize_xhtml(html, options = {}) + options[:to_tree] = false + sanitized = sanitize_xhtml(html, options) + doc = REXML::Document.new("
#{sanitized}
") + sanitized = doc.to_s.gsub(/\A
(.*)<\/div>\Z/m, '\1') + rescue REXML::ParseException + sanitized = sanitized.escapeHTML + end - acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue', - 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed', - 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left', - 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive', - 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top', - 'transparent', 'underline', 'white', 'yellow'] +# Sanitize a string, parsed using HTML parsing rules. +# +# :call-seq: +# sanitize_html( string ) -> string +# sanitize_html( string, {:encoding => 'iso-8859-1', :to_tree => true} ) -> REXML::Document +# +# Unless otherwise specified, the string is assumed to be utf-8 encoded. +# By default, the output is a string. But, optionally, you can return a REXML tree. +# +# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding. +# (REXML trees are always utf-8 encoded.) + def sanitize_html(html, options = {}) + @encoding = 'utf-8' + @treebuilder = TreeBuilders::REXML::TreeBuilder + @to_tree = false + options.each do |name, value| + next unless %w(encoding treebuilder to_tree).include? name.to_s + if name.to_s == 'treebuilder' + @treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value) + else + instance_variable_set("@#{name}", value) + end + end + if @encoding == 'utf-8' + parsed = HTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer, + :encoding => @encoding, :tree => @treebuilder }) + else + parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer, + :encoding => @encoding, :tree => @treebuilder }) + end + return parsed if @to_tree + return parsed.to_s + end - acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule', - 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin', - 'stroke-opacity'] +# Sanitize a REXML tree. The output is a string. +# +# :call-seq: +# sanitize_rexml(tree) -> string +# + def sanitize_rexml(tree) + tokens = TreeWalkers.get_tree_walker('rexml2').new(tree) + XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8', + :space_before_trailing_solidus => true, + :inject_meta_charset => false, + :sanitize => true}) + end +end - acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc', - 'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal', - 'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag', - 'ssh', 'sftp', 'rtsp', 'afs' ] +require 'rexml/element' +module REXML #:nodoc: + class Element - ALLOWED_ELEMENTS = acceptable_elements + mathml_elements + svg_elements unless defined?(ALLOWED_ELEMENTS) - ALLOWED_ATTRIBUTES = acceptable_attributes + mathml_attributes + svg_attributes unless defined?(ALLOWED_ATTRIBUTES) - ALLOWED_CSS_PROPERTIES = acceptable_css_properties unless defined?(ALLOWED_CSS_PROPERTIES) - ALLOWED_CSS_KEYWORDS = acceptable_css_keywords unless defined?(ALLOWED_CSS_KEYWORDS) - ALLOWED_SVG_PROPERTIES = acceptable_svg_properties unless defined?(ALLOWED_SVG_PROPERTIES) - ALLOWED_PROTOCOLS = acceptable_protocols unless defined?(ALLOWED_PROTOCOLS) - ATTR_VAL_IS_URI = attr_val_is_uri unless defined?(ATTR_VAL_IS_URI) +# Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References +# +# :call-seq: +# tree.to_ncr -> REXML::Element +# +# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you +# access the resulting REXML document. +# +# Note that this method needs to traverse the entire tree, converting text nodes and attributes +# for each element. This can be SLOW. It will often be faster to serialize to a string and then +# use String.to_ncr instead. +# + def to_ncr + self.each_element { |el| + el.texts.each_index {|i| + el.texts[i].value = el.texts[i].to_s.to_ncr + } + el.attributes.each { |name,val| + el.attributes[name] = val.to_ncr + } + el.to_ncr if el.has_elements? + } + return self + end + +# Convert XHTML+MathML Named Entities in a REXML::Element to UTF-8 +# +# :call-seq: +# tree.to_utf8 -> REXML::Element +# +# Note that this method needs to traverse the entire tree, converting text nodes and attributes +# for each element. This can be SLOW. It will often be faster to serialize to a string and then +# use String.to_utf8 instead. +# + def to_utf8 + self.each_element { |el| + el.texts.each_index {|i| + el.texts[i].value = el.texts[i].to_s.to_utf8 + } + el.attributes.each { |name,val| + el.attributes[name] = val.to_utf8 + } + el.to_utf8 if el.has_elements? + } + return self + end - # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and stripping out all - # attributes not in ALLOWED_ATTRIBUTES. Style attributes are parsed, and a restricted set, - # specified by ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through. - # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified in - # ALLOWED_PROTOCOLS are allowed. - # You can adjust what gets sanitized, by defining these constant arrays before this Module is loaded. - # - # sanitize_html('') - # => <script> do_nasty_stuff() </script> - # sanitize_html('Click here for $100') - # => Click here for $100 - def sanitize_html(html) - if html.index("<") - tokenizer = HTML::Tokenizer.new(html) - new_text = "" + end +end - while token = tokenizer.next - node = XHTML::Node.parse(nil, 0, 0, token, false) - new_text << case node.tag? - when true - if ALLOWED_ELEMENTS.include?(node.name) - if node.closing != :close - node.attributes.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) } - ATTR_VAL_IS_URI.each do |attr| - val_unescaped = CGI.unescapeHTML(node.attributes[attr].to_s).gsub(/[\000-\040\177\s]+|\302*[\200-\240]/,'').downcase - if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0]) - node.attributes.delete attr - end - end - if node.attributes['style'] - node.attributes['style'] = sanitize_css(node.attributes['style']) - end - end - node.to_s - else - node.to_s.gsub(/#{string}
") + end + + def my_rex(string) + sanitize_rexml(rexml_doc(string.to_utf8)).gsub(/\A
(.*)<\/div>\Z/m, '\1') + end + + def test_sanitize_named_entities + input = '

Greek &phis; φ, double-struck 𝔸, numeric 𝔸 ⁗, uppercase ™ <

' + output = "

Greek \317\225 \317\206, double-struck \360\235\224\270, numeric \360\235\224\270 \342\201\227, uppercase \342\204\242 <

" + output2 = "

Greek \317\225 \317\206, double-struck \360\235\224\270, numeric 𝔸 ⁗, uppercase \342\204\242 <

" + assert_equal(output, sanitize_xhtml(input)) + assert_equal(output, sanitize_html(input)) + assert_equal(output, my_rex(input)) + assert_equal(output2, input.to_utf8) + end + + def test_sanitize_malformed_utf8 + input = "

\357elephant & \302ivory

" + output = "

\357\277\275elephant & \357\277\275ivory

" + check_sanitization(input, output, output, output) + end + + Sanitizer::ALLOWED_ELEMENTS.each do |tag_name| define_method "test_should_allow_#{tag_name}_tag" do - assert_equal "<#{tag_name} title=\"1\">foo <bad>bar</bad> baz", - sanitize_html("<#{tag_name} title='1'>foo bar baz") + input = "<#{tag_name} title='1'>foo bar baz" + htmloutput = "<#{tag_name.downcase} title='1'>foo <bad>bar</bad> baz" + xhtmloutput = "<#{tag_name} title='1'>foo <bad>bar</bad> baz" + rexmloutput = xhtmloutput + + if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name) + htmloutput = "foo <bad>bar</bad> baz" + xhtmloutput = htmloutput + elsif tag_name == 'col' + htmloutput = "foo <bad>bar</bad> baz" + xhtmloutput = htmloutput + rexmloutput = "" + elsif tag_name == 'table' + htmloutput = "foo <bad>bar</bad>baz
" + xhtmloutput = htmloutput + elsif tag_name == 'image' + htmloutput = "foo <bad>bar</bad> baz" + xhtmloutput = htmloutput + rexmloutput = "foo <bad>bar</bad> baz" + elsif VOID_ELEMENTS.include?(tag_name) + htmloutput = "<#{tag_name} title='1'/>foo <bad>bar</bad> baz" + xhtmloutput = htmloutput + htmloutput += '
' if tag_name == 'br' + rexmloutput = "<#{tag_name} title='1' />" + end + check_sanitization(input, xhtmloutput, xhtmloutput, rexmloutput) end end - Sanitize::ALLOWED_ELEMENTS.each do |tag_name| + Sanitizer::ALLOWED_ELEMENTS.each do |tag_name| define_method "test_should_forbid_#{tag_name.upcase}_tag" do - assert_equal "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>", - sanitize_html("<#{tag_name.upcase} title='1'>foo bar baz") + input = "<#{tag_name.upcase} title='1'>foo bar baz" + output = "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>" + xhtmloutput = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>" + check_sanitization(input, output, xhtmloutput, output) end end - Sanitize::ALLOWED_ATTRIBUTES.each do |attribute_name| - if attribute_name != 'style' - define_method "test_should_allow_#{attribute_name}_attribute" do - assert_equal "

foo <bad>bar</bad> baz

", - sanitize_html("

foo bar baz

") + Sanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name| + next if attribute_name == 'style' || attribute_name.include?(':') + define_method "test_should_allow_#{attribute_name}_attribute" do + input = "

foo bar baz

" + output = "

foo <bad>bar</bad> baz

" + htmloutput = "

foo <bad>bar</bad> baz

" + check_sanitization(input, output, output, output) + end + end + + Sanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name| + define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do + input = "

foo bar baz

" + output = "

foo <bad>bar</bad> baz

" + check_sanitization(input, output, output, output) + end + end + + Sanitizer::ALLOWED_PROTOCOLS.each do |protocol| + define_method "test_should_allow_#{protocol}_uris" do + input = %(foo) + output = "foo" + check_sanitization(input, output, output, output) + end + end + + Sanitizer::ALLOWED_PROTOCOLS.each do |protocol| + define_method "test_should_allow_uppercase_#{protocol}_uris" do + input = %(foo) + output = "foo" + check_sanitization(input, output, output, output) + end + end + + Sanitizer::SVG_ALLOW_LOCAL_HREF.each do |tag_name| + next unless Sanitizer::ALLOWED_ELEMENTS.include?(tag_name) + define_method "test_#{tag_name}_should_allow_local_href_with_ns_decl" do + input = %(<#{tag_name} xlink:href="#foo" xmlns:xlink='http://www.w3.org/1999/xlink'/>) + output = "<#{tag_name.downcase} xlink:href='#foo' xmlns:xlink='http://www.w3.org/1999/xlink'/>" + xhtmloutput = "<#{tag_name} xlink:href='#foo' xmlns:xlink='http://www.w3.org/1999/xlink'/>" + check_sanitization(input, xhtmloutput, xhtmloutput, xhtmloutput) + end + + define_method "test_#{tag_name}_should_allow_local_href_with_newline_and_ns_decl" do + input = %(<#{tag_name} xlink:href="\n#foo" xmlns:xlink='http://www.w3.org/1999/xlink'/>) + output = "<#{tag_name.downcase} xlink:href='\n#foo' xmlns:xlink='http://www.w3.org/1999/xlink'/>" + xhtmloutput = "<#{tag_name} xlink:href='\n#foo' xmlns:xlink='http://www.w3.org/1999/xlink'/>" + check_sanitization(input, xhtmloutput, xhtmloutput, xhtmloutput) + end + + define_method "test_#{tag_name}_should_forbid_local_href_without_ns_decl" do + input = %(<#{tag_name} xlink:href="#foo"/>) + output = "<#{tag_name.downcase} xlink:href='#foo'/>" + xhtmloutput = "<#{tag_name} xlink:href='#foo'></#{tag_name}>" + check_sanitization(input, xhtmloutput, xhtmloutput, xhtmloutput) + end + + define_method "test_#{tag_name}_should_forbid_local_href_with_newline_without_ns_decl" do + input = %(<#{tag_name} xlink:href="\n#foo"/>) + output = "<#{tag_name.downcase} xlink:href='\n#foo'/>" + xhtmloutput = "<#{tag_name} xlink:href='\n#foo'></#{tag_name}>" + check_sanitization(input, xhtmloutput, xhtmloutput, xhtmloutput) + end + + define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_ns_decl" do + input = %(<#{tag_name} xlink:href="http://bad.com/foo" xmlns:xlink='http://www.w3.org/1999/xlink'/>) + output = "<#{tag_name.downcase} xmlns:xlink='http://www.w3.org/1999/xlink'/>" + xhtmloutput = "<#{tag_name} xmlns:xlink='http://www.w3.org/1999/xlink'/>" + check_sanitization(input, xhtmloutput, xhtmloutput, xhtmloutput) + end + + define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline_and_ns_decl" do + input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo" xmlns:xlink='http://www.w3.org/1999/xlink'/>) + output = "<#{tag_name.downcase} xmlns:xlink='http://www.w3.org/1999/xlink'/>" + xhtmloutput = "<#{tag_name} xmlns:xlink='http://www.w3.org/1999/xlink'/>" + check_sanitization(input, xhtmloutput, xhtmloutput, xhtmloutput) + end + end + + def test_should_handle_astral_plane_characters + input = "

𝒵 𝔸

" + output = "

\360\235\222\265 \360\235\224\270

" + check_sanitization(input, output, output, output) + + input = "

\360\235\224\270 a

" + output = "

\360\235\224\270 a

" + check_sanitization(input, output, output, output) + end + + JSON::parse(open(File.expand_path(File.join(File.dirname(__FILE__), '/../sanitizer.dat'))).read).each do |test| + define_method "test_#{test['name']}" do + check_sanitization( + test['input'], + test['output'], + test['xhtml'] || test['output'], + test['rexml'] || test['output'] + ) end end - end - - Sanitize::ALLOWED_ATTRIBUTES.each do |attribute_name| - define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do - assert_equal "

foo <bad>bar</bad> baz

", - sanitize_html("

foo bar baz

") - end - end - - Sanitize::ALLOWED_PROTOCOLS.each do |protocol| - define_method "test_should_allow_#{protocol}_uris" do - assert_equal "foo", - sanitize_html(%(foo)) - end - end - - Sanitize::ALLOWED_PROTOCOLS.each do |protocol| - define_method "test_should_allow_uppercase_#{protocol}_uris" do - assert_equal "foo", - sanitize_html(%(foo)) - end - end - - def test_should_allow_anchors - assert_equal "<script>baz</script>", - sanitize_html("") - end - - # RFC 3986, sec 4.2 - def test_allow_colons_in_path_component - assert_equal "foo", - sanitize_html("foo") - end - - %w(src width height alt).each do |img_attr| - define_method "test_should_allow_image_#{img_attr}_attribute" do - assert_equal "", - sanitize_html("") - end - end - - def test_should_handle_non_html - assert_equal 'abc', sanitize_html("abc") - end - - def test_should_handle_blank_text - assert_equal '', sanitize_html('') - end - - [%w(img src), %w(a href)].each do |(tag, attr)| - define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols" do - assert_equal %(<#{tag} title="1">boo), sanitize_html(%(<#{tag} #{attr}="javascript:XSS" title="1">boo)) - end - end - - [%w(img src), %w(a href)].each do |(tag, attr)| - define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols_and_whitespace" do - assert_equal %(<#{tag} title="1">boo), sanitize_html(%(<#{tag} #{attr}=" javascript:XSS" title="1">boo)) - end - end - - [%(), - %(), - %(), - %(), - %(), - %(), - %(), - %(), - %(), - %(), - %(), - %(), - %(), - %(), - %()].each_with_index do |img_hack, i| - define_method "test_should_not_fall_for_xss_image_hack_#{i}" do - assert_equal "", sanitize_html(img_hack) - end - end - - def test_should_sanitize_tag_broken_up_by_null - assert_equal "<scr>alert(\"XSS\")</scr>", sanitize_html(%(alert(\"XSS\"))) - end - - def test_should_sanitize_invalid_script_tag - assert_equal "<script /></script>", sanitize_html(%()) - end - - def test_should_sanitize_script_tag_with_multiple_open_brackets - assert_equal "<<script>alert(\"XSS\");//<</script>", sanitize_html(%(<)) - assert_equal %(<iframe src="http:" /><), sanitize_html(%(