Merge branch 'bzr/golem' of /Users/distler/Sites/code/instiki

This commit is contained in:
Jacques Distler 2009-11-30 16:35:46 -06:00
commit f23d892bf9
142 changed files with 519 additions and 843 deletions

View file

@ -1,207 +1,262 @@
# == Introduction
#
# This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes. Its genesis is {described here}[http://golem.ph.utexas.edu/~distler/blog/archives/001181.html].
#
# Uses the {HTML5lib parser}[http://code.google.com/p/html5lib/], so that the parsing behaviour should
# resemble that of browsers.
#
# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
# sanitize_html() is a case-insensitive sanitizer suitable for HTML
# sanitize_rexml() sanitizes a REXML tree, returning a string
# safe_sanitize_xhtml() makes extra-sure that the result is well-formed XHTML
# by running the output of sanitize_xhtml() through REXML
#
# == Files
#
# {sanitize.rb}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/lib/sanitize.rb],
# {HTML5lib}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/vendor/plugins/HTML5lib/]
#
# == Author
#
# {Jacques Distler}[http://golem.ph.utexas.edu/~distler/]
#
# == License
#
# Ruby License
module Sanitize module Sanitize
# This module provides sanitization of XHTML+MathML+SVG require 'html5/html5parser'
# and of inline style attributes. require 'html5/liberalxmlparser'
require 'html5/treewalkers'
require 'html5/treebuilders'
require 'html5/serializer'
require 'html5/sanitizer'
require 'stringsupport.rb'
include HTML5
# Sanitize a string, parsed using XHTML parsing rules.
# #
# Based heavily on Sam Ruby's code in the Universal FeedParser. # :call-seq:
# sanitize_xhtml(string) -> string
require 'html/tokenizer' # sanitize_xhtml(string, {:encoding => 'iso-8859-1', :to_tree => true}) -> REXML::Document
require 'node' #
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', # By default, the output is a string. But, optionally, you can return a REXML tree.
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', #
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', # The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', # (REXML trees are always utf-8 encoded.)
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', def sanitize_xhtml(html, options = {})
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', @encoding = 'utf-8'
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', @treebuilder = TreeBuilders::REXML::TreeBuilder
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', @to_tree = false
'ul', 'var'] options.each do |name, value|
next unless %w(encoding treebuilder to_tree).include? name.to_s
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi', if name.to_s == 'treebuilder'
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom', @treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub', else
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder', instance_variable_set("@#{name}", value)
'munderover', 'none'] end
end
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion', if @encoding == 'utf-8'
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face', parsed = XHTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image', :lowercase_element_name => false, :lowercase_attr_name => false,
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', :encoding => @encoding, :tree => @treebuilder })
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect', else
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use'] parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
:lowercase_element_name => false, :lowercase_attr_name => false,
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', :encoding => @encoding, :tree => @treebuilder })
'action', 'align', 'alt', 'axis', 'border', 'cellpadding', end
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', return parsed if @to_tree
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', return parsed.to_s
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', end
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', 'title',
'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang']
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
'xlink:type', 'xmlns', 'xmlns:xlink']
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
'arabic-form', 'ascent', 'attributeName', 'attributeType',
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
'font-family', 'font-size', 'font-stretch', 'font-style', 'font-variant',
'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
'lang', 'marker-end', 'marker-mid', 'marker-start', 'markerHeight',
'markerUnits', 'markerWidth', 'mathematical', 'max', 'min', 'name',
'offset', 'opacity', 'orient', 'origin', 'overline-position',
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount', 'repeatDur',
'requiredExtensions', 'requiredFeatures', 'restart', 'rotate', 'rx',
'ry', 'slope', 'stemh', 'stemv', 'stop-color', 'stop-opacity',
'strikethrough-position', 'strikethrough-thickness', 'stroke',
'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
'stroke-width', 'systemLanguage', 'target',
'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
'underline-position', 'underline-thickness', 'unicode',
'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'xlink:href']
acceptable_css_properties = ['azimuth', 'background-color', # Sanitize a string, parsed using XHTML parsing rules. Reparse the result to
'border-bottom-color', 'border-collapse', 'border-color', # ensure well-formedness.
'border-left-color', 'border-right-color', 'border-top-color', 'clear', #
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font', # :call-seq:
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight', # safe_sanitize_xhtml(string) -> string
'height', 'letter-spacing', 'line-height', 'overflow', 'pause', #
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness', # Unless otherwise specified, the string is assumed to be utf-8 encoded.
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation', #
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent', # The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
'unicode-bidi', 'vertical-align', 'voice-family', 'volume', # (REXML trees are always utf-8 encoded.)
'white-space', 'width'] def safe_sanitize_xhtml(html, options = {})
options[:to_tree] = false
sanitized = sanitize_xhtml(html, options)
doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>")
sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
rescue REXML::ParseException
sanitized = sanitized.escapeHTML
end
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue', # Sanitize a string, parsed using HTML parsing rules.
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed', #
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left', # :call-seq:
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive', # sanitize_html( string ) -> string
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top', # sanitize_html( string, {:encoding => 'iso-8859-1', :to_tree => true} ) -> REXML::Document
'transparent', 'underline', 'white', 'yellow'] #
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
# By default, the output is a string. But, optionally, you can return a REXML tree.
#
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
# (REXML trees are always utf-8 encoded.)
def sanitize_html(html, options = {})
@encoding = 'utf-8'
@treebuilder = TreeBuilders::REXML::TreeBuilder
@to_tree = false
options.each do |name, value|
next unless %w(encoding treebuilder to_tree).include? name.to_s
if name.to_s == 'treebuilder'
@treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
else
instance_variable_set("@#{name}", value)
end
end
if @encoding == 'utf-8'
parsed = HTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
:encoding => @encoding, :tree => @treebuilder })
else
parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
:encoding => @encoding, :tree => @treebuilder })
end
return parsed if @to_tree
return parsed.to_s
end
acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule', # Sanitize a REXML tree. The output is a string.
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin', #
'stroke-opacity'] # :call-seq:
# sanitize_rexml(tree) -> string
#
def sanitize_rexml(tree)
tokens = TreeWalkers.get_tree_walker('rexml2').new(tree)
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
:space_before_trailing_solidus => true,
:inject_meta_charset => false,
:sanitize => true})
end
end
acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc', require 'rexml/element'
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal', module REXML #:nodoc:
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag', class Element
'ssh', 'sftp', 'rtsp', 'afs' ]
ALLOWED_ELEMENTS = acceptable_elements + mathml_elements + svg_elements unless defined?(ALLOWED_ELEMENTS) # Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References
ALLOWED_ATTRIBUTES = acceptable_attributes + mathml_attributes + svg_attributes unless defined?(ALLOWED_ATTRIBUTES) #
ALLOWED_CSS_PROPERTIES = acceptable_css_properties unless defined?(ALLOWED_CSS_PROPERTIES) # :call-seq:
ALLOWED_CSS_KEYWORDS = acceptable_css_keywords unless defined?(ALLOWED_CSS_KEYWORDS) # tree.to_ncr -> REXML::Element
ALLOWED_SVG_PROPERTIES = acceptable_svg_properties unless defined?(ALLOWED_SVG_PROPERTIES) #
ALLOWED_PROTOCOLS = acceptable_protocols unless defined?(ALLOWED_PROTOCOLS) # REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
ATTR_VAL_IS_URI = attr_val_is_uri unless defined?(ATTR_VAL_IS_URI) # access the resulting REXML document.
#
# Note that this method needs to traverse the entire tree, converting text nodes and attributes
# for each element. This can be SLOW. It will often be faster to serialize to a string and then
# use String.to_ncr instead.
#
def to_ncr
self.each_element { |el|
el.texts.each_index {|i|
el.texts[i].value = el.texts[i].to_s.to_ncr
}
el.attributes.each { |name,val|
el.attributes[name] = val.to_ncr
}
el.to_ncr if el.has_elements?
}
return self
end
# Convert XHTML+MathML Named Entities in a REXML::Element to UTF-8
#
# :call-seq:
# tree.to_utf8 -> REXML::Element
#
# Note that this method needs to traverse the entire tree, converting text nodes and attributes
# for each element. This can be SLOW. It will often be faster to serialize to a string and then
# use String.to_utf8 instead.
#
def to_utf8
self.each_element { |el|
el.texts.each_index {|i|
el.texts[i].value = el.texts[i].to_s.to_utf8
}
el.attributes.each { |name,val|
el.attributes[name] = val.to_utf8
}
el.to_utf8 if el.has_elements?
}
return self
end
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and stripping out all end
# attributes not in ALLOWED_ATTRIBUTES. Style attributes are parsed, and a restricted set, end
# specified by ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified in
# ALLOWED_PROTOCOLS are allowed.
# You can adjust what gets sanitized, by defining these constant arrays before this Module is loaded.
#
# sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def sanitize_html(html)
if html.index("<")
tokenizer = HTML::Tokenizer.new(html)
new_text = ""
while token = tokenizer.next module HTML5 #:nodoc: all
node = XHTML::Node.parse(nil, 0, 0, token, false) module TreeWalkers
new_text << case node.tag?
when true
if ALLOWED_ELEMENTS.include?(node.name)
if node.closing != :close
node.attributes.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
ATTR_VAL_IS_URI.each do |attr|
val_unescaped = CGI.unescapeHTML(node.attributes[attr].to_s).gsub(/[\000-\040\177\s]+|\302*[\200-\240]/,'').downcase
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
node.attributes.delete attr
end
end
if node.attributes['style']
node.attributes['style'] = sanitize_css(node.attributes['style'])
end
end
node.to_s
else
node.to_s.gsub(/</, "&lt;")
end
else
node.to_s.gsub(/</, "&lt;")
end
end
html = new_text private
class << self
def [](name)
case name.to_s.downcase
when 'rexml'
require 'html5/treewalkers/rexml'
REXML::TreeWalker
when 'rexml2'
REXML2::TreeWalker
else
raise "Unknown TreeWalker #{name}"
end end
html
end end
def sanitize_css(style)
# disallow urls
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
# gauntlet alias :get_tree_walker :[]
if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/ end
style = ''
return style
end
if style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
style = ''
return style
end
clean = [] module REXML2
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val| class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
if ALLOWED_CSS_PROPERTIES.include?(prop.downcase)
clean << prop + ': ' + val + ';' private
elsif ['background','border','margin','padding'].include?(prop.split('-')[0].downcase)
goodval = true def node_details(node)
val.split().each do |keyword| case node
if !ALLOWED_CSS_KEYWORDS.include?(keyword) and when ::REXML::Document
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/ [:DOCUMENT]
goodval = false when ::REXML::Element
end if !node.name
end [:DOCUMENT_FRAGMENT]
if goodval else
clean << prop + ': ' + val + ';' [:ELEMENT, node.name,
end node.attributes.map {|name,value| [name,value.to_utf8]},
elsif ALLOWED_SVG_PROPERTIES.include?(prop.downcase) node.has_elements? || node.has_text?]
clean << prop + ': ' + val + ';'
end end
when ::REXML::Text
[:TEXT, node.value.to_utf8]
when ::REXML::Comment
[:COMMENT, node.string]
when ::REXML::DocType
[:DOCTYPE, node.name, node.public, node.system]
when ::REXML::XMLDecl
[nil]
else
[:UNKNOWN, node.class.inspect]
end end
end
style = clean.join(' ') def first_child(node)
node.children.first
end
def next_sibling(node)
node.next_sibling
end
def parent(node)
node.parent
end
end end
end end
end
end

View file

@ -1,187 +1,189 @@
#!/usr/bin/env ruby #!/usr/bin/env ruby
require File.expand_path(File.join(File.dirname(__FILE__), '/../test_helper')) require File.expand_path(File.dirname(__FILE__) + '/../test_helper')
require 'sanitize' require 'sanitize'
require 'json'
class SanitizeTest < Test::Unit::TestCase class SanitizeTest < Test::Unit::TestCase
include Sanitize include Sanitize
def setup def setup
end end
Sanitize::ALLOWED_ELEMENTS.each do |tag_name| def do_sanitize_xhtml stream
safe_sanitize_xhtml(stream)
end
def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
assert_equal htmloutput, do_sanitize_xhtml(input)
end
def rexml_doc(string)
REXML::Document.new(
"<div xmlns='http://www.w3.org/1999/xhtml'>#{string}</div>")
end
def my_rex(string)
sanitize_rexml(rexml_doc(string.to_utf8)).gsub(/\A<div xmlns="http:\/\/www.w3.org\/1999\/xhtml">(.*)<\/div>\Z/m, '\1')
end
def test_sanitize_named_entities
input = '<p>Greek &phis; &phi;, double-struck &Aopf;, numeric &#x1D538; &#8279;, uppercase &TRADE; &LT;</p>'
output = "<p>Greek \317\225 \317\206, double-struck \360\235\224\270, numeric \360\235\224\270 \342\201\227, uppercase \342\204\242 &lt;</p>"
output2 = "<p>Greek \317\225 \317\206, double-struck \360\235\224\270, numeric &#x1D538; &#8279;, uppercase \342\204\242 &lt;</p>"
assert_equal(output, sanitize_xhtml(input))
assert_equal(output, sanitize_html(input))
assert_equal(output, my_rex(input))
assert_equal(output2, input.to_utf8)
end
def test_sanitize_malformed_utf8
input = "<p>\357elephant &AMP; \302ivory</p>"
output = "<p>\357\277\275elephant &amp; \357\277\275ivory</p>"
check_sanitization(input, output, output, output)
end
Sanitizer::ALLOWED_ELEMENTS.each do |tag_name|
define_method "test_should_allow_#{tag_name}_tag" do define_method "test_should_allow_#{tag_name}_tag" do
assert_equal "<#{tag_name} title=\"1\">foo &lt;bad>bar&lt;/bad> baz</#{tag_name}>", input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>") htmloutput = "<#{tag_name.downcase} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>"
xhtmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>"
rexmloutput = xhtmloutput
if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
xhtmloutput = htmloutput
elsif tag_name == 'col'
htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
xhtmloutput = htmloutput
rexmloutput = "<col title='1' />"
elsif tag_name == 'table'
htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt;baz<table title='1'> </table>"
xhtmloutput = htmloutput
elsif tag_name == 'image'
htmloutput = "<img title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
xhtmloutput = htmloutput
rexmloutput = "<image title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</image>"
elsif VOID_ELEMENTS.include?(tag_name)
htmloutput = "<#{tag_name} title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
xhtmloutput = htmloutput
htmloutput += '<br/>' if tag_name == 'br'
rexmloutput = "<#{tag_name} title='1' />"
end
check_sanitization(input, xhtmloutput, xhtmloutput, rexmloutput)
end end
end end
Sanitize::ALLOWED_ELEMENTS.each do |tag_name| Sanitizer::ALLOWED_ELEMENTS.each do |tag_name|
define_method "test_should_forbid_#{tag_name.upcase}_tag" do define_method "test_should_forbid_#{tag_name.upcase}_tag" do
assert_equal "&lt;#{tag_name.upcase} title=\"1\">foo &lt;bad>bar&lt;/bad> baz&lt;/#{tag_name.upcase}>", input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
sanitize_html("<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>") output = "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
xhtmloutput = "&lt;#{tag_name.upcase} title='1'&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
check_sanitization(input, output, xhtmloutput, output)
end end
end end
Sanitize::ALLOWED_ATTRIBUTES.each do |attribute_name| Sanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
if attribute_name != 'style' next if attribute_name == 'style' || attribute_name.include?(':')
define_method "test_should_allow_#{attribute_name}_attribute" do define_method "test_should_allow_#{attribute_name}_attribute" do
assert_equal "<p #{attribute_name}=\"foo\">foo &lt;bad>bar&lt;/bad> baz</p>", input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
sanitize_html("<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>") output = "<p #{attribute_name}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
htmloutput = "<p #{attribute_name.downcase}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
check_sanitization(input, output, output, output)
end
end
Sanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
output = "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
check_sanitization(input, output, output, output)
end
end
Sanitizer::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_#{protocol}_uris" do
input = %(<a href="#{protocol}">foo</a>)
output = "<a href='#{protocol}'>foo</a>"
check_sanitization(input, output, output, output)
end
end
Sanitizer::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_uppercase_#{protocol}_uris" do
input = %(<a href="#{protocol.upcase}">foo</a>)
output = "<a href='#{protocol.upcase}'>foo</a>"
check_sanitization(input, output, output, output)
end
end
Sanitizer::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
next unless Sanitizer::ALLOWED_ELEMENTS.include?(tag_name)
define_method "test_#{tag_name}_should_allow_local_href_with_ns_decl" do
input = %(<#{tag_name} xlink:href="#foo" xmlns:xlink='http://www.w3.org/1999/xlink'/>)
output = "<#{tag_name.downcase} xlink:href='#foo' xmlns:xlink='http://www.w3.org/1999/xlink'/>"
xhtmloutput = "<#{tag_name} xlink:href='#foo' xmlns:xlink='http://www.w3.org/1999/xlink'/>"
check_sanitization(input, xhtmloutput, xhtmloutput, xhtmloutput)
end
define_method "test_#{tag_name}_should_allow_local_href_with_newline_and_ns_decl" do
input = %(<#{tag_name} xlink:href="\n#foo" xmlns:xlink='http://www.w3.org/1999/xlink'/>)
output = "<#{tag_name.downcase} xlink:href='\n#foo' xmlns:xlink='http://www.w3.org/1999/xlink'/>"
xhtmloutput = "<#{tag_name} xlink:href='\n#foo' xmlns:xlink='http://www.w3.org/1999/xlink'/>"
check_sanitization(input, xhtmloutput, xhtmloutput, xhtmloutput)
end
define_method "test_#{tag_name}_should_forbid_local_href_without_ns_decl" do
input = %(<#{tag_name} xlink:href="#foo"/>)
output = "&lt;#{tag_name.downcase} xlink:href='#foo'/>"
xhtmloutput = "&lt;#{tag_name} xlink:href=&#39;#foo&#39;&gt;&lt;/#{tag_name}&gt;"
check_sanitization(input, xhtmloutput, xhtmloutput, xhtmloutput)
end
define_method "test_#{tag_name}_should_forbid_local_href_with_newline_without_ns_decl" do
input = %(<#{tag_name} xlink:href="\n#foo"/>)
output = "&lt;#{tag_name.downcase} xlink:href='\n#foo'/>"
xhtmloutput = "&lt;#{tag_name} xlink:href=&#39;\n#foo&#39;&gt;&lt;/#{tag_name}&gt;"
check_sanitization(input, xhtmloutput, xhtmloutput, xhtmloutput)
end
define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_ns_decl" do
input = %(<#{tag_name} xlink:href="http://bad.com/foo" xmlns:xlink='http://www.w3.org/1999/xlink'/>)
output = "<#{tag_name.downcase} xmlns:xlink='http://www.w3.org/1999/xlink'/>"
xhtmloutput = "<#{tag_name} xmlns:xlink='http://www.w3.org/1999/xlink'/>"
check_sanitization(input, xhtmloutput, xhtmloutput, xhtmloutput)
end
define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline_and_ns_decl" do
input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo" xmlns:xlink='http://www.w3.org/1999/xlink'/>)
output = "<#{tag_name.downcase} xmlns:xlink='http://www.w3.org/1999/xlink'/>"
xhtmloutput = "<#{tag_name} xmlns:xlink='http://www.w3.org/1999/xlink'/>"
check_sanitization(input, xhtmloutput, xhtmloutput, xhtmloutput)
end
end
def test_should_handle_astral_plane_characters
input = "<p>&#x1d4b5; &#x1d538;</p>"
output = "<p>\360\235\222\265 \360\235\224\270</p>"
check_sanitization(input, output, output, output)
input = "<p><tspan>\360\235\224\270</tspan> a</p>"
output = "<p><tspan>\360\235\224\270</tspan> a</p>"
check_sanitization(input, output, output, output)
end
JSON::parse(open(File.expand_path(File.join(File.dirname(__FILE__), '/../sanitizer.dat'))).read).each do |test|
define_method "test_#{test['name']}" do
check_sanitization(
test['input'],
test['output'],
test['xhtml'] || test['output'],
test['rexml'] || test['output']
)
end end
end end
end
Sanitize::ALLOWED_ATTRIBUTES.each do |attribute_name|
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
assert_equal "<p>foo &lt;bad>bar&lt;/bad> baz</p>",
sanitize_html("<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>")
end
end
Sanitize::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_#{protocol}_uris" do
assert_equal "<a href=\"#{protocol}\">foo</a>",
sanitize_html(%(<a href="#{protocol}">foo</a>))
end
end
Sanitize::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_uppercase_#{protocol}_uris" do
assert_equal "<a href=\"#{protocol.upcase}\">foo</a>",
sanitize_html(%(<a href="#{protocol.upcase}">foo</a>))
end
end
def test_should_allow_anchors
assert_equal "<a href=\"foo\">&lt;script>baz&lt;/script></a>",
sanitize_html("<a href='foo' onclick='bar'><script>baz</script></a>")
end
# RFC 3986, sec 4.2
def test_allow_colons_in_path_component
assert_equal "<a href=\"./this:that\">foo</a>",
sanitize_html("<a href=\"./this:that\">foo</a>")
end
%w(src width height alt).each do |img_attr|
define_method "test_should_allow_image_#{img_attr}_attribute" do
assert_equal "<img #{img_attr}=\"foo\" />",
sanitize_html("<img #{img_attr}='foo' onclick='bar' />")
end
end
def test_should_handle_non_html
assert_equal 'abc', sanitize_html("abc")
end
def test_should_handle_blank_text
assert_equal '', sanitize_html('')
end
[%w(img src), %w(a href)].each do |(tag, attr)|
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols" do
assert_equal %(<#{tag} title="1">boo</#{tag}>), sanitize_html(%(<#{tag} #{attr}="javascript:XSS" title="1">boo</#{tag}>))
end
end
[%w(img src), %w(a href)].each do |(tag, attr)|
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols_and_whitespace" do
assert_equal %(<#{tag} title="1">boo</#{tag}>), sanitize_html(%(<#{tag} #{attr}=" javascript:XSS" title="1">boo</#{tag}>))
end
end
[%(<img src="javascript:alert('XSS');" />),
%(<img src=javascript:alert('XSS') />),
%(<img src="JaVaScRiPt:alert('XSS')" />),
%(<img src='javascript:alert(&quot;XSS&quot;)' />),
%(<img src='javascript:alert(String.fromCharCode(88,83,83))' />),
%(<img src='&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;' />),
%(<img src='&#0000106;&#0000097;&#0000118;&#0000097;&#0000115;&#0000099;&#0000114;&#0000105;&#0000112;&#0000116;&#0000058;&#0000097;&#0000108;&#0000101;&#0000114;&#0000116;&#0000040;&#0000039;&#0000088;&#0000083;&#0000083;&#0000039;&#0000041' />),
%(<img src='&#x6A;&#x61;&#x76;&#x61;&#x73;&#x63;&#x72;&#x69;&#x70;&#x74;&#x3A;&#x61;&#x6C;&#x65;&#x72;&#x74;&#x28;&#x27;&#x58;&#x53;&#x53;&#x27;&#x29' />),
%(<img src="jav\tascript:alert('XSS');" />),
%(<img src="jav&#x09;ascript:alert('XSS');" />),
%(<img src="jav&#x0A;ascript:alert('XSS');" />),
%(<img src="jav&#x0D;ascript:alert('XSS');" />),
%(<img src=" &#14; javascript:alert('XSS');" />),
%(<img src="&#x20;javascript:alert('XSS');" />),
%(<img src="&#xA0;javascript:alert('XSS');" />)].each_with_index do |img_hack, i|
define_method "test_should_not_fall_for_xss_image_hack_#{i}" do
assert_equal "<img />", sanitize_html(img_hack)
end
end
def test_should_sanitize_tag_broken_up_by_null
assert_equal "&lt;scr>alert(\"XSS\")&lt;/scr>", sanitize_html(%(<scr\0ipt>alert(\"XSS\")</scr\0ipt>))
end
def test_should_sanitize_invalid_script_tag
assert_equal "&lt;script />&lt;/script>", sanitize_html(%(<script/XSS SRC="http://ha.ckers.org/xss.js"></script>))
end
def test_should_sanitize_script_tag_with_multiple_open_brackets
assert_equal "&lt;&lt;script>alert(\"XSS\");//&lt;&lt;/script>", sanitize_html(%(<<script>alert("XSS");//<</script>))
assert_equal %(&lt;iframe src="http:" />&lt;), sanitize_html(%(<iframe src=http://ha.ckers.org/scriptlet.html\n<))
end
def test_should_sanitize_unclosed_script
assert_equal "&lt;script src=\"http:\" /><b>", sanitize_html(%(<script src=http://ha.ckers.org/xss.js?<b>))
end
def test_should_sanitize_half_open_scripts
assert_equal "<img>", sanitize_html(%(<img src="javascript:alert('XSS')"))
end
def test_should_not_fall_for_ridiculous_hack
img_hack = %(<img\nsrc\n=\n"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n"\n />)
assert_equal "<img />", sanitize_html(img_hack)
end
def test_platypus
assert_equal %(<a href=\"http://www.ragingplatypus.com/\" style=\"display: block; width: 100%; height: 100%; background-color: black; background-image: ; background-x: center; background-y: center;\">never trust your upstream platypus</a>),
sanitize_html(%(<a href="http://www.ragingplatypus.com/" style="display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;">never trust your upstream platypus</a>))
end
def test_xul
assert_equal %(<p style="">fubar</p>),
sanitize_html(%(<p style="-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')">fubar</p>))
end
def test_input_image
assert_equal %(<input type="image" />),
sanitize_html(%(<input type="image" src="javascript:alert('XSS');" />))
end
def test_non_alpha_non_digit
assert_equal "&lt;script />&lt;/script>",
sanitize_html(%(<script/XSS src="http://ha.ckers.org/xss.js"></script>))
assert_equal "<a>foo</a>",
sanitize_html('<a onclick!#$%&()*~+-_.,:;?@[/|\]^`=alert("XSS")>foo</a>')
assert_equal "<img />",
sanitize_html('<img/src="http://ha.ckers.org/xss.js"/>')
end
def test_img_dynsrc_lowsrc
assert_equal "<img />",
sanitize_html(%(<img dynsrc="javascript:alert('XSS')" />))
assert_equal "<img />",
sanitize_html(%(<img lowsrc="javascript:alert('XSS')" />))
end
def test_div_background_image_unicode_encoded
assert_equal '<div style="">foo</div>',
sanitize_html(%(<div style="background-image:\0075\0072\006C\0028'\006a\0061\0076\0061\0073\0063\0072\0069\0070\0074\003a\0061\006c\0065\0072\0074\0028.1027\0058.1053\0053\0027\0029'\0029">foo</div>))
end
def test_div_expression
assert_equal '<div style="">foo</div>',
sanitize_html(%(<div style="width: expression(alert('XSS'));">foo</div>))
end
def test_img_vbscript
assert_equal '<img />',
sanitize_html(%(<img src='vbscript:msgbox("XSS")' />))
end
end end

Some files were not shown because too many files have changed in this diff Show more