Ruby 1.9 Compatibility
Completely removed the html5lib sanitizer. Fixed the string-handling to work in both Ruby 1.8.x and 1.9.2. There are still, inexplicably, two functional tests that fail. But the rest seems to work quite well.
This commit is contained in:
parent
79c8572053
commit
a6429f8c22
|
@ -1,207 +1,262 @@
|
|||
# == Introduction
|
||||
#
|
||||
# This module provides sanitization of XHTML+MathML+SVG
|
||||
# and of inline style attributes. Its genesis is {described here}[http://golem.ph.utexas.edu/~distler/blog/archives/001181.html].
|
||||
#
|
||||
# Uses the {HTML5lib parser}[http://code.google.com/p/html5lib/], so that the parsing behaviour should
|
||||
# resemble that of browsers.
|
||||
#
|
||||
# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
|
||||
# sanitize_html() is a case-insensitive sanitizer suitable for HTML
|
||||
# sanitize_rexml() sanitizes a REXML tree, returning a string
|
||||
# safe_sanitize_xhtml() makes extra-sure that the result is well-formed XHTML
|
||||
# by running the output of sanitize_xhtml() through REXML
|
||||
#
|
||||
# == Files
|
||||
#
|
||||
# {sanitize.rb}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/lib/sanitize.rb],
|
||||
# {HTML5lib}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/vendor/plugins/HTML5lib/]
|
||||
#
|
||||
# == Author
|
||||
#
|
||||
# {Jacques Distler}[http://golem.ph.utexas.edu/~distler/]
|
||||
#
|
||||
# == License
|
||||
#
|
||||
# Ruby License
|
||||
|
||||
module Sanitize
|
||||
|
||||
# This module provides sanitization of XHTML+MathML+SVG
|
||||
# and of inline style attributes.
|
||||
require 'html5/html5parser'
|
||||
require 'html5/liberalxmlparser'
|
||||
require 'html5/treewalkers'
|
||||
require 'html5/treebuilders'
|
||||
require 'html5/serializer'
|
||||
require 'html5/sanitizer'
|
||||
require 'stringsupport.rb'
|
||||
|
||||
include HTML5
|
||||
|
||||
# Sanitize a string, parsed using XHTML parsing rules.
|
||||
#
|
||||
# Based heavily on Sam Ruby's code in the Universal FeedParser.
|
||||
|
||||
require 'html/tokenizer'
|
||||
require 'node'
|
||||
|
||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
||||
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
||||
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
|
||||
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
|
||||
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
|
||||
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
|
||||
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
|
||||
'ul', 'var']
|
||||
|
||||
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
|
||||
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
|
||||
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
|
||||
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
|
||||
'munderover', 'none']
|
||||
|
||||
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
||||
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
|
||||
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
|
||||
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
|
||||
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
|
||||
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
|
||||
|
||||
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
||||
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
|
||||
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
|
||||
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
|
||||
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
|
||||
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
|
||||
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
|
||||
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
||||
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
|
||||
'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', 'title',
|
||||
'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang']
|
||||
|
||||
|
||||
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
|
||||
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
|
||||
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
|
||||
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
|
||||
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
|
||||
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
|
||||
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
|
||||
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
|
||||
'xlink:type', 'xmlns', 'xmlns:xlink']
|
||||
|
||||
|
||||
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
|
||||
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
||||
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
||||
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
|
||||
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
|
||||
'font-family', 'font-size', 'font-stretch', 'font-style', 'font-variant',
|
||||
'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
|
||||
'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
|
||||
'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
|
||||
'lang', 'marker-end', 'marker-mid', 'marker-start', 'markerHeight',
|
||||
'markerUnits', 'markerWidth', 'mathematical', 'max', 'min', 'name',
|
||||
'offset', 'opacity', 'orient', 'origin', 'overline-position',
|
||||
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
|
||||
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount', 'repeatDur',
|
||||
'requiredExtensions', 'requiredFeatures', 'restart', 'rotate', 'rx',
|
||||
'ry', 'slope', 'stemh', 'stemv', 'stop-color', 'stop-opacity',
|
||||
'strikethrough-position', 'strikethrough-thickness', 'stroke',
|
||||
'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
|
||||
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
|
||||
'stroke-width', 'systemLanguage', 'target',
|
||||
'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
|
||||
'underline-position', 'underline-thickness', 'unicode',
|
||||
'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
|
||||
'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
|
||||
'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
|
||||
'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
|
||||
'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
|
||||
|
||||
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'xlink:href']
|
||||
|
||||
acceptable_css_properties = ['azimuth', 'background-color',
|
||||
'border-bottom-color', 'border-collapse', 'border-color',
|
||||
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
|
||||
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
|
||||
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
|
||||
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
|
||||
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
|
||||
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
|
||||
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
|
||||
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
|
||||
'white-space', 'width']
|
||||
|
||||
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
|
||||
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
|
||||
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
|
||||
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
|
||||
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
|
||||
'transparent', 'underline', 'white', 'yellow']
|
||||
|
||||
acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
|
||||
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
|
||||
'stroke-opacity']
|
||||
|
||||
acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
|
||||
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
|
||||
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
|
||||
'ssh', 'sftp', 'rtsp', 'afs' ]
|
||||
|
||||
ALLOWED_ELEMENTS = acceptable_elements + mathml_elements + svg_elements unless defined?(ALLOWED_ELEMENTS)
|
||||
ALLOWED_ATTRIBUTES = acceptable_attributes + mathml_attributes + svg_attributes unless defined?(ALLOWED_ATTRIBUTES)
|
||||
ALLOWED_CSS_PROPERTIES = acceptable_css_properties unless defined?(ALLOWED_CSS_PROPERTIES)
|
||||
ALLOWED_CSS_KEYWORDS = acceptable_css_keywords unless defined?(ALLOWED_CSS_KEYWORDS)
|
||||
ALLOWED_SVG_PROPERTIES = acceptable_svg_properties unless defined?(ALLOWED_SVG_PROPERTIES)
|
||||
ALLOWED_PROTOCOLS = acceptable_protocols unless defined?(ALLOWED_PROTOCOLS)
|
||||
ATTR_VAL_IS_URI = attr_val_is_uri unless defined?(ATTR_VAL_IS_URI)
|
||||
|
||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and stripping out all
|
||||
# attributes not in ALLOWED_ATTRIBUTES. Style attributes are parsed, and a restricted set,
|
||||
# specified by ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
||||
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified in
|
||||
# ALLOWED_PROTOCOLS are allowed.
|
||||
# You can adjust what gets sanitized, by defining these constant arrays before this Module is loaded.
|
||||
#
|
||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||
# => <script> do_nasty_stuff() </script>
|
||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||
# => <a>Click here for $100</a>
|
||||
def sanitize_html(html)
|
||||
if html.index("<")
|
||||
tokenizer = HTML::Tokenizer.new(html)
|
||||
new_text = ""
|
||||
|
||||
while token = tokenizer.next
|
||||
node = XHTML::Node.parse(nil, 0, 0, token, false)
|
||||
new_text << case node.tag?
|
||||
when true
|
||||
if ALLOWED_ELEMENTS.include?(node.name)
|
||||
if node.closing != :close
|
||||
node.attributes.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
|
||||
ATTR_VAL_IS_URI.each do |attr|
|
||||
val_unescaped = CGI.unescapeHTML(node.attributes[attr].to_s).gsub(/[\000-\040\177\s]+|\302*[\200-\240]/,'').downcase
|
||||
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
|
||||
node.attributes.delete attr
|
||||
end
|
||||
end
|
||||
if node.attributes['style']
|
||||
node.attributes['style'] = sanitize_css(node.attributes['style'])
|
||||
end
|
||||
end
|
||||
node.to_s
|
||||
# :call-seq:
|
||||
# sanitize_xhtml(string) -> string
|
||||
# sanitize_xhtml(string, {:encoding => 'iso-8859-1', :to_tree => true}) -> REXML::Document
|
||||
#
|
||||
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
||||
# By default, the output is a string. But, optionally, you can return a REXML tree.
|
||||
#
|
||||
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
||||
# (REXML trees are always utf-8 encoded.)
|
||||
def sanitize_xhtml(html, options = {})
|
||||
@encoding = 'utf-8'
|
||||
@treebuilder = TreeBuilders::REXML::TreeBuilder
|
||||
@to_tree = false
|
||||
options.each do |name, value|
|
||||
next unless %w(encoding treebuilder to_tree).include? name.to_s
|
||||
if name.to_s == 'treebuilder'
|
||||
@treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
|
||||
else
|
||||
node.to_s.gsub(/</, "<")
|
||||
instance_variable_set("@#{name}", value)
|
||||
end
|
||||
end
|
||||
if @encoding == 'utf-8'
|
||||
parsed = XHTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
|
||||
:lowercase_element_name => false, :lowercase_attr_name => false,
|
||||
:encoding => @encoding, :tree => @treebuilder })
|
||||
else
|
||||
node.to_s.gsub(/</, "<")
|
||||
parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
|
||||
:lowercase_element_name => false, :lowercase_attr_name => false,
|
||||
:encoding => @encoding, :tree => @treebuilder })
|
||||
end
|
||||
return parsed if @to_tree
|
||||
return parsed.to_s
|
||||
end
|
||||
|
||||
html = new_text
|
||||
end
|
||||
html
|
||||
# Sanitize a string, parsed using XHTML parsing rules. Reparse the result to
|
||||
# ensure well-formedness.
|
||||
#
|
||||
# :call-seq:
|
||||
# safe_sanitize_xhtml(string) -> string
|
||||
#
|
||||
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
||||
#
|
||||
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
||||
# (REXML trees are always utf-8 encoded.)
|
||||
def safe_sanitize_xhtml(html, options = {})
|
||||
options[:to_tree] = false
|
||||
sanitized = sanitize_xhtml(html, options)
|
||||
doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>")
|
||||
sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
|
||||
rescue REXML::ParseException
|
||||
sanitized = sanitized.escapeHTML
|
||||
end
|
||||
|
||||
def sanitize_css(style)
|
||||
# disallow urls
|
||||
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
|
||||
|
||||
# gauntlet
|
||||
if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
|
||||
style = ''
|
||||
return style
|
||||
# Sanitize a string, parsed using HTML parsing rules.
|
||||
#
|
||||
# :call-seq:
|
||||
# sanitize_html( string ) -> string
|
||||
# sanitize_html( string, {:encoding => 'iso-8859-1', :to_tree => true} ) -> REXML::Document
|
||||
#
|
||||
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
||||
# By default, the output is a string. But, optionally, you can return a REXML tree.
|
||||
#
|
||||
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
||||
# (REXML trees are always utf-8 encoded.)
|
||||
def sanitize_html(html, options = {})
|
||||
@encoding = 'utf-8'
|
||||
@treebuilder = TreeBuilders::REXML::TreeBuilder
|
||||
@to_tree = false
|
||||
options.each do |name, value|
|
||||
next unless %w(encoding treebuilder to_tree).include? name.to_s
|
||||
if name.to_s == 'treebuilder'
|
||||
@treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
|
||||
else
|
||||
instance_variable_set("@#{name}", value)
|
||||
end
|
||||
if style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
|
||||
style = ''
|
||||
return style
|
||||
end
|
||||
if @encoding == 'utf-8'
|
||||
parsed = HTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
|
||||
:encoding => @encoding, :tree => @treebuilder })
|
||||
else
|
||||
parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
|
||||
:encoding => @encoding, :tree => @treebuilder })
|
||||
end
|
||||
return parsed if @to_tree
|
||||
return parsed.to_s
|
||||
end
|
||||
|
||||
clean = []
|
||||
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
|
||||
if ALLOWED_CSS_PROPERTIES.include?(prop.downcase)
|
||||
clean << prop + ': ' + val + ';'
|
||||
elsif ['background','border','margin','padding'].include?(prop.split('-')[0].downcase)
|
||||
goodval = true
|
||||
val.split().each do |keyword|
|
||||
if !ALLOWED_CSS_KEYWORDS.include?(keyword) and
|
||||
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
|
||||
goodval = false
|
||||
end
|
||||
end
|
||||
if goodval
|
||||
clean << prop + ': ' + val + ';'
|
||||
end
|
||||
elsif ALLOWED_SVG_PROPERTIES.include?(prop.downcase)
|
||||
clean << prop + ': ' + val + ';'
|
||||
end
|
||||
end
|
||||
|
||||
style = clean.join(' ')
|
||||
# Sanitize a REXML tree. The output is a string.
|
||||
#
|
||||
# :call-seq:
|
||||
# sanitize_rexml(tree) -> string
|
||||
#
|
||||
def sanitize_rexml(tree)
|
||||
tokens = TreeWalkers.get_tree_walker('rexml2').new(tree)
|
||||
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
|
||||
:space_before_trailing_solidus => true,
|
||||
:inject_meta_charset => false,
|
||||
:sanitize => true})
|
||||
end
|
||||
end
|
||||
|
||||
require 'rexml/element'
|
||||
module REXML #:nodoc:
|
||||
class Element
|
||||
|
||||
# Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References
|
||||
#
|
||||
# :call-seq:
|
||||
# tree.to_ncr -> REXML::Element
|
||||
#
|
||||
# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
|
||||
# access the resulting REXML document.
|
||||
#
|
||||
# Note that this method needs to traverse the entire tree, converting text nodes and attributes
|
||||
# for each element. This can be SLOW. It will often be faster to serialize to a string and then
|
||||
# use String.to_ncr instead.
|
||||
#
|
||||
def to_ncr
|
||||
self.each_element { |el|
|
||||
el.texts.each_index {|i|
|
||||
el.texts[i].value = el.texts[i].to_s.to_ncr
|
||||
}
|
||||
el.attributes.each { |name,val|
|
||||
el.attributes[name] = val.to_ncr
|
||||
}
|
||||
el.to_ncr if el.has_elements?
|
||||
}
|
||||
return self
|
||||
end
|
||||
|
||||
# Convert XHTML+MathML Named Entities in a REXML::Element to UTF-8
|
||||
#
|
||||
# :call-seq:
|
||||
# tree.to_utf8 -> REXML::Element
|
||||
#
|
||||
# Note that this method needs to traverse the entire tree, converting text nodes and attributes
|
||||
# for each element. This can be SLOW. It will often be faster to serialize to a string and then
|
||||
# use String.to_utf8 instead.
|
||||
#
|
||||
def to_utf8
|
||||
self.each_element { |el|
|
||||
el.texts.each_index {|i|
|
||||
el.texts[i].value = el.texts[i].to_s.to_utf8
|
||||
}
|
||||
el.attributes.each { |name,val|
|
||||
el.attributes[name] = val.to_utf8
|
||||
}
|
||||
el.to_utf8 if el.has_elements?
|
||||
}
|
||||
return self
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
module HTML5 #:nodoc: all
|
||||
module TreeWalkers
|
||||
|
||||
private
|
||||
|
||||
class << self
|
||||
def [](name)
|
||||
case name.to_s.downcase
|
||||
when 'rexml'
|
||||
require 'html5/treewalkers/rexml'
|
||||
REXML::TreeWalker
|
||||
when 'rexml2'
|
||||
REXML2::TreeWalker
|
||||
else
|
||||
raise "Unknown TreeWalker #{name}"
|
||||
end
|
||||
end
|
||||
|
||||
alias :get_tree_walker :[]
|
||||
end
|
||||
|
||||
module REXML2
|
||||
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
|
||||
|
||||
private
|
||||
|
||||
def node_details(node)
|
||||
case node
|
||||
when ::REXML::Document
|
||||
[:DOCUMENT]
|
||||
when ::REXML::Element
|
||||
if !node.name
|
||||
[:DOCUMENT_FRAGMENT]
|
||||
else
|
||||
[:ELEMENT, node.name,
|
||||
node.attributes.map {|name,value| [name,value.to_utf8]},
|
||||
node.has_elements? || node.has_text?]
|
||||
end
|
||||
when ::REXML::Text
|
||||
[:TEXT, node.value.to_utf8]
|
||||
when ::REXML::Comment
|
||||
[:COMMENT, node.string]
|
||||
when ::REXML::DocType
|
||||
[:DOCTYPE, node.name, node.public, node.system]
|
||||
when ::REXML::XMLDecl
|
||||
[nil]
|
||||
else
|
||||
[:UNKNOWN, node.class.inspect]
|
||||
end
|
||||
end
|
||||
|
||||
def first_child(node)
|
||||
node.children.first
|
||||
end
|
||||
|
||||
def next_sibling(node)
|
||||
node.next_sibling
|
||||
end
|
||||
|
||||
def parent(node)
|
||||
node.parent
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -1,187 +1,189 @@
|
|||
#!/usr/bin/env ruby
|
||||
|
||||
require File.expand_path(File.join(File.dirname(__FILE__), '/../test_helper'))
|
||||
require File.expand_path(File.dirname(__FILE__) + '/../test_helper')
|
||||
require 'sanitize'
|
||||
require 'json'
|
||||
|
||||
|
||||
class SanitizeTest < Test::Unit::TestCase
|
||||
|
||||
include Sanitize
|
||||
|
||||
def setup
|
||||
|
||||
end
|
||||
|
||||
Sanitize::ALLOWED_ELEMENTS.each do |tag_name|
|
||||
def do_sanitize_xhtml stream
|
||||
safe_sanitize_xhtml(stream)
|
||||
end
|
||||
|
||||
def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
|
||||
assert_equal htmloutput, do_sanitize_xhtml(input)
|
||||
end
|
||||
|
||||
def rexml_doc(string)
|
||||
REXML::Document.new(
|
||||
"<div xmlns='http://www.w3.org/1999/xhtml'>#{string}</div>")
|
||||
end
|
||||
|
||||
def my_rex(string)
|
||||
sanitize_rexml(rexml_doc(string.to_utf8)).gsub(/\A<div xmlns="http:\/\/www.w3.org\/1999\/xhtml">(.*)<\/div>\Z/m, '\1')
|
||||
end
|
||||
|
||||
def test_sanitize_named_entities
|
||||
input = '<p>Greek &phis; φ, double-struck 𝔸, numeric 𝔸 ⁗, uppercase ™ <</p>'
|
||||
output = "<p>Greek \317\225 \317\206, double-struck \360\235\224\270, numeric \360\235\224\270 \342\201\227, uppercase \342\204\242 <</p>"
|
||||
output2 = "<p>Greek \317\225 \317\206, double-struck \360\235\224\270, numeric 𝔸 ⁗, uppercase \342\204\242 <</p>"
|
||||
assert_equal(output, sanitize_xhtml(input))
|
||||
assert_equal(output, sanitize_html(input))
|
||||
assert_equal(output, my_rex(input))
|
||||
assert_equal(output2, input.to_utf8)
|
||||
end
|
||||
|
||||
def test_sanitize_malformed_utf8
|
||||
input = "<p>\357elephant & \302ivory</p>"
|
||||
output = "<p>\357\277\275elephant & \357\277\275ivory</p>"
|
||||
check_sanitization(input, output, output, output)
|
||||
end
|
||||
|
||||
Sanitizer::ALLOWED_ELEMENTS.each do |tag_name|
|
||||
define_method "test_should_allow_#{tag_name}_tag" do
|
||||
assert_equal "<#{tag_name} title=\"1\">foo <bad>bar</bad> baz</#{tag_name}>",
|
||||
sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
|
||||
input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
|
||||
htmloutput = "<#{tag_name.downcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.downcase}>"
|
||||
xhtmloutput = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
|
||||
rexmloutput = xhtmloutput
|
||||
|
||||
if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
|
||||
htmloutput = "foo <bad>bar</bad> baz"
|
||||
xhtmloutput = htmloutput
|
||||
elsif tag_name == 'col'
|
||||
htmloutput = "foo <bad>bar</bad> baz"
|
||||
xhtmloutput = htmloutput
|
||||
rexmloutput = "<col title='1' />"
|
||||
elsif tag_name == 'table'
|
||||
htmloutput = "foo <bad>bar</bad>baz<table title='1'> </table>"
|
||||
xhtmloutput = htmloutput
|
||||
elsif tag_name == 'image'
|
||||
htmloutput = "<img title='1'/>foo <bad>bar</bad> baz"
|
||||
xhtmloutput = htmloutput
|
||||
rexmloutput = "<image title='1'>foo <bad>bar</bad> baz</image>"
|
||||
elsif VOID_ELEMENTS.include?(tag_name)
|
||||
htmloutput = "<#{tag_name} title='1'/>foo <bad>bar</bad> baz"
|
||||
xhtmloutput = htmloutput
|
||||
htmloutput += '<br/>' if tag_name == 'br'
|
||||
rexmloutput = "<#{tag_name} title='1' />"
|
||||
end
|
||||
check_sanitization(input, xhtmloutput, xhtmloutput, rexmloutput)
|
||||
end
|
||||
end
|
||||
|
||||
Sanitize::ALLOWED_ELEMENTS.each do |tag_name|
|
||||
Sanitizer::ALLOWED_ELEMENTS.each do |tag_name|
|
||||
define_method "test_should_forbid_#{tag_name.upcase}_tag" do
|
||||
assert_equal "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>",
|
||||
sanitize_html("<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>")
|
||||
input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
||||
output = "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
||||
xhtmloutput = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
||||
check_sanitization(input, output, xhtmloutput, output)
|
||||
end
|
||||
end
|
||||
|
||||
Sanitize::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
||||
if attribute_name != 'style'
|
||||
Sanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
||||
next if attribute_name == 'style' || attribute_name.include?(':')
|
||||
define_method "test_should_allow_#{attribute_name}_attribute" do
|
||||
assert_equal "<p #{attribute_name}=\"foo\">foo <bad>bar</bad> baz</p>",
|
||||
sanitize_html("<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>")
|
||||
end
|
||||
input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
|
||||
output = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
|
||||
htmloutput = "<p #{attribute_name.downcase}='foo'>foo <bad>bar</bad> baz</p>"
|
||||
check_sanitization(input, output, output, output)
|
||||
end
|
||||
end
|
||||
|
||||
Sanitize::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
||||
Sanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
||||
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
|
||||
assert_equal "<p>foo <bad>bar</bad> baz</p>",
|
||||
sanitize_html("<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>")
|
||||
input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
|
||||
output = "<p>foo <bad>bar</bad> baz</p>"
|
||||
check_sanitization(input, output, output, output)
|
||||
end
|
||||
end
|
||||
|
||||
Sanitize::ALLOWED_PROTOCOLS.each do |protocol|
|
||||
Sanitizer::ALLOWED_PROTOCOLS.each do |protocol|
|
||||
define_method "test_should_allow_#{protocol}_uris" do
|
||||
assert_equal "<a href=\"#{protocol}\">foo</a>",
|
||||
sanitize_html(%(<a href="#{protocol}">foo</a>))
|
||||
input = %(<a href="#{protocol}">foo</a>)
|
||||
output = "<a href='#{protocol}'>foo</a>"
|
||||
check_sanitization(input, output, output, output)
|
||||
end
|
||||
end
|
||||
|
||||
Sanitize::ALLOWED_PROTOCOLS.each do |protocol|
|
||||
Sanitizer::ALLOWED_PROTOCOLS.each do |protocol|
|
||||
define_method "test_should_allow_uppercase_#{protocol}_uris" do
|
||||
assert_equal "<a href=\"#{protocol.upcase}\">foo</a>",
|
||||
sanitize_html(%(<a href="#{protocol.upcase}">foo</a>))
|
||||
input = %(<a href="#{protocol.upcase}">foo</a>)
|
||||
output = "<a href='#{protocol.upcase}'>foo</a>"
|
||||
check_sanitization(input, output, output, output)
|
||||
end
|
||||
end
|
||||
|
||||
def test_should_allow_anchors
|
||||
assert_equal "<a href=\"foo\"><script>baz</script></a>",
|
||||
sanitize_html("<a href='foo' onclick='bar'><script>baz</script></a>")
|
||||
Sanitizer::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
|
||||
next unless Sanitizer::ALLOWED_ELEMENTS.include?(tag_name)
|
||||
define_method "test_#{tag_name}_should_allow_local_href_with_ns_decl" do
|
||||
input = %(<#{tag_name} xlink:href="#foo" xmlns:xlink='http://www.w3.org/1999/xlink'/>)
|
||||
output = "<#{tag_name.downcase} xlink:href='#foo' xmlns:xlink='http://www.w3.org/1999/xlink'/>"
|
||||
xhtmloutput = "<#{tag_name} xlink:href='#foo' xmlns:xlink='http://www.w3.org/1999/xlink'/>"
|
||||
check_sanitization(input, xhtmloutput, xhtmloutput, xhtmloutput)
|
||||
end
|
||||
|
||||
# RFC 3986, sec 4.2
|
||||
def test_allow_colons_in_path_component
|
||||
assert_equal "<a href=\"./this:that\">foo</a>",
|
||||
sanitize_html("<a href=\"./this:that\">foo</a>")
|
||||
define_method "test_#{tag_name}_should_allow_local_href_with_newline_and_ns_decl" do
|
||||
input = %(<#{tag_name} xlink:href="\n#foo" xmlns:xlink='http://www.w3.org/1999/xlink'/>)
|
||||
output = "<#{tag_name.downcase} xlink:href='\n#foo' xmlns:xlink='http://www.w3.org/1999/xlink'/>"
|
||||
xhtmloutput = "<#{tag_name} xlink:href='\n#foo' xmlns:xlink='http://www.w3.org/1999/xlink'/>"
|
||||
check_sanitization(input, xhtmloutput, xhtmloutput, xhtmloutput)
|
||||
end
|
||||
|
||||
%w(src width height alt).each do |img_attr|
|
||||
define_method "test_should_allow_image_#{img_attr}_attribute" do
|
||||
assert_equal "<img #{img_attr}=\"foo\" />",
|
||||
sanitize_html("<img #{img_attr}='foo' onclick='bar' />")
|
||||
define_method "test_#{tag_name}_should_forbid_local_href_without_ns_decl" do
|
||||
input = %(<#{tag_name} xlink:href="#foo"/>)
|
||||
output = "<#{tag_name.downcase} xlink:href='#foo'/>"
|
||||
xhtmloutput = "<#{tag_name} xlink:href='#foo'></#{tag_name}>"
|
||||
check_sanitization(input, xhtmloutput, xhtmloutput, xhtmloutput)
|
||||
end
|
||||
|
||||
define_method "test_#{tag_name}_should_forbid_local_href_with_newline_without_ns_decl" do
|
||||
input = %(<#{tag_name} xlink:href="\n#foo"/>)
|
||||
output = "<#{tag_name.downcase} xlink:href='\n#foo'/>"
|
||||
xhtmloutput = "<#{tag_name} xlink:href='\n#foo'></#{tag_name}>"
|
||||
check_sanitization(input, xhtmloutput, xhtmloutput, xhtmloutput)
|
||||
end
|
||||
|
||||
define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_ns_decl" do
|
||||
input = %(<#{tag_name} xlink:href="http://bad.com/foo" xmlns:xlink='http://www.w3.org/1999/xlink'/>)
|
||||
output = "<#{tag_name.downcase} xmlns:xlink='http://www.w3.org/1999/xlink'/>"
|
||||
xhtmloutput = "<#{tag_name} xmlns:xlink='http://www.w3.org/1999/xlink'/>"
|
||||
check_sanitization(input, xhtmloutput, xhtmloutput, xhtmloutput)
|
||||
end
|
||||
|
||||
define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline_and_ns_decl" do
|
||||
input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo" xmlns:xlink='http://www.w3.org/1999/xlink'/>)
|
||||
output = "<#{tag_name.downcase} xmlns:xlink='http://www.w3.org/1999/xlink'/>"
|
||||
xhtmloutput = "<#{tag_name} xmlns:xlink='http://www.w3.org/1999/xlink'/>"
|
||||
check_sanitization(input, xhtmloutput, xhtmloutput, xhtmloutput)
|
||||
end
|
||||
end
|
||||
|
||||
def test_should_handle_non_html
|
||||
assert_equal 'abc', sanitize_html("abc")
|
||||
def test_should_handle_astral_plane_characters
|
||||
input = "<p>𝒵 𝔸</p>"
|
||||
output = "<p>\360\235\222\265 \360\235\224\270</p>"
|
||||
check_sanitization(input, output, output, output)
|
||||
|
||||
input = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
||||
output = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
||||
check_sanitization(input, output, output, output)
|
||||
end
|
||||
|
||||
def test_should_handle_blank_text
|
||||
assert_equal '', sanitize_html('')
|
||||
JSON::parse(open(File.expand_path(File.join(File.dirname(__FILE__), '/../sanitizer.dat'))).read).each do |test|
|
||||
define_method "test_#{test['name']}" do
|
||||
check_sanitization(
|
||||
test['input'],
|
||||
test['output'],
|
||||
test['xhtml'] || test['output'],
|
||||
test['rexml'] || test['output']
|
||||
)
|
||||
end
|
||||
|
||||
[%w(img src), %w(a href)].each do |(tag, attr)|
|
||||
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols" do
|
||||
assert_equal %(<#{tag} title="1">boo</#{tag}>), sanitize_html(%(<#{tag} #{attr}="javascript:XSS" title="1">boo</#{tag}>))
|
||||
end
|
||||
end
|
||||
|
||||
[%w(img src), %w(a href)].each do |(tag, attr)|
|
||||
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols_and_whitespace" do
|
||||
assert_equal %(<#{tag} title="1">boo</#{tag}>), sanitize_html(%(<#{tag} #{attr}=" javascript:XSS" title="1">boo</#{tag}>))
|
||||
end
|
||||
end
|
||||
|
||||
[%(<img src="javascript:alert('XSS');" />),
|
||||
%(<img src=javascript:alert('XSS') />),
|
||||
%(<img src="JaVaScRiPt:alert('XSS')" />),
|
||||
%(<img src='javascript:alert("XSS")' />),
|
||||
%(<img src='javascript:alert(String.fromCharCode(88,83,83))' />),
|
||||
%(<img src='javascript:alert('XSS')' />),
|
||||
%(<img src='javascript:alert('XSS')' />),
|
||||
%(<img src='javascript:alert('XSS')' />),
|
||||
%(<img src="jav\tascript:alert('XSS');" />),
|
||||
%(<img src="jav	ascript:alert('XSS');" />),
|
||||
%(<img src="jav
ascript:alert('XSS');" />),
|
||||
%(<img src="jav
ascript:alert('XSS');" />),
|
||||
%(<img src="  javascript:alert('XSS');" />),
|
||||
%(<img src=" javascript:alert('XSS');" />),
|
||||
%(<img src=" javascript:alert('XSS');" />)].each_with_index do |img_hack, i|
|
||||
define_method "test_should_not_fall_for_xss_image_hack_#{i}" do
|
||||
assert_equal "<img />", sanitize_html(img_hack)
|
||||
end
|
||||
end
|
||||
|
||||
def test_should_sanitize_tag_broken_up_by_null
|
||||
assert_equal "<scr>alert(\"XSS\")</scr>", sanitize_html(%(<scr\0ipt>alert(\"XSS\")</scr\0ipt>))
|
||||
end
|
||||
|
||||
def test_should_sanitize_invalid_script_tag
|
||||
assert_equal "<script /></script>", sanitize_html(%(<script/XSS SRC="http://ha.ckers.org/xss.js"></script>))
|
||||
end
|
||||
|
||||
def test_should_sanitize_script_tag_with_multiple_open_brackets
|
||||
assert_equal "<<script>alert(\"XSS\");//<</script>", sanitize_html(%(<<script>alert("XSS");//<</script>))
|
||||
assert_equal %(<iframe src="http:" /><), sanitize_html(%(<iframe src=http://ha.ckers.org/scriptlet.html\n<))
|
||||
end
|
||||
|
||||
def test_should_sanitize_unclosed_script
|
||||
assert_equal "<script src=\"http:\" /><b>", sanitize_html(%(<script src=http://ha.ckers.org/xss.js?<b>))
|
||||
end
|
||||
|
||||
def test_should_sanitize_half_open_scripts
|
||||
assert_equal "<img>", sanitize_html(%(<img src="javascript:alert('XSS')"))
|
||||
end
|
||||
|
||||
def test_should_not_fall_for_ridiculous_hack
|
||||
img_hack = %(<img\nsrc\n=\n"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n"\n />)
|
||||
assert_equal "<img />", sanitize_html(img_hack)
|
||||
end
|
||||
|
||||
def test_platypus
|
||||
assert_equal %(<a href=\"http://www.ragingplatypus.com/\" style=\"display: block; width: 100%; height: 100%; background-color: black; background-image: ; background-x: center; background-y: center;\">never trust your upstream platypus</a>),
|
||||
sanitize_html(%(<a href="http://www.ragingplatypus.com/" style="display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;">never trust your upstream platypus</a>))
|
||||
end
|
||||
|
||||
def test_xul
|
||||
assert_equal %(<p style="">fubar</p>),
|
||||
sanitize_html(%(<p style="-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')">fubar</p>))
|
||||
end
|
||||
|
||||
def test_input_image
|
||||
assert_equal %(<input type="image" />),
|
||||
sanitize_html(%(<input type="image" src="javascript:alert('XSS');" />))
|
||||
end
|
||||
|
||||
def test_non_alpha_non_digit
|
||||
assert_equal "<script /></script>",
|
||||
sanitize_html(%(<script/XSS src="http://ha.ckers.org/xss.js"></script>))
|
||||
assert_equal "<a>foo</a>",
|
||||
sanitize_html('<a onclick!#$%&()*~+-_.,:;?@[/|\]^`=alert("XSS")>foo</a>')
|
||||
assert_equal "<img />",
|
||||
sanitize_html('<img/src="http://ha.ckers.org/xss.js"/>')
|
||||
end
|
||||
|
||||
def test_img_dynsrc_lowsrc
|
||||
assert_equal "<img />",
|
||||
sanitize_html(%(<img dynsrc="javascript:alert('XSS')" />))
|
||||
assert_equal "<img />",
|
||||
sanitize_html(%(<img lowsrc="javascript:alert('XSS')" />))
|
||||
end
|
||||
|
||||
def test_div_background_image_unicode_encoded
|
||||
assert_equal '<div style="">foo</div>',
|
||||
sanitize_html(%(<div style="background-image:\0075\0072\006C\0028'\006a\0061\0076\0061\0073\0063\0072\0069\0070\0074\003a\0061\006c\0065\0072\0074\0028.1027\0058.1053\0053\0027\0029'\0029">foo</div>))
|
||||
end
|
||||
|
||||
def test_div_expression
|
||||
assert_equal '<div style="">foo</div>',
|
||||
sanitize_html(%(<div style="width: expression(alert('XSS'));">foo</div>))
|
||||
end
|
||||
|
||||
def test_img_vbscript
|
||||
assert_equal '<img />',
|
||||
sanitize_html(%(<img src='vbscript:msgbox("XSS")' />))
|
||||
end
|
||||
|
||||
end
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue