2008-05-21 00:02:10 +02:00
|
|
|
module Sanitizer
|
|
|
|
|
|
|
|
# This module provides sanitization of XHTML+MathML+SVG
|
|
|
|
# and of inline style attributes.
|
|
|
|
#
|
|
|
|
# Based heavily on Sam Ruby's code in the Universal FeedParser.
|
|
|
|
|
2009-02-04 21:26:08 +01:00
|
|
|
require 'action_controller/vendor/html-scanner/html/tokenizer'
|
2008-05-21 00:02:10 +02:00
|
|
|
require 'node'
|
|
|
|
require 'stringsupport'
|
2009-10-08 23:22:50 +02:00
|
|
|
require 'set'
|
2008-05-21 00:02:10 +02:00
|
|
|
|
2010-05-22 21:34:08 +02:00
|
|
|
acceptable_elements = Set.new %w[a abbr acronym address area article aside
|
|
|
|
audio b big blockquote br button canvas caption center cite code
|
2010-05-24 06:22:45 +02:00
|
|
|
col colgroup command datalist dd del details dfn dialog dir div dl dt
|
2010-05-22 21:34:08 +02:00
|
|
|
em fieldset figcaption figure font footer form h1 h2 h3 h4 h5 h6 header
|
|
|
|
hgroup hr i img input ins kbd label legend li map mark menu meter nav
|
|
|
|
ol optgroup option p pre progress q rp rt ruby s samp section select small
|
|
|
|
source span strike strong sub summary sup table tbody td textarea tfoot
|
|
|
|
th thead time tr tt u ul var video wbr]
|
2008-05-21 00:02:10 +02:00
|
|
|
|
2009-10-08 23:22:50 +02:00
|
|
|
mathml_elements = Set.new %w[annotation annotation-xml maction math merror mfrac
|
2008-05-21 00:02:10 +02:00
|
|
|
mfenced mi mmultiscripts mn mo mover mpadded mphantom mprescripts mroot
|
|
|
|
mrow mspace msqrt mstyle msub msubsup msup mtable mtd mtext mtr munder
|
|
|
|
munderover none semantics]
|
|
|
|
|
2009-10-08 23:22:50 +02:00
|
|
|
svg_elements = Set.new %w[a animate animateColor animateMotion animateTransform
|
2010-02-06 04:36:35 +01:00
|
|
|
circle clipPath defs desc ellipse feGaussianBlur filter font-face
|
|
|
|
font-face-name font-face-src foreignObject g glyph hkern linearGradient
|
|
|
|
line marker mask metadata missing-glyph mpath path pattern polygon
|
|
|
|
polyline radialGradient rect set stop svg switch text textPath title tspan use]
|
2008-05-21 00:02:10 +02:00
|
|
|
|
2010-05-24 06:22:45 +02:00
|
|
|
acceptable_attributes = Set.new %w[abbr accept accept-charset accesskey action
|
|
|
|
align alt autocomplete axis bgcolor border cellpadding cellspacing char charoff
|
2010-05-22 21:34:08 +02:00
|
|
|
checked cite class clear cols colspan color compact contenteditable contextmenu
|
2010-05-24 06:22:45 +02:00
|
|
|
controls coords datetime dir disabled draggable enctype face for formaction frame
|
|
|
|
headers height high href hreflang hspace icon id ismap label list lang longdesc
|
|
|
|
loop low max maxlength media method min multiple name nohref noshade nowrap open
|
|
|
|
optimumpattern placeholder poster preload pubdate radiogroup readonly rel
|
|
|
|
required rev reversed rows rowspan rules spellcheck scope
|
2010-05-22 21:34:08 +02:00
|
|
|
selected shape size span src start step style summary tabindex target title
|
|
|
|
type usemap valign value vspace width wrap xml:lang]
|
2008-05-21 00:02:10 +02:00
|
|
|
|
2010-02-06 08:14:42 +01:00
|
|
|
mathml_attributes = Set.new %w[actiontype align close
|
2008-05-21 00:02:10 +02:00
|
|
|
columnalign columnlines columnspacing columnspan depth display
|
|
|
|
displaystyle encoding equalcolumns equalrows fence fontstyle fontweight
|
|
|
|
frame height linethickness lspace mathbackground mathcolor mathvariant
|
2010-02-06 08:14:42 +01:00
|
|
|
maxsize minsize open other rowalign
|
2008-05-21 00:02:10 +02:00
|
|
|
rowlines rowspacing rowspan rspace scriptlevel selection separator
|
2010-02-06 08:14:42 +01:00
|
|
|
separators stretchy width xlink:href xlink:show xlink:type xmlns
|
2008-05-21 00:02:10 +02:00
|
|
|
xmlns:xlink]
|
|
|
|
|
2009-10-08 23:22:50 +02:00
|
|
|
svg_attributes = Set.new %w[accent-height accumulate additive alphabetic
|
2008-05-21 00:02:10 +02:00
|
|
|
arabic-form ascent attributeName attributeType baseProfile bbox begin
|
2010-04-02 06:56:21 +02:00
|
|
|
by calcMode cap-height class clip-path clip-rule color
|
|
|
|
color-interpolation-filters color-rendering
|
2009-05-07 23:53:56 +02:00
|
|
|
content cx cy d dx dy descent display dur end fill fill-opacity fill-rule
|
2010-02-06 04:36:35 +01:00
|
|
|
filterRes filterUnits font-family font-size font-stretch font-style
|
|
|
|
font-variant font-weight from fx fy g1 g2 glyph-name gradientUnits
|
|
|
|
hanging height horiz-adv-x horiz-origin-x id ideographic k keyPoints
|
|
|
|
keySplines keyTimes lang marker-end marker-mid marker-start
|
|
|
|
markerHeight markerUnits markerWidth maskContentUnits maskUnits
|
|
|
|
mathematical max method min name offset opacity orient origin
|
|
|
|
overline-position overline-thickness panose-1 path pathLength
|
|
|
|
patternContentUnits patternTransform patternUnits points
|
|
|
|
preserveAspectRatio primitiveUnits r refX refY repeatCount repeatDur
|
2010-02-24 06:07:09 +01:00
|
|
|
requiredExtensions requiredFeatures restart rotate rx ry se:connector
|
2010-02-25 09:25:16 +01:00
|
|
|
se:nonce slope spacing
|
2010-02-06 04:36:35 +01:00
|
|
|
startOffset stdDeviation stemh stemv stop-color stop-opacity
|
|
|
|
strikethrough-position strikethrough-thickness stroke stroke-dasharray
|
|
|
|
stroke-dashoffset stroke-linecap stroke-linejoin stroke-miterlimit
|
|
|
|
stroke-opacity stroke-width systemLanguage target text-anchor
|
|
|
|
to transform type u1 u2 underline-position underline-thickness
|
|
|
|
unicode unicode-range units-per-em values version viewBox
|
|
|
|
visibility width widths x x-height x1 x2 xlink:actuate
|
|
|
|
xlink:arcrole xlink:href xlink:role xlink:show xlink:title xlink:type
|
2010-02-24 06:07:09 +01:00
|
|
|
xml:base xml:lang xml:space xmlns xmlns:xlink xmlns:se y y1 y2 zoomAndPan]
|
2008-05-21 00:02:10 +02:00
|
|
|
|
2010-05-22 21:34:08 +02:00
|
|
|
attr_val_is_uri = Set.new %w[href src cite action formaction longdesc xlink:href xml:base]
|
2008-05-21 00:02:10 +02:00
|
|
|
|
2009-10-10 09:44:44 +02:00
|
|
|
svg_attr_val_allows_ref = Set.new %w[clip-path color-profile cursor fill
|
2008-05-21 00:02:10 +02:00
|
|
|
filter marker marker-start marker-mid marker-end mask stroke]
|
|
|
|
|
2009-10-10 09:44:44 +02:00
|
|
|
svg_allow_local_href = Set.new %w[altGlyph animate animateColor animateMotion
|
2008-05-21 00:02:10 +02:00
|
|
|
animateTransform cursor feImage filter linearGradient pattern
|
|
|
|
radialGradient textpath tref set use]
|
|
|
|
|
2009-10-08 23:22:50 +02:00
|
|
|
acceptable_css_properties = Set.new %w[azimuth background-color
|
2008-05-21 00:02:10 +02:00
|
|
|
border-bottom-color border-collapse border-color border-left-color
|
|
|
|
border-right-color border-top-color clear color cursor direction
|
|
|
|
display elevation float font font-family font-size font-style
|
|
|
|
font-variant font-weight height letter-spacing line-height overflow
|
|
|
|
pause pause-after pause-before pitch pitch-range richness speak
|
|
|
|
speak-header speak-numeral speak-punctuation speech-rate stress
|
|
|
|
text-align text-decoration text-indent unicode-bidi vertical-align
|
|
|
|
voice-family volume white-space width]
|
|
|
|
|
2009-10-08 23:22:50 +02:00
|
|
|
acceptable_css_keywords = Set.new %w[auto aqua black block blue bold both bottom
|
2008-05-21 00:02:10 +02:00
|
|
|
brown center collapse dashed dotted fuchsia gray green !important
|
|
|
|
italic left lime maroon medium none navy normal nowrap olive pointer
|
|
|
|
purple red right solid silver teal top transparent underline white
|
|
|
|
yellow]
|
|
|
|
|
2009-10-08 23:22:50 +02:00
|
|
|
acceptable_svg_properties = Set.new %w[fill fill-opacity fill-rule stroke
|
2008-05-21 00:02:10 +02:00
|
|
|
stroke-width stroke-linecap stroke-linejoin stroke-opacity]
|
|
|
|
|
2009-10-08 23:22:50 +02:00
|
|
|
acceptable_protocols = Set.new %w[ed2k ftp http https irc mailto news gopher nntp
|
2008-05-21 00:02:10 +02:00
|
|
|
telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
|
|
|
|
|
2009-10-10 09:44:44 +02:00
|
|
|
SHORTHAND_CSS_PROPERTIES = Set.new %w[background border margin padding]
|
2009-10-08 23:22:50 +02:00
|
|
|
VOID_ELEMENTS = Set.new %w[img br hr link meta area base basefont
|
2008-05-21 00:02:10 +02:00
|
|
|
col frame input isindex param]
|
|
|
|
|
|
|
|
ALLOWED_ELEMENTS = acceptable_elements + mathml_elements + svg_elements unless defined?(ALLOWED_ELEMENTS)
|
|
|
|
ALLOWED_ATTRIBUTES = acceptable_attributes + mathml_attributes + svg_attributes unless defined?(ALLOWED_ATTRIBUTES)
|
|
|
|
ALLOWED_CSS_PROPERTIES = acceptable_css_properties unless defined?(ALLOWED_CSS_PROPERTIES)
|
|
|
|
ALLOWED_CSS_KEYWORDS = acceptable_css_keywords unless defined?(ALLOWED_CSS_KEYWORDS)
|
|
|
|
ALLOWED_SVG_PROPERTIES = acceptable_svg_properties unless defined?(ALLOWED_SVG_PROPERTIES)
|
|
|
|
ALLOWED_PROTOCOLS = acceptable_protocols unless defined?(ALLOWED_PROTOCOLS)
|
|
|
|
ATTR_VAL_IS_URI = attr_val_is_uri unless defined?(ATTR_VAL_IS_URI)
|
2009-10-10 09:44:44 +02:00
|
|
|
SVG_ATTR_VAL_ALLOWS_REF = svg_attr_val_allows_ref unless defined?(SVG_ATTR_VAL_ALLOWS_REF)
|
|
|
|
SVG_ALLOW_LOCAL_HREF = svg_allow_local_href unless defined?(SVG_ALLOW_LOCAL_HREF)
|
2008-05-21 00:02:10 +02:00
|
|
|
|
|
|
|
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and stripping out all
|
|
|
|
# attributes not in ALLOWED_ATTRIBUTES. Style attributes are parsed, and a restricted set,
|
|
|
|
# specified by ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
2009-10-10 09:44:44 +02:00
|
|
|
# Attributes in ATTR_VAL_IS_URI are scanned, and only uri schemes specified in
|
2008-05-21 00:02:10 +02:00
|
|
|
# ALLOWED_PROTOCOLS are allowed.
|
2009-10-09 21:02:07 +02:00
|
|
|
# Certain SVG attributes (SVG_ATTR_VAL_ALLOWS_REF) may take a url as a value. These are restricted to
|
|
|
|
# fragment-id's (in-document references). Certain SVG elements (SVG_ALLOW_LOCAL_HREF) allow href attributes
|
|
|
|
# which, again, are restricted to be fragment-id's.
|
|
|
|
#
|
2008-05-21 00:02:10 +02:00
|
|
|
# You can adjust what gets sanitized, by defining these constant arrays before this Module is loaded.
|
|
|
|
#
|
2009-10-09 21:02:07 +02:00
|
|
|
# xhtml_sanitize('<script> do_nasty_stuff() </script>')
|
2008-05-21 00:02:10 +02:00
|
|
|
# => <script> do_nasty_stuff() </script>
|
2009-10-10 06:18:17 +02:00
|
|
|
# xhtml_sanitize('<a href="javascript: sucker();">Click here for $100</a>')
|
2008-05-21 00:02:10 +02:00
|
|
|
# => <a>Click here for $100</a>
|
2008-05-21 09:06:31 +02:00
|
|
|
def xhtml_sanitize(html)
|
2009-10-10 06:18:17 +02:00
|
|
|
return html unless sanitizeable?(html)
|
|
|
|
tokenizer = HTML::Tokenizer.new(html.to_utf8)
|
|
|
|
results = []
|
2008-05-21 00:02:10 +02:00
|
|
|
|
2009-10-10 06:18:17 +02:00
|
|
|
while token = tokenizer.next
|
|
|
|
node = XHTML::Node.parse(nil, 0, 0, token, false)
|
|
|
|
results << case node.tag?
|
|
|
|
when true
|
|
|
|
if ALLOWED_ELEMENTS.include?(node.name)
|
2009-10-10 09:44:44 +02:00
|
|
|
process_attributes_for node
|
2009-10-10 06:18:17 +02:00
|
|
|
node.to_s
|
2008-05-21 00:02:10 +02:00
|
|
|
else
|
2009-10-10 06:18:17 +02:00
|
|
|
node.to_s.gsub(/</, "<").gsub(/>/, ">")
|
|
|
|
end
|
|
|
|
else
|
|
|
|
node.to_s.unescapeHTML.escapeHTML
|
2008-05-21 00:02:10 +02:00
|
|
|
end
|
|
|
|
end
|
2009-10-10 06:18:17 +02:00
|
|
|
|
|
|
|
results.join
|
|
|
|
end
|
|
|
|
|
|
|
|
def sanitizeable?(text)
|
|
|
|
!(text.nil? || text.empty? || !text.index("<"))
|
2008-05-21 00:02:10 +02:00
|
|
|
end
|
2009-10-09 20:02:02 +02:00
|
|
|
|
|
|
|
protected
|
|
|
|
|
|
|
|
def process_attributes_for(node)
|
|
|
|
return unless node.attributes
|
|
|
|
node.attributes.each do |attr,val|
|
|
|
|
if String === val && ALLOWED_ATTRIBUTES.include?(attr)
|
|
|
|
val = val.unescapeHTML.escapeHTML
|
|
|
|
else
|
|
|
|
node.attributes.delete attr; next
|
|
|
|
end
|
|
|
|
if attr == 'xlink:href' && SVG_ALLOW_LOCAL_HREF.include?(node.name) && val =~ /^\s*[^#\s]/m
|
|
|
|
node.attributes.delete attr; next
|
|
|
|
end
|
|
|
|
if ATTR_VAL_IS_URI.include?(attr)
|
2009-11-30 23:28:18 +01:00
|
|
|
val_unescaped = val.unescapeHTML.as_bytes.gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
|
2009-10-09 20:02:02 +02:00
|
|
|
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
|
|
|
|
node.attributes.delete attr; next
|
|
|
|
end
|
|
|
|
end
|
2009-10-09 21:02:07 +02:00
|
|
|
val = val.to_s.gsub(/url\s*\(\s*[^#\s][^)]+?\)/mi, ' ') if SVG_ATTR_VAL_ALLOWS_REF.include?(attr)
|
2009-10-09 20:02:02 +02:00
|
|
|
val = sanitize_css(val) if attr == 'style'
|
|
|
|
node.attributes[attr] = val
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2008-05-21 00:02:10 +02:00
|
|
|
def sanitize_css(style)
|
|
|
|
# disallow urls
|
|
|
|
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
|
|
|
|
|
|
|
|
# gauntlet
|
2008-07-26 11:14:41 +02:00
|
|
|
return '' unless style =~ /^([-:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
|
2008-12-09 15:54:35 +01:00
|
|
|
return '' unless style =~ /^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$/
|
2008-05-21 00:02:10 +02:00
|
|
|
|
|
|
|
clean = []
|
|
|
|
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
|
|
|
|
next if val.empty?
|
|
|
|
prop.downcase!
|
|
|
|
if self.class.const_get("ALLOWED_CSS_PROPERTIES").include?(prop)
|
|
|
|
clean << "#{prop}: #{val};"
|
2009-10-10 09:44:44 +02:00
|
|
|
elsif self.class.const_get("SHORTHAND_CSS_PROPERTIES").include?(prop.split('-')[0])
|
2008-05-21 00:02:10 +02:00
|
|
|
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
|
|
|
|
!self.class.const_get("ALLOWED_CSS_KEYWORDS").include?(keyword) and
|
|
|
|
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
|
|
|
|
end
|
|
|
|
elsif self.class.const_get("ALLOWED_SVG_PROPERTIES").include?(prop)
|
|
|
|
clean << "#{prop}: #{val};"
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2009-10-10 09:44:44 +02:00
|
|
|
clean.join(' ')
|
2008-05-21 00:02:10 +02:00
|
|
|
end
|
2009-11-30 23:28:18 +01:00
|
|
|
|
|
|
|
# Sanitize a string, parsed using XHTML parsing rules. Reparse the result to
|
|
|
|
# ensure well-formedness.
|
|
|
|
#
|
|
|
|
# :call-seq:
|
|
|
|
# safe_sanitize_xhtml(string) -> string
|
|
|
|
#
|
|
|
|
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
|
|
|
#
|
|
|
|
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
|
|
|
# (REXML trees are always utf-8 encoded.)
|
|
|
|
def safe_xhtml_sanitize(html, options = {})
|
|
|
|
sanitized = xhtml_sanitize(html.purify)
|
|
|
|
doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>")
|
|
|
|
sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
|
|
|
|
rescue REXML::ParseException
|
|
|
|
sanitized = sanitized.escapeHTML
|
|
|
|
end
|
|
|
|
|
2008-05-21 00:02:10 +02:00
|
|
|
end
|