2007-05-25 20:52:27 -05:00
|
|
|
require 'cgi'
|
2007-07-04 17:36:59 -05:00
|
|
|
require 'html5/tokenizer'
|
2007-05-25 20:52:27 -05:00
|
|
|
|
2007-07-04 17:36:59 -05:00
|
|
|
module HTML5
|
2007-05-25 20:52:27 -05:00
|
|
|
|
|
|
|
# This module provides sanitization of XHTML+MathML+SVG
|
|
|
|
# and of inline style attributes.
|
2007-06-06 00:56:43 -05:00
|
|
|
#
|
|
|
|
# It can be either at the Tokenizer stage:
|
|
|
|
#
|
|
|
|
# HTMLParser.parse(html, :tokenizer => HTMLSanitizer)
|
|
|
|
#
|
|
|
|
# or, if you already have a parse tree (in this example, a REXML tree),
|
|
|
|
# at the Serializer stage:
|
|
|
|
#
|
2007-08-30 12:19:10 -05:00
|
|
|
# tokens = TreeWalkers.get_tree_walker('rexml').new(tree)
|
2007-06-06 00:56:43 -05:00
|
|
|
# HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
|
|
|
|
# :sanitize => true})
|
2007-05-25 20:52:27 -05:00
|
|
|
|
2007-08-30 12:19:10 -05:00
|
|
|
module HTMLSanitizeModule
|
2007-05-25 20:52:27 -05:00
|
|
|
|
|
|
|
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
|
2007-05-30 10:45:52 -05:00
|
|
|
button caption center cite code col colgroup dd del dfn dir div dl dt
|
|
|
|
em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label
|
|
|
|
legend li map menu ol optgroup option p pre q s samp select small span
|
|
|
|
strike strong sub sup table tbody td textarea tfoot th thead tr tt u
|
|
|
|
ul var]
|
2007-05-25 20:52:27 -05:00
|
|
|
|
2007-10-21 02:19:10 -05:00
|
|
|
MATHML_ELEMENTS = %w[annotation annotation-xml maction math merror mfrac
|
2008-02-03 23:56:17 -06:00
|
|
|
mfenced mi mmultiscripts mn mo mover mpadded mphantom mprescripts mroot mrow
|
2007-10-21 02:19:10 -05:00
|
|
|
mspace msqrt mstyle msub msubsup msup mtable mtd mtext mtr munder
|
|
|
|
munderover none semantics]
|
2007-05-25 20:52:27 -05:00
|
|
|
|
|
|
|
SVG_ELEMENTS = %w[a animate animateColor animateMotion animateTransform
|
2008-02-03 23:56:17 -06:00
|
|
|
circle defs desc ellipse font-face font-face-name font-face-src foreignObject
|
|
|
|
g glyph hkern linearGradient line marker metadata missing-glyph
|
2007-05-30 10:45:52 -05:00
|
|
|
mpath path polygon polyline radialGradient rect set stop svg switch
|
|
|
|
text title tspan use]
|
2007-05-25 20:52:27 -05:00
|
|
|
|
|
|
|
ACCEPTABLE_ATTRIBUTES = %w[abbr accept accept-charset accesskey action
|
2007-05-30 10:45:52 -05:00
|
|
|
align alt axis border cellpadding cellspacing char charoff charset
|
|
|
|
checked cite class clear cols colspan color compact coords datetime
|
|
|
|
dir disabled enctype for frame headers height href hreflang hspace id
|
|
|
|
ismap label lang longdesc maxlength media method multiple name nohref
|
|
|
|
noshade nowrap prompt readonly rel rev rows rowspan rules scope
|
|
|
|
selected shape size span src start style summary tabindex target title
|
|
|
|
type usemap valign value vspace width xml:lang]
|
2007-05-25 20:52:27 -05:00
|
|
|
|
2008-02-03 23:56:17 -06:00
|
|
|
MATHML_ATTRIBUTES = %w[actiontype align close columnalign columnalign
|
2007-05-30 10:45:52 -05:00
|
|
|
columnalign columnlines columnspacing columnspan depth display
|
2007-10-21 02:19:10 -05:00
|
|
|
displaystyle encoding equalcolumns equalrows fence fontstyle fontweight
|
|
|
|
frame height linethickness lspace mathbackground mathcolor mathvariant
|
2008-02-03 23:56:17 -06:00
|
|
|
mathvariant maxsize minsize open other rowalign rowalign rowalign rowlines
|
|
|
|
rowspacing rowspan rspace scriptlevel selection separator separators
|
|
|
|
stretchy width width xlink:href xlink:show xlink:type xmlns xmlns:xlink]
|
2007-05-25 20:52:27 -05:00
|
|
|
|
|
|
|
SVG_ATTRIBUTES = %w[accent-height accumulate additive alphabetic
|
2007-05-30 10:45:52 -05:00
|
|
|
arabic-form ascent attributeName attributeType baseProfile bbox begin
|
|
|
|
by calcMode cap-height class color color-rendering content cx cy d dx
|
2008-07-28 10:57:55 -05:00
|
|
|
dy descent display dur end fill fill-opacity fill-rule font-family
|
|
|
|
font-size font-stretch font-style font-variant font-weight from fx fy g1
|
|
|
|
g2 glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
|
2007-05-30 10:45:52 -05:00
|
|
|
ideographic k keyPoints keySplines keyTimes lang marker-end
|
|
|
|
marker-mid marker-start markerHeight markerUnits markerWidth
|
|
|
|
mathematical max min name offset opacity orient origin
|
|
|
|
overline-position overline-thickness panose-1 path pathLength points
|
|
|
|
preserveAspectRatio r refX refY repeatCount repeatDur
|
|
|
|
requiredExtensions requiredFeatures restart rotate rx ry slope stemh
|
|
|
|
stemv stop-color stop-opacity strikethrough-position
|
|
|
|
strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
|
|
|
|
stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
|
|
|
|
stroke-width systemLanguage target text-anchor to transform type u1
|
|
|
|
u2 underline-position underline-thickness unicode unicode-range
|
|
|
|
units-per-em values version viewBox visibility width widths x
|
|
|
|
x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
|
|
|
|
xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
|
|
|
|
xmlns:xlink y y1 y2 zoomAndPan]
|
2007-05-25 20:52:27 -05:00
|
|
|
|
2007-06-10 15:07:26 -05:00
|
|
|
ATTR_VAL_IS_URI = %w[href src cite action longdesc xlink:href xml:base]
|
2007-05-25 20:52:27 -05:00
|
|
|
|
2007-10-31 01:00:45 -05:00
|
|
|
SVG_ATTR_VAL_ALLOWS_REF = %w[clip-path color-profile cursor fill
|
|
|
|
filter marker marker-start marker-mid marker-end mask stroke]
|
|
|
|
|
|
|
|
SVG_ALLOW_LOCAL_HREF = %w[altGlyph animate animateColor animateMotion
|
|
|
|
animateTransform cursor feImage filter linearGradient pattern
|
|
|
|
radialGradient textpath tref set use]
|
2007-10-27 17:34:29 -05:00
|
|
|
|
2007-05-25 20:52:27 -05:00
|
|
|
ACCEPTABLE_CSS_PROPERTIES = %w[azimuth background-color
|
2007-05-30 10:45:52 -05:00
|
|
|
border-bottom-color border-collapse border-color border-left-color
|
|
|
|
border-right-color border-top-color clear color cursor direction
|
|
|
|
display elevation float font font-family font-size font-style
|
|
|
|
font-variant font-weight height letter-spacing line-height overflow
|
|
|
|
pause pause-after pause-before pitch pitch-range richness speak
|
|
|
|
speak-header speak-numeral speak-punctuation speech-rate stress
|
|
|
|
text-align text-decoration text-indent unicode-bidi vertical-align
|
|
|
|
voice-family volume white-space width]
|
2007-05-25 20:52:27 -05:00
|
|
|
|
|
|
|
ACCEPTABLE_CSS_KEYWORDS = %w[auto aqua black block blue bold both bottom
|
2007-05-30 10:45:52 -05:00
|
|
|
brown center collapse dashed dotted fuchsia gray green !important
|
|
|
|
italic left lime maroon medium none navy normal nowrap olive pointer
|
|
|
|
purple red right solid silver teal top transparent underline white
|
|
|
|
yellow]
|
2007-05-25 20:52:27 -05:00
|
|
|
|
|
|
|
ACCEPTABLE_SVG_PROPERTIES = %w[fill fill-opacity fill-rule stroke
|
2007-05-30 10:45:52 -05:00
|
|
|
stroke-width stroke-linecap stroke-linejoin stroke-opacity]
|
2007-05-25 20:52:27 -05:00
|
|
|
|
|
|
|
ACCEPTABLE_PROTOCOLS = %w[ed2k ftp http https irc mailto news gopher nntp
|
2007-05-30 10:45:52 -05:00
|
|
|
telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
|
2007-05-25 20:52:27 -05:00
|
|
|
|
|
|
|
# subclasses may define their own versions of these constants
|
|
|
|
ALLOWED_ELEMENTS = ACCEPTABLE_ELEMENTS + MATHML_ELEMENTS + SVG_ELEMENTS
|
|
|
|
ALLOWED_ATTRIBUTES = ACCEPTABLE_ATTRIBUTES + MATHML_ATTRIBUTES + SVG_ATTRIBUTES
|
|
|
|
ALLOWED_CSS_PROPERTIES = ACCEPTABLE_CSS_PROPERTIES
|
|
|
|
ALLOWED_CSS_KEYWORDS = ACCEPTABLE_CSS_KEYWORDS
|
|
|
|
ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
|
|
|
|
ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
|
|
|
|
|
2007-06-06 14:36:54 -05:00
|
|
|
def sanitize_token(token)
|
2007-05-30 10:45:52 -05:00
|
|
|
case token[:type]
|
|
|
|
when :StartTag, :EndTag, :EmptyTag
|
2007-10-06 11:55:58 -05:00
|
|
|
if self.class.const_get("ALLOWED_ELEMENTS").include?(token[:name])
|
2007-05-30 10:45:52 -05:00
|
|
|
if token.has_key? :data
|
|
|
|
attrs = Hash[*token[:data].flatten]
|
2007-10-06 11:55:58 -05:00
|
|
|
attrs.delete_if { |attr,v| !self.class.const_get("ALLOWED_ATTRIBUTES").include?(attr) }
|
2007-05-30 10:45:52 -05:00
|
|
|
ATTR_VAL_IS_URI.each do |attr|
|
2007-06-11 00:03:51 -05:00
|
|
|
val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
|
2007-10-06 11:55:58 -05:00
|
|
|
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !self.class.const_get("ALLOWED_PROTOCOLS").include?(val_unescaped.split(':')[0])
|
2007-05-30 10:45:52 -05:00
|
|
|
attrs.delete attr
|
2007-05-25 20:52:27 -05:00
|
|
|
end
|
2007-10-27 23:08:13 -05:00
|
|
|
end
|
|
|
|
SVG_ATTR_VAL_ALLOWS_REF.each do |attr|
|
|
|
|
attrs[attr] = attrs[attr].to_s.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attrs[attr]
|
2007-05-30 10:45:52 -05:00
|
|
|
end
|
2007-10-31 01:00:45 -05:00
|
|
|
if SVG_ALLOW_LOCAL_HREF.include?(token[:name]) && attrs['xlink:href'] && attrs['xlink:href'] =~ /^\s*[^#\s].*/m
|
|
|
|
attrs.delete 'xlink:href'
|
|
|
|
end
|
2007-05-30 10:45:52 -05:00
|
|
|
if attrs['style']
|
|
|
|
attrs['style'] = sanitize_css(attrs['style'])
|
|
|
|
end
|
|
|
|
token[:data] = attrs.map {|k,v| [k,v]}
|
|
|
|
end
|
2007-06-05 16:34:49 -05:00
|
|
|
return token
|
2007-05-30 10:45:52 -05:00
|
|
|
else
|
|
|
|
if token[:type] == :EndTag
|
|
|
|
token[:data] = "</#{token[:name]}>"
|
|
|
|
elsif token[:data]
|
|
|
|
attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('')
|
|
|
|
token[:data] = "<#{token[:name]}#{attrs}>"
|
2007-05-25 20:52:27 -05:00
|
|
|
else
|
2007-05-30 10:45:52 -05:00
|
|
|
token[:data] = "<#{token[:name]}>"
|
2007-05-25 20:52:27 -05:00
|
|
|
end
|
2007-05-30 10:45:52 -05:00
|
|
|
token[:data].insert(-2,'/') if token[:type] == :EmptyTag
|
|
|
|
token[:type] = :Characters
|
|
|
|
token.delete(:name)
|
2007-06-05 16:34:49 -05:00
|
|
|
return token
|
2007-05-25 20:52:27 -05:00
|
|
|
end
|
2007-06-10 15:07:26 -05:00
|
|
|
when :Comment
|
|
|
|
token[:data] = ""
|
|
|
|
return token
|
2007-05-30 10:45:52 -05:00
|
|
|
else
|
2007-06-05 16:34:49 -05:00
|
|
|
return token
|
2007-05-30 10:45:52 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def sanitize_css(style)
|
|
|
|
# disallow urls
|
|
|
|
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
|
|
|
|
|
|
|
|
# gauntlet
|
2008-07-26 04:14:41 -05:00
|
|
|
return '' unless style =~ /^([-:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
|
2007-05-30 10:45:52 -05:00
|
|
|
return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
|
|
|
|
|
|
|
|
clean = []
|
|
|
|
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
|
|
|
|
next if val.empty?
|
|
|
|
prop.downcase!
|
2007-10-06 11:55:58 -05:00
|
|
|
if self.class.const_get("ALLOWED_CSS_PROPERTIES").include?(prop)
|
2007-05-30 10:45:52 -05:00
|
|
|
clean << "#{prop}: #{val};"
|
|
|
|
elsif %w[background border margin padding].include?(prop.split('-')[0])
|
|
|
|
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
|
2007-10-06 11:55:58 -05:00
|
|
|
!self.class.const_get("ALLOWED_CSS_KEYWORDS").include?(keyword) and
|
2007-05-30 10:45:52 -05:00
|
|
|
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
|
2007-05-25 20:52:27 -05:00
|
|
|
end
|
2007-10-06 11:55:58 -05:00
|
|
|
elsif self.class.const_get("ALLOWED_SVG_PROPERTIES").include?(prop)
|
2007-05-30 10:45:52 -05:00
|
|
|
clean << "#{prop}: #{val};"
|
|
|
|
end
|
2007-05-25 20:52:27 -05:00
|
|
|
end
|
2007-05-30 10:45:52 -05:00
|
|
|
|
|
|
|
style = clean.join(' ')
|
|
|
|
end
|
|
|
|
end
|
2007-06-05 16:34:49 -05:00
|
|
|
|
|
|
|
class HTMLSanitizer < HTMLTokenizer
|
|
|
|
include HTMLSanitizeModule
|
|
|
|
def each
|
|
|
|
super do |token|
|
2007-06-06 14:36:54 -05:00
|
|
|
yield(sanitize_token(token))
|
2007-06-05 16:34:49 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2007-05-25 20:52:27 -05:00
|
|
|
end
|