Rough In New Sanitizer
Start work (which may not pan out) on a new sanitizer. Right now, it passes all but 1 of the HTML5lib Sanitizer's unit tests. But it doesn't do much of anything to ensure well-formedness. This is not an issue for Maruku-processed content, but it is a concern for <nowiki> blocks. (One solution would be to use the HTML5lib parser on <nowiki> blocks.) In any case, this baby is 3 times as fast as the HTML5lib sanitizer.
This commit is contained in:
parent
f8e74e53bd
commit
800880f382
15 changed files with 3657 additions and 12 deletions
|
@ -1,7 +1,7 @@
|
|||
# Controller responsible for serving files and pictures.
|
||||
|
||||
require 'zip/zip'
|
||||
require 'sanitize'
|
||||
require 'stringsupport'
|
||||
|
||||
class FileController < ApplicationController
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@ require 'fileutils'
|
|||
require 'maruku'
|
||||
require 'parsedate'
|
||||
require 'zip/zip'
|
||||
require 'sanitize'
|
||||
require 'stringsupport'
|
||||
require 'resolv'
|
||||
|
||||
class WikiController < ApplicationController
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
require 'chunks/chunk'
|
||||
require 'sanitize'
|
||||
require 'stringsupport'
|
||||
|
||||
# The category chunk looks for "category: news" on a line by
|
||||
# itself and parses the terms after the ':' as categories.
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
$: << File.dirname(__FILE__) + "../../lib"
|
||||
|
||||
require_dependency 'chunks/chunk'
|
||||
require 'sanitize'
|
||||
require 'stringsupport'
|
||||
|
||||
|
||||
# The markup engines are Chunks that call the one of RedCloth
|
||||
|
|
532
lib/node.rb
Normal file
532
lib/node.rb
Normal file
|
@ -0,0 +1,532 @@
|
|||
require 'strscan'
|
||||
|
||||
module XHTML #:nodoc:
|
||||
|
||||
class Conditions < Hash #:nodoc:
|
||||
def initialize(hash)
|
||||
super()
|
||||
hash = { :content => hash } unless Hash === hash
|
||||
hash = keys_to_symbols(hash)
|
||||
hash.each do |k,v|
|
||||
case k
|
||||
when :tag, :content then
|
||||
# keys are valid, and require no further processing
|
||||
when :attributes then
|
||||
hash[k] = keys_to_strings(v)
|
||||
when :parent, :child, :ancestor, :descendant, :sibling, :before,
|
||||
:after
|
||||
hash[k] = Conditions.new(v)
|
||||
when :children
|
||||
hash[k] = v = keys_to_symbols(v)
|
||||
v.each do |k,v2|
|
||||
case k
|
||||
when :count, :greater_than, :less_than
|
||||
# keys are valid, and require no further processing
|
||||
when :only
|
||||
v[k] = Conditions.new(v2)
|
||||
else
|
||||
raise "illegal key #{k.inspect} => #{v2.inspect}"
|
||||
end
|
||||
end
|
||||
else
|
||||
raise "illegal key #{k.inspect} => #{v.inspect}"
|
||||
end
|
||||
end
|
||||
update hash
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def keys_to_strings(hash)
|
||||
hash.keys.inject({}) do |h,k|
|
||||
h[k.to_s] = hash[k]
|
||||
h
|
||||
end
|
||||
end
|
||||
|
||||
def keys_to_symbols(hash)
|
||||
hash.keys.inject({}) do |h,k|
|
||||
raise "illegal key #{k.inspect}" unless k.respond_to?(:to_sym)
|
||||
h[k.to_sym] = hash[k]
|
||||
h
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# The base class of all nodes, textual and otherwise, in an HTML document.
|
||||
class Node #:nodoc:
|
||||
# The array of children of this node. Not all nodes have children.
|
||||
attr_reader :children
|
||||
|
||||
# The parent node of this node. All nodes have a parent, except for the
|
||||
# root node.
|
||||
attr_reader :parent
|
||||
|
||||
# The line number of the input where this node was begun
|
||||
attr_reader :line
|
||||
|
||||
# The byte position in the input where this node was begun
|
||||
attr_reader :position
|
||||
|
||||
# Create a new node as a child of the given parent.
|
||||
def initialize(parent, line=0, pos=0)
|
||||
@parent = parent
|
||||
@children = []
|
||||
@line, @position = line, pos
|
||||
end
|
||||
|
||||
# Return a textual representation of the node.
|
||||
def to_s
|
||||
s = ""
|
||||
@children.each { |child| s << child.to_s }
|
||||
s
|
||||
end
|
||||
|
||||
# Return false (subclasses must override this to provide specific matching
|
||||
# behavior.) +conditions+ may be of any type.
|
||||
def match(conditions)
|
||||
false
|
||||
end
|
||||
|
||||
# Search the children of this node for the first node for which #find
|
||||
# returns non +nil+. Returns the result of the #find call that succeeded.
|
||||
def find(conditions)
|
||||
conditions = validate_conditions(conditions)
|
||||
@children.each do |child|
|
||||
node = child.find(conditions)
|
||||
return node if node
|
||||
end
|
||||
nil
|
||||
end
|
||||
|
||||
# Search for all nodes that match the given conditions, and return them
|
||||
# as an array.
|
||||
def find_all(conditions)
|
||||
conditions = validate_conditions(conditions)
|
||||
|
||||
matches = []
|
||||
matches << self if match(conditions)
|
||||
@children.each do |child|
|
||||
matches.concat child.find_all(conditions)
|
||||
end
|
||||
matches
|
||||
end
|
||||
|
||||
# Returns +false+. Subclasses may override this if they define a kind of
|
||||
# tag.
|
||||
def tag?
|
||||
false
|
||||
end
|
||||
|
||||
def validate_conditions(conditions)
|
||||
Conditions === conditions ? conditions : Conditions.new(conditions)
|
||||
end
|
||||
|
||||
def ==(node)
|
||||
return false unless self.class == node.class && children.size == node.children.size
|
||||
|
||||
equivalent = true
|
||||
|
||||
children.size.times do |i|
|
||||
equivalent &&= children[i] == node.children[i]
|
||||
end
|
||||
|
||||
equivalent
|
||||
end
|
||||
|
||||
class <<self
|
||||
def parse(parent, line, pos, content, strict=true)
|
||||
if content !~ /^<\S/
|
||||
Text.new(parent, line, pos, content)
|
||||
else
|
||||
scanner = StringScanner.new(content)
|
||||
|
||||
unless scanner.skip(/</)
|
||||
if strict
|
||||
raise "expected <"
|
||||
else
|
||||
return Text.new(parent, line, pos, content)
|
||||
end
|
||||
end
|
||||
|
||||
if scanner.skip(/!\[CDATA\[/)
|
||||
scanner.scan_until(/\]\]>/)
|
||||
return CDATA.new(parent, line, pos, scanner.pre_match.gsub(/<!\[CDATA\[/, ''))
|
||||
end
|
||||
|
||||
closing = ( scanner.scan(/\//) ? :close : nil )
|
||||
return Text.new(parent, line, pos, content) unless name = scanner.scan(/[\w:-]+/)
|
||||
name
|
||||
|
||||
unless closing
|
||||
scanner.skip(/\s*/)
|
||||
attributes = {}
|
||||
while attr = scanner.scan(/[-\w:]+/)
|
||||
value = true
|
||||
if scanner.scan(/\s*=\s*/)
|
||||
if delim = scanner.scan(/['"]/)
|
||||
value = ""
|
||||
while text = scanner.scan(/[^#{delim}\\]+|./)
|
||||
case text
|
||||
when "\\" then
|
||||
value << text
|
||||
value << scanner.getch
|
||||
when delim
|
||||
break
|
||||
else value << text
|
||||
end
|
||||
end
|
||||
else
|
||||
value = scanner.scan(/[^\s>\/]+/)
|
||||
end
|
||||
end
|
||||
attributes[attr] = value
|
||||
scanner.skip(/\s*/)
|
||||
end
|
||||
|
||||
closing = ( scanner.scan(/\//) ? :self : nil )
|
||||
end
|
||||
|
||||
unless scanner.scan(/\s*>/)
|
||||
if strict
|
||||
raise "expected > (got #{scanner.rest.inspect} for #{content}, #{attributes.inspect})"
|
||||
else
|
||||
# throw away all text until we find what we're looking for
|
||||
scanner.skip_until(/>/) or scanner.terminate
|
||||
end
|
||||
end
|
||||
|
||||
Tag.new(parent, line, pos, name, attributes, closing)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# A node that represents text, rather than markup.
|
||||
class Text < Node #:nodoc:
|
||||
|
||||
attr_reader :content
|
||||
|
||||
# Creates a new text node as a child of the given parent, with the given
|
||||
# content.
|
||||
def initialize(parent, line, pos, content)
|
||||
super(parent, line, pos)
|
||||
@content = content
|
||||
end
|
||||
|
||||
# Returns the content of this node.
|
||||
def to_s
|
||||
@content
|
||||
end
|
||||
|
||||
# Returns +self+ if this node meets the given conditions. Text nodes support
|
||||
# conditions of the following kinds:
|
||||
#
|
||||
# * if +conditions+ is a string, it must be a substring of the node's
|
||||
# content
|
||||
# * if +conditions+ is a regular expression, it must match the node's
|
||||
# content
|
||||
# * if +conditions+ is a hash, it must contain a <tt>:content</tt> key that
|
||||
# is either a string or a regexp, and which is interpreted as described
|
||||
# above.
|
||||
def find(conditions)
|
||||
match(conditions) && self
|
||||
end
|
||||
|
||||
# Returns non-+nil+ if this node meets the given conditions, or +nil+
|
||||
# otherwise. See the discussion of #find for the valid conditions.
|
||||
def match(conditions)
|
||||
case conditions
|
||||
when String
|
||||
@content == conditions
|
||||
when Regexp
|
||||
@content =~ conditions
|
||||
when Hash
|
||||
conditions = validate_conditions(conditions)
|
||||
|
||||
# Text nodes only have :content, :parent, :ancestor
|
||||
unless (conditions.keys - [:content, :parent, :ancestor]).empty?
|
||||
return false
|
||||
end
|
||||
|
||||
match(conditions[:content])
|
||||
else
|
||||
nil
|
||||
end
|
||||
end
|
||||
|
||||
def ==(node)
|
||||
return false unless super
|
||||
content == node.content
|
||||
end
|
||||
end
|
||||
|
||||
# A CDATA node is simply a text node with a specialized way of displaying
|
||||
# itself.
|
||||
class CDATA < Text #:nodoc:
|
||||
def to_s
|
||||
"<![CDATA[#{super}]>"
|
||||
end
|
||||
end
|
||||
|
||||
# A Tag is any node that represents markup. It may be an opening tag, a
|
||||
# closing tag, or a self-closing tag. It has a name, and may have a hash of
|
||||
# attributes.
|
||||
class Tag < Node #:nodoc:
|
||||
|
||||
# Either +nil+, <tt>:close</tt>, or <tt>:self</tt>
|
||||
attr_reader :closing
|
||||
|
||||
# Either +nil+, or a hash of attributes for this node.
|
||||
attr_reader :attributes
|
||||
|
||||
# The name of this tag.
|
||||
attr_reader :name
|
||||
|
||||
# Create a new node as a child of the given parent, using the given content
|
||||
# to describe the node. It will be parsed and the node name, attributes and
|
||||
# closing status extracted.
|
||||
def initialize(parent, line, pos, name, attributes, closing)
|
||||
super(parent, line, pos)
|
||||
@name = name
|
||||
@attributes = attributes
|
||||
@closing = closing
|
||||
end
|
||||
|
||||
# A convenience for obtaining an attribute of the node. Returns +nil+ if
|
||||
# the node has no attributes.
|
||||
def [](attr)
|
||||
@attributes ? @attributes[attr] : nil
|
||||
end
|
||||
|
||||
# Returns non-+nil+ if this tag can contain child nodes.
|
||||
def childless?(xml = false)
|
||||
return false if xml && @closing.nil?
|
||||
# !@closing.nil? ||
|
||||
@name =~ /^(img|br|hr|link|meta|area|base|basefont|
|
||||
col|frame|input|isindex|param)$/ox
|
||||
end
|
||||
|
||||
# Returns a textual representation of the node
|
||||
def to_s
|
||||
s = ''
|
||||
if @closing == :close
|
||||
s = "</#{@name}>" unless self.childless?
|
||||
else
|
||||
s = "<#{@name}"
|
||||
atlist = @attributes.sort
|
||||
atlist.each do |att|
|
||||
s << " #{att[0]}"
|
||||
s << "='#{att[1]}'" if String === att[1]
|
||||
end
|
||||
s << "/" if (@children.empty? && @closing == :self) or self.childless?
|
||||
s << ">"
|
||||
@children.each { |child| s << child.to_s }
|
||||
s << "</#{@name}>" if @closing != :self && !@closing.nil? && !@children.empty?
|
||||
end
|
||||
s
|
||||
end
|
||||
|
||||
# If either the node or any of its children meet the given conditions, the
|
||||
# matching node is returned. Otherwise, +nil+ is returned. (See the
|
||||
# description of the valid conditions in the +match+ method.)
|
||||
def find(conditions)
|
||||
match(conditions) && self || super
|
||||
end
|
||||
|
||||
# Returns +true+, indicating that this node represents an HTML tag.
|
||||
def tag?
|
||||
true
|
||||
end
|
||||
|
||||
# Returns +true+ if the node meets any of the given conditions. The
|
||||
# +conditions+ parameter must be a hash of any of the following keys
|
||||
# (all are optional):
|
||||
#
|
||||
# * <tt>:tag</tt>: the node name must match the corresponding value
|
||||
# * <tt>:attributes</tt>: a hash. The node's values must match the
|
||||
# corresponding values in the hash.
|
||||
# * <tt>:parent</tt>: a hash. The node's parent must match the
|
||||
# corresponding hash.
|
||||
# * <tt>:child</tt>: a hash. At least one of the node's immediate children
|
||||
# must meet the criteria described by the hash.
|
||||
# * <tt>:ancestor</tt>: a hash. At least one of the node's ancestors must
|
||||
# meet the criteria described by the hash.
|
||||
# * <tt>:descendant</tt>: a hash. At least one of the node's descendants
|
||||
# must meet the criteria described by the hash.
|
||||
# * <tt>:sibling</tt>: a hash. At least one of the node's siblings must
|
||||
# meet the criteria described by the hash.
|
||||
# * <tt>:after</tt>: a hash. The node must be after any sibling meeting
|
||||
# the criteria described by the hash, and at least one sibling must match.
|
||||
# * <tt>:before</tt>: a hash. The node must be before any sibling meeting
|
||||
# the criteria described by the hash, and at least one sibling must match.
|
||||
# * <tt>:children</tt>: a hash, for counting children of a node. Accepts the
|
||||
# keys:
|
||||
# ** <tt>:count</tt>: either a number or a range which must equal (or
|
||||
# include) the number of children that match.
|
||||
# ** <tt>:less_than</tt>: the number of matching children must be less than
|
||||
# this number.
|
||||
# ** <tt>:greater_than</tt>: the number of matching children must be
|
||||
# greater than this number.
|
||||
# ** <tt>:only</tt>: another hash consisting of the keys to use
|
||||
# to match on the children, and only matching children will be
|
||||
# counted.
|
||||
#
|
||||
# Conditions are matched using the following algorithm:
|
||||
#
|
||||
# * if the condition is a string, it must be a substring of the value.
|
||||
# * if the condition is a regexp, it must match the value.
|
||||
# * if the condition is a number, the value must match number.to_s.
|
||||
# * if the condition is +true+, the value must not be +nil+.
|
||||
# * if the condition is +false+ or +nil+, the value must be +nil+.
|
||||
#
|
||||
# Usage:
|
||||
#
|
||||
# # test if the node is a "span" tag
|
||||
# node.match :tag => "span"
|
||||
#
|
||||
# # test if the node's parent is a "div"
|
||||
# node.match :parent => { :tag => "div" }
|
||||
#
|
||||
# # test if any of the node's ancestors are "table" tags
|
||||
# node.match :ancestor => { :tag => "table" }
|
||||
#
|
||||
# # test if any of the node's immediate children are "em" tags
|
||||
# node.match :child => { :tag => "em" }
|
||||
#
|
||||
# # test if any of the node's descendants are "strong" tags
|
||||
# node.match :descendant => { :tag => "strong" }
|
||||
#
|
||||
# # test if the node has between 2 and 4 span tags as immediate children
|
||||
# node.match :children => { :count => 2..4, :only => { :tag => "span" } }
|
||||
#
|
||||
# # get funky: test to see if the node is a "div", has a "ul" ancestor
|
||||
# # and an "li" parent (with "class" = "enum"), and whether or not it has
|
||||
# # a "span" descendant that contains # text matching /hello world/:
|
||||
# node.match :tag => "div",
|
||||
# :ancestor => { :tag => "ul" },
|
||||
# :parent => { :tag => "li",
|
||||
# :attributes => { :class => "enum" } },
|
||||
# :descendant => { :tag => "span",
|
||||
# :child => /hello world/ }
|
||||
def match(conditions)
|
||||
conditions = validate_conditions(conditions)
|
||||
# check content of child nodes
|
||||
if conditions[:content]
|
||||
if children.empty?
|
||||
return false unless match_condition("", conditions[:content])
|
||||
else
|
||||
return false unless children.find { |child| child.match(conditions[:content]) }
|
||||
end
|
||||
end
|
||||
|
||||
# test the name
|
||||
return false unless match_condition(@name, conditions[:tag]) if conditions[:tag]
|
||||
|
||||
# test attributes
|
||||
(conditions[:attributes] || {}).each do |key, value|
|
||||
return false unless match_condition(self[key], value)
|
||||
end
|
||||
|
||||
# test parent
|
||||
return false unless parent.match(conditions[:parent]) if conditions[:parent]
|
||||
|
||||
# test children
|
||||
return false unless children.find { |child| child.match(conditions[:child]) } if conditions[:child]
|
||||
|
||||
# test ancestors
|
||||
if conditions[:ancestor]
|
||||
return false unless catch :found do
|
||||
p = self
|
||||
throw :found, true if p.match(conditions[:ancestor]) while p = p.parent
|
||||
end
|
||||
end
|
||||
|
||||
# test descendants
|
||||
if conditions[:descendant]
|
||||
return false unless children.find do |child|
|
||||
# test the child
|
||||
child.match(conditions[:descendant]) ||
|
||||
# test the child's descendants
|
||||
child.match(:descendant => conditions[:descendant])
|
||||
end
|
||||
end
|
||||
|
||||
# count children
|
||||
if opts = conditions[:children]
|
||||
matches = children.select do |c|
|
||||
(c.kind_of?(HTML::Tag) and (c.closing == :self or ! c.childless?))
|
||||
end
|
||||
|
||||
matches = matches.select { |c| c.match(opts[:only]) } if opts[:only]
|
||||
opts.each do |key, value|
|
||||
next if key == :only
|
||||
case key
|
||||
when :count
|
||||
if Integer === value
|
||||
return false if matches.length != value
|
||||
else
|
||||
return false unless value.include?(matches.length)
|
||||
end
|
||||
when :less_than
|
||||
return false unless matches.length < value
|
||||
when :greater_than
|
||||
return false unless matches.length > value
|
||||
else raise "unknown count condition #{key}"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# test siblings
|
||||
if conditions[:sibling] || conditions[:before] || conditions[:after]
|
||||
siblings = parent ? parent.children : []
|
||||
self_index = siblings.index(self)
|
||||
|
||||
if conditions[:sibling]
|
||||
return false unless siblings.detect do |s|
|
||||
s != self && s.match(conditions[:sibling])
|
||||
end
|
||||
end
|
||||
|
||||
if conditions[:before]
|
||||
return false unless siblings[self_index+1..-1].detect do |s|
|
||||
s != self && s.match(conditions[:before])
|
||||
end
|
||||
end
|
||||
|
||||
if conditions[:after]
|
||||
return false unless siblings[0,self_index].detect do |s|
|
||||
s != self && s.match(conditions[:after])
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
true
|
||||
end
|
||||
|
||||
def ==(node)
|
||||
return false unless super
|
||||
return false unless closing == node.closing && self.name == node.name
|
||||
attributes == node.attributes
|
||||
end
|
||||
|
||||
private
|
||||
# Match the given value to the given condition.
|
||||
def match_condition(value, condition)
|
||||
case condition
|
||||
when String
|
||||
value && value == condition
|
||||
when Regexp
|
||||
value && value.match(condition)
|
||||
when Numeric
|
||||
value == condition.to_s
|
||||
when true
|
||||
!value.nil?
|
||||
when false, nil
|
||||
value.nil?
|
||||
else
|
||||
false
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -158,7 +158,7 @@ class String
|
|||
#++
|
||||
|
||||
#:stopdoc:
|
||||
MATHML_ENTITIES = {
|
||||
MATHML_ENTITIES = {
|
||||
'Alpha' => 'Α',
|
||||
'Beta' => 'Β',
|
||||
'Epsilon' => 'Ε',
|
||||
|
@ -2279,7 +2279,7 @@ class String
|
|||
'wp' => '℘',
|
||||
'wr' => '≀',
|
||||
'zeetrf' => 'ℨ'
|
||||
}
|
||||
} unless const_defined? "MATHML_ENTITIES"
|
||||
#:startdoc:
|
||||
|
||||
# Converts XHTML+MathML named entities in string to Numeric Character References
|
||||
|
|
198
lib/sanitizer.rb
Normal file
198
lib/sanitizer.rb
Normal file
|
@ -0,0 +1,198 @@
|
|||
module Sanitizer
|
||||
|
||||
# This module provides sanitization of XHTML+MathML+SVG
|
||||
# and of inline style attributes.
|
||||
#
|
||||
# Based heavily on Sam Ruby's code in the Universal FeedParser.
|
||||
|
||||
require 'html/tokenizer'
|
||||
require 'node'
|
||||
require 'stringsupport'
|
||||
|
||||
acceptable_elements = %w[a abbr acronym address area b big blockquote br
|
||||
button caption center cite code col colgroup dd del dfn dir div dl dt
|
||||
em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label
|
||||
legend li map menu ol optgroup option p pre q s samp select small span
|
||||
strike strong sub sup table tbody td textarea tfoot th thead tr tt u
|
||||
ul var]
|
||||
|
||||
mathml_elements = %w[annotation annotation-xml maction math merror mfrac
|
||||
mfenced mi mmultiscripts mn mo mover mpadded mphantom mprescripts mroot
|
||||
mrow mspace msqrt mstyle msub msubsup msup mtable mtd mtext mtr munder
|
||||
munderover none semantics]
|
||||
|
||||
svg_elements = %w[a animate animateColor animateMotion animateTransform
|
||||
circle defs desc ellipse font-face font-face-name font-face-src
|
||||
foreignObject g glyph hkern linearGradient line marker metadata
|
||||
missing-glyph mpath path polygon polyline radialGradient rect set
|
||||
stop svg switch text title tspan use]
|
||||
|
||||
acceptable_attributes = %w[abbr accept accept-charset accesskey action
|
||||
align alt axis border cellpadding cellspacing char charoff charset
|
||||
checked cite class clear cols colspan color compact coords datetime
|
||||
dir disabled enctype for frame headers height href hreflang hspace id
|
||||
ismap label lang longdesc maxlength media method multiple name nohref
|
||||
noshade nowrap prompt readonly rel rev rows rowspan rules scope
|
||||
selected shape size span src start style summary tabindex target title
|
||||
type usemap valign value vspace width xml:lang]
|
||||
|
||||
mathml_attributes = %w[actiontype align close columnalign columnalign
|
||||
columnalign columnlines columnspacing columnspan depth display
|
||||
displaystyle encoding equalcolumns equalrows fence fontstyle fontweight
|
||||
frame height linethickness lspace mathbackground mathcolor mathvariant
|
||||
mathvariant maxsize minsize open other rowalign rowalign rowalign
|
||||
rowlines rowspacing rowspan rspace scriptlevel selection separator
|
||||
separators stretchy width width xlink:href xlink:show xlink:type xmlns
|
||||
xmlns:xlink]
|
||||
|
||||
svg_attributes = %w[accent-height accumulate additive alphabetic
|
||||
arabic-form ascent attributeName attributeType baseProfile bbox begin
|
||||
by calcMode cap-height class color color-rendering content cx cy d dx
|
||||
dy descent display dur end fill fill-rule font-family font-size
|
||||
font-stretch font-style font-variant font-weight from fx fy g1 g2
|
||||
glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
|
||||
ideographic k keyPoints keySplines keyTimes lang marker-end
|
||||
marker-mid marker-start markerHeight markerUnits markerWidth
|
||||
mathematical max min name offset opacity orient origin
|
||||
overline-position overline-thickness panose-1 path pathLength points
|
||||
preserveAspectRatio r refX refY repeatCount repeatDur
|
||||
requiredExtensions requiredFeatures restart rotate rx ry slope stemh
|
||||
stemv stop-color stop-opacity strikethrough-position
|
||||
strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
|
||||
stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
|
||||
stroke-width systemLanguage target text-anchor to transform type u1
|
||||
u2 underline-position underline-thickness unicode unicode-range
|
||||
units-per-em values version viewBox visibility width widths x
|
||||
x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
|
||||
xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
|
||||
xmlns:xlink y y1 y2 zoomAndPan]
|
||||
|
||||
attr_val_is_uri = %w[href src cite action longdesc xlink:href xml:base]
|
||||
|
||||
SVG_ATTR_VAL_ALLOWS_REF = %w[clip-path color-profile cursor fill
|
||||
filter marker marker-start marker-mid marker-end mask stroke]
|
||||
|
||||
SVG_ALLOW_LOCAL_HREF = %w[altGlyph animate animateColor animateMotion
|
||||
animateTransform cursor feImage filter linearGradient pattern
|
||||
radialGradient textpath tref set use]
|
||||
|
||||
acceptable_css_properties = %w[azimuth background-color
|
||||
border-bottom-color border-collapse border-color border-left-color
|
||||
border-right-color border-top-color clear color cursor direction
|
||||
display elevation float font font-family font-size font-style
|
||||
font-variant font-weight height letter-spacing line-height overflow
|
||||
pause pause-after pause-before pitch pitch-range richness speak
|
||||
speak-header speak-numeral speak-punctuation speech-rate stress
|
||||
text-align text-decoration text-indent unicode-bidi vertical-align
|
||||
voice-family volume white-space width]
|
||||
|
||||
acceptable_css_keywords = %w[auto aqua black block blue bold both bottom
|
||||
brown center collapse dashed dotted fuchsia gray green !important
|
||||
italic left lime maroon medium none navy normal nowrap olive pointer
|
||||
purple red right solid silver teal top transparent underline white
|
||||
yellow]
|
||||
|
||||
acceptable_svg_properties = %w[fill fill-opacity fill-rule stroke
|
||||
stroke-width stroke-linecap stroke-linejoin stroke-opacity]
|
||||
|
||||
acceptable_protocols = %w[ed2k ftp http https irc mailto news gopher nntp
|
||||
telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
|
||||
|
||||
VOID_ELEMENTS = %w[img br hr link meta area base basefont
|
||||
col frame input isindex param]
|
||||
|
||||
ALLOWED_ELEMENTS = acceptable_elements + mathml_elements + svg_elements unless defined?(ALLOWED_ELEMENTS)
|
||||
ALLOWED_ATTRIBUTES = acceptable_attributes + mathml_attributes + svg_attributes unless defined?(ALLOWED_ATTRIBUTES)
|
||||
ALLOWED_CSS_PROPERTIES = acceptable_css_properties unless defined?(ALLOWED_CSS_PROPERTIES)
|
||||
ALLOWED_CSS_KEYWORDS = acceptable_css_keywords unless defined?(ALLOWED_CSS_KEYWORDS)
|
||||
ALLOWED_SVG_PROPERTIES = acceptable_svg_properties unless defined?(ALLOWED_SVG_PROPERTIES)
|
||||
ALLOWED_PROTOCOLS = acceptable_protocols unless defined?(ALLOWED_PROTOCOLS)
|
||||
ATTR_VAL_IS_URI = attr_val_is_uri unless defined?(ATTR_VAL_IS_URI)
|
||||
|
||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and stripping out all
|
||||
# attributes not in ALLOWED_ATTRIBUTES. Style attributes are parsed, and a restricted set,
|
||||
# specified by ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
||||
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified in
|
||||
# ALLOWED_PROTOCOLS are allowed.
|
||||
# You can adjust what gets sanitized, by defining these constant arrays before this Module is loaded.
|
||||
#
|
||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||
# => <script> do_nasty_stuff() </script>
|
||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||
# => <a>Click here for $100</a>
|
||||
def sanitize_xhtml(html)
|
||||
if html.index("<")
|
||||
tokenizer = HTML::Tokenizer.new(html.to_utf8)
|
||||
new_text = ""
|
||||
|
||||
while token = tokenizer.next
|
||||
node = XHTML::Node.parse(nil, 0, 0, token, false)
|
||||
new_text << case node.tag?
|
||||
when true
|
||||
if ALLOWED_ELEMENTS.include?(node.name)
|
||||
if node.attributes
|
||||
node.attributes.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
|
||||
ATTR_VAL_IS_URI.each do |attr|
|
||||
val_unescaped = CGI.unescapeHTML(node.attributes[attr].to_s).gsub(/`|[\000-\040\177\s\200-\240]/,'').downcase
|
||||
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
|
||||
node.attributes.delete attr
|
||||
end
|
||||
end
|
||||
SVG_ATTR_VAL_ALLOWS_REF.each do |attr|
|
||||
node.attributes[attr] = node.attributes[attr].to_s.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if node.attributes[attr]
|
||||
end
|
||||
if SVG_ALLOW_LOCAL_HREF.include?(node.name) && node.attributes['xlink:href'] && node.attributes['xlink:href'] =~ /^\s*[^#\s].*/m
|
||||
node.attributes.delete 'xlink:href'
|
||||
end
|
||||
if node.attributes['style']
|
||||
node.attributes['style'] = sanitize_css(node.attributes['style'])
|
||||
end
|
||||
node.attributes.each do |attr,val|
|
||||
if String === val
|
||||
node.attributes[attr] = CGI.escapeHTML(val.unescapeHTML)
|
||||
else
|
||||
node.attributes.delete attr
|
||||
end
|
||||
end
|
||||
end
|
||||
node.to_s
|
||||
else
|
||||
node.to_s.gsub(/</, "<").gsub(/>/, ">")
|
||||
end
|
||||
else
|
||||
CGI.escapeHTML(node.to_s.unescapeHTML)
|
||||
end
|
||||
end
|
||||
|
||||
html = new_text
|
||||
end
|
||||
html
|
||||
end
|
||||
|
||||
def sanitize_css(style)
|
||||
# disallow urls
|
||||
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
|
||||
|
||||
# gauntlet
|
||||
return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
|
||||
return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
|
||||
|
||||
clean = []
|
||||
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
|
||||
next if val.empty?
|
||||
prop.downcase!
|
||||
if self.class.const_get("ALLOWED_CSS_PROPERTIES").include?(prop)
|
||||
clean << "#{prop}: #{val};"
|
||||
elsif %w[background border margin padding].include?(prop.split('-')[0])
|
||||
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
|
||||
!self.class.const_get("ALLOWED_CSS_KEYWORDS").include?(keyword) and
|
||||
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
|
||||
end
|
||||
elsif self.class.const_get("ALLOWED_SVG_PROPERTIES").include?(prop)
|
||||
clean << "#{prop}: #{val};"
|
||||
end
|
||||
end
|
||||
|
||||
style = clean.join(' ')
|
||||
end
|
||||
end
|
2271
lib/stringsupport.rb
Normal file
2271
lib/stringsupport.rb
Normal file
File diff suppressed because it is too large
Load diff
|
@ -7,8 +7,6 @@ require_dependency 'chunks/literal'
|
|||
require 'chunks/nowiki'
|
||||
require 'sanitize'
|
||||
|
||||
include Sanitize
|
||||
|
||||
# Wiki content is just a string that can process itself with a chain of
|
||||
# actions. The actions can modify wiki content so that certain parts of
|
||||
# it are protected from being rendered by later actions.
|
||||
|
@ -116,6 +114,7 @@ end
|
|||
class WikiContent < String
|
||||
|
||||
include ChunkManager
|
||||
include Sanitize
|
||||
|
||||
DEFAULT_OPTS = {
|
||||
:active_chunks => ACTIVE_CHUNKS,
|
||||
|
|
475
test/sanitizer.dat
Normal file
475
test/sanitizer.dat
Normal file
|
@ -0,0 +1,475 @@
|
|||
[
|
||||
{
|
||||
"name": "IE_Comments",
|
||||
"input": "<!--[if gte IE 4]><script>alert('XSS');</script><![endif]-->",
|
||||
"output": "",
|
||||
"xhtml": "<!--[if gte IE 4]><script>alert('XSS');</script><![endif]-->"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "IE_Comments_2",
|
||||
"input": "<![if !IE 5]><script>alert('XSS');</script><![endif]>",
|
||||
"output": "<script>alert('XSS');</script>",
|
||||
"xhtml": "<![if !IE 5]><script>alert('XSS');</script><![endif]>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "allow_colons_in_path_component",
|
||||
"input": "<a href=\"./this:that\">foo</a>",
|
||||
"output": "<a href='./this:that'>foo</a>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "background_attribute",
|
||||
"input": "<div background=\"javascript:alert('XSS')\"></div>",
|
||||
"output": "<div/>",
|
||||
"xhtml": "<div></div>",
|
||||
"rexml": "<div></div>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "bgsound",
|
||||
"input": "<bgsound src=\"javascript:alert('XSS');\" />",
|
||||
"output": "<bgsound src=\"javascript:alert('XSS');\"/>",
|
||||
"xhtml": "<bgsound src='javascript:alert('XSS');'/>",
|
||||
"rexml": "<bgsound src=\"javascript:alert('XSS');\"></bgsound>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "div_background_image_unicode_encoded",
|
||||
"input": "<div style=\"background-image:\u00a5\u00a2\u006C\u0028'\u006a\u0061\u00a6\u0061\u00a3\u0063\u00a2\u0069\u00a0\u00a4\u003a\u0061\u006c\u0065\u00a2\u00a4\u0028.1027\u0058.1053\u0053\u0027\u0029'\u0029\">foo</div>",
|
||||
"output": "<div style=''>foo</div>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "div_expression",
|
||||
"input": "<div style=\"width: expression(alert('XSS'));\">foo</div>",
|
||||
"output": "<div style=''>foo</div>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "double_open_angle_brackets",
|
||||
"input": "<img src=http://ha.ckers.org/scriptlet.html <",
|
||||
"output": "<img src='http://ha.ckers.org/scriptlet.html'/>",
|
||||
"xhtml": "<img src='http:'/><",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "double_open_angle_brackets_2",
|
||||
"input": "<script src=http://ha.ckers.org/scriptlet.html <",
|
||||
"output": "<script src=\"http://ha.ckers.org/scriptlet.html\" <=\"\">",
|
||||
"xhtml": "<script src='http:'/><",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "grave_accents",
|
||||
"input": "<img src=`javascript:alert('XSS')` />",
|
||||
"output": "<img/>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "img_dynsrc_lowsrc",
|
||||
"input": "<img dynsrc=\"javascript:alert('XSS')\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "img_vbscript",
|
||||
"input": "<img src='vbscript:msgbox(\"XSS\")' />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "input_image",
|
||||
"input": "<input type=\"image\" src=\"javascript:alert('XSS');\" />",
|
||||
"output": "<input type='image'/>",
|
||||
"rexml": "<input type='image' />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "link_stylesheets",
|
||||
"input": "<link rel=\"stylesheet\" href=\"javascript:alert('XSS');\" />",
|
||||
"output": "<link rel=\"stylesheet\" href=\"javascript:alert('XSS');\"/>",
|
||||
"xhtml": "<link href='javascript:alert('XSS');' rel='stylesheet'/>",
|
||||
"rexml": "<link href=\"javascript:alert('XSS');\" rel=\"stylesheet\"/>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "link_stylesheets_2",
|
||||
"input": "<link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\" />",
|
||||
"output": "<link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\"/>",
|
||||
"xhtml": "<link href='http://ha.ckers.org/xss.css' rel='stylesheet'/>",
|
||||
"rexml": "<link href=\"http://ha.ckers.org/xss.css\" rel=\"stylesheet\"/>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "list_style_image",
|
||||
"input": "<li style=\"list-style-image: url(javascript:alert('XSS'))\">foo</li>",
|
||||
"output": "<li style=''>foo</li>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "no_closing_script_tags",
|
||||
"input": "<script src=http://ha.ckers.org/xss.js?<b>",
|
||||
"output": "<script src=\"http://ha.ckers.org/xss.js?&lt;b\">",
|
||||
"xhtml": "<script src='http:'/><b>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "non_alpha_non_digit",
|
||||
"input": "<script/XSS src=\"http://ha.ckers.org/xss.js\"></script>",
|
||||
"output": "<script XSS=\"\" src=\"http://ha.ckers.org/xss.js\"></script>",
|
||||
"xhtml": "<script/></script>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "non_alpha_non_digit_2",
|
||||
"input": "<a onclick!\\#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>foo</a>",
|
||||
"output": "<a>foo</a>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "non_alpha_non_digit_3",
|
||||
"input": "<img/src=\"http://ha.ckers.org/xss.js\"/>",
|
||||
"output": "<img src='http://ha.ckers.org/xss.js'/>",
|
||||
"xhtml": "<img/>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "non_alpha_non_digit_II",
|
||||
"input": "<a href!\\#$%&()*~+-_.,:;?@[/|]^`=alert('XSS')>foo</a>",
|
||||
"output": "<a>foo</a>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "non_alpha_non_digit_III",
|
||||
"input": "<a/href=\"javascript:alert('XSS');\">foo</a>",
|
||||
"output": "<a>foo</a>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "platypus",
|
||||
"input": "<a href=\"http://www.ragingplatypus.com/\" style=\"display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;\">never trust your upstream platypus</a>",
|
||||
"output": "<a href='http://www.ragingplatypus.com/' style='display: block; width: 100%; height: 100%; background-color: black; background-x: center; background-y: center;'>never trust your upstream platypus</a>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "protocol_resolution_in_script_tag",
|
||||
"input": "<script src=//ha.ckers.org/.j></script>",
|
||||
"output": "<script src=\"//ha.ckers.org/.j\"></script>",
|
||||
"xhtml": "<script src/></script>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_allow_anchors",
|
||||
"input": "<a href='foo' onclick='bar'><script>baz</script></a>",
|
||||
"output": "<a href='foo'><script>baz</script></a>",
|
||||
"xhtml": "<a href='foo'><script>baz</script></a>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_allow_image_alt_attribute",
|
||||
"input": "<img alt='foo' onclick='bar' />",
|
||||
"output": "<img alt='foo'/>",
|
||||
"rexml": "<img alt='foo' />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_allow_image_height_attribute",
|
||||
"input": "<img height='foo' onclick='bar' />",
|
||||
"output": "<img height='foo'/>",
|
||||
"rexml": "<img height='foo' />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_allow_image_src_attribute",
|
||||
"input": "<img src='foo' onclick='bar' />",
|
||||
"output": "<img src='foo'/>",
|
||||
"rexml": "<img src='foo' />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_allow_image_width_attribute",
|
||||
"input": "<img width='foo' onclick='bar' />",
|
||||
"output": "<img width='foo'/>",
|
||||
"rexml": "<img width='foo' />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_handle_blank_text",
|
||||
"input": "",
|
||||
"output": ""
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_handle_malformed_image_tags",
|
||||
"input": "<img \"\"\"><script>alert(\"XSS\")</script>\">",
|
||||
"output": "<img/><script>alert(\"XSS\")</script>\">",
|
||||
"xhtml": "<img/>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_handle_non_html",
|
||||
"input": "abc",
|
||||
"output": "abc"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_ridiculous_hack",
|
||||
"input": "<img\nsrc\n=\n\"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n\"\n />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_0",
|
||||
"input": "<img src=\"javascript:alert('XSS');\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_1",
|
||||
"input": "<img src=javascript:alert('XSS') />",
|
||||
"output": "<img/>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_10",
|
||||
"input": "<img src=\"jav
ascript:alert('XSS');\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_11",
|
||||
"input": "<img src=\"jav
ascript:alert('XSS');\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_12",
|
||||
"input": "<img src=\"  javascript:alert('XSS');\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_13",
|
||||
"input": "<img src=\" javascript:alert('XSS');\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_14",
|
||||
"input": "<img src=\" javascript:alert('XSS');\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_2",
|
||||
"input": "<img src=\"JaVaScRiPt:alert('XSS')\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_3",
|
||||
"input": "<img src='javascript:alert("XSS")' />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_4",
|
||||
"input": "<img src='javascript:alert(String.fromCharCode(88,83,83))' />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_5",
|
||||
"input": "<img src='javascript:alert('XSS')' />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_6",
|
||||
"input": "<img src='javascript:alert('XSS')' />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_7",
|
||||
"input": "<img src='javascript:alert('XSS')' />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_8",
|
||||
"input": "<img src=\"jav\tascript:alert('XSS');\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_9",
|
||||
"input": "<img src=\"jav	ascript:alert('XSS');\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_sanitize_half_open_scripts",
|
||||
"input": "<img src=\"javascript:alert('XSS')\"",
|
||||
"output": "<img/>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_sanitize_invalid_script_tag",
|
||||
"input": "<script/XSS SRC=\"http://ha.ckers.org/xss.js\"></script>",
|
||||
"output": "<script XSS=\"\" SRC=\"http://ha.ckers.org/xss.js\"></script>",
|
||||
"xhtml": "<script/></script>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_sanitize_script_tag_with_multiple_open_brackets",
|
||||
"input": "<<script>alert(\"XSS\");//<</script>",
|
||||
"output": "<<script>alert(\"XSS\");//<</script>",
|
||||
"xhtml": "<<script>alert("XSS");//<</script>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_sanitize_script_tag_with_multiple_open_brackets_2",
|
||||
"input": "<iframe src=http://ha.ckers.org/scriptlet.html\n<",
|
||||
"output": "<iframe src=\"http://ha.ckers.org/scriptlet.html\" <=\"\">",
|
||||
"xhtml": "<iframe src='http:'/><",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_sanitize_tag_broken_up_by_null",
|
||||
"input": "<scr\u0000ipt>alert(\"XSS\")</scr\u0000ipt>",
|
||||
"output": "<scr\ufffdipt>alert(\"XSS\")</scr\ufffdipt>",
|
||||
"xhtml": "<scr>alert("XSS")</scr>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_sanitize_unclosed_script",
|
||||
"input": "<script src=http://ha.ckers.org/xss.js?<b>",
|
||||
"output": "<script src=\"http://ha.ckers.org/xss.js?&lt;b\">",
|
||||
"xhtml": "<script src='http:'/><b>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_strip_href_attribute_in_a_with_bad_protocols",
|
||||
"input": "<a href=\"javascript:XSS\" title=\"1\">boo</a>",
|
||||
"output": "<a title='1'>boo</a>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_strip_href_attribute_in_a_with_bad_protocols_and_whitespace",
|
||||
"input": "<a href=\" javascript:XSS\" title=\"1\">boo</a>",
|
||||
"output": "<a title='1'>boo</a>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_strip_src_attribute_in_img_with_bad_protocols",
|
||||
"input": "<img src=\"javascript:XSS\" title=\"1\">boo</img>",
|
||||
"output": "<img title='1'/>boo",
|
||||
"rexml": "<img title='1' />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_strip_src_attribute_in_img_with_bad_protocols_and_whitespace",
|
||||
"input": "<img src=\" javascript:XSS\" title=\"1\">boo</img>",
|
||||
"output": "<img title='1'/>boo",
|
||||
"rexml": "<img title='1' />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "xml_base",
|
||||
"input": "<div xml:base=\"javascript:alert('XSS');//\">foo</div>",
|
||||
"output": "<div>foo</div>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "xul",
|
||||
"input": "<p style=\"-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')\">fubar</p>",
|
||||
"output": "<p style=''>fubar</p>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "quotes_in_attributes",
|
||||
"input": "<img src='foo' title='\"foo\" bar' />",
|
||||
"rexml": "<img src='foo' title='\"foo\" bar' />",
|
||||
"output": "<img src='foo' title='"foo" bar'/>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "uri_refs_in_svg_attributes",
|
||||
"input": "<rect fill='url(#foo)' />",
|
||||
"rexml": "<rect fill='url(#foo)'></rect>",
|
||||
"xhtml": "<rect fill='url(#foo)'/>",
|
||||
"output": "<rect fill='url(#foo)'/>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "absolute_uri_refs_in_svg_attributes",
|
||||
"input": "<rect fill='url(http://bad.com/) #fff' />",
|
||||
"rexml": "<rect fill=' #fff'></rect>",
|
||||
"xhtml": "<rect fill=' #fff'/>",
|
||||
"output": "<rect fill=' #fff'/>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "uri_ref_with_space_in svg_attribute",
|
||||
"input": "<rect fill='url(\n#foo)' />",
|
||||
"rexml": "<rect fill='url(\n#foo)'></rect>",
|
||||
"xhtml": "<rect fill='url(\n#foo)'/>",
|
||||
"output": "<rect fill='url(\n#foo)'/>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "absolute_uri_ref_with_space_in svg_attribute",
|
||||
"input": "<rect fill=\"url(\nhttp://bad.com/)\" />",
|
||||
"rexml": "<rect fill=' '></rect>",
|
||||
"xhtml": "<rect fill=' '/>",
|
||||
"output": "<rect fill=' '/>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "allow_html5_image_tag",
|
||||
"input": "<image src='foo' />",
|
||||
"rexml": "<image src=\"foo\"></image>",
|
||||
"xhtml": "<image src='foo'/>",
|
||||
"output": "<image src=\"foo\"/>"
|
||||
}
|
||||
|
||||
]
|
|
@ -12,4 +12,16 @@ class NoWikiTest < Test::Unit::TestCase
|
|||
)
|
||||
end
|
||||
|
||||
def test_markdown_nowiki
|
||||
match(NoWiki, 'This sentence contains <nowiki>*raw text*</nowiki>. Do not touch!',
|
||||
:plain_text => '*raw text*'
|
||||
)
|
||||
end
|
||||
|
||||
def test_no_sanitize_nowiki
|
||||
match(NoWiki, 'This sentence contains <nowiki>[[test]]&<a href="a&b">shebang</a> <script>alert("xss!");</script> *foo*</nowiki>. Do not touch!',
|
||||
:plain_text => '[[test]]&<a href="a&b">shebang</a> <script>alert("xss!");</script> *foo*'
|
||||
)
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -49,7 +49,7 @@ class PageRendererTest < Test::Unit::TestCase
|
|||
"<annotation-xml encoding='SVG1.1'><svg/></annotation-xml></semantics></math> in kinda " +
|
||||
"<a class='existingWikiWord' href='../show/ThatWay'>That Way</a> in " +
|
||||
"<span class='newWikiWord'>His Way<a href='../show/HisWay'>?</a></span> " +
|
||||
%{though <a class='existingWikiWord' href='../show/MyWay'>My Way</a> OverThere \xE2\x80\x93 see } +
|
||||
%{though <a class='existingWikiWord' href='../show/MyWay'>My Way</a> OverThere \342\200\223 see } +
|
||||
"<a class='existingWikiWord' href='../show/SmartEngine'>Smart Engine</a> in that " +
|
||||
"<span class='newWikiWord'>Smart Engine GUI" +
|
||||
"<a href='../show/SmartEngineGUI'>?</a></span></p>",
|
||||
|
@ -86,7 +86,7 @@ class PageRendererTest < Test::Unit::TestCase
|
|||
%{(</mo><mi>x</mi><mo stretchy='false'>)</mo><semantics><annotation-xml encoding='SVG1.1'>} +
|
||||
%{<svg/></annotation-xml></semantics></math><div class='maruku-eq-tex'><code style='display: none;'>} +
|
||||
%{\\sin(x) \\begin{svg}<svg></svg>\\end{svg}</code></div></div>},
|
||||
"$$\\sin(x) \\begin{svg}<svg/>\\end{svg}$$")
|
||||
"$$\\sin(x) \\begin{svg}<svg></svg>\\end{svg}$$")
|
||||
|
||||
code_block = [
|
||||
'This is a code block:',
|
||||
|
@ -275,6 +275,13 @@ class PageRendererTest < Test::Unit::TestCase
|
|||
'or <nowiki>http://www.thislink.com</nowiki>.')
|
||||
end
|
||||
|
||||
def test_malformed_nowiki
|
||||
assert_markup_parsed_as(
|
||||
'<p><i><b/></i></p>',
|
||||
'<nowiki><i><b></i></b></nowiki> ')
|
||||
end
|
||||
|
||||
|
||||
def test_multiline_nowiki_tag
|
||||
assert_markup_parsed_as(
|
||||
"<p>Do not mark \n up [[this text]] \nand http://this.url.com but markup " +
|
||||
|
@ -283,6 +290,13 @@ class PageRendererTest < Test::Unit::TestCase
|
|||
"and http://this.url.com </nowiki> but markup [[this]]")
|
||||
end
|
||||
|
||||
def test_markdown_nowiki_tag
|
||||
assert_markup_parsed_as(
|
||||
'<p>Do not mark up *this text* or http://www.thislink.com.</p>',
|
||||
'Do not mark up <nowiki>*this text*</nowiki> ' +
|
||||
'or <nowiki>http://www.thislink.com</nowiki>.')
|
||||
end
|
||||
|
||||
def test_sanitize_nowiki_tag
|
||||
assert_markup_parsed_as(
|
||||
'<p>[[test]]&<a href=\'a&b\'>shebang</a> <script>alert("xss!");</script> *foo*</p>',
|
||||
|
@ -311,7 +325,7 @@ class PageRendererTest < Test::Unit::TestCase
|
|||
"<annotation-xml encoding='SVG1.1'><svg/></annotation-xml></semantics></math> in kinda " +
|
||||
"<a class='existingWikiWord' href='ThatWay.html'>That Way</a> in " +
|
||||
"<span class='newWikiWord'>His Way</span> though " +
|
||||
%{<a class='existingWikiWord' href='MyWay.html'>My Way</a> OverThere \xE2\x80\x93 see } +
|
||||
%{<a class='existingWikiWord' href='MyWay.html'>My Way</a> OverThere \342\200\223 see } +
|
||||
"<a class='existingWikiWord' href='SmartEngine.html'>Smart Engine</a> in that " +
|
||||
"<span class='newWikiWord'>Smart Engine GUI</span></p>",
|
||||
test_renderer(@revision).display_content_for_export
|
||||
|
|
|
@ -5,6 +5,8 @@ require 'sanitize'
|
|||
|
||||
class SanitizeTest < Test::Unit::TestCase
|
||||
|
||||
include Sanitize
|
||||
|
||||
def setup
|
||||
|
||||
end
|
||||
|
|
142
test/unit/sanitizer_test.rb
Normal file
142
test/unit/sanitizer_test.rb
Normal file
|
@ -0,0 +1,142 @@
|
|||
#!/usr/bin/env ruby
|
||||
|
||||
require File.expand_path(File.join(File.dirname(__FILE__), '/../test_helper'))
|
||||
require 'sanitizer'
|
||||
require 'json'
|
||||
require 'stringsupport'
|
||||
|
||||
class SanitizerTest < Test::Unit::TestCase
|
||||
|
||||
include Sanitizer
|
||||
|
||||
def setup
|
||||
|
||||
end
|
||||
|
||||
def do_sanitize_xhtml stream
|
||||
sanitize_xhtml(stream.to_utf8)
|
||||
end
|
||||
|
||||
def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
|
||||
assert_equal xhtmloutput, do_sanitize_xhtml(input)
|
||||
end
|
||||
|
||||
Sanitizer::ALLOWED_ELEMENTS.each do |tag_name|
|
||||
define_method "test_should_allow_#{tag_name}_tag" do
|
||||
input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
|
||||
htmloutput = "<#{tag_name.downcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.downcase}>"
|
||||
xhtmloutput = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
|
||||
rexmloutput = xhtmloutput
|
||||
|
||||
if VOID_ELEMENTS.include?(tag_name)
|
||||
htmloutput = "<#{tag_name} title='1'/>foo <bad>bar</bad> baz"
|
||||
xhtmloutput = htmloutput
|
||||
htmloutput += '<br/>' if tag_name == 'br'
|
||||
rexmloutput = "<#{tag_name} title='1' />"
|
||||
end
|
||||
check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
|
||||
end
|
||||
end
|
||||
|
||||
Sanitizer::ALLOWED_ELEMENTS.each do |tag_name|
|
||||
define_method "test_should_forbid_#{tag_name.upcase}_tag" do
|
||||
input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
||||
output = "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
||||
xhtmloutput = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
||||
check_sanitization(input, output, xhtmloutput, output)
|
||||
end
|
||||
end
|
||||
|
||||
Sanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
||||
next if attribute_name == 'style'
|
||||
define_method "test_should_allow_#{attribute_name}_attribute" do
|
||||
input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
|
||||
output = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
|
||||
htmloutput = "<p #{attribute_name.downcase}='foo'>foo <bad>bar</bad> baz</p>"
|
||||
check_sanitization(input, htmloutput, output, output)
|
||||
end
|
||||
end
|
||||
|
||||
Sanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
||||
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
|
||||
input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
|
||||
output = "<p>foo <bad>bar</bad> baz</p>"
|
||||
check_sanitization(input, output, output, output)
|
||||
end
|
||||
end
|
||||
|
||||
Sanitizer::ALLOWED_PROTOCOLS.each do |protocol|
|
||||
define_method "test_should_allow_#{protocol}_uris" do
|
||||
input = %(<a href="#{protocol}">foo</a>)
|
||||
output = "<a href='#{protocol}'>foo</a>"
|
||||
check_sanitization(input, output, output, output)
|
||||
end
|
||||
end
|
||||
|
||||
Sanitizer::ALLOWED_PROTOCOLS.each do |protocol|
|
||||
define_method "test_should_allow_uppercase_#{protocol}_uris" do
|
||||
input = %(<a href="#{protocol.upcase}">foo</a>)
|
||||
output = "<a href='#{protocol.upcase}'>foo</a>"
|
||||
check_sanitization(input, output, output, output)
|
||||
end
|
||||
end
|
||||
|
||||
Sanitizer::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
|
||||
next unless Sanitizer::ALLOWED_ELEMENTS.include?(tag_name)
|
||||
define_method "test_#{tag_name}_should_allow_local_href" do
|
||||
input = %(<#{tag_name} xlink:href="#foo"/>)
|
||||
output = "<#{tag_name.downcase} xlink:href='#foo'/>"
|
||||
xhtmloutput = "<#{tag_name} xlink:href='#foo'/>"
|
||||
check_sanitization(input, output, xhtmloutput, xhtmloutput)
|
||||
end
|
||||
|
||||
define_method "test_#{tag_name}_should_allow_local_href_with_newline" do
|
||||
input = %(<#{tag_name} xlink:href="\n#foo"/>)
|
||||
output = "<#{tag_name.downcase} xlink:href='\n#foo'/>"
|
||||
xhtmloutput = "<#{tag_name} xlink:href='\n#foo'/>"
|
||||
check_sanitization(input, output, xhtmloutput, xhtmloutput)
|
||||
end
|
||||
|
||||
define_method "test_#{tag_name}_should_forbid_nonlocal_href" do
|
||||
input = %(<#{tag_name} xlink:href="http://bad.com/foo"/>)
|
||||
output = "<#{tag_name.downcase}/>"
|
||||
xhtmloutput = "<#{tag_name}/>"
|
||||
check_sanitization(input, output, xhtmloutput, xhtmloutput)
|
||||
end
|
||||
|
||||
define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline" do
|
||||
input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo"/>)
|
||||
output = "<#{tag_name.downcase}/>"
|
||||
xhtmloutput = "<#{tag_name}/>"
|
||||
check_sanitization(input, output, xhtmloutput, xhtmloutput)
|
||||
end
|
||||
end
|
||||
|
||||
def test_should_handle_astral_plane_characters
|
||||
input = "<p>𝒵 𝔸</p>"
|
||||
output = "<p>\360\235\222\265 \360\235\224\270</p>"
|
||||
check_sanitization(input, output, output, output)
|
||||
|
||||
input = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
||||
output = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
||||
check_sanitization(input, output, output, output)
|
||||
end
|
||||
|
||||
# This affects only NS4. Is it worth fixing?
|
||||
# def test_javascript_includes
|
||||
# input = %(<div size="&{alert('XSS')}">foo</div>)
|
||||
# output = "<div>foo</div>"
|
||||
# check_sanitization(input, output, output, output)
|
||||
# end
|
||||
|
||||
JSON::parse(open(File.expand_path(File.join(File.dirname(__FILE__), '/../sanitizer.dat'))).read).each do |test|
|
||||
define_method "test_#{test['name']}" do
|
||||
check_sanitization(
|
||||
test['input'],
|
||||
test['output'],
|
||||
test['xhtml'] || test['output'],
|
||||
test['rexml'] || test['output']
|
||||
)
|
||||
end
|
||||
end
|
||||
end
|
|
@ -4,7 +4,7 @@ module MaRuKu; module Out; module HTML
|
|||
def convert_to_mathml_itex2mml(kind, tex)
|
||||
begin
|
||||
if not $itex2mml_parser
|
||||
require 'sanitize'
|
||||
require 'stringsupport'
|
||||
require 'itextomml'
|
||||
$itex2mml_parser = Itex2MML::Parser.new
|
||||
end
|
||||
|
|
Loading…
Reference in a new issue