Sanitization now preserves case-sensitive element and attribute names (necessary to support SVG).
Unit tests, galore.
This commit is contained in:
parent
2fa1e08c96
commit
e179508377
530
lib/node.rb
Normal file
530
lib/node.rb
Normal file
|
@ -0,0 +1,530 @@
|
||||||
|
require 'strscan'
|
||||||
|
|
||||||
|
module XHTML #:nodoc:
|
||||||
|
|
||||||
|
class Conditions < Hash #:nodoc:
|
||||||
|
def initialize(hash)
|
||||||
|
super()
|
||||||
|
hash = { :content => hash } unless Hash === hash
|
||||||
|
hash = keys_to_symbols(hash)
|
||||||
|
hash.each do |k,v|
|
||||||
|
case k
|
||||||
|
when :tag, :content then
|
||||||
|
# keys are valid, and require no further processing
|
||||||
|
when :attributes then
|
||||||
|
hash[k] = keys_to_strings(v)
|
||||||
|
when :parent, :child, :ancestor, :descendant, :sibling, :before,
|
||||||
|
:after
|
||||||
|
hash[k] = Conditions.new(v)
|
||||||
|
when :children
|
||||||
|
hash[k] = v = keys_to_symbols(v)
|
||||||
|
v.each do |k,v2|
|
||||||
|
case k
|
||||||
|
when :count, :greater_than, :less_than
|
||||||
|
# keys are valid, and require no further processing
|
||||||
|
when :only
|
||||||
|
v[k] = Conditions.new(v2)
|
||||||
|
else
|
||||||
|
raise "illegal key #{k.inspect} => #{v2.inspect}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
else
|
||||||
|
raise "illegal key #{k.inspect} => #{v.inspect}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
update hash
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def keys_to_strings(hash)
|
||||||
|
hash.keys.inject({}) do |h,k|
|
||||||
|
h[k.to_s] = hash[k]
|
||||||
|
h
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def keys_to_symbols(hash)
|
||||||
|
hash.keys.inject({}) do |h,k|
|
||||||
|
raise "illegal key #{k.inspect}" unless k.respond_to?(:to_sym)
|
||||||
|
h[k.to_sym] = hash[k]
|
||||||
|
h
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# The base class of all nodes, textual and otherwise, in an HTML document.
|
||||||
|
class Node #:nodoc:
|
||||||
|
# The array of children of this node. Not all nodes have children.
|
||||||
|
attr_reader :children
|
||||||
|
|
||||||
|
# The parent node of this node. All nodes have a parent, except for the
|
||||||
|
# root node.
|
||||||
|
attr_reader :parent
|
||||||
|
|
||||||
|
# The line number of the input where this node was begun
|
||||||
|
attr_reader :line
|
||||||
|
|
||||||
|
# The byte position in the input where this node was begun
|
||||||
|
attr_reader :position
|
||||||
|
|
||||||
|
# Create a new node as a child of the given parent.
|
||||||
|
def initialize(parent, line=0, pos=0)
|
||||||
|
@parent = parent
|
||||||
|
@children = []
|
||||||
|
@line, @position = line, pos
|
||||||
|
end
|
||||||
|
|
||||||
|
# Return a textual representation of the node.
|
||||||
|
def to_s
|
||||||
|
s = ""
|
||||||
|
@children.each { |child| s << child.to_s }
|
||||||
|
s
|
||||||
|
end
|
||||||
|
|
||||||
|
# Return false (subclasses must override this to provide specific matching
|
||||||
|
# behavior.) +conditions+ may be of any type.
|
||||||
|
def match(conditions)
|
||||||
|
false
|
||||||
|
end
|
||||||
|
|
||||||
|
# Search the children of this node for the first node for which #find
|
||||||
|
# returns non +nil+. Returns the result of the #find call that succeeded.
|
||||||
|
def find(conditions)
|
||||||
|
conditions = validate_conditions(conditions)
|
||||||
|
@children.each do |child|
|
||||||
|
node = child.find(conditions)
|
||||||
|
return node if node
|
||||||
|
end
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
|
||||||
|
# Search for all nodes that match the given conditions, and return them
|
||||||
|
# as an array.
|
||||||
|
def find_all(conditions)
|
||||||
|
conditions = validate_conditions(conditions)
|
||||||
|
|
||||||
|
matches = []
|
||||||
|
matches << self if match(conditions)
|
||||||
|
@children.each do |child|
|
||||||
|
matches.concat child.find_all(conditions)
|
||||||
|
end
|
||||||
|
matches
|
||||||
|
end
|
||||||
|
|
||||||
|
# Returns +false+. Subclasses may override this if they define a kind of
|
||||||
|
# tag.
|
||||||
|
def tag?
|
||||||
|
false
|
||||||
|
end
|
||||||
|
|
||||||
|
def validate_conditions(conditions)
|
||||||
|
Conditions === conditions ? conditions : Conditions.new(conditions)
|
||||||
|
end
|
||||||
|
|
||||||
|
def ==(node)
|
||||||
|
return false unless self.class == node.class && children.size == node.children.size
|
||||||
|
|
||||||
|
equivalent = true
|
||||||
|
|
||||||
|
children.size.times do |i|
|
||||||
|
equivalent &&= children[i] == node.children[i]
|
||||||
|
end
|
||||||
|
|
||||||
|
equivalent
|
||||||
|
end
|
||||||
|
|
||||||
|
class <<self
|
||||||
|
def parse(parent, line, pos, content, strict=true)
|
||||||
|
if content !~ /^<\S/
|
||||||
|
Text.new(parent, line, pos, content)
|
||||||
|
else
|
||||||
|
scanner = StringScanner.new(content)
|
||||||
|
|
||||||
|
unless scanner.skip(/</)
|
||||||
|
if strict
|
||||||
|
raise "expected <"
|
||||||
|
else
|
||||||
|
return Text.new(parent, line, pos, content)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
if scanner.skip(/!\[CDATA\[/)
|
||||||
|
scanner.scan_until(/\]\]>/)
|
||||||
|
return CDATA.new(parent, line, pos, scanner.pre_match.gsub(/<!\[CDATA\[/, ''))
|
||||||
|
end
|
||||||
|
|
||||||
|
closing = ( scanner.scan(/\//) ? :close : nil )
|
||||||
|
return Text.new(parent, line, pos, content) unless name = scanner.scan(/[\w:-]+/)
|
||||||
|
name
|
||||||
|
|
||||||
|
unless closing
|
||||||
|
scanner.skip(/\s*/)
|
||||||
|
attributes = {}
|
||||||
|
while attr = scanner.scan(/[-\w:]+/)
|
||||||
|
value = true
|
||||||
|
if scanner.scan(/\s*=\s*/)
|
||||||
|
if delim = scanner.scan(/['"]/)
|
||||||
|
value = ""
|
||||||
|
while text = scanner.scan(/[^#{delim}\\]+|./)
|
||||||
|
case text
|
||||||
|
when "\\" then
|
||||||
|
value << text
|
||||||
|
value << scanner.getch
|
||||||
|
when delim
|
||||||
|
break
|
||||||
|
else value << text
|
||||||
|
end
|
||||||
|
end
|
||||||
|
else
|
||||||
|
value = scanner.scan(/[^\s>\/]+/)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
attributes[attr] = value
|
||||||
|
scanner.skip(/\s*/)
|
||||||
|
end
|
||||||
|
|
||||||
|
closing = ( scanner.scan(/\//) ? :self : nil )
|
||||||
|
end
|
||||||
|
|
||||||
|
unless scanner.scan(/\s*>/)
|
||||||
|
if strict
|
||||||
|
raise "expected > (got #{scanner.rest.inspect} for #{content}, #{attributes.inspect})"
|
||||||
|
else
|
||||||
|
# throw away all text until we find what we're looking for
|
||||||
|
scanner.skip_until(/>/) or scanner.terminate
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
Tag.new(parent, line, pos, name, attributes, closing)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# A node that represents text, rather than markup.
|
||||||
|
class Text < Node #:nodoc:
|
||||||
|
|
||||||
|
attr_reader :content
|
||||||
|
|
||||||
|
# Creates a new text node as a child of the given parent, with the given
|
||||||
|
# content.
|
||||||
|
def initialize(parent, line, pos, content)
|
||||||
|
super(parent, line, pos)
|
||||||
|
@content = content
|
||||||
|
end
|
||||||
|
|
||||||
|
# Returns the content of this node.
|
||||||
|
def to_s
|
||||||
|
@content
|
||||||
|
end
|
||||||
|
|
||||||
|
# Returns +self+ if this node meets the given conditions. Text nodes support
|
||||||
|
# conditions of the following kinds:
|
||||||
|
#
|
||||||
|
# * if +conditions+ is a string, it must be a substring of the node's
|
||||||
|
# content
|
||||||
|
# * if +conditions+ is a regular expression, it must match the node's
|
||||||
|
# content
|
||||||
|
# * if +conditions+ is a hash, it must contain a <tt>:content</tt> key that
|
||||||
|
# is either a string or a regexp, and which is interpreted as described
|
||||||
|
# above.
|
||||||
|
def find(conditions)
|
||||||
|
match(conditions) && self
|
||||||
|
end
|
||||||
|
|
||||||
|
# Returns non-+nil+ if this node meets the given conditions, or +nil+
|
||||||
|
# otherwise. See the discussion of #find for the valid conditions.
|
||||||
|
def match(conditions)
|
||||||
|
case conditions
|
||||||
|
when String
|
||||||
|
@content == conditions
|
||||||
|
when Regexp
|
||||||
|
@content =~ conditions
|
||||||
|
when Hash
|
||||||
|
conditions = validate_conditions(conditions)
|
||||||
|
|
||||||
|
# Text nodes only have :content, :parent, :ancestor
|
||||||
|
unless (conditions.keys - [:content, :parent, :ancestor]).empty?
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
|
||||||
|
match(conditions[:content])
|
||||||
|
else
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def ==(node)
|
||||||
|
return false unless super
|
||||||
|
content == node.content
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# A CDATA node is simply a text node with a specialized way of displaying
|
||||||
|
# itself.
|
||||||
|
class CDATA < Text #:nodoc:
|
||||||
|
def to_s
|
||||||
|
"<![CDATA[#{super}]>"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# A Tag is any node that represents markup. It may be an opening tag, a
|
||||||
|
# closing tag, or a self-closing tag. It has a name, and may have a hash of
|
||||||
|
# attributes.
|
||||||
|
class Tag < Node #:nodoc:
|
||||||
|
|
||||||
|
# Either +nil+, <tt>:close</tt>, or <tt>:self</tt>
|
||||||
|
attr_reader :closing
|
||||||
|
|
||||||
|
# Either +nil+, or a hash of attributes for this node.
|
||||||
|
attr_reader :attributes
|
||||||
|
|
||||||
|
# The name of this tag.
|
||||||
|
attr_reader :name
|
||||||
|
|
||||||
|
# Create a new node as a child of the given parent, using the given content
|
||||||
|
# to describe the node. It will be parsed and the node name, attributes and
|
||||||
|
# closing status extracted.
|
||||||
|
def initialize(parent, line, pos, name, attributes, closing)
|
||||||
|
super(parent, line, pos)
|
||||||
|
@name = name
|
||||||
|
@attributes = attributes
|
||||||
|
@closing = closing
|
||||||
|
end
|
||||||
|
|
||||||
|
# A convenience for obtaining an attribute of the node. Returns +nil+ if
|
||||||
|
# the node has no attributes.
|
||||||
|
def [](attr)
|
||||||
|
@attributes ? @attributes[attr] : nil
|
||||||
|
end
|
||||||
|
|
||||||
|
# Returns non-+nil+ if this tag can contain child nodes.
|
||||||
|
def childless?(xml = false)
|
||||||
|
return false if xml && @closing.nil?
|
||||||
|
!@closing.nil? ||
|
||||||
|
@name =~ /^(img|br|hr|link|meta|area|base|basefont|
|
||||||
|
col|frame|input|isindex|param)$/ox
|
||||||
|
end
|
||||||
|
|
||||||
|
# Returns a textual representation of the node
|
||||||
|
def to_s
|
||||||
|
if @closing == :close
|
||||||
|
"</#{@name}>"
|
||||||
|
else
|
||||||
|
s = "<#{@name}"
|
||||||
|
@attributes.each do |k,v|
|
||||||
|
s << " #{k}"
|
||||||
|
s << "=\"#{v}\"" if String === v
|
||||||
|
end
|
||||||
|
s << " /" if @closing == :self
|
||||||
|
s << ">"
|
||||||
|
@children.each { |child| s << child.to_s }
|
||||||
|
s << "</#{@name}>" if @closing != :self && !@children.empty?
|
||||||
|
s
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# If either the node or any of its children meet the given conditions, the
|
||||||
|
# matching node is returned. Otherwise, +nil+ is returned. (See the
|
||||||
|
# description of the valid conditions in the +match+ method.)
|
||||||
|
def find(conditions)
|
||||||
|
match(conditions) && self || super
|
||||||
|
end
|
||||||
|
|
||||||
|
# Returns +true+, indicating that this node represents an HTML tag.
|
||||||
|
def tag?
|
||||||
|
true
|
||||||
|
end
|
||||||
|
|
||||||
|
# Returns +true+ if the node meets any of the given conditions. The
|
||||||
|
# +conditions+ parameter must be a hash of any of the following keys
|
||||||
|
# (all are optional):
|
||||||
|
#
|
||||||
|
# * <tt>:tag</tt>: the node name must match the corresponding value
|
||||||
|
# * <tt>:attributes</tt>: a hash. The node's values must match the
|
||||||
|
# corresponding values in the hash.
|
||||||
|
# * <tt>:parent</tt>: a hash. The node's parent must match the
|
||||||
|
# corresponding hash.
|
||||||
|
# * <tt>:child</tt>: a hash. At least one of the node's immediate children
|
||||||
|
# must meet the criteria described by the hash.
|
||||||
|
# * <tt>:ancestor</tt>: a hash. At least one of the node's ancestors must
|
||||||
|
# meet the criteria described by the hash.
|
||||||
|
# * <tt>:descendant</tt>: a hash. At least one of the node's descendants
|
||||||
|
# must meet the criteria described by the hash.
|
||||||
|
# * <tt>:sibling</tt>: a hash. At least one of the node's siblings must
|
||||||
|
# meet the criteria described by the hash.
|
||||||
|
# * <tt>:after</tt>: a hash. The node must be after any sibling meeting
|
||||||
|
# the criteria described by the hash, and at least one sibling must match.
|
||||||
|
# * <tt>:before</tt>: a hash. The node must be before any sibling meeting
|
||||||
|
# the criteria described by the hash, and at least one sibling must match.
|
||||||
|
# * <tt>:children</tt>: a hash, for counting children of a node. Accepts the
|
||||||
|
# keys:
|
||||||
|
# ** <tt>:count</tt>: either a number or a range which must equal (or
|
||||||
|
# include) the number of children that match.
|
||||||
|
# ** <tt>:less_than</tt>: the number of matching children must be less than
|
||||||
|
# this number.
|
||||||
|
# ** <tt>:greater_than</tt>: the number of matching children must be
|
||||||
|
# greater than this number.
|
||||||
|
# ** <tt>:only</tt>: another hash consisting of the keys to use
|
||||||
|
# to match on the children, and only matching children will be
|
||||||
|
# counted.
|
||||||
|
#
|
||||||
|
# Conditions are matched using the following algorithm:
|
||||||
|
#
|
||||||
|
# * if the condition is a string, it must be a substring of the value.
|
||||||
|
# * if the condition is a regexp, it must match the value.
|
||||||
|
# * if the condition is a number, the value must match number.to_s.
|
||||||
|
# * if the condition is +true+, the value must not be +nil+.
|
||||||
|
# * if the condition is +false+ or +nil+, the value must be +nil+.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
#
|
||||||
|
# # test if the node is a "span" tag
|
||||||
|
# node.match :tag => "span"
|
||||||
|
#
|
||||||
|
# # test if the node's parent is a "div"
|
||||||
|
# node.match :parent => { :tag => "div" }
|
||||||
|
#
|
||||||
|
# # test if any of the node's ancestors are "table" tags
|
||||||
|
# node.match :ancestor => { :tag => "table" }
|
||||||
|
#
|
||||||
|
# # test if any of the node's immediate children are "em" tags
|
||||||
|
# node.match :child => { :tag => "em" }
|
||||||
|
#
|
||||||
|
# # test if any of the node's descendants are "strong" tags
|
||||||
|
# node.match :descendant => { :tag => "strong" }
|
||||||
|
#
|
||||||
|
# # test if the node has between 2 and 4 span tags as immediate children
|
||||||
|
# node.match :children => { :count => 2..4, :only => { :tag => "span" } }
|
||||||
|
#
|
||||||
|
# # get funky: test to see if the node is a "div", has a "ul" ancestor
|
||||||
|
# # and an "li" parent (with "class" = "enum"), and whether or not it has
|
||||||
|
# # a "span" descendant that contains # text matching /hello world/:
|
||||||
|
# node.match :tag => "div",
|
||||||
|
# :ancestor => { :tag => "ul" },
|
||||||
|
# :parent => { :tag => "li",
|
||||||
|
# :attributes => { :class => "enum" } },
|
||||||
|
# :descendant => { :tag => "span",
|
||||||
|
# :child => /hello world/ }
|
||||||
|
def match(conditions)
|
||||||
|
conditions = validate_conditions(conditions)
|
||||||
|
# check content of child nodes
|
||||||
|
if conditions[:content]
|
||||||
|
if children.empty?
|
||||||
|
return false unless match_condition("", conditions[:content])
|
||||||
|
else
|
||||||
|
return false unless children.find { |child| child.match(conditions[:content]) }
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# test the name
|
||||||
|
return false unless match_condition(@name, conditions[:tag]) if conditions[:tag]
|
||||||
|
|
||||||
|
# test attributes
|
||||||
|
(conditions[:attributes] || {}).each do |key, value|
|
||||||
|
return false unless match_condition(self[key], value)
|
||||||
|
end
|
||||||
|
|
||||||
|
# test parent
|
||||||
|
return false unless parent.match(conditions[:parent]) if conditions[:parent]
|
||||||
|
|
||||||
|
# test children
|
||||||
|
return false unless children.find { |child| child.match(conditions[:child]) } if conditions[:child]
|
||||||
|
|
||||||
|
# test ancestors
|
||||||
|
if conditions[:ancestor]
|
||||||
|
return false unless catch :found do
|
||||||
|
p = self
|
||||||
|
throw :found, true if p.match(conditions[:ancestor]) while p = p.parent
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# test descendants
|
||||||
|
if conditions[:descendant]
|
||||||
|
return false unless children.find do |child|
|
||||||
|
# test the child
|
||||||
|
child.match(conditions[:descendant]) ||
|
||||||
|
# test the child's descendants
|
||||||
|
child.match(:descendant => conditions[:descendant])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# count children
|
||||||
|
if opts = conditions[:children]
|
||||||
|
matches = children.select do |c|
|
||||||
|
(c.kind_of?(HTML::Tag) and (c.closing == :self or ! c.childless?))
|
||||||
|
end
|
||||||
|
|
||||||
|
matches = matches.select { |c| c.match(opts[:only]) } if opts[:only]
|
||||||
|
opts.each do |key, value|
|
||||||
|
next if key == :only
|
||||||
|
case key
|
||||||
|
when :count
|
||||||
|
if Integer === value
|
||||||
|
return false if matches.length != value
|
||||||
|
else
|
||||||
|
return false unless value.include?(matches.length)
|
||||||
|
end
|
||||||
|
when :less_than
|
||||||
|
return false unless matches.length < value
|
||||||
|
when :greater_than
|
||||||
|
return false unless matches.length > value
|
||||||
|
else raise "unknown count condition #{key}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# test siblings
|
||||||
|
if conditions[:sibling] || conditions[:before] || conditions[:after]
|
||||||
|
siblings = parent ? parent.children : []
|
||||||
|
self_index = siblings.index(self)
|
||||||
|
|
||||||
|
if conditions[:sibling]
|
||||||
|
return false unless siblings.detect do |s|
|
||||||
|
s != self && s.match(conditions[:sibling])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
if conditions[:before]
|
||||||
|
return false unless siblings[self_index+1..-1].detect do |s|
|
||||||
|
s != self && s.match(conditions[:before])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
if conditions[:after]
|
||||||
|
return false unless siblings[0,self_index].detect do |s|
|
||||||
|
s != self && s.match(conditions[:after])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
true
|
||||||
|
end
|
||||||
|
|
||||||
|
def ==(node)
|
||||||
|
return false unless super
|
||||||
|
return false unless closing == node.closing && self.name == node.name
|
||||||
|
attributes == node.attributes
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
# Match the given value to the given condition.
|
||||||
|
def match_condition(value, condition)
|
||||||
|
case condition
|
||||||
|
when String
|
||||||
|
value && value == condition
|
||||||
|
when Regexp
|
||||||
|
value && value.match(condition)
|
||||||
|
when Numeric
|
||||||
|
value == condition.to_s
|
||||||
|
when true
|
||||||
|
!value.nil?
|
||||||
|
when false, nil
|
||||||
|
value.nil?
|
||||||
|
else
|
||||||
|
false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -6,7 +6,7 @@ module Sanitize
|
||||||
# Based heavily on Sam Ruby's code in the Universal FeedParser.
|
# Based heavily on Sam Ruby's code in the Universal FeedParser.
|
||||||
|
|
||||||
require 'html/tokenizer'
|
require 'html/tokenizer'
|
||||||
require 'html/node'
|
require 'node'
|
||||||
|
|
||||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
||||||
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
||||||
|
@ -125,14 +125,15 @@ module Sanitize
|
||||||
new_text = ""
|
new_text = ""
|
||||||
|
|
||||||
while token = tokenizer.next
|
while token = tokenizer.next
|
||||||
node = HTML::Node.parse(nil, 0, 0, token, false)
|
node = XHTML::Node.parse(nil, 0, 0, token, false)
|
||||||
new_text << case node
|
new_text << case node.tag?
|
||||||
when HTML::Tag
|
when true
|
||||||
if ALLOWED_ELEMENTS.include?(node.name)
|
if ALLOWED_ELEMENTS.include?(node.name)
|
||||||
if node.closing != :close
|
if node.closing != :close
|
||||||
node.attributes.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
|
node.attributes.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
|
||||||
%w(href src).each do |attr|
|
%w(href src).each do |attr|
|
||||||
node.attributes.delete attr if node.attributes[attr] =~ /^\s*javascript:/i
|
val_unescaped = CGI.unescapeHTML(node.attributes[attr].to_s).gsub(/[\000-\040\177-\240]+/,'')
|
||||||
|
node.attributes.delete attr if val_unescaped =~ /^javascript:/i
|
||||||
end
|
end
|
||||||
if node.attributes['style']
|
if node.attributes['style']
|
||||||
node.attributes['style'] = sanitize_css(node.attributes['style'])
|
node.attributes['style'] = sanitize_css(node.attributes['style'])
|
||||||
|
@ -174,7 +175,7 @@ module Sanitize
|
||||||
goodval = true
|
goodval = true
|
||||||
val.split().each do |keyword|
|
val.split().each do |keyword|
|
||||||
if !ALLOWED_CSS_KEYWORDS.include?(keyword) and
|
if !ALLOWED_CSS_KEYWORDS.include?(keyword) and
|
||||||
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
|
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
|
||||||
goodval = false
|
goodval = false
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
143
test/unit/sanitize.rb
Normal file
143
test/unit/sanitize.rb
Normal file
|
@ -0,0 +1,143 @@
|
||||||
|
#!/usr/bin/env ruby
|
||||||
|
|
||||||
|
require File.expand_path(File.join(File.dirname(__FILE__), '/../test_helper'))
|
||||||
|
require 'sanitize'
|
||||||
|
|
||||||
|
class SanitizeTest < Test::Unit::TestCase
|
||||||
|
include Sanitize
|
||||||
|
|
||||||
|
def setup
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
Sanitize::ALLOWED_ELEMENTS.each do |tag_name|
|
||||||
|
define_method "test_should_allow_#{tag_name}_tag" do
|
||||||
|
assert_equal "<#{tag_name} title=\"1\">foo <bad>bar</bad> baz</#{tag_name}>",
|
||||||
|
sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
Sanitize::ALLOWED_ELEMENTS.each do |tag_name|
|
||||||
|
define_method "test_should_forbid_#{tag_name.upcase}_tag" do
|
||||||
|
assert_equal "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>",
|
||||||
|
sanitize_html("<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
Sanitize::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
||||||
|
define_method "test_should_allow_#{attribute_name}_attribute" do
|
||||||
|
assert_equal "<p #{attribute_name}=\"display: none;\">foo <bad>bar</bad> baz</p>",
|
||||||
|
sanitize_html("<p #{attribute_name}='display: none;'>foo <bad>bar</bad> baz</p>")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
Sanitize::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
||||||
|
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
|
||||||
|
assert_equal "<p>foo <bad>bar</bad> baz</p>",
|
||||||
|
sanitize_html("<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_allow_anchors
|
||||||
|
assert_equal "<a href=\"foo\"><script>baz</script></a>",
|
||||||
|
sanitize_html("<a href='foo' onclick='bar'><script>baz</script></a>")
|
||||||
|
end
|
||||||
|
|
||||||
|
%w(src width height alt).each do |img_attr|
|
||||||
|
define_method "test_should_allow_image_#{img_attr}_attribute" do
|
||||||
|
assert_equal "<img #{img_attr}=\"foo\" />",
|
||||||
|
sanitize_html("<img #{img_attr}='foo' onclick='bar' />")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_handle_non_html
|
||||||
|
assert_equal 'abc', sanitize_html("abc")
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_handle_blank_text
|
||||||
|
assert_equal '', sanitize_html('')
|
||||||
|
end
|
||||||
|
|
||||||
|
[%w(img src), %w(a href)].each do |(tag, attr)|
|
||||||
|
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols" do
|
||||||
|
assert_equal %(<#{tag} title="1">boo</#{tag}>), sanitize_html(%(<#{tag} #{attr}="javascript:XSS" title="1">boo</#{tag}>))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
[%w(img src), %w(a href)].each do |(tag, attr)|
|
||||||
|
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols_and_whitespace" do
|
||||||
|
assert_equal %(<#{tag} title="1">boo</#{tag}>), sanitize_html(%(<#{tag} #{attr}=" javascript:XSS" title="1">boo</#{tag}>))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
[%(<img src="javascript:alert('XSS');" />),
|
||||||
|
%(<img src=javascript:alert('XSS') />),
|
||||||
|
%(<img src="JaVaScRiPt:alert('XSS')" />),
|
||||||
|
%(<img src='javascript:alert("XSS")' />),
|
||||||
|
%(<img src='javascript:alert(String.fromCharCode(88,83,83))' />),
|
||||||
|
%(<img src='javascript:alert('XSS')' />),
|
||||||
|
%(<img src='javascript:alert('XSS')' />),
|
||||||
|
%(<img src='javascript:alert('XSS')' />),
|
||||||
|
%(<img src="jav\tascript:alert('XSS');" />),
|
||||||
|
%(<img src="jav	ascript:alert('XSS');" />),
|
||||||
|
%(<img src="jav
ascript:alert('XSS');" />),
|
||||||
|
%(<img src="jav
ascript:alert('XSS');" />),
|
||||||
|
%(<img src="  javascript:alert('XSS');" />),
|
||||||
|
%(<img src=" javascript:alert('XSS');" />),
|
||||||
|
%(<img src=" javascript:alert('XSS');" />)].each_with_index do |img_hack, i|
|
||||||
|
define_method "test_should_not_fall_for_xss_image_hack_#{i}" do
|
||||||
|
assert_equal "<img />", sanitize_html(img_hack)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_sanitize_tag_broken_up_by_null
|
||||||
|
assert_equal "<scr>alert(\"XSS\")</scr>", sanitize_html(%(<scr\0ipt>alert(\"XSS\")</scr\0ipt>))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_sanitize_invalid_script_tag
|
||||||
|
assert_equal "<script /></script>", sanitize_html(%(<script/XSS SRC="http://ha.ckers.org/xss.js"></script>))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_sanitize_script_tag_with_multiple_open_brackets
|
||||||
|
assert_equal "<<script>alert(\"XSS\");//<</script>", sanitize_html(%(<<script>alert("XSS");//<</script>))
|
||||||
|
assert_equal %(<iframe src="http:" /><), sanitize_html(%(<iframe src=http://ha.ckers.org/scriptlet.html\n<))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_sanitize_unclosed_script
|
||||||
|
assert_equal "<script src=\"http:\" /><b>", sanitize_html(%(<script src=http://ha.ckers.org/xss.js?<b>))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_sanitize_half_open_scripts
|
||||||
|
assert_equal "<img>", sanitize_html(%(<img src="javascript:alert('XSS')"))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_not_fall_for_ridiculous_hack
|
||||||
|
img_hack = %(<img\nsrc\n=\n"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n"\n />)
|
||||||
|
assert_equal "<img />", sanitize_html(img_hack)
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_platypus
|
||||||
|
assert_equal %(<a href=\"http://www.ragingplatypus.com/\" style=\"display: block; width: 100%; height: 100%; background-color: black; background-image: ; background-x: center; background-y: center;\">never trust your upstream platypus</a>),
|
||||||
|
sanitize_html(%(<a href="http://www.ragingplatypus.com/" style="display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;">never trust your upstream platypus</a>))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_xul
|
||||||
|
assert_equal %(<p style="">fubar</p>),
|
||||||
|
sanitize_html(%(<p style="-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')">fubar</p>))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_input_image
|
||||||
|
assert_equal %(<input type="image" />),
|
||||||
|
sanitize_html(%(<input type="image" src="javascript:alert('XSS');" />))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_non_alpha_non_digit
|
||||||
|
assert_equal "<script /></script>",
|
||||||
|
sanitize_html(%(<script/XSS src="http://ha.ckers.org/xss.js"></script>))
|
||||||
|
assert_equal "<a>foo</a>",
|
||||||
|
sanitize_html('<a onclick!#$%&()*~+-_.,:;?@[/|\]^`=alert("XSS")>foo</a>')
|
||||||
|
assert_equal "<img />",
|
||||||
|
sanitize_html('<img/src="http://ha.ckers.org/xss.js"/>')
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
Loading…
Reference in a new issue