XSS Security fixes

2007-02-25 15:13:50 +00:00 · 2007-02-25 15:13:50 +00:00 · 552cf4cff0
commit 552cf4cff0
parent c9a9b7d315
4 changed files with 938 additions and 4 deletions
--- a/lib/chunks/engines.rb
+++ b/lib/chunks/engines.rb
@ -24,36 +24,48 @@ module Engines
  end

  class Textile < AbstractEngine
+    require_dependency 'sanitize'
+    include Sanitize
    def mask
      require_dependency 'redcloth'
      redcloth = RedCloth.new(@content, [:hard_breaks] + @content.options[:engine_opts])
      redcloth.filter_html = false
      redcloth.no_span_caps = false  
-      redcloth.to_html(:textile)
+      html = redcloth.to_html(:textile)
+      sanitize_html(html)
    end
  end

  class Markdown < AbstractEngine
+    require_dependency 'sanitize'
+    include Sanitize
    def mask
      require_dependency 'bluecloth_tweaked'
-      BlueCloth.new(@content, @content.options[:engine_opts]).to_html
+      html = BlueCloth.new(@content, @content.options[:engine_opts]).to_html
+      sanitize_html(html)
    end
  end

  class Mixed < AbstractEngine
+    require_dependency 'sanitize'
+    include Sanitize
    def mask
      require_dependency 'redcloth'
      redcloth = RedCloth.new(@content, @content.options[:engine_opts])
      redcloth.filter_html = false
      redcloth.no_span_caps = false
-      redcloth.to_html
+      html = redcloth.to_html
+      sanitize_html(html)
    end
  end

  class RDoc < AbstractEngine
+    require_dependency 'sanitize'
+    include Sanitize
    def mask
      require_dependency 'rdocsupport'
-      RDocSupport::RDocFormatter.new(@content).to_html
+      html = RDocSupport::RDocFormatter.new(@content).to_html
+      sanitize_html(html)
    end
  end

--- a/lib/node.rb
+++ b/lib/node.rb
@ -0,0 +1,530 @@
+require 'strscan'
+
+module XHTML #:nodoc:
+  
+  class Conditions < Hash #:nodoc:
+    def initialize(hash)
+      super()
+      hash = { :content => hash } unless Hash === hash
+      hash = keys_to_symbols(hash)
+      hash.each do |k,v|
+        case k
+          when :tag, :content then
+            # keys are valid, and require no further processing
+          when :attributes then
+            hash[k] = keys_to_strings(v)
+          when :parent, :child, :ancestor, :descendant, :sibling, :before,
+                  :after
+            hash[k] = Conditions.new(v)
+          when :children
+            hash[k] = v = keys_to_symbols(v)
+            v.each do |k,v2|
+              case k
+                when :count, :greater_than, :less_than
+                  # keys are valid, and require no further processing
+                when :only
+                  v[k] = Conditions.new(v2)
+                else
+                  raise "illegal key #{k.inspect} => #{v2.inspect}"
+              end
+            end
+          else
+            raise "illegal key #{k.inspect} => #{v.inspect}"
+        end
+      end
+      update hash
+    end
+
+    private
+
+      def keys_to_strings(hash)
+        hash.keys.inject({}) do |h,k|
+          h[k.to_s] = hash[k]
+          h
+        end
+      end
+
+      def keys_to_symbols(hash)
+        hash.keys.inject({}) do |h,k|
+          raise "illegal key #{k.inspect}" unless k.respond_to?(:to_sym)
+          h[k.to_sym] = hash[k]
+          h
+        end
+      end
+  end
+
+  # The base class of all nodes, textual and otherwise, in an HTML document.
+  class Node #:nodoc:
+    # The array of children of this node. Not all nodes have children.
+    attr_reader :children
+    
+    # The parent node of this node. All nodes have a parent, except for the
+    # root node.
+    attr_reader :parent
+    
+    # The line number of the input where this node was begun
+    attr_reader :line
+    
+    # The byte position in the input where this node was begun
+    attr_reader :position
+    
+    # Create a new node as a child of the given parent.
+    def initialize(parent, line=0, pos=0)
+      @parent = parent
+      @children = []
+      @line, @position = line, pos
+    end
+
+    # Return a textual representation of the node.
+    def to_s
+      s = ""
+      @children.each { |child| s << child.to_s }
+      s
+    end
+
+    # Return false (subclasses must override this to provide specific matching
+    # behavior.) +conditions+ may be of any type.
+    def match(conditions)
+      false
+    end
+
+    # Search the children of this node for the first node for which #find
+    # returns non +nil+. Returns the result of the #find call that succeeded.
+    def find(conditions)
+      conditions = validate_conditions(conditions)
+      @children.each do |child|        
+        node = child.find(conditions)
+        return node if node
+      end
+      nil
+    end
+
+    # Search for all nodes that match the given conditions, and return them
+    # as an array.
+    def find_all(conditions)
+      conditions = validate_conditions(conditions)
+
+      matches = []
+      matches << self if match(conditions)
+      @children.each do |child|
+        matches.concat child.find_all(conditions)
+      end
+      matches
+    end
+
+    # Returns +false+. Subclasses may override this if they define a kind of
+    # tag.
+    def tag?
+      false
+    end
+
+    def validate_conditions(conditions)
+      Conditions === conditions ? conditions : Conditions.new(conditions)
+    end
+
+    def ==(node)
+      return false unless self.class == node.class && children.size == node.children.size
+
+      equivalent = true
+
+      children.size.times do |i|
+        equivalent &&= children[i] == node.children[i]
+      end
+
+      equivalent
+    end
+    
+    class <<self
+      def parse(parent, line, pos, content, strict=true)
+        if content !~ /^<\S/
+          Text.new(parent, line, pos, content)
+        else
+          scanner = StringScanner.new(content)
+
+          unless scanner.skip(/</)
+            if strict
+              raise "expected <"
+            else
+              return Text.new(parent, line, pos, content)
+            end
+          end
+
+          if scanner.skip(/!\[CDATA\[/)
+            scanner.scan_until(/\]\]>/)
+            return CDATA.new(parent, line, pos, scanner.pre_match.gsub(/<!\[CDATA\[/, ''))
+          end
+          
+          closing = ( scanner.scan(/\//) ? :close : nil )
+          return Text.new(parent, line, pos, content) unless name = scanner.scan(/[\w:-]+/)
+          name
+  
+          unless closing
+            scanner.skip(/\s*/)
+            attributes = {}
+            while attr = scanner.scan(/[-\w:]+/)
+              value = true
+              if scanner.scan(/\s*=\s*/)
+                if delim = scanner.scan(/['"]/)
+                  value = ""
+                  while text = scanner.scan(/[^#{delim}\\]+|./)
+                    case text
+                      when "\\" then
+                        value << text
+                        value << scanner.getch
+                      when delim
+                        break
+                      else value << text
+                    end
+                  end
+                else
+                  value = scanner.scan(/[^\s>\/]+/)
+                end
+              end
+              attributes[attr] = value
+              scanner.skip(/\s*/)
+            end
+    
+            closing = ( scanner.scan(/\//) ? :self : nil )
+          end
+          
+          unless scanner.scan(/\s*>/)
+            if strict
+              raise "expected > (got #{scanner.rest.inspect} for #{content}, #{attributes.inspect})" 
+            else
+              # throw away all text until we find what we're looking for
+              scanner.skip_until(/>/) or scanner.terminate
+            end
+          end
+
+          Tag.new(parent, line, pos, name, attributes, closing)
+        end
+      end
+    end
+  end
+
+  # A node that represents text, rather than markup.
+  class Text < Node #:nodoc:
+    
+    attr_reader :content
+    
+    # Creates a new text node as a child of the given parent, with the given
+    # content.
+    def initialize(parent, line, pos, content)
+      super(parent, line, pos)
+      @content = content
+    end
+
+    # Returns the content of this node.
+    def to_s
+      @content
+    end
+
+    # Returns +self+ if this node meets the given conditions. Text nodes support
+    # conditions of the following kinds:
+    #
+    # * if +conditions+ is a string, it must be a substring of the node's
+    #   content
+    # * if +conditions+ is a regular expression, it must match the node's
+    #   content
+    # * if +conditions+ is a hash, it must contain a <tt>:content</tt> key that
+    #   is either a string or a regexp, and which is interpreted as described
+    #   above.
+    def find(conditions)
+      match(conditions) && self
+    end
+    
+    # Returns non-+nil+ if this node meets the given conditions, or +nil+
+    # otherwise. See the discussion of #find for the valid conditions.
+    def match(conditions)
+      case conditions
+        when String
+          @content == conditions
+        when Regexp
+          @content =~ conditions
+        when Hash
+          conditions = validate_conditions(conditions)
+
+          # Text nodes only have :content, :parent, :ancestor
+          unless (conditions.keys - [:content, :parent, :ancestor]).empty?
+            return false
+          end
+
+          match(conditions[:content])
+        else
+          nil
+      end
+    end
+
+    def ==(node)
+      return false unless super
+      content == node.content
+    end
+  end
+  
+  # A CDATA node is simply a text node with a specialized way of displaying
+  # itself.
+  class CDATA < Text #:nodoc:
+    def to_s
+      "<![CDATA[#{super}]>"
+    end
+  end
+
+  # A Tag is any node that represents markup. It may be an opening tag, a
+  # closing tag, or a self-closing tag. It has a name, and may have a hash of
+  # attributes.
+  class Tag < Node #:nodoc:
+    
+    # Either +nil+, <tt>:close</tt>, or <tt>:self</tt>
+    attr_reader :closing
+    
+    # Either +nil+, or a hash of attributes for this node.
+    attr_reader :attributes
+
+    # The name of this tag.
+    attr_reader :name
+        
+    # Create a new node as a child of the given parent, using the given content
+    # to describe the node. It will be parsed and the node name, attributes and
+    # closing status extracted.
+    def initialize(parent, line, pos, name, attributes, closing)
+      super(parent, line, pos)
+      @name = name
+      @attributes = attributes
+      @closing = closing
+    end
+
+    # A convenience for obtaining an attribute of the node. Returns +nil+ if
+    # the node has no attributes.
+    def [](attr)
+      @attributes ? @attributes[attr] : nil
+    end
+
+    # Returns non-+nil+ if this tag can contain child nodes.
+    def childless?(xml = false)
+      return false if xml && @closing.nil?
+      !@closing.nil? ||
+        @name =~ /^(img|br|hr|link|meta|area|base|basefont|
+                    col|frame|input|isindex|param)$/ox
+    end
+
+    # Returns a textual representation of the node
+    def to_s
+      if @closing == :close
+        "</#{@name}>"
+      else
+        s = "<#{@name}"
+        @attributes.each do |k,v|
+          s << " #{k}"
+          s << "=\"#{v}\"" if String === v
+        end
+        s << " /" if @closing == :self
+        s << ">"
+        @children.each { |child| s << child.to_s }
+        s << "</#{@name}>" if @closing != :self && !@children.empty?
+        s
+      end
+    end
+
+    # If either the node or any of its children meet the given conditions, the
+    # matching node is returned. Otherwise, +nil+ is returned. (See the
+    # description of the valid conditions in the +match+ method.)
+    def find(conditions)
+      match(conditions) && self || super
+    end
+
+    # Returns +true+, indicating that this node represents an HTML tag.
+    def tag?
+      true
+    end
+    
+    # Returns +true+ if the node meets any of the given conditions. The
+    # +conditions+ parameter must be a hash of any of the following keys
+    # (all are optional):
+    #
+    # * <tt>:tag</tt>: the node name must match the corresponding value
+    # * <tt>:attributes</tt>: a hash. The node's values must match the
+    #   corresponding values in the hash.
+    # * <tt>:parent</tt>: a hash. The node's parent must match the
+    #   corresponding hash.
+    # * <tt>:child</tt>: a hash. At least one of the node's immediate children
+    #   must meet the criteria described by the hash.
+    # * <tt>:ancestor</tt>: a hash. At least one of the node's ancestors must
+    #   meet the criteria described by the hash.
+    # * <tt>:descendant</tt>: a hash. At least one of the node's descendants
+    #   must meet the criteria described by the hash.
+    # * <tt>:sibling</tt>: a hash. At least one of the node's siblings must
+    #   meet the criteria described by the hash.
+    # * <tt>:after</tt>: a hash. The node must be after any sibling meeting
+    #   the criteria described by the hash, and at least one sibling must match.
+    # * <tt>:before</tt>: a hash. The node must be before any sibling meeting
+    #   the criteria described by the hash, and at least one sibling must match.
+    # * <tt>:children</tt>: a hash, for counting children of a node. Accepts the
+    #   keys:
+    # ** <tt>:count</tt>: either a number or a range which must equal (or
+    #    include) the number of children that match.
+    # ** <tt>:less_than</tt>: the number of matching children must be less than
+    #    this number.
+    # ** <tt>:greater_than</tt>: the number of matching children must be
+    #    greater than this number.
+    # ** <tt>:only</tt>: another hash consisting of the keys to use
+    #    to match on the children, and only matching children will be
+    #    counted.
+    #
+    # Conditions are matched using the following algorithm:
+    #
+    # * if the condition is a string, it must be a substring of the value.
+    # * if the condition is a regexp, it must match the value.
+    # * if the condition is a number, the value must match number.to_s.
+    # * if the condition is +true+, the value must not be +nil+.
+    # * if the condition is +false+ or +nil+, the value must be +nil+.
+    #
+    # Usage:
+    #
+    #   # test if the node is a "span" tag
+    #   node.match :tag => "span"
+    #
+    #   # test if the node's parent is a "div"
+    #   node.match :parent => { :tag => "div" }
+    #
+    #   # test if any of the node's ancestors are "table" tags
+    #   node.match :ancestor => { :tag => "table" }
+    #
+    #   # test if any of the node's immediate children are "em" tags
+    #   node.match :child => { :tag => "em" }
+    #
+    #   # test if any of the node's descendants are "strong" tags
+    #   node.match :descendant => { :tag => "strong" }
+    #
+    #   # test if the node has between 2 and 4 span tags as immediate children
+    #   node.match :children => { :count => 2..4, :only => { :tag => "span" } } 
+    #
+    #   # get funky: test to see if the node is a "div", has a "ul" ancestor
+    #   # and an "li" parent (with "class" = "enum"), and whether or not it has
+    #   # a "span" descendant that contains # text matching /hello world/:
+    #   node.match :tag => "div",
+    #              :ancestor => { :tag => "ul" },
+    #              :parent => { :tag => "li",
+    #                           :attributes => { :class => "enum" } },
+    #              :descendant => { :tag => "span",
+    #                               :child => /hello world/ }
+    def match(conditions)
+      conditions = validate_conditions(conditions)
+      # check content of child nodes
+      if conditions[:content]
+        if children.empty?
+          return false unless match_condition("", conditions[:content])
+        else
+          return false unless children.find { |child| child.match(conditions[:content]) }
+        end
+      end
+
+      # test the name
+      return false unless match_condition(@name, conditions[:tag]) if conditions[:tag]
+
+      # test attributes
+      (conditions[:attributes] || {}).each do |key, value|
+        return false unless match_condition(self[key], value)
+      end
+
+      # test parent
+      return false unless parent.match(conditions[:parent]) if conditions[:parent]
+
+      # test children
+      return false unless children.find { |child| child.match(conditions[:child]) } if conditions[:child]
+   
+      # test ancestors
+      if conditions[:ancestor]
+        return false unless catch :found do
+          p = self
+          throw :found, true if p.match(conditions[:ancestor]) while p = p.parent
+        end
+      end
+
+      # test descendants
+      if conditions[:descendant]
+        return false unless children.find do |child|
+          # test the child
+          child.match(conditions[:descendant]) ||
+          # test the child's descendants
+          child.match(:descendant => conditions[:descendant])
+        end
+      end
+      
+      # count children
+      if opts = conditions[:children]
+        matches = children.select do |c|
+          (c.kind_of?(HTML::Tag) and (c.closing == :self or ! c.childless?))
+        end
+        
+        matches = matches.select { |c| c.match(opts[:only]) } if opts[:only]
+        opts.each do |key, value|
+          next if key == :only
+          case key
+            when :count
+              if Integer === value
+                return false if matches.length != value
+              else
+                return false unless value.include?(matches.length)
+              end
+            when :less_than
+              return false unless matches.length < value
+            when :greater_than
+              return false unless matches.length > value
+            else raise "unknown count condition #{key}"
+          end
+        end
+      end
+
+      # test siblings
+      if conditions[:sibling] || conditions[:before] || conditions[:after]
+        siblings = parent ? parent.children : []
+        self_index = siblings.index(self)
+
+        if conditions[:sibling]
+          return false unless siblings.detect do |s| 
+            s != self && s.match(conditions[:sibling])
+          end
+        end
+
+        if conditions[:before]
+          return false unless siblings[self_index+1..-1].detect do |s| 
+            s != self && s.match(conditions[:before])
+          end
+        end
+
+        if conditions[:after]
+          return false unless siblings[0,self_index].detect do |s| 
+            s != self && s.match(conditions[:after])
+          end
+        end
+      end
+  
+      true
+    end
+
+    def ==(node)
+      return false unless super
+      return false unless closing == node.closing && self.name == node.name
+      attributes == node.attributes
+    end
+    
+    private
+      # Match the given value to the given condition.
+      def match_condition(value, condition)
+        case condition
+          when String
+            value && value == condition
+          when Regexp
+            value && value.match(condition)
+          when Numeric
+            value == condition.to_s
+          when true
+            !value.nil?
+          when false, nil
+            value.nil?
+          else
+            false
+        end
+      end
+  end
+end
--- a/lib/sanitize.rb
+++ b/lib/sanitize.rb
@ -0,0 +1,205 @@
+module Sanitize
+
+# This module provides sanitization of XHTML+MathML+SVG
+# and of inline style attributes.
+#
+# Based heavily on Sam Ruby's code in the Universal FeedParser.
+
+  require 'html/tokenizer'
+  require 'node'
+
+  acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
+      'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
+      'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
+      'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+      'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
+      'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
+      'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
+      'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
+      'ul', 'var']
+      
+  mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
+      'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
+      'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
+      'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
+      'munderover', 'none']
+      
+  svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
+      'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
+      'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
+      'linearGradient', 'line', 'metadata', 'missing-glyph', 'mpath', 'path',
+      'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', 'svg',
+      'switch', 'text', 'title', 'use']
+      
+  acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
+      'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
+      'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
+      'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
+      'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
+      'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
+      'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
+      'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
+      'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
+      'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', 'title',
+      'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang']
+
+
+  mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
+      'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
+      'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
+      'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
+      'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
+      'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
+      'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
+      'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
+      'xlink:type', 'xmlns', 'xmlns:xlink']
+
+      
+  svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
+       'arabic-form', 'ascent', 'attributeName', 'attributeType',
+       'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
+       'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
+       'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
+       'font-size', 'font-stretch', 'font-style', 'font-variant',
+       'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name', 
+       'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
+       'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
+       'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
+       'origin', 'overline-position', 'overline-thickness', 'panose-1',
+       'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
+       'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
+       'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 
+       'stop-color', 'stop-opacity', 'strikethrough-position',
+       'strikethrough-thickness', 'stroke', 'stroke-dasharray',
+       'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
+       'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
+       'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
+       'underline-position', 'underline-thickness', 'unicode',
+       'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
+       'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
+       'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
+       'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
+       'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
+
+  attr_val_is_uri = ['href', 'src', 'action', 'longdesc', 'xlink:href']
+  
+  acceptable_css_properties = ['azimuth', 'background-color',
+      'border-bottom-color', 'border-collapse', 'border-color',
+      'border-left-color', 'border-right-color', 'border-top-color', 'clear',
+      'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
+      'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
+      'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
+      'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
+      'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
+      'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
+      'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
+      'white-space', 'width']
+
+  acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
+      'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
+      'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
+      'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
+      'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
+      'transparent', 'underline', 'white', 'yellow']
+
+  acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
+      'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
+      'stroke-opacity']
+
+  acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
+      'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
+      'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
+      'ssh', 'sftp', 'rtsp', 'afs' ]
+
+      ALLOWED_ELEMENTS = acceptable_elements + mathml_elements + svg_elements  unless defined?(ALLOWED_ELEMENTS)
+      ALLOWED_ATTRIBUTES = acceptable_attributes + mathml_attributes + svg_attributes unless defined?(ALLOWED_ATTRIBUTES)
+      ALLOWED_CSS_PROPERTIES = acceptable_css_properties unless defined?(ALLOWED_CSS_PROPERTIES)
+      ALLOWED_CSS_KEYWORDS = acceptable_css_keywords unless defined?(ALLOWED_CSS_KEYWORDS)
+      ALLOWED_SVG_PROPERTIES = acceptable_svg_properties unless defined?(ALLOWED_SVG_PROPERTIES)
+      ALLOWED_PROTOCOLS = acceptable_protocols unless defined?(ALLOWED_PROTOCOLS)
+      ATTR_VAL_IS_URI = attr_val_is_uri unless defined?(ATTR_VAL_IS_URI)
+
+      # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and stripping out all
+      # attributes not in ALLOWED_ATTRIBUTES. Style attributes are parsed, and a restricted set,
+      # specified by ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
+      # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified in
+      # ALLOWED_PROTOCOLS are allowed.
+      # You can adjust what gets sanitized, by defining these constant arrays before this Module is loaded. 
+      #
+      #   sanitize_html('<script> do_nasty_stuff() </script>')
+      #    => &lt;script> do_nasty_stuff() &lt;/script>
+      #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
+      #    => <a>Click here for $100</a>
+      def sanitize_html(html)
+        if html.index("<")
+          tokenizer = HTML::Tokenizer.new(html)
+          new_text = ""
+
+          while token = tokenizer.next
+            node = XHTML::Node.parse(nil, 0, 0, token, false)
+            new_text << case node.tag?
+              when true
+                if ALLOWED_ELEMENTS.include?(node.name)
+                  if node.closing != :close
+                    node.attributes.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
+                    ATTR_VAL_IS_URI.each do |attr|
+                      val_unescaped = CGI.unescapeHTML(node.attributes[attr].to_s).gsub(/[\000-\040\177-\240]+/,'').downcase
+                      if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0]) 
+                        node.attributes.delete attr 
+                      end
+                    end
+                    if node.attributes['style']
+                      node.attributes['style'] = sanitize_css(node.attributes['style']) 
+                    end
+                  end
+                  node.to_s
+                else
+                  node.to_s.gsub(/</, "&lt;")
+                end
+              else
+                node.to_s.gsub(/</, "&lt;")
+            end
+          end
+
+          html = new_text
+        end
+        html
+      end
+      
+      def sanitize_css(style)
+          # disallow urls
+          style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
+
+          # gauntlet
+          if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
+             style = ''
+             return style
+          end
+          if style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
+             style = ''
+             return style
+          end
+
+          clean = []
+          style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
+            if ALLOWED_CSS_PROPERTIES.include?(prop.downcase)
+              clean <<  prop + ': ' + val + ';'
+            elsif ['background','border','margin','padding'].include?(prop.split('-')[0].downcase) 
+              goodval = true
+              val.split().each do |keyword|
+                if !ALLOWED_CSS_KEYWORDS.include?(keyword) and 
+                   keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
+                  goodval = false
+                end
+              end
+              if goodval 
+                clean <<  prop + ': ' + val + ';'
+              end
+            elsif ALLOWED_SVG_PROPERTIES.include?(prop.downcase)
+               clean <<  prop + ': ' + val + ';'
+            end
+          end
+
+          style = clean.join(' ')
+      end
+end      
--- a/test/unit/sanitize_test.rb
+++ b/test/unit/sanitize_test.rb
@ -0,0 +1,187 @@
+#!/usr/bin/env ruby
+
+require File.expand_path(File.join(File.dirname(__FILE__), '/../test_helper'))
+require 'sanitize'
+
+class SanitizeTest < Test::Unit::TestCase
+  include Sanitize
+
+  def setup
+
+  end
+
+  Sanitize::ALLOWED_ELEMENTS.each do |tag_name|
+    define_method "test_should_allow_#{tag_name}_tag" do
+      assert_equal "<#{tag_name} title=\"1\">foo &lt;bad>bar&lt;/bad> baz</#{tag_name}>",
+        sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
+    end
+  end
+
+  Sanitize::ALLOWED_ELEMENTS.each do |tag_name|
+    define_method "test_should_forbid_#{tag_name.upcase}_tag" do
+      assert_equal "&lt;#{tag_name.upcase} title=\"1\">foo &lt;bad>bar&lt;/bad> baz&lt;/#{tag_name.upcase}>",
+        sanitize_html("<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>")
+    end
+  end
+
+  Sanitize::ALLOWED_ATTRIBUTES.each do |attribute_name|
+    if attribute_name != 'style'
+      define_method "test_should_allow_#{attribute_name}_attribute" do
+        assert_equal "<p #{attribute_name}=\"foo\">foo &lt;bad>bar&lt;/bad> baz</p>",
+          sanitize_html("<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>")
+      end
+    end
+  end
+
+  Sanitize::ALLOWED_ATTRIBUTES.each do |attribute_name|
+    define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
+      assert_equal "<p>foo &lt;bad>bar&lt;/bad> baz</p>",
+        sanitize_html("<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>")
+    end
+  end
+
+  Sanitize::ALLOWED_PROTOCOLS.each do |protocol|
+    define_method "test_should_allow_#{protocol}_uris" do
+      assert_equal "<a href=\"#{protocol}\">foo</a>",
+        sanitize_html(%(<a href="#{protocol}">foo</a>))
+    end
+  end
+
+  Sanitize::ALLOWED_PROTOCOLS.each do |protocol|
+    define_method "test_should_allow_uppercase_#{protocol}_uris" do
+      assert_equal "<a href=\"#{protocol.upcase}\">foo</a>",
+        sanitize_html(%(<a href="#{protocol.upcase}">foo</a>))
+    end
+  end
+
+  def test_should_allow_anchors
+    assert_equal "<a href=\"foo\">&lt;script>baz&lt;/script></a>",
+     sanitize_html("<a href='foo' onclick='bar'><script>baz</script></a>")
+  end
+
+  # RFC 3986, sec 4.2
+  def test_allow_colons_in_path_component
+    assert_equal "<a href=\"./this:that\">foo</a>",
+      sanitize_html("<a href=\"./this:that\">foo</a>")
+  end
+
+  %w(src width height alt).each do |img_attr|
+    define_method "test_should_allow_image_#{img_attr}_attribute" do
+      assert_equal "<img #{img_attr}=\"foo\" />",
+        sanitize_html("<img #{img_attr}='foo' onclick='bar' />")
+    end
+  end
+
+  def test_should_handle_non_html
+    assert_equal 'abc',  sanitize_html("abc")
+  end
+
+  def test_should_handle_blank_text
+    assert_equal '', sanitize_html('')
+  end
+
+  [%w(img src), %w(a href)].each do |(tag, attr)|
+    define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols" do
+      assert_equal %(<#{tag} title="1">boo</#{tag}>), sanitize_html(%(<#{tag} #{attr}="javascript:XSS" title="1">boo</#{tag}>))
+    end
+  end
+
+  [%w(img src), %w(a href)].each do |(tag, attr)|
+    define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols_and_whitespace" do
+      assert_equal %(<#{tag} title="1">boo</#{tag}>), sanitize_html(%(<#{tag} #{attr}=" javascript:XSS" title="1">boo</#{tag}>))
+    end
+  end
+
+  [%(<img src="javascript:alert('XSS');" />), 
+   %(<img src=javascript:alert('XSS') />), 
+   %(<img src="JaVaScRiPt:alert('XSS')" />), 
+   %(<img src='javascript:alert(&quot;XSS&quot;)' />),
+   %(<img src='javascript:alert(String.fromCharCode(88,83,83))' />),
+   %(<img src='&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;' />),
+   %(<img src='&#0000106;&#0000097;&#0000118;&#0000097;&#0000115;&#0000099;&#0000114;&#0000105;&#0000112;&#0000116;&#0000058;&#0000097;&#0000108;&#0000101;&#0000114;&#0000116;&#0000040;&#0000039;&#0000088;&#0000083;&#0000083;&#0000039;&#0000041' />),
+   %(<img src='&#x6A;&#x61;&#x76;&#x61;&#x73;&#x63;&#x72;&#x69;&#x70;&#x74;&#x3A;&#x61;&#x6C;&#x65;&#x72;&#x74;&#x28;&#x27;&#x58;&#x53;&#x53;&#x27;&#x29' />),
+   %(<img src="jav\tascript:alert('XSS');" />),
+   %(<img src="jav&#x09;ascript:alert('XSS');" />),
+   %(<img src="jav&#x0A;ascript:alert('XSS');" />),
+   %(<img src="jav&#x0D;ascript:alert('XSS');" />),
+   %(<img src=" &#14;  javascript:alert('XSS');" />),
+   %(<img src="&#x20;javascript:alert('XSS');" />),
+   %(<img src="&#xA0;javascript:alert('XSS');" />)].each_with_index do |img_hack, i|
+    define_method "test_should_not_fall_for_xss_image_hack_#{i}" do
+      assert_equal "<img />", sanitize_html(img_hack)
+    end
+  end
+
+  def test_should_sanitize_tag_broken_up_by_null
+    assert_equal "&lt;scr>alert(\"XSS\")&lt;/scr>", sanitize_html(%(<scr\0ipt>alert(\"XSS\")</scr\0ipt>))
+  end
+  
+  def test_should_sanitize_invalid_script_tag
+    assert_equal "&lt;script />&lt;/script>", sanitize_html(%(<script/XSS SRC="http://ha.ckers.org/xss.js"></script>))
+  end
+  
+  def test_should_sanitize_script_tag_with_multiple_open_brackets
+    assert_equal "&lt;&lt;script>alert(\"XSS\");//&lt;&lt;/script>", sanitize_html(%(<<script>alert("XSS");//<</script>))
+    assert_equal %(&lt;iframe src="http:" />&lt;), sanitize_html(%(<iframe src=http://ha.ckers.org/scriptlet.html\n<))
+  end
+  
+  def test_should_sanitize_unclosed_script
+    assert_equal "&lt;script src=\"http:\" /><b>", sanitize_html(%(<script src=http://ha.ckers.org/xss.js?<b>))
+  end
+  
+  def test_should_sanitize_half_open_scripts
+    assert_equal  "<img>", sanitize_html(%(<img src="javascript:alert('XSS')"))
+  end
+  
+  def test_should_not_fall_for_ridiculous_hack
+    img_hack = %(<img\nsrc\n=\n"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n"\n />)
+    assert_equal "<img />", sanitize_html(img_hack)
+  end
+
+  def test_platypus
+    assert_equal %(<a href=\"http://www.ragingplatypus.com/\" style=\"display: block; width: 100%; height: 100%; background-color: black; background-image: ; background-x: center; background-y: center;\">never trust your upstream platypus</a>),
+       sanitize_html(%(<a href="http://www.ragingplatypus.com/" style="display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;">never trust your upstream platypus</a>))
+  end
+
+  def test_xul
+    assert_equal %(<p style="">fubar</p>),
+     sanitize_html(%(<p style="-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')">fubar</p>))
+  end
+
+  def test_input_image
+    assert_equal %(<input type="image" />),
+      sanitize_html(%(<input type="image" src="javascript:alert('XSS');" />))
+  end
+
+  def test_non_alpha_non_digit
+    assert_equal "&lt;script />&lt;/script>",
+      sanitize_html(%(<script/XSS src="http://ha.ckers.org/xss.js"></script>))
+    assert_equal "<a>foo</a>",
+      sanitize_html('<a onclick!#$%&()*~+-_.,:;?@[/|\]^`=alert("XSS")>foo</a>')
+    assert_equal "<img />",
+      sanitize_html('<img/src="http://ha.ckers.org/xss.js"/>')
+  end
+
+  def test_img_dynsrc_lowsrc
+     assert_equal "<img />",
+       sanitize_html(%(<img dynsrc="javascript:alert('XSS')" />))
+     assert_equal "<img />",
+       sanitize_html(%(<img lowsrc="javascript:alert('XSS')" />))
+  end
+
+  def test_div_background_image_unicode_encoded
+    assert_equal '<div style="">foo</div>',
+      sanitize_html(%(<div style="background-image:\0075\0072\006C\0028'\006a\0061\0076\0061\0073\0063\0072\0069\0070\0074\003a\0061\006c\0065\0072\0074\0028.1027\0058.1053\0053\0027\0029'\0029">foo</div>))
+  end
+
+  def test_div_expression
+    assert_equal '<div style="">foo</div>',
+      sanitize_html(%(<div style="width: expression(alert('XSS'));">foo</div>))
+  end
+
+  def test_img_vbscript
+     assert_equal '<img />',
+       sanitize_html(%(<img src='vbscript:msgbox("XSS")' />))
+  end
+
+end