REXML Trees

Synced with latest HTML5lib.
Added preliminary support (currently disabled) for sanitizing REXML trees.
This commit is contained in:
Jacques Distler 2007-06-05 16:34:49 -05:00
parent 4dd70af5ae
commit bd8ba1f4b1
28 changed files with 1317 additions and 112 deletions

View file

@ -1,4 +1,3 @@
require 'html5lib/tokenizer'
require 'cgi'
module HTML5lib
@ -6,7 +5,7 @@ module HTML5lib
# This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes.
class HTMLSanitizer < HTMLTokenizer
module HTMLSanitizeModule
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
button caption center cite code col colgroup dd del dfn dir div dl dt
@ -96,19 +95,7 @@ module HTML5lib
ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
# attributes are parsed, and a restricted set, # specified by
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
# in ALLOWED_PROTOCOLS are allowed.
#
# sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def each
super do |token|
def process_token(token)
case token[:type]
when :StartTag, :EndTag, :EmptyTag
if ALLOWED_ELEMENTS.include?(token[:name])
@ -126,7 +113,7 @@ module HTML5lib
end
token[:data] = attrs.map {|k,v| [k,v]}
end
yield token
return token
else
if token[:type] == :EndTag
token[:data] = "</#{token[:name]}>"
@ -139,12 +126,11 @@ module HTML5lib
token[:data].insert(-2,'/') if token[:type] == :EmptyTag
token[:type] = :Characters
token.delete(:name)
yield token
return token
end
else
yield token
return token
end
end
end
def sanitize_css(style)
@ -174,4 +160,23 @@ module HTML5lib
style = clean.join(' ')
end
end
class HTMLSanitizeFilter < Filter
include HTMLSanitizeModule
def each
@source.each do |token|
yield(process_token(token))
end
end
end
class HTMLSanitizer < HTMLTokenizer
include HTMLSanitizeModule
def each
super do |token|
yield(process_token(token))
end
end
end
end