REXML Trees
Synced with latest HTML5lib. Added preliminary support (currently disabled) for sanitizing REXML trees.
This commit is contained in:
parent
4dd70af5ae
commit
bd8ba1f4b1
28 changed files with 1317 additions and 112 deletions
|
@ -1,4 +1,3 @@
|
|||
require 'html5lib/tokenizer'
|
||||
require 'cgi'
|
||||
|
||||
module HTML5lib
|
||||
|
@ -6,7 +5,7 @@ module HTML5lib
|
|||
# This module provides sanitization of XHTML+MathML+SVG
|
||||
# and of inline style attributes.
|
||||
|
||||
class HTMLSanitizer < HTMLTokenizer
|
||||
module HTMLSanitizeModule
|
||||
|
||||
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
|
||||
button caption center cite code col colgroup dd del dfn dir div dl dt
|
||||
|
@ -96,19 +95,7 @@ module HTML5lib
|
|||
ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
|
||||
ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
|
||||
|
||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
||||
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
|
||||
# attributes are parsed, and a restricted set, # specified by
|
||||
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
||||
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
|
||||
# in ALLOWED_PROTOCOLS are allowed.
|
||||
#
|
||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||
# => <script> do_nasty_stuff() </script>
|
||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||
# => <a>Click here for $100</a>
|
||||
def each
|
||||
super do |token|
|
||||
def process_token(token)
|
||||
case token[:type]
|
||||
when :StartTag, :EndTag, :EmptyTag
|
||||
if ALLOWED_ELEMENTS.include?(token[:name])
|
||||
|
@ -126,7 +113,7 @@ module HTML5lib
|
|||
end
|
||||
token[:data] = attrs.map {|k,v| [k,v]}
|
||||
end
|
||||
yield token
|
||||
return token
|
||||
else
|
||||
if token[:type] == :EndTag
|
||||
token[:data] = "</#{token[:name]}>"
|
||||
|
@ -139,12 +126,11 @@ module HTML5lib
|
|||
token[:data].insert(-2,'/') if token[:type] == :EmptyTag
|
||||
token[:type] = :Characters
|
||||
token.delete(:name)
|
||||
yield token
|
||||
return token
|
||||
end
|
||||
else
|
||||
yield token
|
||||
return token
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def sanitize_css(style)
|
||||
|
@ -174,4 +160,23 @@ module HTML5lib
|
|||
style = clean.join(' ')
|
||||
end
|
||||
end
|
||||
|
||||
class HTMLSanitizeFilter < Filter
|
||||
include HTMLSanitizeModule
|
||||
def each
|
||||
@source.each do |token|
|
||||
yield(process_token(token))
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
class HTMLSanitizer < HTMLTokenizer
|
||||
include HTMLSanitizeModule
|
||||
def each
|
||||
super do |token|
|
||||
yield(process_token(token))
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue