4dd70af5ae
Synced with latest version of HTML5lib, which fixes problem with Astral plane characters. I should really do some tests, but the HTML5lib Sanitizer seems to be 2-5 times slower than the old sanitizer.
156 lines
4.5 KiB
Ruby
156 lines
4.5 KiB
Ruby
module HTML5lib
|
|
# Base class for helper objects that implement each phase of processing.
|
|
#
|
|
# Handler methods should be in the following order (they can be omitted):
|
|
#
|
|
# * EOF
|
|
# * Comment
|
|
# * Doctype
|
|
# * SpaceCharacters
|
|
# * Characters
|
|
# * StartTag
|
|
# - startTag* methods
|
|
# * EndTag
|
|
# - endTag* methods
|
|
#
|
|
class Phase
|
|
|
|
# The following example call:
|
|
#
|
|
# tag_handlers('startTag', 'html', %( base link meta ), %( li dt dd ) => 'ListItem')
|
|
#
|
|
# ...would return a hash equal to this:
|
|
#
|
|
# { 'html' => 'startTagHtml',
|
|
# 'base' => 'startTagBaseLinkMeta',
|
|
# 'link' => 'startTagBaseLinkMeta',
|
|
# 'meta' => 'startTagBaseLinkMeta',
|
|
# 'li' => 'startTagListItem',
|
|
# 'dt' => 'startTagListItem',
|
|
# 'dd' => 'startTagListItem' }
|
|
#
|
|
def self.tag_handlers(prefix, *tags)
|
|
mapping = {}
|
|
if tags.last.is_a?(Hash)
|
|
tags.pop.each do |names, handler_method_suffix|
|
|
handler_method = prefix + handler_method_suffix
|
|
Array(names).each { |name| mapping[name] = handler_method }
|
|
end
|
|
end
|
|
tags.each do |names|
|
|
names = Array(names)
|
|
handler_method = prefix + names.map { |name| name.capitalize }.join
|
|
names.each { |name| mapping[name] = handler_method }
|
|
end
|
|
return mapping
|
|
end
|
|
|
|
def self.start_tag_handlers
|
|
@start_tag_handlers ||= Hash.new('startTagOther')
|
|
end
|
|
|
|
# Declare what start tags this Phase handles. Can be called more than once.
|
|
#
|
|
# Example usage:
|
|
#
|
|
# handle_start 'html'
|
|
# # html start tags will be handled by a method named 'startTagHtml'
|
|
#
|
|
# handle_start %( base link meta )
|
|
# # base, link and meta start tags will be handled by a method named 'startTagBaseLinkMeta'
|
|
#
|
|
# handle_start %( li dt dd ) => 'ListItem'
|
|
# # li, dt, and dd start tags will be handled by a method named 'startTagListItem'
|
|
#
|
|
def self.handle_start(*tags)
|
|
start_tag_handlers.update tag_handlers('startTag', *tags)
|
|
end
|
|
|
|
def self.end_tag_handlers
|
|
@end_tag_handlers ||= Hash.new('endTagOther')
|
|
end
|
|
|
|
# Declare what end tags this Phase handles. Behaves like handle_start.
|
|
#
|
|
def self.handle_end(*tags)
|
|
end_tag_handlers.update tag_handlers('endTag', *tags)
|
|
end
|
|
|
|
def initialize(parser, tree)
|
|
@parser, @tree = parser, tree
|
|
end
|
|
|
|
def processEOF
|
|
@tree.generateImpliedEndTags
|
|
|
|
if @tree.openElements.length > 2
|
|
@parser.parseError(_('Unexpected end of file. Missing closing tags.'))
|
|
elsif @tree.openElements.length == 2 and @tree.openElements[1].name != 'body'
|
|
# This happens for framesets or something?
|
|
@parser.parseError(_("Unexpected end of file. Expected end tag (#{@tree.openElements[1].name}) first."))
|
|
elsif @parser.innerHTML and @tree.openElements.length > 1
|
|
# XXX This is not what the specification says. Not sure what to do here.
|
|
@parser.parseError(_('XXX innerHTML EOF'))
|
|
end
|
|
# Betting ends.
|
|
end
|
|
|
|
def processComment(data)
|
|
# For most phases the following is correct. Where it's not it will be
|
|
# overridden.
|
|
@tree.insertComment(data, @tree.openElements[-1])
|
|
end
|
|
|
|
def processDoctype(name, error)
|
|
@parser.parseError(_('Unexpected DOCTYPE. Ignored.'))
|
|
end
|
|
|
|
def processSpaceCharacters(data)
|
|
@tree.insertText(data)
|
|
end
|
|
|
|
def processStartTag(name, attributes)
|
|
send self.class.start_tag_handlers[name], name, attributes
|
|
end
|
|
|
|
def startTagHtml(name, attributes)
|
|
if @parser.firstStartTag == false and name == 'html'
|
|
@parser.parseError(_('html needs to be the first start tag.'))
|
|
end
|
|
# XXX Need a check here to see if the first start tag token emitted is
|
|
# this token... If it's not, invoke @parser.parseError.
|
|
attributes.each do |attr, value|
|
|
unless @tree.openElements[0].attributes.has_key?(attr)
|
|
@tree.openElements[0].attributes[attr] = value
|
|
end
|
|
end
|
|
@parser.firstStartTag = false
|
|
end
|
|
|
|
def processEndTag(name)
|
|
send self.class.end_tag_handlers[name], name
|
|
end
|
|
|
|
def _(string)
|
|
string
|
|
end
|
|
|
|
def assert(value)
|
|
throw AssertionError.new unless value
|
|
end
|
|
|
|
def in_scope?(*args)
|
|
@tree.elementInScope(*args)
|
|
end
|
|
|
|
def remove_open_elements_until(name=nil)
|
|
finished = false
|
|
until finished
|
|
element = @tree.openElements.pop
|
|
finished = name.nil?? yield(element) : element.name == name
|
|
end
|
|
return element
|
|
end
|
|
|
|
end
|
|
end |