a6429f8c22
Completely removed the html5lib sanitizer. Fixed the string-handling to work in both Ruby 1.8.x and 1.9.2. There are still, inexplicably, two functional tests that fail. But the rest seems to work quite well.
610 lines
20 KiB
Ruby
610 lines
20 KiB
Ruby
require 'html5/html5parser/phase'
|
|
|
|
module HTML5
|
|
class InBodyPhase < Phase
|
|
|
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-body
|
|
|
|
handle_start 'html'
|
|
handle_start %w(base link meta script style) => 'ProcessInHead'
|
|
handle_start 'title'
|
|
|
|
handle_start 'body', 'form', 'plaintext', 'a', 'button', 'xmp', 'table', 'hr', 'image'
|
|
|
|
handle_start 'input', 'textarea', 'select', 'isindex', %w(marquee object)
|
|
|
|
handle_start %w(li dd dt) => 'ListItem'
|
|
|
|
handle_start %w(address blockquote center dir div dl fieldset listing menu ol p pre ul) => 'CloseP'
|
|
|
|
handle_start %w(b big em font i s small strike strong tt u) => 'Formatting'
|
|
handle_start 'nobr'
|
|
|
|
handle_start %w(area basefont bgsound br embed img param spacer wbr) => 'VoidFormatting'
|
|
|
|
handle_start %w(iframe noembed noframes noscript) => 'Cdata', HEADING_ELEMENTS => 'Heading'
|
|
|
|
handle_start %w(caption col colgroup frame frameset head option optgroup tbody td tfoot th thead tr) => 'Misplaced'
|
|
|
|
handle_start %w(event-source section nav article aside header footer datagrid command) => 'New'
|
|
|
|
handle_end 'p', 'body', 'html', 'form', %w(button marquee object), %w(dd dt li) => 'ListItem'
|
|
|
|
handle_end %w(address blockquote center div dl fieldset listing menu ol pre ul) => 'Block'
|
|
|
|
handle_end HEADING_ELEMENTS => 'Heading'
|
|
|
|
handle_end %w(a b big em font i nobr s small strike strong tt u) => 'Formatting'
|
|
|
|
handle_end %w(head frameset select optgroup option table caption colgroup col thead tfoot tbody tr td th) => 'Misplaced'
|
|
|
|
handle_end 'br'
|
|
|
|
handle_end %w(area basefont bgsound embed hr image img input isindex param spacer wbr frame) => 'None'
|
|
|
|
handle_end %w(noframes noscript noembed textarea xmp iframe ) => 'CdataTextAreaXmp'
|
|
|
|
handle_end %w(event-source section nav article aside header footer datagrid command) => 'New'
|
|
|
|
def initialize(parser, tree)
|
|
super(parser, tree)
|
|
|
|
# for special handling of whitespace in <pre>
|
|
class << self
|
|
alias processSpaceCharactersNonPre processSpaceCharacters
|
|
end
|
|
end
|
|
|
|
def processSpaceCharactersDropNewline(data)
|
|
# #Sometimes (start of <pre> blocks) we want to drop leading newlines
|
|
|
|
class << self
|
|
remove_method :processSpaceCharacters rescue nil
|
|
alias processSpaceCharacters processSpaceCharactersNonPre
|
|
end
|
|
|
|
if (data.length > 0 and data[0] == ?\n &&
|
|
%w[pre textarea].include?(@tree.open_elements.last.name) && !@tree.open_elements.last.hasContent)
|
|
data = data[1..-1]
|
|
end
|
|
|
|
if data.length > 0
|
|
@tree.reconstructActiveFormattingElements
|
|
@tree.insertText(data)
|
|
end
|
|
end
|
|
|
|
def processSpaceCharacters(data)
|
|
@tree.reconstructActiveFormattingElements()
|
|
@tree.insertText(data)
|
|
end
|
|
|
|
def processCharacters(data)
|
|
# XXX The specification says to do this for every character at the
|
|
# moment, but apparently that doesn't match the real world so we don't
|
|
# do it for space characters.
|
|
@tree.reconstructActiveFormattingElements
|
|
@tree.insertText(data)
|
|
end
|
|
|
|
def startTagProcessInHead(name, attributes)
|
|
@parser.phases[:inHead].processStartTag(name, attributes)
|
|
end
|
|
|
|
def startTagTitle(name, attributes)
|
|
parse_error("unexpected-start-tag-out-of-my-head", {"name" => name})
|
|
@parser.phases[:inHead].processStartTag(name, attributes)
|
|
end
|
|
|
|
def startTagBody(name, attributes)
|
|
parse_error("unexpected-start-tag", {"name" => "body"})
|
|
|
|
if @tree.open_elements.length == 1 || @tree.open_elements[1].name != 'body'
|
|
assert @parser.inner_html
|
|
else
|
|
attributes.each do |attr, value|
|
|
unless @tree.open_elements[1].attributes.has_key?(attr)
|
|
@tree.open_elements[1].attributes[attr] = value
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
def startTagCloseP(name, attributes)
|
|
endTagP('p') if in_scope?('p')
|
|
@tree.insert_element(name, attributes)
|
|
if name == 'pre'
|
|
class << self
|
|
remove_method :processSpaceCharacters rescue nil
|
|
alias processSpaceCharacters processSpaceCharactersDropNewline
|
|
end
|
|
end
|
|
end
|
|
|
|
def startTagForm(name, attributes)
|
|
if @tree.formPointer
|
|
parse_error("unexpected-start-tag", {"name" => name})
|
|
else
|
|
endTagP('p') if in_scope?('p')
|
|
@tree.insert_element(name, attributes)
|
|
@tree.formPointer = @tree.open_elements.last
|
|
end
|
|
end
|
|
|
|
def startTagListItem(name, attributes)
|
|
endTagP('p') if in_scope?('p')
|
|
stopNames = {'li' => ['li'], 'dd' => ['dd', 'dt'], 'dt' => ['dd', 'dt']}
|
|
stopName = stopNames[name]
|
|
|
|
@tree.open_elements.reverse.each_with_index do |node, i|
|
|
if stopName.include?(node.name)
|
|
poppedNodes = (0..i).collect { @tree.open_elements.pop }
|
|
if i >= 1
|
|
parse_error(
|
|
i == 1 ? "missing-end-tag" : "missing-end-tags",
|
|
{"name" => poppedNodes[0..-1].collect{|n| n.name}.join(", ")})
|
|
|
|
end
|
|
break
|
|
end
|
|
|
|
# Phrasing elements are all non special, non scoping, non
|
|
# formatting elements
|
|
break if ((SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name) && !%w[address div].include?(node.name))
|
|
end
|
|
|
|
# Always insert an <li> element.
|
|
@tree.insert_element(name, attributes)
|
|
end
|
|
|
|
def startTagPlaintext(name, attributes)
|
|
endTagP('p') if in_scope?('p')
|
|
@tree.insert_element(name, attributes)
|
|
@parser.tokenizer.content_model_flag = :PLAINTEXT
|
|
end
|
|
|
|
def startTagHeading(name, attributes)
|
|
endTagP('p') if in_scope?('p')
|
|
|
|
# Uncomment the following for IE7 behavior:
|
|
# HEADING_ELEMENTS.each do |element|
|
|
# if in_scope?(element)
|
|
# parse_error("unexpected-start-tag", {"name" => name})
|
|
#
|
|
# remove_open_elements_until do |element|
|
|
# HEADING_ELEMENTS.include?(element.name)
|
|
# end
|
|
#
|
|
# break
|
|
# end
|
|
# end
|
|
@tree.insert_element(name, attributes)
|
|
end
|
|
|
|
def startTagA(name, attributes)
|
|
if afeAElement = @tree.elementInActiveFormattingElements('a')
|
|
parse_error("unexpected-start-tag-implies-end-tag", {"startName" => "a", "endName" => "a"})
|
|
endTagFormatting('a')
|
|
@tree.open_elements.delete(afeAElement) if @tree.open_elements.include?(afeAElement)
|
|
@tree.activeFormattingElements.delete(afeAElement) if @tree.activeFormattingElements.include?(afeAElement)
|
|
end
|
|
@tree.reconstructActiveFormattingElements
|
|
addFormattingElement(name, attributes)
|
|
end
|
|
|
|
def startTagFormatting(name, attributes)
|
|
@tree.reconstructActiveFormattingElements
|
|
addFormattingElement(name, attributes)
|
|
end
|
|
|
|
def startTagNobr(name, attributes)
|
|
@tree.reconstructActiveFormattingElements
|
|
if in_scope?('nobr')
|
|
parse_error("unexpected-start-tag-implies-end-tag", {"startName" => "nobr", "endName" => "nobr"})
|
|
processEndTag('nobr')
|
|
# XXX Need tests that trigger the following
|
|
@tree.reconstructActiveFormattingElements
|
|
end
|
|
addFormattingElement(name, attributes)
|
|
end
|
|
|
|
def startTagButton(name, attributes)
|
|
if in_scope?('button')
|
|
parse_error("unexpected-start-tag-implies-end-tag", {"startName" => "button", "endName" => "button"})
|
|
processEndTag('button')
|
|
@parser.phase.processStartTag(name, attributes)
|
|
else
|
|
@tree.reconstructActiveFormattingElements
|
|
@tree.insert_element(name, attributes)
|
|
@tree.activeFormattingElements.push(Marker)
|
|
end
|
|
end
|
|
|
|
def startTagMarqueeObject(name, attributes)
|
|
@tree.reconstructActiveFormattingElements
|
|
@tree.insert_element(name, attributes)
|
|
@tree.activeFormattingElements.push(Marker)
|
|
end
|
|
|
|
def startTagXmp(name, attributes)
|
|
@tree.reconstructActiveFormattingElements
|
|
@tree.insert_element(name, attributes)
|
|
@parser.tokenizer.content_model_flag = :CDATA
|
|
end
|
|
|
|
def startTagTable(name, attributes)
|
|
processEndTag('p') if in_scope?('p')
|
|
@tree.insert_element(name, attributes)
|
|
@parser.phase = @parser.phases[:inTable]
|
|
end
|
|
|
|
def startTagVoidFormatting(name, attributes)
|
|
@tree.reconstructActiveFormattingElements
|
|
@tree.insert_element(name, attributes)
|
|
@tree.open_elements.pop
|
|
end
|
|
|
|
def startTagHr(name, attributes)
|
|
endTagP('p') if in_scope?('p')
|
|
@tree.insert_element(name, attributes)
|
|
@tree.open_elements.pop
|
|
end
|
|
|
|
def startTagImage(name, attributes)
|
|
# No really...
|
|
parse_error("unexpected-start-tag-treated-as", {"originalName" => "image", "newName" => "img"})
|
|
processStartTag('img', attributes)
|
|
end
|
|
|
|
def startTagInput(name, attributes)
|
|
@tree.reconstructActiveFormattingElements
|
|
@tree.insert_element(name, attributes)
|
|
if @tree.formPointer
|
|
# XXX Not exactly sure what to do here
|
|
# @tree.open_elements[-1].form = @tree.formPointer
|
|
end
|
|
@tree.open_elements.pop
|
|
end
|
|
|
|
def startTagIsindex(name, attributes)
|
|
parse_error("deprecated-tag", {"name" => "isindex"})
|
|
return if @tree.formPointer
|
|
processStartTag('form', {})
|
|
processStartTag('hr', {})
|
|
processStartTag('p', {})
|
|
processStartTag('label', {})
|
|
# XXX Localization ...
|
|
processCharacters('This is a searchable index. Insert your search keywords here: ')
|
|
attributes['name'] = 'isindex'
|
|
attrs = attributes.to_a
|
|
processStartTag('input', attributes)
|
|
processEndTag('label')
|
|
processEndTag('p')
|
|
processStartTag('hr', {})
|
|
processEndTag('form')
|
|
end
|
|
|
|
def startTagTextarea(name, attributes)
|
|
# XXX Form element pointer checking here as well...
|
|
@tree.insert_element(name, attributes)
|
|
@parser.tokenizer.content_model_flag = :RCDATA
|
|
class << self
|
|
remove_method :processSpaceCharacters rescue nil
|
|
alias processSpaceCharacters processSpaceCharactersDropNewline
|
|
end
|
|
end
|
|
|
|
# iframe, noembed noframes, noscript(if scripting enabled)
|
|
def startTagCdata(name, attributes)
|
|
@tree.insert_element(name, attributes)
|
|
@parser.tokenizer.content_model_flag = :CDATA
|
|
end
|
|
|
|
def startTagSelect(name, attributes)
|
|
@tree.reconstructActiveFormattingElements
|
|
@tree.insert_element(name, attributes)
|
|
@parser.phase = @parser.phases[:inSelect]
|
|
end
|
|
|
|
def startTagMisplaced(name, attributes)
|
|
# Elements that should be children of other elements that have a
|
|
# different insertion mode; here they are ignored
|
|
# "caption", "col", "colgroup", "frame", "frameset", "head",
|
|
# "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
|
|
# "tr", "noscript"
|
|
parse_error("unexpected-start-tag-ignored", {"name" => name})
|
|
end
|
|
|
|
def startTagNew(name, attributes)
|
|
# New HTML5 elements, "event-source", "section", "nav",
|
|
# "article", "aside", "header", "footer", "datagrid", "command"
|
|
# $stderr.puts("Warning: Undefined behaviour for start tag #{name}")
|
|
startTagOther(name, attributes)
|
|
#raise NotImplementedError
|
|
end
|
|
|
|
def startTagOther(name, attributes)
|
|
@tree.reconstructActiveFormattingElements
|
|
@tree.insert_element(name, attributes)
|
|
end
|
|
|
|
def endTagP(name)
|
|
@tree.generateImpliedEndTags('p') if in_scope?('p')
|
|
parse_error("unexpected-end-tag", {"name" => "p"}) unless @tree.open_elements.last.name == 'p'
|
|
if in_scope?('p')
|
|
@tree.open_elements.pop while in_scope?('p')
|
|
else
|
|
startTagCloseP('p', {})
|
|
endTagP('p')
|
|
end
|
|
end
|
|
|
|
def endTagBody(name)
|
|
# XXX Need to take open <p> tags into account here. We shouldn't imply
|
|
# </p> but we should not throw a parse error either. Specification is
|
|
# likely to be updated.
|
|
unless @tree.open_elements[1] && @tree.open_elements[1].name == 'body'
|
|
# inner_html case
|
|
parse_error "unexpected-end-tag", {:name => 'body'}
|
|
return
|
|
end
|
|
unless @tree.open_elements.last.name == 'body'
|
|
parse_error("expected-one-end-tag-but-got-another",
|
|
{"expectedName" => "body",
|
|
"gotName" => @tree.open_elements.last.name})
|
|
end
|
|
@parser.phase = @parser.phases[:afterBody]
|
|
end
|
|
|
|
def endTagHtml(name)
|
|
endTagBody(name)
|
|
@parser.phase.processEndTag(name) unless @parser.inner_html
|
|
end
|
|
|
|
def endTagBlock(name)
|
|
@tree.generateImpliedEndTags if in_scope?(name)
|
|
|
|
unless @tree.open_elements.last.name == name
|
|
parse_error("end-tag-too-early", {"name" => name})
|
|
end
|
|
|
|
if in_scope?(name)
|
|
remove_open_elements_until(name)
|
|
end
|
|
end
|
|
|
|
def endTagForm(name)
|
|
if in_scope?(name)
|
|
@tree.generateImpliedEndTags
|
|
end
|
|
if @tree.open_elements.last.name != name
|
|
parse_error("end-tag-too-early-ignored", {"name" => "form"})
|
|
else
|
|
@tree.open_elements.pop
|
|
end
|
|
@tree.formPointer = nil
|
|
end
|
|
|
|
def endTagListItem(name)
|
|
# AT Could merge this with the Block case
|
|
@tree.generateImpliedEndTags(name) if in_scope?(name)
|
|
|
|
unless @tree.open_elements.last.name == name
|
|
parse_error("end-tag-too-early", {"name" => name})
|
|
end
|
|
|
|
remove_open_elements_until(name) if in_scope?(name)
|
|
end
|
|
|
|
def endTagHeading(name)
|
|
HEADING_ELEMENTS.each do |element|
|
|
if in_scope?(element)
|
|
@tree.generateImpliedEndTags
|
|
break
|
|
end
|
|
end
|
|
|
|
unless @tree.open_elements.last.name == name
|
|
parse_error("end-tag-too-early", {"name" => name})
|
|
end
|
|
|
|
HEADING_ELEMENTS.each do |element|
|
|
if in_scope?(element)
|
|
remove_open_elements_until {|element| HEADING_ELEMENTS.include?(element.name)}
|
|
break
|
|
end
|
|
end
|
|
end
|
|
|
|
# The much-feared adoption agency algorithm
|
|
def endTagFormatting(name)
|
|
# http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
|
|
# XXX Better parse_error messages appreciated.
|
|
while true
|
|
# Step 1 paragraph 1
|
|
afeElement = @tree.elementInActiveFormattingElements(name)
|
|
if !afeElement or (@tree.open_elements.include?(afeElement) && !in_scope?(afeElement.name))
|
|
parse_error("adoption-agency-1.1", {"name" => name})
|
|
return
|
|
# Step 1 paragraph 2
|
|
elsif not @tree.open_elements.include?(afeElement)
|
|
parse_error("adoption-agency-1.2", {"name" => name})
|
|
@tree.activeFormattingElements.delete(afeElement)
|
|
return
|
|
end
|
|
|
|
# Step 1 paragraph 3
|
|
if afeElement != @tree.open_elements.last
|
|
parse_error("adoption-agency-1.3", {"name" => name})
|
|
end
|
|
|
|
# Step 2
|
|
# Start of the adoption agency algorithm proper
|
|
afeIndex = @tree.open_elements.index(afeElement)
|
|
furthestBlock = nil
|
|
@tree.open_elements[afeIndex..-1].each do |element|
|
|
if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(element.name)
|
|
furthestBlock = element
|
|
break
|
|
end
|
|
end
|
|
|
|
# Step 3
|
|
if furthestBlock.nil?
|
|
element = remove_open_elements_until {|element| element == afeElement }
|
|
@tree.activeFormattingElements.delete(element)
|
|
return
|
|
end
|
|
commonAncestor = @tree.open_elements[afeIndex - 1]
|
|
|
|
# Step 5
|
|
furthestBlock.parent.removeChild(furthestBlock) if furthestBlock.parent
|
|
|
|
# Step 6
|
|
# The bookmark is supposed to help us identify where to reinsert
|
|
# nodes in step 12. We have to ensure that we reinsert nodes after
|
|
# the node before the active formatting element. Note the bookmark
|
|
# can move in step 7.4
|
|
bookmark = @tree.activeFormattingElements.index(afeElement)
|
|
|
|
# Step 7
|
|
lastNode = node = furthestBlock
|
|
while true
|
|
# AT replace this with a function and recursion?
|
|
# Node is element before node in open elements
|
|
node = @tree.open_elements[@tree.open_elements.index(node) - 1]
|
|
until @tree.activeFormattingElements.include?(node)
|
|
tmpNode = node
|
|
node = @tree.open_elements[@tree.open_elements.index(node) - 1]
|
|
@tree.open_elements.delete(tmpNode)
|
|
end
|
|
# Step 7.3
|
|
break if node == afeElement
|
|
# Step 7.4
|
|
if lastNode == furthestBlock
|
|
# XXX should this be index(node) or index(node)+1
|
|
# Anne: I think +1 is ok. Given x = [2,3,4,5]
|
|
# x.index(3) gives 1 and then x[1 +1] gives 4...
|
|
bookmark = @tree.activeFormattingElements.index(node) + 1
|
|
end
|
|
# Step 7.5
|
|
cite = node.parent
|
|
if node.hasContent
|
|
clone = node.cloneNode
|
|
# Replace node with clone
|
|
@tree.activeFormattingElements[@tree.activeFormattingElements.index(node)] = clone
|
|
@tree.open_elements[@tree.open_elements.index(node)] = clone
|
|
node = clone
|
|
end
|
|
# Step 7.6
|
|
# Remove lastNode from its parents, if any
|
|
lastNode.parent.removeChild(lastNode) if lastNode.parent
|
|
node.appendChild(lastNode)
|
|
# Step 7.7
|
|
lastNode = node
|
|
# End of inner loop
|
|
end
|
|
|
|
# Step 8
|
|
lastNode.parent.removeChild(lastNode) if lastNode.parent
|
|
commonAncestor.appendChild(lastNode)
|
|
|
|
# Step 9
|
|
clone = afeElement.cloneNode
|
|
|
|
# Step 10
|
|
furthestBlock.reparentChildren(clone)
|
|
|
|
# Step 11
|
|
furthestBlock.appendChild(clone)
|
|
|
|
# Step 12
|
|
@tree.activeFormattingElements.delete(afeElement)
|
|
@tree.activeFormattingElements.insert([bookmark,@tree.activeFormattingElements.length].min, clone)
|
|
|
|
# Step 13
|
|
@tree.open_elements.delete(afeElement)
|
|
@tree.open_elements.insert(@tree.open_elements.index(furthestBlock) + 1, clone)
|
|
end
|
|
end
|
|
|
|
def endTagButtonMarqueeObject(name)
|
|
@tree.generateImpliedEndTags if in_scope?(name)
|
|
|
|
unless @tree.open_elements.last.name == name
|
|
parse_error("end-tag-too-early", {"name" => name})
|
|
end
|
|
|
|
if in_scope?(name)
|
|
remove_open_elements_until(name)
|
|
|
|
@tree.clearActiveFormattingElements
|
|
end
|
|
end
|
|
|
|
def endTagMisplaced(name)
|
|
# This handles elements with end tags in other insertion modes.
|
|
parse_error("unexpected-end-tag", {"name" => name})
|
|
end
|
|
|
|
def endTagBr(name)
|
|
parse_error("unexpected-end-tag-treated-as",
|
|
{"originalName" => "br", "newName" => "br element"})
|
|
@tree.reconstructActiveFormattingElements
|
|
@tree.insert_element(name, {})
|
|
@tree.open_elements.pop()
|
|
end
|
|
|
|
def endTagNone(name)
|
|
# This handles elements with no end tag.
|
|
parse_error("no-end-tag", {"name" => name})
|
|
end
|
|
|
|
def endTagCdataTextAreaXmp(name)
|
|
if @tree.open_elements.last.name == name
|
|
@tree.open_elements.pop
|
|
else
|
|
parse_error("unexpected-end-tag", {"name" => name})
|
|
end
|
|
end
|
|
|
|
def endTagNew(name)
|
|
# New HTML5 elements, "event-source", "section", "nav",
|
|
# "article", "aside", "header", "footer", "datagrid", "command"
|
|
# STDERR.puts "Warning: Undefined behaviour for end tag #{name}"
|
|
endTagOther(name)
|
|
#raise NotImplementedError
|
|
end
|
|
|
|
def endTagOther(name)
|
|
# XXX This logic should be moved into the treebuilder
|
|
@tree.open_elements.reverse.each do |node|
|
|
if node.name == name
|
|
@tree.generateImpliedEndTags
|
|
|
|
unless @tree.open_elements.last.name == name
|
|
parse_error("unexpected-end-tag", {"name" => name})
|
|
end
|
|
|
|
remove_open_elements_until {|element| element == node }
|
|
|
|
break
|
|
else
|
|
if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name)
|
|
parse_error("unexpected-end-tag", {"name" => name})
|
|
break
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
protected
|
|
|
|
def addFormattingElement(name, attributes)
|
|
@tree.insert_element(name, attributes)
|
|
@tree.activeFormattingElements.push(@tree.open_elements.last)
|
|
end
|
|
|
|
end
|
|
end
|