Merged with latest trunk.

This commit is contained in:
Jason Blevins 2007-06-04 22:47:59 -04:00
commit aadfb55342
38 changed files with 4839 additions and 4849 deletions

View file

@ -8,7 +8,19 @@ OPTIONS = {
:ip => "0.0.0.0", :ip => "0.0.0.0",
:environment => "production", :environment => "production",
:server_root => File.expand_path(File.dirname(__FILE__) + "/../public/"), :server_root => File.expand_path(File.dirname(__FILE__) + "/../public/"),
:server_type => WEBrick::SimpleServer :server_type => WEBrick::SimpleServer,
:mime_types => WEBrick::HTTPUtils::DefaultMimeTypes.merge({
'avi' => 'video/x-msvideo',
'gz' => 'application/x-gzip',
'js' => 'application/x-javascript',
'nb' => 'application/mathematica',
'pdf' => 'application/pdf',
'svg' => 'application/svg+xml',
'tar' => 'application/x-tar',
'tex' => 'application/x-tex',
'xml' => 'application/xml',
'xslt' => 'application/xslt+xml'
})
} }
ARGV.options do |opts| ARGV.options do |opts|

View file

@ -2,6 +2,6 @@ require 'rake'
require 'rake/testtask' require 'rake/testtask'
Rake::TestTask.new do |task| Rake::TestTask.new do |task|
task.pattern = 'tests/test_*.rb' task.pattern = 'tests/test_*.rb'
task.verbose = true task.verbose = true
end end

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,46 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class AfterBodyPhase < Phase
handle_end 'html'
def processComment(data)
# This is needed because data is to be appended to the <html> element
# here and not to whatever is currently open.
@tree.insertComment(data, @tree.openElements[0])
end
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters in the after body phase.'))
@parser.phase = @parser.phases[:inBody]
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.parseError(_("Unexpected start tag token (#{name}) in the after body phase."))
@parser.phase = @parser.phases[:inBody]
@parser.phase.processStartTag(name, attributes)
end
def endTagHtml(name)
if @parser.innerHTML
@parser.parseError
else
# XXX: This may need to be done, not sure
# Don't set lastPhase to the current phase but to the inBody phase
# instead. No need for extra parse errors if there's something after </html>.
# Try "<!doctype html>X</html>X" for instance.
@parser.lastPhase = @parser.phase
@parser.phase = @parser.phases[:trailingEnd]
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the after body phase."))
@parser.phase = @parser.phases[:inBody]
@parser.phase.processEndTag(name)
end
end
end

View file

@ -0,0 +1,34 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class AfterFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#after3
handle_start 'html', 'noframes'
handle_end 'html'
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters in the after frameset phase. Ignored.'))
end
def startTagNoframes(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) in the after frameset phase. Ignored."))
end
def endTagHtml(name)
@parser.lastPhase = @parser.phase
@parser.phase = @parser.phases[:trailingEnd]
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the after frameset phase. Ignored."))
end
end
end

View file

@ -0,0 +1,50 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class AfterHeadPhase < Phase
handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead'
def processEOF
anythingElse
@parser.phase.processEOF
end
def processCharacters(data)
anythingElse
@parser.phase.processCharacters(data)
end
def startTagBody(name, attributes)
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inBody]
end
def startTagFrameset(name, attributes)
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inFrameset]
end
def startTagFromHead(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) that can be in head. Moved."))
@parser.phase = @parser.phases[:inHead]
@parser.phase.processStartTag(name, attributes)
end
def startTagOther(name, attributes)
anythingElse
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
anythingElse
@parser.phase.processEndTag(name)
end
def anythingElse
@tree.insertElement('body', {})
@parser.phase = @parser.phases[:inBody]
end
end
end

View file

@ -0,0 +1,41 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class BeforeHeadPhase < Phase
handle_start 'html', 'head'
handle_end 'html'
def processEOF
startTagHead('head', {})
@parser.phase.processEOF
end
def processCharacters(data)
startTagHead('head', {})
@parser.phase.processCharacters(data)
end
def startTagHead(name, attributes)
@tree.insertElement(name, attributes)
@tree.headPointer = @tree.openElements[-1]
@parser.phase = @parser.phases[:inHead]
end
def startTagOther(name, attributes)
startTagHead('head', {})
@parser.phase.processStartTag(name, attributes)
end
def endTagHtml(name)
startTagHead('head', {})
@parser.phase.processEndTag(name)
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) after the (implied) root element."))
end
end
end

View file

@ -0,0 +1,548 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InBodyPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-body
handle_start 'html', 'body', 'form', 'plaintext', 'a', 'button', 'xmp', 'table', 'hr', 'image'
handle_start 'input', 'textarea', 'select', 'isindex', %w( script style ), %w( marquee object )
handle_start %w( li dd dt ) => 'ListItem', %w( base link meta title ) => 'FromHead'
handle_start %w( address blockquote center dir div dl fieldset listing menu ol p pre ul ) => 'CloseP'
handle_start %w( b big em font i nobr s small strike strong tt u ) => 'Formatting'
handle_start %w( area basefont bgsound br embed img param spacer wbr ) => 'VoidFormatting'
handle_start %w( iframe noembed noframes noscript ) => 'Cdata', HEADING_ELEMENTS => 'Heading'
handle_start %w( caption col colgroup frame frameset head option optgroup tbody td tfoot th thead tr ) => 'Misplaced'
handle_start %w( event-source section nav article aside header footer datagrid command ) => 'New'
handle_end 'p', 'body', 'html', 'form', %w( button marquee object ), %w( dd dt li ) => 'ListItem'
handle_end %w( address blockquote center div dl fieldset listing menu ol pre ul ) => 'Block'
handle_end %w( a b big em font i nobr s small strike strong tt u ) => 'Formatting'
handle_end %w( head frameset select optgroup option table caption colgroup col thead tfoot tbody tr td th ) => 'Misplaced'
handle_end %w( area basefont bgsound br embed hr image img input isindex param spacer wbr frame ) => 'None'
handle_end %w( noframes noscript noembed textarea xmp iframe ) => 'CdataTextAreaXmp'
handle_end %w( event-source section nav article aside header footer datagrid command ) => 'New'
def initialize(parser, tree)
super(parser, tree)
# for special handling of whitespace in <pre>
@processSpaceCharactersPre = false
end
def processSpaceCharactersPre(data)
#Sometimes (start of <pre> blocks) we want to drop leading newlines
@processSpaceCharactersPre = false
if (data.length > 0 and data[0] == ?\n and
@tree.openElements[-1].name == 'pre' and
not @tree.openElements[-1].hasContent)
data = data[1..-1]
end
@tree.insertText(data) if data.length > 0
end
def processSpaceCharacters(data)
if @processSpaceCharactersPre
processSpaceCharactersPre(data)
else
super(data)
end
end
def processCharacters(data)
# XXX The specification says to do this for every character at the
# moment, but apparently that doesn't match the real world so we don't
# do it for space characters.
@tree.reconstructActiveFormattingElements
@tree.insertText(data)
end
def startTagScriptStyle(name, attributes)
@parser.phases[:inHead].processStartTag(name, attributes)
end
def startTagFromHead(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) that belongs in the head. Moved."))
@parser.phases[:inHead].processStartTag(name, attributes)
end
def startTagBody(name, attributes)
@parser.parseError(_('Unexpected start tag (body).'))
if (@tree.openElements.length == 1 or
@tree.openElements[1].name != 'body')
assert @parser.innerHTML
else
attributes.each do |attr, value|
unless @tree.openElements[1].attributes.has_key?(attr)
@tree.openElements[1].attributes[attr] = value
end
end
end
end
def startTagCloseP(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@processSpaceCharactersPre = true if name == 'pre'
end
def startTagForm(name, attributes)
if @tree.formPointer
@parser.parseError('Unexpected start tag (form). Ignored.')
else
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@tree.formPointer = @tree.openElements[-1]
end
end
def startTagListItem(name, attributes)
endTagP('p') if in_scope?('p')
stopNames = {'li' => ['li'], 'dd' => ['dd', 'dt'], 'dt' => ['dd', 'dt']}
stopName = stopNames[name]
@tree.openElements.reverse.each_with_index do |node, i|
if stopName.include?(node.name)
(i + 1).times { @tree.openElements.pop }
break
end
# Phrasing elements are all non special, non scoping, non
# formatting elements
break if ((SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name) and
not ['address', 'div'].include?(node.name))
end
# Always insert an <li> element.
@tree.insertElement(name, attributes)
end
def startTagPlaintext(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :PLAINTEXT
end
def startTagHeading(name, attributes)
endTagP('p') if in_scope?('p')
HEADING_ELEMENTS.each do |element|
if in_scope?(element)
@parser.parseError(_("Unexpected start tag (#{name})."))
remove_open_elements_until { |element| HEADING_ELEMENTS.include?(element.name) }
break
end
end
@tree.insertElement(name, attributes)
end
def startTagA(name, attributes)
if afeAElement = @tree.elementInActiveFormattingElements('a')
@parser.parseError(_('Unexpected start tag (a) implies end tag (a).'))
endTagFormatting('a')
@tree.openElements.delete(afeAElement) if @tree.openElements.include?(afeAElement)
@tree.activeFormattingElements.delete(afeAElement) if @tree.activeFormattingElements.include?(afeAElement)
end
@tree.reconstructActiveFormattingElements
addFormattingElement(name, attributes)
end
def startTagFormatting(name, attributes)
@tree.reconstructActiveFormattingElements
addFormattingElement(name, attributes)
end
def startTagButton(name, attributes)
if in_scope?('button')
@parser.parseError(_('Unexpected start tag (button) implied end tag (button).'))
processEndTag('button')
@parser.phase.processStartTag(name, attributes)
else
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.activeFormattingElements.push(Marker)
end
end
def startTagMarqueeObject(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.activeFormattingElements.push(Marker)
end
def startTagXmp(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagTable(name, attributes)
processEndTag('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inTable]
end
def startTagVoidFormatting(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagHr(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagImage(name, attributes)
# No really...
@parser.parseError(_('Unexpected start tag (image). Treated as img.'))
processStartTag('img', attributes)
end
def startTagInput(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
if @tree.formPointer
# XXX Not exactly sure what to do here
# @tree.openElements[-1].form = @tree.formPointer
end
@tree.openElements.pop
end
def startTagIsindex(name, attributes)
@parser.parseError("Unexpected start tag isindex. Don't use it!")
return if @tree.formPointer
processStartTag('form', {})
processStartTag('hr', {})
processStartTag('p', {})
processStartTag('label', {})
# XXX Localization ...
processCharacters('This is a searchable index. Insert your search keywords here:')
attributes['name'] = 'isindex'
attrs = attributes.to_a
processStartTag('input', attributes)
processEndTag('label')
processEndTag('p')
processStartTag('hr', {})
processEndTag('form')
end
def startTagTextarea(name, attributes)
# XXX Form element pointer checking here as well...
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :RCDATA
end
# iframe, noembed noframes, noscript(if scripting enabled)
def startTagCdata(name, attributes)
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagSelect(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inSelect]
end
def startTagMisplaced(name, attributes)
# Elements that should be children of other elements that have a
# different insertion mode; here they are ignored
# "caption", "col", "colgroup", "frame", "frameset", "head",
# "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
# "tr", "noscript"
@parser.parseError(_("Unexpected start tag (#{name}). Ignored."))
end
def startTagNew(name, attributes)
# New HTML5 elements, "event-source", "section", "nav",
# "article", "aside", "header", "footer", "datagrid", "command"
sys.stderr.write("Warning: Undefined behaviour for start tag #{name}")
startTagOther(name, attributes)
#raise NotImplementedError
end
def startTagOther(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
end
def endTagP(name)
@tree.generateImpliedEndTags('p') if in_scope?('p')
@parser.parseError('Unexpected end tag (p).') unless @tree.openElements[-1].name == 'p'
@tree.openElements.pop while in_scope?('p')
end
def endTagBody(name)
# XXX Need to take open <p> tags into account here. We shouldn't imply
# </p> but we should not throw a parse error either. Specification is
# likely to be updated.
unless @tree.openElements[1].name == 'body'
# innerHTML case
@parser.parseError
return
end
unless @tree.openElements[-1].name == 'body'
@parser.parseError(_("Unexpected end tag (body). Missing end tag (#{@tree.openElements[-1].name})."))
end
@parser.phase = @parser.phases[:afterBody]
end
def endTagHtml(name)
endTagBody(name)
@parser.phase.processEndTag(name) unless @parser.innerHTML
end
def endTagBlock(name)
#Put us back in the right whitespace handling mode
@processSpaceCharactersPre = false if name == 'pre'
@tree.generateImpliedEndTags if in_scope?(name)
unless @tree.openElements[-1].name == name
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
end
if in_scope?(name)
remove_open_elements_until(name)
end
end
def endTagForm(name)
endTagBlock(name)
@tree.formPointer = nil
end
def endTagListItem(name)
# AT Could merge this with the Block case
if in_scope?(name)
@tree.generateImpliedEndTags(name)
unless @tree.openElements[-1].name == name
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
end
end
remove_open_elements_until(name) if in_scope?(name)
end
def endTagHeading(name)
HEADING_ELEMENTS.each do |element|
if in_scope?(element)
@tree.generateImpliedEndTags
break
end
end
unless @tree.openElements[-1].name == name
@parser.parseError(("Unexpected end tag (#{name}). Expected other end tag."))
end
HEADING_ELEMENTS.each do |element|
if in_scope?(element)
remove_open_elements_until { |element| HEADING_ELEMENTS.include?(element.name) }
break
end
end
end
# The much-feared adoption agency algorithm
def endTagFormatting(name)
# http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
# XXX Better parseError messages appreciated.
while true
# Step 1 paragraph 1
afeElement = @tree.elementInActiveFormattingElements(name)
if not afeElement or (@tree.openElements.include?(afeElement) and not in_scope?(afeElement.name))
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 1 of the adoption agency algorithm."))
return
# Step 1 paragraph 2
elsif not @tree.openElements.include?(afeElement)
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 2 of the adoption agency algorithm."))
@tree.activeFormattingElements.delete(afeElement)
return
end
# Step 1 paragraph 3
if afeElement != @tree.openElements[-1]
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 3 of the adoption agency algorithm."))
end
# Step 2
# Start of the adoption agency algorithm proper
afeIndex = @tree.openElements.index(afeElement)
furthestBlock = nil
@tree.openElements[afeIndex..-1].each do |element|
if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(element.name)
furthestBlock = element
break
end
end
# Step 3
if furthestBlock.nil?
element = remove_open_elements_until { |element| element == afeElement }
@tree.activeFormattingElements.delete(element)
return
end
commonAncestor = @tree.openElements[afeIndex - 1]
# Step 5
furthestBlock.parent.removeChild(furthestBlock) if furthestBlock.parent
# Step 6
# The bookmark is supposed to help us identify where to reinsert
# nodes in step 12. We have to ensure that we reinsert nodes after
# the node before the active formatting element. Note the bookmark
# can move in step 7.4
bookmark = @tree.activeFormattingElements.index(afeElement)
# Step 7
lastNode = node = furthestBlock
while true
# AT replace this with a function and recursion?
# Node is element before node in open elements
node = @tree.openElements[@tree.openElements.index(node) - 1]
until @tree.activeFormattingElements.include?(node)
tmpNode = node
node = @tree.openElements[@tree.openElements.index(node) - 1]
@tree.openElements.delete(tmpNode)
end
# Step 7.3
break if node == afeElement
# Step 7.4
if lastNode == furthestBlock
# XXX should this be index(node) or index(node)+1
# Anne: I think +1 is ok. Given x = [2,3,4,5]
# x.index(3) gives 1 and then x[1 +1] gives 4...
bookmark = @tree.activeFormattingElements.index(node) + 1
end
# Step 7.5
cite = node.parent
if node.hasContent
clone = node.cloneNode
# Replace node with clone
@tree.activeFormattingElements[@tree.activeFormattingElements.index(node)] = clone
@tree.openElements[@tree.openElements.index(node)] = clone
node = clone
end
# Step 7.6
# Remove lastNode from its parents, if any
lastNode.parent.removeChild(lastNode) if lastNode.parent
node.appendChild(lastNode)
# Step 7.7
lastNode = node
# End of inner loop
end
# Step 8
lastNode.parent.removeChild(lastNode) if lastNode.parent
commonAncestor.appendChild(lastNode)
# Step 9
clone = afeElement.cloneNode
# Step 10
furthestBlock.reparentChildren(clone)
# Step 11
furthestBlock.appendChild(clone)
# Step 12
@tree.activeFormattingElements.delete(afeElement)
@tree.activeFormattingElements.insert([bookmark,@tree.activeFormattingElements.length].min, clone)
# Step 13
@tree.openElements.delete(afeElement)
@tree.openElements.insert(@tree.openElements.index(furthestBlock) + 1, clone)
end
end
def endTagButtonMarqueeObject(name)
@tree.generateImpliedEndTags if in_scope?(name)
unless @tree.openElements[-1].name == name
@parser.parseError(_("Unexpected end tag (#{name}). Expected other end tag first."))
end
if in_scope?(name)
remove_open_elements_until(name)
@tree.clearActiveFormattingElements
end
end
def endTagMisplaced(name)
# This handles elements with end tags in other insertion modes.
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagNone(name)
# This handles elements with no end tag.
@parser.parseError(_("This tag (#{name}) has no end tag"))
end
def endTagCdataTextAreaXmp(name)
if @tree.openElements[-1].name == name
@tree.openElements.pop
else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagNew(name)
# New HTML5 elements, "event-source", "section", "nav",
# "article", "aside", "header", "footer", "datagrid", "command"
STDERR.puts "Warning: Undefined behaviour for end tag #{name}"
endTagOther(name)
#raise NotImplementedError
end
def endTagOther(name)
# XXX This logic should be moved into the treebuilder
@tree.openElements.reverse.each do |node|
if node.name == name
@tree.generateImpliedEndTags
unless @tree.openElements[-1].name == name
@parser.parseError(_("Unexpected end tag (#{name})."))
end
remove_open_elements_until { |element| element == node }
break
else
if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
break
end
end
end
end
protected
def addFormattingElement(name, attributes)
@tree.insertElement(name, attributes)
@tree.activeFormattingElements.push(@tree.openElements[-1])
end
end
end

View file

@ -0,0 +1,68 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InCaptionPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
handle_start 'html', %w( caption col colgroup tbody td tfoot th thead tr ) => 'TableElement'
handle_end 'caption', 'table', %w( body col colgroup html tbody td tfoot th thead tr ) => 'Ignore'
def ignoreEndTagCaption
not in_scope?('caption', true)
end
def processCharacters(data)
@parser.phases[:inBody].processCharacters(data)
end
def startTagTableElement(name, attributes)
@parser.parseError
#XXX Have to duplicate logic here to find out if the tag is ignored
ignoreEndTag = ignoreEndTagCaption
@parser.phase.processEndTag('caption')
@parser.phase.processStartTag(name, attributes) unless ignoreEndTag
end
def startTagOther(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def endTagCaption(name)
if ignoreEndTagCaption
# innerHTML case
assert @parser.innerHTML
@parser.parseError
else
# AT this code is quite similar to endTagTable in "InTable"
@tree.generateImpliedEndTags
unless @tree.openElements[-1].name == 'caption'
@parser.parseError(_("Unexpected end tag (caption). Missing end tags."))
end
remove_open_elements_until('caption')
@tree.clearActiveFormattingElements
@parser.phase = @parser.phases[:inTable]
end
end
def endTagTable(name)
@parser.parseError
ignoreEndTag = ignoreEndTagCaption
@parser.phase.processEndTag('caption')
@parser.phase.processEndTag(name) unless ignoreEndTag
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagOther(name)
@parser.phases[:inBody].processEndTag(name)
end
end
end

View file

@ -0,0 +1,78 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InCellPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
handle_start 'html', %w( caption col colgroup tbody td tfoot th thead tr ) => 'TableOther'
handle_end %w( td th ) => 'TableCell', %w( body caption col colgroup html ) => 'Ignore'
handle_end %w( table tbody tfoot thead tr ) => 'Imply'
def processCharacters(data)
@parser.phases[:inBody].processCharacters(data)
end
def startTagTableOther(name, attributes)
if in_scope?('td', true) or in_scope?('th', true)
closeCell
@parser.phase.processStartTag(name, attributes)
else
# innerHTML case
@parser.parseError
end
end
def startTagOther(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def endTagTableCell(name)
if in_scope?(name, true)
@tree.generateImpliedEndTags(name)
if @tree.openElements[-1].name != name
@parser.parseError("Got table cell end tag (#{name}) while required end tags are missing.")
remove_open_elements_until(name)
else
@tree.openElements.pop
end
@tree.clearActiveFormattingElements
@parser.phase = @parser.phases[:inRow]
else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagImply(name)
if in_scope?(name, true)
closeCell
@parser.phase.processEndTag(name)
else
# sometimes innerHTML case
@parser.parseError
end
end
def endTagOther(name)
@parser.phases[:inBody].processEndTag(name)
end
protected
def closeCell
if in_scope?('td', true)
endTagTableCell('td')
elsif in_scope?('th', true)
endTagTableCell('th')
end
end
end
end

View file

@ -0,0 +1,55 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InColumnGroupPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-column
handle_start 'html', 'col'
handle_end 'colgroup', 'col'
def ignoreEndTagColgroup
@tree.openElements[-1].name == 'html'
end
def processCharacters(data)
ignoreEndTag = ignoreEndTagColgroup
endTagColgroup("colgroup")
@parser.phase.processCharacters(data) unless ignoreEndTag
end
def startTagCol(name, attributes)
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagOther(name, attributes)
ignoreEndTag = ignoreEndTagColgroup
endTagColgroup('colgroup')
@parser.phase.processStartTag(name, attributes) unless ignoreEndTag
end
def endTagColgroup(name)
if ignoreEndTagColgroup
# innerHTML case
assert @parser.innerHTML
@parser.parseError
else
@tree.openElements.pop
@parser.phase = @parser.phases[:inTable]
end
end
def endTagCol(name)
@parser.parseError(_('Unexpected end tag (col). col has no end tag.'))
end
def endTagOther(name)
ignoreEndTag = ignoreEndTagColgroup
endTagColgroup('colgroup')
@parser.phase.processEndTag(name) unless ignoreEndTag
end
end
end

View file

@ -0,0 +1,57 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
handle_start 'html', 'frameset', 'frame', 'noframes'
handle_end 'frameset', 'noframes'
def processCharacters(data)
@parser.parseError(_('Unexpected characters in the frameset phase. Characters ignored.'))
end
def startTagFrameset(name, attributes)
@tree.insertElement(name, attributes)
end
def startTagFrame(name, attributes)
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagNoframes(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag token (#{name}) in the frameset phase. Ignored"))
end
def endTagFrameset(name)
if @tree.openElements[-1].name == 'html'
# innerHTML case
@parser.parseError(_("Unexpected end tag token (frameset) in the frameset phase (innerHTML)."))
else
@tree.openElements.pop
end
if (not @parser.innerHTML and
@tree.openElements[-1].name != 'frameset')
# If we're not in innerHTML mode and the the current node is not a
# "frameset" element (anymore) then switch.
@parser.phase = @parser.phases[:afterFrameset]
end
end
def endTagNoframes(name)
@parser.phases[:inBody].processEndTag(name)
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the frameset phase. Ignored."))
end
end
end

View file

@ -0,0 +1,120 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InHeadPhase < Phase
handle_start 'html', 'head', 'title', 'style', 'script', %w( base link meta )
handle_end 'head', 'html', %w( title style script )
def processEOF
if ['title', 'style', 'script'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected end of file. Expected end tag (#{name})."))
@tree.openElements.pop
end
anythingElse
@parser.phase.processEOF
end
def processCharacters(data)
if ['title', 'style', 'script'].include?(@tree.openElements[-1].name)
@tree.insertText(data)
else
anythingElse
@parser.phase.processCharacters(data)
end
end
def startTagHead(name, attributes)
@parser.parseError(_('Unexpected start tag head in existing head. Ignored'))
end
def startTagTitle(name, attributes)
element = @tree.createElement(name, attributes)
appendToHead(element)
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :RCDATA
end
def startTagStyle(name, attributes)
element = @tree.createElement(name, attributes)
if @tree.headPointer != nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.openElements[-1].appendChild(element)
end
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagScript(name, attributes)
#XXX Inner HTML case may be wrong
element = @tree.createElement(name, attributes)
element._flags.push("parser-inserted")
if (@tree.headPointer != nil and
@parser.phase == @parser.phases[:inHead])
appendToHead(element)
else
@tree.openElements[-1].appendChild(element)
end
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagBaseLinkMeta(name, attributes)
element = @tree.createElement(name, attributes)
appendToHead(element)
end
def startTagOther(name, attributes)
anythingElse
@parser.phase.processStartTag(name, attributes)
end
def endTagHead(name)
if @tree.openElements[-1].name == 'head'
@tree.openElements.pop
else
@parser.parseError(_("Unexpected end tag (head). Ignored."))
end
@parser.phase = @parser.phases[:afterHead]
end
def endTagHtml(name)
anythingElse
@parser.phase.processEndTag(name)
end
def endTagTitleStyleScript(name)
if @tree.openElements[-1].name == name
@tree.openElements.pop
else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def anythingElse
if @tree.openElements[-1].name == 'head'
endTagHead('head')
else
@parser.phase = @parser.phases[:afterHead]
end
end
protected
def appendToHead(element)
if @tree.headPointer.nil?
assert @parser.innerHTML
@tree.openElements[-1].appendChild(element)
else
@tree.headPointer.appendChild(element)
end
end
end
end

View file

@ -0,0 +1,87 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InRowPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
handle_start 'html', %w( td th ) => 'TableCell', %w( caption col colgroup tbody tfoot thead tr ) => 'TableOther'
handle_end 'tr', 'table', %w( tbody tfoot thead ) => 'TableRowGroup', %w( body caption col colgroup html td th ) => 'Ignore'
def processCharacters(data)
@parser.phases[:inTable].processCharacters(data)
end
def startTagTableCell(name, attributes)
clearStackToTableRowContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inCell]
@tree.activeFormattingElements.push(Marker)
end
def startTagTableOther(name, attributes)
ignoreEndTag = ignoreEndTagTr
endTagTr('tr')
# XXX how are we sure it's always ignored in the innerHTML case?
@parser.phase.processStartTag(name, attributes) unless ignoreEndTag
end
def startTagOther(name, attributes)
@parser.phases[:inTable].processStartTag(name, attributes)
end
def endTagTr(name)
if ignoreEndTagTr
# innerHTML case
assert @parser.innerHTML
@parser.parseError
else
clearStackToTableRowContext
@tree.openElements.pop
@parser.phase = @parser.phases[:inTableBody]
end
end
def endTagTable(name)
ignoreEndTag = ignoreEndTagTr
endTagTr('tr')
# Reprocess the current tag if the tr end tag was not ignored
# XXX how are we sure it's always ignored in the innerHTML case?
@parser.phase.processEndTag(name) unless ignoreEndTag
end
def endTagTableRowGroup(name)
if in_scope?(name, true)
endTagTr('tr')
@parser.phase.processEndTag(name)
else
# innerHTML case
@parser.parseError
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the row phase. Ignored."))
end
def endTagOther(name)
@parser.phases[:inTable].processEndTag(name)
end
protected
# XXX unify this with other table helper methods
def clearStackToTableRowContext
until ['tr', 'html'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the row phase."))
@tree.openElements.pop
end
end
def ignoreEndTagTr
not in_scope?('tr', :tableVariant => true)
end
end
end

View file

@ -0,0 +1,84 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InSelectPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
handle_start 'html', 'option', 'optgroup', 'select'
handle_end 'option', 'optgroup', 'select', %w( caption table tbody tfoot thead tr td th ) => 'TableElements'
def processCharacters(data)
@tree.insertText(data)
end
def startTagOption(name, attributes)
# We need to imply </option> if <option> is the current node.
@tree.openElements.pop if @tree.openElements[-1].name == 'option'
@tree.insertElement(name, attributes)
end
def startTagOptgroup(name, attributes)
@tree.openElements.pop if @tree.openElements[-1].name == 'option'
@tree.openElements.pop if @tree.openElements[-1].name == 'optgroup'
@tree.insertElement(name, attributes)
end
def startTagSelect(name, attributes)
@parser.parseError(_('Unexpected start tag (select) in the select phase implies select start tag.'))
endTagSelect('select')
end
def startTagOther(name, attributes)
@parser.parseError(_('Unexpected start tag token (#{name}) in the select phase. Ignored.'))
end
def endTagOption(name)
if @tree.openElements[-1].name == 'option'
@tree.openElements.pop
else
@parser.parseError(_('Unexpected end tag (option) in the select phase. Ignored.'))
end
end
def endTagOptgroup(name)
# </optgroup> implicitly closes <option>
if @tree.openElements[-1].name == 'option' and @tree.openElements[-2].name == 'optgroup'
@tree.openElements.pop
end
# It also closes </optgroup>
if @tree.openElements[-1].name == 'optgroup'
@tree.openElements.pop
# But nothing else
else
@parser.parseError(_('Unexpected end tag (optgroup) in the select phase. Ignored.'))
end
end
def endTagSelect(name)
if in_scope?('select', true)
remove_open_elements_until('select')
@parser.resetInsertionMode
else
# innerHTML case
@parser.parseError
end
end
def endTagTableElements(name)
@parser.parseError(_("Unexpected table end tag (#{name}) in the select phase."))
if in_scope?(name, true)
endTagSelect('select')
@parser.phase.processEndTag(name)
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the select phase. Ignored."))
end
end
end

View file

@ -0,0 +1,83 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InTableBodyPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
handle_start 'html', 'tr', %w( td th ) => 'TableCell', %w( caption col colgroup tbody tfoot thead ) => 'TableOther'
handle_end 'table', %w( tbody tfoot thead ) => 'TableRowGroup', %w( body caption col colgroup html td th tr ) => 'Ingore'
def processCharacters(data)
@parser.phases[:inTable].processCharacters(data)
end
def startTagTr(name, attributes)
clearStackToTableBodyContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inRow]
end
def startTagTableCell(name, attributes)
@parser.parseError(_("Unexpected table cell start tag (#{name}) in the table body phase."))
startTagTr('tr', {})
@parser.phase.processStartTag(name, attributes)
end
def startTagTableOther(name, attributes)
# XXX AT Any ideas on how to share this with endTagTable?
if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
clearStackToTableBodyContext
endTagTableRowGroup(@tree.openElements[-1].name)
@parser.phase.processStartTag(name, attributes)
else
# innerHTML case
@parser.parseError
end
end
def startTagOther(name, attributes)
@parser.phases[:inTable].processStartTag(name, attributes)
end
def endTagTableRowGroup(name)
if in_scope?(name, true)
clearStackToTableBodyContext
@tree.openElements.pop
@parser.phase = @parser.phases[:inTable]
else
@parser.parseError(_("Unexpected end tag (#{name}) in the table body phase. Ignored."))
end
end
def endTagTable(name)
if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
clearStackToTableBodyContext
endTagTableRowGroup(@tree.openElements[-1].name)
@parser.phase.processEndTag(name)
else
# innerHTML case
@parser.parseError
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the table body phase. Ignored."))
end
def endTagOther(name)
@parser.phases[:inTable].processEndTag(name)
end
protected
def clearStackToTableBodyContext
until ['tbody', 'tfoot', 'thead', 'html'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the table body phase."))
@tree.openElements.pop
end
end
end
end

View file

@ -0,0 +1,110 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InTablePhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
handle_start 'html', 'caption', 'colgroup', 'col', 'table'
handle_start %w( tbody tfoot thead ) => 'RowGroup', %w( td th tr ) => 'ImplyTbody'
handle_end 'table', %w( body caption col colgroup html tbody td tfoot th thead tr ) => 'Ignore'
def processCharacters(data)
@parser.parseError(_("Unexpected non-space characters in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
@tree.insertFromTable = true
# Process the character in the "in body" mode
@parser.phases[:inBody].processCharacters(data)
@tree.insertFromTable = false
end
def startTagCaption(name, attributes)
clearStackToTableContext
@tree.activeFormattingElements.push(Marker)
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inCaption]
end
def startTagColgroup(name, attributes)
clearStackToTableContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inColumnGroup]
end
def startTagCol(name, attributes)
startTagColgroup('colgroup', {})
@parser.phase.processStartTag(name, attributes)
end
def startTagRowGroup(name, attributes)
clearStackToTableContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inTableBody]
end
def startTagImplyTbody(name, attributes)
startTagRowGroup('tbody', {})
@parser.phase.processStartTag(name, attributes)
end
def startTagTable(name, attributes)
@parser.parseError(_("Unexpected start tag (table) in table phase. Implies end tag (table)."))
@parser.phase.processEndTag('table')
@parser.phase.processStartTag(name, attributes) unless @parser.innerHTML
end
def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
@tree.insertFromTable = true
# Process the start tag in the "in body" mode
@parser.phases[:inBody].processStartTag(name, attributes)
@tree.insertFromTable = false
end
def endTagTable(name)
if in_scope?('table', true)
@tree.generateImpliedEndTags
unless @tree.openElements[-1].name == 'table'
@parser.parseError(_("Unexpected end tag (table). Expected end tag (#{@tree.openElements[-1].name})."))
end
remove_open_elements_until('table')
@parser.resetInsertionMode
else
# innerHTML case
assert @parser.innerHTML
@parser.parseError
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
@parser.insertFromTable = true
# Process the end tag in the "in body" mode
@parser.phases[:inBody].processEndTag(name)
@parser.insertFromTable = false
end
protected
def clearStackToTableContext
# "clear the stack back to a table context"
until ['table', 'html'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the table phase."))
@tree.openElements.pop
end
# When the current node is <html> it's an innerHTML case
end
end
end

View file

@ -0,0 +1,49 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InitialPhase < Phase
# This phase deals with error handling as well which is currently not
# covered in the specification. The error handling is typically known as
# "quirks mode". It is expected that a future version of HTML5 will define this.
def processEOF
@parser.parseError(_('Unexpected End of file. Expected DOCTYPE.'))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processEOF
end
def processComment(data)
@tree.insertComment(data, @tree.document)
end
def processDoctype(name, error)
@parser.parseError(_('Erroneous DOCTYPE.')) if error
@tree.insertDoctype(name)
@parser.phase = @parser.phases[:rootElement]
end
def processSpaceCharacters(data)
@tree.insertText(data, @tree.document)
end
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters. Expected DOCTYPE.'))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}). Expected DOCTYPE."))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
@parser.parseError(_("Unexpected end tag (#{name}). Expected DOCTYPE."))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processEndTag(name)
end
end
end

View file

@ -0,0 +1,156 @@
module HTML5lib
# Base class for helper objects that implement each phase of processing.
#
# Handler methods should be in the following order (they can be omitted):
#
# * EOF
# * Comment
# * Doctype
# * SpaceCharacters
# * Characters
# * StartTag
# - startTag* methods
# * EndTag
# - endTag* methods
#
class Phase
# The following example call:
#
# tag_handlers('startTag', 'html', %( base link meta ), %( li dt dd ) => 'ListItem')
#
# ...would return a hash equal to this:
#
# { 'html' => 'startTagHtml',
# 'base' => 'startTagBaseLinkMeta',
# 'link' => 'startTagBaseLinkMeta',
# 'meta' => 'startTagBaseLinkMeta',
# 'li' => 'startTagListItem',
# 'dt' => 'startTagListItem',
# 'dd' => 'startTagListItem' }
#
def self.tag_handlers(prefix, *tags)
mapping = {}
if tags.last.is_a?(Hash)
tags.pop.each do |names, handler_method_suffix|
handler_method = prefix + handler_method_suffix
Array(names).each { |name| mapping[name] = handler_method }
end
end
tags.each do |names|
names = Array(names)
handler_method = prefix + names.map { |name| name.capitalize }.join
names.each { |name| mapping[name] = handler_method }
end
return mapping
end
def self.start_tag_handlers
@start_tag_handlers ||= Hash.new('startTagOther')
end
# Declare what start tags this Phase handles. Can be called more than once.
#
# Example usage:
#
# handle_start 'html'
# # html start tags will be handled by a method named 'startTagHtml'
#
# handle_start %( base link meta )
# # base, link and meta start tags will be handled by a method named 'startTagBaseLinkMeta'
#
# handle_start %( li dt dd ) => 'ListItem'
# # li, dt, and dd start tags will be handled by a method named 'startTagListItem'
#
def self.handle_start(*tags)
start_tag_handlers.update tag_handlers('startTag', *tags)
end
def self.end_tag_handlers
@end_tag_handlers ||= Hash.new('endTagOther')
end
# Declare what end tags this Phase handles. Behaves like handle_start.
#
def self.handle_end(*tags)
end_tag_handlers.update tag_handlers('endTag', *tags)
end
def initialize(parser, tree)
@parser, @tree = parser, tree
end
def processEOF
@tree.generateImpliedEndTags
if @tree.openElements.length > 2
@parser.parseError(_('Unexpected end of file. Missing closing tags.'))
elsif @tree.openElements.length == 2 and @tree.openElements[1].name != 'body'
# This happens for framesets or something?
@parser.parseError(_("Unexpected end of file. Expected end tag (#{@tree.openElements[1].name}) first."))
elsif @parser.innerHTML and @tree.openElements.length > 1
# XXX This is not what the specification says. Not sure what to do here.
@parser.parseError(_('XXX innerHTML EOF'))
end
# Betting ends.
end
def processComment(data)
# For most phases the following is correct. Where it's not it will be
# overridden.
@tree.insertComment(data, @tree.openElements[-1])
end
def processDoctype(name, error)
@parser.parseError(_('Unexpected DOCTYPE. Ignored.'))
end
def processSpaceCharacters(data)
@tree.insertText(data)
end
def processStartTag(name, attributes)
send self.class.start_tag_handlers[name], name, attributes
end
def startTagHtml(name, attributes)
if @parser.firstStartTag == false and name == 'html'
@parser.parseError(_('html needs to be the first start tag.'))
end
# XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke @parser.parseError.
attributes.each do |attr, value|
unless @tree.openElements[0].attributes.has_key?(attr)
@tree.openElements[0].attributes[attr] = value
end
end
@parser.firstStartTag = false
end
def processEndTag(name)
send self.class.end_tag_handlers[name], name
end
def _(string)
string
end
def assert(value)
throw AssertionError.new unless value
end
def in_scope?(*args)
@tree.elementInScope(*args)
end
def remove_open_elements_until(name=nil)
finished = false
until finished
element = @tree.openElements.pop
finished = name.nil?? yield(element) : element.name == name
end
return element
end
end
end

View file

@ -0,0 +1,43 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class RootElementPhase < Phase
def processEOF
insertHtmlElement
@parser.phase.processEOF
end
def processComment(data)
@tree.insertComment(data, @tree.document)
end
def processSpaceCharacters(data)
@tree.insertText(data, @tree.document)
end
def processCharacters(data)
insertHtmlElement
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.firstStartTag = true if name == 'html'
insertHtmlElement
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
insertHtmlElement
@parser.phase.processEndTag(name)
end
def insertHtmlElement
element = @tree.createElement('html', {})
@tree.openElements.push(element)
@tree.document.appendChild(element)
@parser.phase = @parser.phases[:beforeHead]
end
end
end

View file

@ -0,0 +1,36 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class TrailingEndPhase < Phase
def processEOF
end
def processComment(data)
@tree.insertComment(data, @tree.document)
end
def processSpaceCharacters(data)
@parser.lastPhase.processSpaceCharacters(data)
end
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters. Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.parseError(_('Unexpected start tag (#{name}). Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
@parser.parseError(_('Unexpected end tag (#{name}). Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processEndTag(name)
end
end
end

View file

@ -3,14 +3,14 @@ require 'html5lib/constants'
module HTML5lib module HTML5lib
# Provides a unicode stream of characters to the HTMLTokenizer. # Provides a unicode stream of characters to the HTMLTokenizer.
# This class takes care of character encoding and removing or replacing # This class takes care of character encoding and removing or replacing
# incorrect byte-sequences and also provides column and line tracking. # incorrect byte-sequences and also provides column and line tracking.
class HTMLInputStream class HTMLInputStream
attr_accessor :queue, :charEncoding attr_accessor :queue, :char_encoding
# Initialises the HTMLInputStream. # Initialises the HTMLInputStream.
# #
@ -27,523 +27,524 @@ class HTMLInputStream
# parseMeta - Look for a <meta> element containing encoding information # parseMeta - Look for a <meta> element containing encoding information
def initialize(source, options = {}) def initialize(source, options = {})
@encoding = nil @encoding = nil
@parseMeta = true @parse_meta = true
@chardet = true @chardet = true
options.each { |name, value| instance_variable_set("@#{name}", value) } options.each { |name, value| instance_variable_set("@#{name}", value) }
# List of where new lines occur # List of where new lines occur
@newLines = [] @new_lines = []
# Raw Stream # Raw Stream
@rawStream = openStream(source) @raw_stream = open_stream(source)
# Encoding Information # Encoding Information
#Number of bytes to use when looking for a meta element with #Number of bytes to use when looking for a meta element with
#encoding information #encoding information
@NUM_BYTES_META = 512 @NUM_BYTES_META = 512
#Encoding to use if no other information can be found #Encoding to use if no other information can be found
@DEFAULT_ENCODING = 'windows-1252' @DEFAULT_ENCODING = 'windows-1252'
#Detect encoding iff no explicit "transport level" encoding is supplied #Detect encoding iff no explicit "transport level" encoding is supplied
if @encoding.nil? or not HTML5lib.isValidEncoding(@encoding) if @encoding.nil? or not HTML5lib.is_valid_encoding(@encoding)
@charEncoding = detectEncoding @char_encoding = detect_encoding
else else
@charEncoding = @encoding @char_encoding = @encoding
end
# Read bytes from stream decoding them into Unicode
uString = @raw_stream.read
unless @char_encoding == 'utf-8'
begin
require 'iconv'
uString = Iconv.iconv('utf-8', @encoding, uString)[0]
rescue
end end
end
# Read bytes from stream decoding them into Unicode # Normalize newlines and null characters
uString = @rawStream.read uString.gsub!(/\r\n?/, "\n")
unless @charEncoding == 'utf-8' uString.gsub!("\x00", [0xFFFD].pack('U'))
begin
require 'iconv'
uString = Iconv.iconv('utf-8', @encoding, uString)[0]
rescue
end
end
# Normalize newlines and null characters # Convert the unicode string into a list to be used as the data stream
uString.gsub!(/\r\n?/, "\n") @data_stream = uString
uString.gsub!("\x00", [0xFFFD].pack('U'))
# Convert the unicode string into a list to be used as the data stream @queue = []
@dataStream = uString
@queue = [] # Reset position in the list to read from
reset
# Reset position in the list to read from
reset
end end
# Produces a file object from source. # Produces a file object from source.
# #
# source can be either a file object, local filename or a string. # source can be either a file object, local filename or a string.
def openStream(source) def open_stream(source)
# Already an IO like object # Already an IO like object
if source.respond_to?(:read) if source.respond_to?(:read)
@stream = source @stream = source
else else
# Treat source as a string and wrap in StringIO # Treat source as a string and wrap in StringIO
@stream = StringIO.new(source) @stream = StringIO.new(source)
end end
return @stream return @stream
end end
def detectEncoding def detect_encoding
#First look for a BOM #First look for a BOM
#This will also read past the BOM if present #This will also read past the BOM if present
encoding = detectBOM encoding = detect_bom
#If there is no BOM need to look for meta elements with encoding #If there is no BOM need to look for meta elements with encoding
#information #information
if encoding.nil? and @parseMeta if encoding.nil? and @parse_meta
encoding = detectEncodingMeta encoding = detect_encoding_meta
end end
#Guess with chardet, if avaliable #Guess with chardet, if avaliable
if encoding.nil? and @chardet if encoding.nil? and @chardet
begin begin
require 'rubygems' require 'rubygems'
require 'UniversalDetector' # gem install chardet require 'UniversalDetector' # gem install chardet
buffer = @rawStream.read buffer = @raw_stream.read
encoding = UniversalDetector::chardet(buffer)['encoding'] encoding = UniversalDetector::chardet(buffer)['encoding']
@rawStream = openStream(buffer) @raw_stream = open_stream(buffer)
rescue LoadError rescue LoadError
end
end
# If all else fails use the default encoding
if encoding.nil?
encoding = @DEFAULT_ENCODING
end end
end
# If all else fails use the default encoding
if encoding.nil?
encoding = @DEFAULT_ENCODING
end
#Substitute for equivalent encodings: #Substitute for equivalent encodings:
encodingSub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'} encoding_sub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
if encodingSub.has_key?(encoding.downcase) if encoding_sub.has_key?(encoding.downcase)
encoding = encodingSub[encoding.downcase] encoding = encoding_sub[encoding.downcase]
end end
return encoding return encoding
end end
# Attempts to detect at BOM at the start of the stream. If # Attempts to detect at BOM at the start of the stream. If
# an encoding can be determined from the BOM return the name of the # an encoding can be determined from the BOM return the name of the
# encoding otherwise return nil # encoding otherwise return nil
def detectBOM def detect_bom
bomDict = { bom_dict = {
"\xef\xbb\xbf" => 'utf-8', "\xef\xbb\xbf" => 'utf-8',
"\xff\xfe" => 'utf-16-le', "\xff\xfe" => 'utf-16-le',
"\xfe\xff" => 'utf-16-be', "\xfe\xff" => 'utf-16-be',
"\xff\xfe\x00\x00" => 'utf-32-le', "\xff\xfe\x00\x00" => 'utf-32-le',
"\x00\x00\xfe\xff" => 'utf-32-be' "\x00\x00\xfe\xff" => 'utf-32-be'
} }
# Go to beginning of file and read in 4 bytes # Go to beginning of file and read in 4 bytes
@rawStream.seek(0) @raw_stream.seek(0)
string = @rawStream.read(4) string = @raw_stream.read(4)
return nil unless string return nil unless string
# Try detecting the BOM using bytes from the string # Try detecting the BOM using bytes from the string
encoding = bomDict[string[0...3]] # UTF-8 encoding = bom_dict[string[0...3]] # UTF-8
seek = 3 seek = 3
unless encoding
# Need to detect UTF-32 before UTF-16
encoding = bom_dict[string] # UTF-32
seek = 4
unless encoding unless encoding
# Need to detect UTF-32 before UTF-16 encoding = bom_dict[string[0...2]] # UTF-16
encoding = bomDict[string] # UTF-32 seek = 2
seek = 4
unless encoding
encoding = bomDict[string[0...2]] # UTF-16
seek = 2
end
end end
end
#AT - move this to the caller? #AT - move this to the caller?
# Set the read position past the BOM if one was found, otherwise # Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream # set it to the start of the stream
@rawStream.seek(encoding ? seek : 0) @raw_stream.seek(encoding ? seek : 0)
return encoding return encoding
end end
# Report the encoding declared by the meta element # Report the encoding declared by the meta element
def detectEncodingMeta def detect_encoding_meta
parser = EncodingParser.new(@rawStream.read(@NUM_BYTES_META)) parser = EncodingParser.new(@raw_stream.read(@NUM_BYTES_META))
@rawStream.seek(0) @raw_stream.seek(0)
return parser.getEncoding return parser.get_encoding
end end
def determineNewLines def determine_new_lines
# Looks through the stream to find where new lines occur so # Looks through the stream to find where new lines occur so
# the position method can tell where it is. # the position method can tell where it is.
@newLines.push(0) @new_lines.push(0)
(0...@dataStream.length).each { |i| @newLines.push(i) if @dataStream[i] == ?\n } (0...@data_stream.length).each { |i| @new_lines.push(i) if @data_stream[i] == ?\n }
end end
# Returns (line, col) of the current position in the stream. # Returns (line, col) of the current position in the stream.
def position def position
# Generate list of new lines first time around # Generate list of new lines first time around
determineNewLines if @newLines.empty? determine_new_lines if @new_lines.empty?
line = 0 line = 0
tell = @tell tell = @tell
@newLines.each do |pos| @new_lines.each do |pos|
break unless pos < tell break unless pos < tell
line += 1 line += 1
end end
col = tell - @newLines[line-1] - 1 col = tell - @new_lines[line-1] - 1
return [line, col] return [line, col]
end end
# Resets the position in the stream back to the start. # Resets the position in the stream back to the start.
def reset def reset
@tell = 0 @tell = 0
end end
# Read one character from the stream or queue if available. Return # Read one character from the stream or queue if available. Return
# EOF when EOF is reached. # EOF when EOF is reached.
def char def char
unless @queue.empty? unless @queue.empty?
return @queue.shift return @queue.shift
else else
begin begin
@tell += 1 @tell += 1
return @dataStream[@tell - 1].chr return @data_stream[@tell - 1].chr
rescue rescue
return :EOF return :EOF
end
end end
end
end end
# Returns a string of characters from the stream up to but not # Returns a string of characters from the stream up to but not
# including any character in characters or EOF. characters can be # including any character in characters or EOF. characters can be
# any container that supports the in method being called on it. # any container that supports the in method being called on it.
def charsUntil(characters, opposite = false) def chars_until(characters, opposite=false)
charStack = [char] char_stack = [char]
unless charStack[0] == :EOF unless char_stack[0] == :EOF
while (characters.include? charStack[-1]) == opposite while (characters.include? char_stack[-1]) == opposite
unless @queue.empty? unless @queue.empty?
# First from the queue # First from the queue
charStack.push(@queue.shift) char_stack.push(@queue.shift)
break if charStack[-1] == :EOF break if char_stack[-1] == :EOF
else else
# Then the rest # Then the rest
begin begin
charStack.push(@dataStream[@tell].chr) char_stack.push(@data_stream[@tell].chr)
@tell += 1 @tell += 1
rescue rescue
charStack.push(:EOF) char_stack.push(:EOF)
break break
end
end
end end
end
end end
end
# Put the character stopped on back to the front of the queue # Put the character stopped on back to the front of the queue
# from where it came. # from where it came.
@queue.insert(0, charStack.pop) @queue.insert(0, char_stack.pop)
return charStack.join('') return char_stack.join('')
end end
end end
# String-like object with an assosiated position and various extra methods # String-like object with an assosiated position and various extra methods
# If the position is ever greater than the string length then an exception is raised # If the position is ever greater than the string length then an exception is raised
class EncodingBytes < String class EncodingBytes < String
attr_accessor :position attr_accessor :position
def initialize(value) def initialize(value)
super(value) super(value)
@position = -1 @position = -1
end end
def each def each
while @position < length while @position < length
@position += 1 @position += 1
yield self[@position] yield self[@position]
end end
rescue EOF rescue EOF
end end
def currentByte def current_byte
raise EOF if @position >= length raise EOF if @position >= length
return self[@position].chr return self[@position].chr
end end
# Skip past a list of characters # Skip past a list of characters
def skip(chars = SPACE_CHARACTERS) def skip(chars=SPACE_CHARACTERS)
while chars.include?(currentByte) while chars.include?(current_byte)
@position += 1 @position += 1
end end
end end
# Look for a sequence of bytes at the start of a string. If the bytes # Look for a sequence of bytes at the start of a string. If the bytes
# are found return true and advance the position to the byte after the # are found return true and advance the position to the byte after the
# match. Otherwise return false and leave the position alone # match. Otherwise return false and leave the position alone
def matchBytes(bytes, lower = false) def match_bytes(bytes, lower=false)
data = self[position ... position+bytes.length] data = self[position ... position+bytes.length]
data.downcase! if lower data.downcase! if lower
rv = (data == bytes) rv = (data == bytes)
@position += bytes.length if rv == true @position += bytes.length if rv == true
return rv return rv
end end
# Look for the next sequence of bytes matching a given sequence. If # Look for the next sequence of bytes matching a given sequence. If
# a match is found advance the position to the last byte of the match # a match is found advance the position to the last byte of the match
def jumpTo(bytes) def jump_to(bytes)
newPosition = self[position .. -1].index(bytes) new_position = self[position .. -1].index(bytes)
if newPosition if new_position
@position += (newPosition + bytes.length-1) @position += (new_position + bytes.length-1)
return true return true
else else
raise EOF raise EOF
end end
end end
# Move the pointer so it points to the next byte in a set of possible # Move the pointer so it points to the next byte in a set of possible
# bytes # bytes
def findNext(byteList) def find_next(byte_list)
until byteList.include?(currentByte) until byte_list.include?(current_byte)
@position += 1 @position += 1
end end
end end
end end
# Mini parser for detecting character encoding from meta elements # Mini parser for detecting character encoding from meta elements
class EncodingParser class EncodingParser
# string - the data to work on for encoding detection # string - the data to work on for encoding detection
def initialize(data) def initialize(data)
@data = EncodingBytes.new(data.to_s) @data = EncodingBytes.new(data.to_s)
@encoding = nil @encoding = nil
end end
@@method_dispatch = [ @@method_dispatch = [
['<!--', :handleComment], ['<!--', :handle_comment],
['<meta', :handleMeta], ['<meta', :handle_meta],
['</', :handlePossibleEndTag], ['</', :handle_possible_end_tag],
['<!', :handleOther], ['<!', :handle_other],
['<?', :handleOther], ['<?', :handle_other],
['<', :handlePossibleStartTag] ['<', :handle_possible_start_tag]
] ]
def getEncoding def get_encoding
@data.each do |byte| @data.each do |byte|
keepParsing = true keep_parsing = true
@@method_dispatch.each do |(key, method)| @@method_dispatch.each do |(key, method)|
if @data.matchBytes(key, lower = true) if @data.match_bytes(key, lower = true)
keepParsing = send(method) keep_parsing = send(method)
break break
end end
end
break unless keepParsing
end end
@encoding = @encoding.strip unless @encoding.nil? break unless keep_parsing
return @encoding end
@encoding = @encoding.strip unless @encoding.nil?
return @encoding
end end
# Skip over comments # Skip over comments
def handleComment def handle_comment
return @data.jumpTo('-->') return @data.jump_to('-->')
end end
def handleMeta def handle_meta
# if we have <meta not followed by a space so just keep going # if we have <meta not followed by a space so just keep going
return true unless SPACE_CHARACTERS.include?(@data.currentByte) return true unless SPACE_CHARACTERS.include?(@data.current_byte)
#We have a valid meta element we want to search for attributes #We have a valid meta element we want to search for attributes
while true while true
#Try to find the next attribute after the current position #Try to find the next attribute after the current position
attr = getAttribute attr = get_attribute
return true if attr.nil? return true if attr.nil?
if attr[0] == 'charset' if attr[0] == 'charset'
tentativeEncoding = attr[1] tentative_encoding = attr[1]
if HTML5lib.isValidEncoding(tentativeEncoding) if HTML5lib.is_valid_encoding(tentative_encoding)
@encoding = tentativeEncoding @encoding = tentative_encoding
return false return false
end end
elsif attr[0] == 'content' elsif attr[0] == 'content'
contentParser = ContentAttrParser.new(EncodingBytes.new(attr[1])) content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
tentativeEncoding = contentParser.parse tentative_encoding = content_parser.parse
if HTML5lib.isValidEncoding(tentativeEncoding) if HTML5lib.is_valid_encoding(tentative_encoding)
@encoding = tentativeEncoding @encoding = tentative_encoding
return false return false
end end
end
end end
end
end end
def handlePossibleStartTag def handle_possible_start_tag
return handlePossibleTag(false) return handle_possible_tag(false)
end end
def handlePossibleEndTag def handle_possible_end_tag
@data.position+=1 @data.position += 1
return handlePossibleTag(true) return handle_possible_tag(true)
end end
def handlePossibleTag(endTag) def handle_possible_tag(end_tag)
unless ASCII_LETTERS.include?(@data.currentByte) unless ASCII_LETTERS.include?(@data.current_byte)
#If the next byte is not an ascii letter either ignore this #If the next byte is not an ascii letter either ignore this
#fragment (possible start tag case) or treat it according to #fragment (possible start tag case) or treat it according to
#handleOther #handleOther
if endTag if end_tag
@data.position -= 1 @data.position -= 1
handleOther handle_other
end
return true
end
@data.findNext(SPACE_CHARACTERS + ['<', '>'])
if @data.currentByte == '<'
#return to the first step in the overall "two step" algorithm
#reprocessing the < byte
@data.position -= 1
else
#Read all attributes
{} until getAttribute.nil?
end end
return true return true
end
@data.find_next(SPACE_CHARACTERS + ['<', '>'])
if @data.current_byte == '<'
#return to the first step in the overall "two step" algorithm
#reprocessing the < byte
@data.position -= 1
else
#Read all attributes
{} until get_attribute.nil?
end
return true
end end
def handleOther def handle_other
return @data.jumpTo('>') return @data.jump_to('>')
end end
# Return a name,value pair for the next attribute in the stream, # Return a name,value pair for the next attribute in the stream,
# if one is found, or nil # if one is found, or nil
def getAttribute def get_attribute
@data.skip(SPACE_CHARACTERS + ['/']) @data.skip(SPACE_CHARACTERS + ['/'])
if @data.currentByte == '<' if @data.current_byte == '<'
@data.position -= 1 @data.position -= 1
return nil return nil
elsif @data.currentByte == '>' elsif @data.current_byte == '>'
return nil return nil
end end
attrName = [] attr_name = []
attrValue = [] attr_value = []
spaceFound = false space_found = false
#Step 5 attribute name #Step 5 attribute name
while true while true
if @data.currentByte == '=' and attrName: if @data.current_byte == '=' and attr_name:
break break
elsif SPACE_CHARACTERS.include?(@data.currentByte) elsif SPACE_CHARACTERS.include?(@data.current_byte)
spaceFound = true space_found = true
break break
elsif ['/', '<', '>'].include?(@data.currentByte) elsif ['/', '<', '>'].include?(@data.current_byte)
return [attrName.join(''), ''] return [attr_name.join(''), '']
elsif ASCII_UPPERCASE.include?(@data.currentByte) elsif ASCII_UPPERCASE.include?(@data.current_byte)
attrName.push(@data.currentByte.downcase) attr_name.push(@data.current_byte.downcase)
else
attrName.push(@data.currentByte)
end
#Step 6
@data.position += 1
end
#Step 7
if spaceFound
@data.skip
#Step 8
unless @data.currentByte == '='
@data.position -= 1
return [attrName.join(''), '']
end
end
#XXX need to advance position in both spaces and value case
#Step 9
@data.position += 1
#Step 10
@data.skip
#Step 11
if ["'", '"'].include?(@data.currentByte)
#11.1
quoteChar = @data.currentByte
while true
@data.position+=1
#11.3
if @data.currentByte == quoteChar
@data.position += 1
return [attrName.join(''), attrValue.join('')]
#11.4
elsif ASCII_UPPERCASE.include?(@data.currentByte)
attrValue.push(@data.currentByte.downcase)
#11.5
else
attrValue.push(@data.currentByte)
end
end
elsif ['>', '<'].include?(@data.currentByte)
return [attrName.join(''), '']
elsif ASCII_UPPERCASE.include?(@data.currentByte)
attrValue.push(@data.currentByte.downcase)
else else
attrValue.push(@data.currentByte) attr_name.push(@data.current_byte)
end end
#Step 6
@data.position += 1
end
#Step 7
if space_found
@data.skip
#Step 8
unless @data.current_byte == '='
@data.position -= 1
return [attr_name.join(''), '']
end
end
#XXX need to advance position in both spaces and value case
#Step 9
@data.position += 1
#Step 10
@data.skip
#Step 11
if ["'", '"'].include?(@data.current_byte)
#11.1
quote_char = @data.current_byte
while true while true
@data.position +=1 @data.position+=1
if (SPACE_CHARACTERS + ['>', '<']).include?(@data.currentByte) #11.3
return [attrName.join(''), attrValue.join('')] if @data.current_byte == quote_char
elsif ASCII_UPPERCASE.include?(@data.currentByte) @data.position += 1
attrValue.push(@data.currentByte.downcase) return [attr_name.join(''), attr_value.join('')]
else #11.4
attrValue.push(@data.currentByte) elsif ASCII_UPPERCASE.include?(@data.current_byte)
end attr_value.push(@data.current_byte.downcase)
#11.5
else
attr_value.push(@data.current_byte)
end
end end
elsif ['>', '<'].include?(@data.current_byte)
return [attr_name.join(''), '']
elsif ASCII_UPPERCASE.include?(@data.current_byte)
attr_value.push(@data.current_byte.downcase)
else
attr_value.push(@data.current_byte)
end
while true
@data.position += 1
if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
return [attr_name.join(''), attr_value.join('')]
elsif ASCII_UPPERCASE.include?(@data.current_byte)
attr_value.push(@data.current_byte.downcase)
else
attr_value.push(@data.current_byte)
end
end
end end
end end
class ContentAttrParser class ContentAttrParser
def initialize(data) def initialize(data)
@data = data @data = data
end end
def parse def parse
begin begin
#Skip to the first ";" #Skip to the first ";"
@data.position = 0 @data.position = 0
@data.jumpTo(';') @data.jump_to(';')
@data.position += 1 @data.position += 1
@data.skip @data.skip
#Check if the attr name is charset #Check if the attr name is charset
#otherwise return #otherwise return
@data.jumpTo('charset') @data.jump_to('charset')
@data.position += 1 @data.position += 1
@data.skip @data.skip
unless @data.currentByte == '=' unless @data.current_byte == '='
#If there is no = sign keep looking for attrs #If there is no = sign keep looking for attrs
return nil return nil
end
@data.position += 1
@data.skip
#Look for an encoding between matching quote marks
if ['"', "'"].include?(@data.currentByte)
quoteMark = @data.currentByte
@data.position += 1
oldPosition = @data.position
@data.jumpTo(quoteMark)
return @data[oldPosition ... @data.position]
else
#Unquoted value
oldPosition = @data.position
begin
@data.findNext(SPACE_CHARACTERS)
return @data[oldPosition ... @data.position]
rescue EOF
#Return the whole remaining value
return @data[oldPosition .. -1]
end
end
rescue EOF
return nil
end end
@data.position += 1
@data.skip
#Look for an encoding between matching quote marks
if ['"', "'"].include?(@data.current_byte)
quote_mark = @data.current_byte
@data.position += 1
old_position = @data.position
@data.jump_to(quote_mark)
return @data[old_position ... @data.position]
else
#Unquoted value
old_position = @data.position
begin
@data.find_next(SPACE_CHARACTERS)
return @data[old_position ... @data.position]
rescue EOF
#Return the whole remaining value
return @data[old_position .. -1]
end
end
rescue EOF
return nil
end
end end
end end
# Determine if a string is a supported encoding # Determine if a string is a supported encoding
def self.isValidEncoding(encoding) def self.is_valid_encoding(encoding)
(not encoding.nil? and encoding.kind_of?(String) and ENCODINGS.include?(encoding.downcase.strip)) (not encoding.nil? and encoding.kind_of?(String) and ENCODINGS.include?(encoding.downcase.strip))
end end
end end

View file

@ -16,126 +16,126 @@ require 'html5lib/constants'
module HTML5lib module HTML5lib
# liberal XML parser # liberal XML parser
class XMLParser < HTMLParser class XMLParser < HTMLParser
def initialize(options={}) def initialize(options = {})
super options super options
@phases[:initial] = XmlRootPhase.new(self, @tree) @phases[:initial] = XmlRootPhase.new(self, @tree)
end end
def normalizeToken(token) def normalizeToken(token)
if token[:type] == :StartTag or token[:type] == :EmptyTag if token[:type] == :StartTag or token[:type] == :EmptyTag
# We need to remove the duplicate attributes and convert attributes # We need to remove the duplicate attributes and convert attributes
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"} # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
token[:data] = Hash[*token[:data].reverse.flatten] token[:data] = Hash[*token[:data].reverse.flatten]
# For EmptyTags, process both a Start and an End tag # For EmptyTags, process both a Start and an End tag
if token[:type] == :EmptyTag if token[:type] == :EmptyTag
@phase.processStartTag(token[:name], token[:data]) @phase.processStartTag(token[:name], token[:data])
token[:data] = {} token[:data] = {}
token[:type] = :EndTag token[:type] = :EndTag
end
elsif token[:type] == :EndTag
if token[:data]
parseError(_("End tag contains unexpected attributes."))
end
elsif token[:type] == :Comment
# Rescue CDATA from the comments
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
token[:type] = :Characters
token[:data] = token[:data][7 ... -2]
end
end end
return token elsif token[:type] == :EndTag
if token[:data]
parseError(_("End tag contains unexpected attributes."))
end
elsif token[:type] == :Comment
# Rescue CDATA from the comments
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
token[:type] = :Characters
token[:data] = token[:data][7 ... -2]
end
end
return token
end end
end end
# liberal XMTHML parser # liberal XMTHML parser
class XHTMLParser < XMLParser class XHTMLParser < XMLParser
def initialize(options={}) def initialize(options = {})
super options super options
@phases[:initial] = InitialPhase.new(self, @tree) @phases[:initial] = InitialPhase.new(self, @tree)
@phases[:rootElement] = XhmlRootPhase.new(self, @tree) @phases[:rootElement] = XhmlRootPhase.new(self, @tree)
end end
def normalizeToken(token) def normalizeToken(token)
super(token) super(token)
# ensure that non-void XHTML elements have content so that separate # ensure that non-void XHTML elements have content so that separate
# open and close tags are emitted # open and close tags are emitted
if token[:type] == :EndTag and \ if token[:type] == :EndTag and \
not VOID_ELEMENTS.include? token[:name] and \ not VOID_ELEMENTS.include? token[:name] and \
token[:name] == @tree.openElements[-1].name and \ token[:name] == @tree.openElements[-1].name and \
not @tree.openElements[-1].hasContent not @tree.openElements[-1].hasContent
@tree.insertText('') unless @tree.insertText('') unless
@tree.openElements.any? {|e| @tree.openElements.any? {|e|
e.attributes.keys.include? 'xmlns' and e.attributes.keys.include? 'xmlns' and
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml' e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
} }
end end
return token return token
end end
end end
class XhmlRootPhase < RootElementPhase class XhmlRootPhase < RootElementPhase
def insertHtmlElement def insertHtmlElement
element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'}) element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
@tree.openElements.push(element) @tree.openElements.push(element)
@tree.document.appendChild(element) @tree.document.appendChild(element)
@parser.phase = @parser.phases[:beforeHead] @parser.phase = @parser.phases[:beforeHead]
end end
end end
class XmlRootPhase < Phase class XmlRootPhase < Phase
# Prime the Xml parser # Prime the Xml parser
@start_tag_handlers = Hash.new(:startTagOther) @start_tag_handlers = Hash.new(:startTagOther)
@end_tag_handlers = Hash.new(:endTagOther) @end_tag_handlers = Hash.new(:endTagOther)
def startTagOther(name, attributes) def startTagOther(name, attributes)
@tree.openElements.push(@tree.document) @tree.openElements.push(@tree.document)
element = @tree.createElement(name, attributes) element = @tree.createElement(name, attributes)
@tree.openElements[-1].appendChild(element) @tree.openElements[-1].appendChild(element)
@tree.openElements.push(element) @tree.openElements.push(element)
@parser.phase = XmlElementPhase.new(@parser,@tree) @parser.phase = XmlElementPhase.new(@parser,@tree)
end end
def endTagOther(name) def endTagOther(name)
super super
@tree.openElements.pop @tree.openElements.pop
end end
end end
class XmlElementPhase < Phase class XmlElementPhase < Phase
# Generic handling for all XML elements # Generic handling for all XML elements
@start_tag_handlers = Hash.new(:startTagOther) @start_tag_handlers = Hash.new(:startTagOther)
@end_tag_handlers = Hash.new(:endTagOther) @end_tag_handlers = Hash.new(:endTagOther)
def startTagOther(name, attributes) def startTagOther(name, attributes)
element = @tree.createElement(name, attributes) element = @tree.createElement(name, attributes)
@tree.openElements[-1].appendChild(element) @tree.openElements[-1].appendChild(element)
@tree.openElements.push(element) @tree.openElements.push(element)
end end
def endTagOther(name) def endTagOther(name)
for node in @tree.openElements.reverse for node in @tree.openElements.reverse
if node.name == name if node.name == name
{} while @tree.openElements.pop != node {} while @tree.openElements.pop != node
break break
else else
@parser.parseError @parser.parseError
end
end end
end
end end
def processCharacters(data) def processCharacters(data)
@tree.insertText(data) @tree.insertText(data)
end end
end end
end end

View file

@ -6,87 +6,87 @@ module HTML5lib
# This module provides sanitization of XHTML+MathML+SVG # This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes. # and of inline style attributes.
class HTMLSanitizer < HTMLTokenizer class HTMLSanitizer < HTMLTokenizer
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
button caption center cite code col colgroup dd del dfn dir div dl dt button caption center cite code col colgroup dd del dfn dir div dl dt
em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label
legend li map menu ol optgroup option p pre q s samp select small span legend li map menu ol optgroup option p pre q s samp select small span
strike strong sub sup table tbody td textarea tfoot th thead tr tt u strike strong sub sup table tbody td textarea tfoot th thead tr tt u
ul var] ul var]
MATHML_ELEMENTS = %w[maction math merror mfrac mi mmultiscripts mn mo MATHML_ELEMENTS = %w[maction math merror mfrac mi mmultiscripts mn mo
mover mpadded mphantom mprescripts mroot mrow mspace msqrt mstyle msub mover mpadded mphantom mprescripts mroot mrow mspace msqrt mstyle msub
msubsup msup mtable mtd mtext mtr munder munderover none] msubsup msup mtable mtd mtext mtr munder munderover none]
SVG_ELEMENTS = %w[a animate animateColor animateMotion animateTransform SVG_ELEMENTS = %w[a animate animateColor animateMotion animateTransform
circle defs desc ellipse font-face font-face-name font-face-src g circle defs desc ellipse font-face font-face-name font-face-src g
glyph hkern image linearGradient line marker metadata missing-glyph glyph hkern image linearGradient line marker metadata missing-glyph
mpath path polygon polyline radialGradient rect set stop svg switch mpath path polygon polyline radialGradient rect set stop svg switch
text title tspan use] text title tspan use]
ACCEPTABLE_ATTRIBUTES = %w[abbr accept accept-charset accesskey action ACCEPTABLE_ATTRIBUTES = %w[abbr accept accept-charset accesskey action
align alt axis border cellpadding cellspacing char charoff charset align alt axis border cellpadding cellspacing char charoff charset
checked cite class clear cols colspan color compact coords datetime checked cite class clear cols colspan color compact coords datetime
dir disabled enctype for frame headers height href hreflang hspace id dir disabled enctype for frame headers height href hreflang hspace id
ismap label lang longdesc maxlength media method multiple name nohref ismap label lang longdesc maxlength media method multiple name nohref
noshade nowrap prompt readonly rel rev rows rowspan rules scope noshade nowrap prompt readonly rel rev rows rowspan rules scope
selected shape size span src start style summary tabindex target title selected shape size span src start style summary tabindex target title
type usemap valign value vspace width xml:lang] type usemap valign value vspace width xml:lang]
MATHML_ATTRIBUTES = %w[actiontype align columnalign columnalign MATHML_ATTRIBUTES = %w[actiontype align columnalign columnalign
columnalign columnlines columnspacing columnspan depth display columnalign columnlines columnspacing columnspan depth display
displaystyle equalcolumns equalrows fence fontstyle fontweight frame displaystyle equalcolumns equalrows fence fontstyle fontweight frame
height linethickness lspace mathbackground mathcolor mathvariant height linethickness lspace mathbackground mathcolor mathvariant
mathvariant maxsize minsize other rowalign rowalign rowalign rowlines mathvariant maxsize minsize other rowalign rowalign rowalign rowlines
rowspacing rowspan rspace scriptlevel selection separator stretchy rowspacing rowspan rspace scriptlevel selection separator stretchy
width width xlink:href xlink:show xlink:type xmlns xmlns:xlink] width width xlink:href xlink:show xlink:type xmlns xmlns:xlink]
SVG_ATTRIBUTES = %w[accent-height accumulate additive alphabetic SVG_ATTRIBUTES = %w[accent-height accumulate additive alphabetic
arabic-form ascent attributeName attributeType baseProfile bbox begin arabic-form ascent attributeName attributeType baseProfile bbox begin
by calcMode cap-height class color color-rendering content cx cy d dx by calcMode cap-height class color color-rendering content cx cy d dx
dy descent display dur end fill fill-rule font-family font-size dy descent display dur end fill fill-rule font-family font-size
font-stretch font-style font-variant font-weight from fx fy g1 g2 font-stretch font-style font-variant font-weight from fx fy g1 g2
glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
ideographic k keyPoints keySplines keyTimes lang marker-end ideographic k keyPoints keySplines keyTimes lang marker-end
marker-mid marker-start markerHeight markerUnits markerWidth marker-mid marker-start markerHeight markerUnits markerWidth
mathematical max min name offset opacity orient origin mathematical max min name offset opacity orient origin
overline-position overline-thickness panose-1 path pathLength points overline-position overline-thickness panose-1 path pathLength points
preserveAspectRatio r refX refY repeatCount repeatDur preserveAspectRatio r refX refY repeatCount repeatDur
requiredExtensions requiredFeatures restart rotate rx ry slope stemh requiredExtensions requiredFeatures restart rotate rx ry slope stemh
stemv stop-color stop-opacity strikethrough-position stemv stop-color stop-opacity strikethrough-position
strikethrough-thickness stroke stroke-dasharray stroke-dashoffset strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
stroke-width systemLanguage target text-anchor to transform type u1 stroke-width systemLanguage target text-anchor to transform type u1
u2 underline-position underline-thickness unicode unicode-range u2 underline-position underline-thickness unicode unicode-range
units-per-em values version viewBox visibility width widths x units-per-em values version viewBox visibility width widths x
x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
xmlns:xlink y y1 y2 zoomAndPan] xmlns:xlink y y1 y2 zoomAndPan]
ATTR_VAL_IS_URI = %w[href src cite action longdesc xlink:href] ATTR_VAL_IS_URI = %w[href src cite action longdesc xlink:href]
ACCEPTABLE_CSS_PROPERTIES = %w[azimuth background-color ACCEPTABLE_CSS_PROPERTIES = %w[azimuth background-color
border-bottom-color border-collapse border-color border-left-color border-bottom-color border-collapse border-color border-left-color
border-right-color border-top-color clear color cursor direction border-right-color border-top-color clear color cursor direction
display elevation float font font-family font-size font-style display elevation float font font-family font-size font-style
font-variant font-weight height letter-spacing line-height overflow font-variant font-weight height letter-spacing line-height overflow
pause pause-after pause-before pitch pitch-range richness speak pause pause-after pause-before pitch pitch-range richness speak
speak-header speak-numeral speak-punctuation speech-rate stress speak-header speak-numeral speak-punctuation speech-rate stress
text-align text-decoration text-indent unicode-bidi vertical-align text-align text-decoration text-indent unicode-bidi vertical-align
voice-family volume white-space width] voice-family volume white-space width]
ACCEPTABLE_CSS_KEYWORDS = %w[auto aqua black block blue bold both bottom ACCEPTABLE_CSS_KEYWORDS = %w[auto aqua black block blue bold both bottom
brown center collapse dashed dotted fuchsia gray green !important brown center collapse dashed dotted fuchsia gray green !important
italic left lime maroon medium none navy normal nowrap olive pointer italic left lime maroon medium none navy normal nowrap olive pointer
purple red right solid silver teal top transparent underline white purple red right solid silver teal top transparent underline white
yellow] yellow]
ACCEPTABLE_SVG_PROPERTIES = %w[fill fill-opacity fill-rule stroke ACCEPTABLE_SVG_PROPERTIES = %w[fill fill-opacity fill-rule stroke
stroke-width stroke-linecap stroke-linejoin stroke-opacity] stroke-width stroke-linecap stroke-linejoin stroke-opacity]
ACCEPTABLE_PROTOCOLS = %w[ed2k ftp http https irc mailto news gopher nntp ACCEPTABLE_PROTOCOLS = %w[ed2k ftp http https irc mailto news gopher nntp
telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs] telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
# subclasses may define their own versions of these constants # subclasses may define their own versions of these constants
ALLOWED_ELEMENTS = ACCEPTABLE_ELEMENTS + MATHML_ELEMENTS + SVG_ELEMENTS ALLOWED_ELEMENTS = ACCEPTABLE_ELEMENTS + MATHML_ELEMENTS + SVG_ELEMENTS
@ -104,75 +104,74 @@ class HTMLSanitizer < HTMLTokenizer
# in ALLOWED_PROTOCOLS are allowed. # in ALLOWED_PROTOCOLS are allowed.
# #
# sanitize_html('<script> do_nasty_stuff() </script>') # sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script> # => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>') # sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a> # => <a>Click here for $100</a>
def each def each
super do |token| super do |token|
case token[:type] case token[:type]
when :StartTag, :EndTag, :EmptyTag when :StartTag, :EndTag, :EmptyTag
if ALLOWED_ELEMENTS.include?(token[:name]) if ALLOWED_ELEMENTS.include?(token[:name])
if token.has_key? :data if token.has_key? :data
attrs = Hash[*token[:data].flatten] attrs = Hash[*token[:data].flatten]
attrs.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) } attrs.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
ATTR_VAL_IS_URI.each do |attr| ATTR_VAL_IS_URI.each do |attr|
val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/[\000-\040\177\s]+|\302[\200-\240]/,'').downcase val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0]) if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
attrs.delete attr attrs.delete attr
end
end
if attrs['style']
attrs['style'] = sanitize_css(attrs['style'])
end
token[:data] = attrs.map {|k,v| [k,v]}
end
yield token
else
if token[:type] == :EndTag
token[:data] = "</#{token[:name]}>"
elsif token[:data]
attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('')
token[:data] = "<#{token[:name]}#{attrs}>"
else
token[:data] = "<#{token[:name]}>"
end
token[:data].insert(-2,'/') if token[:type] == :EmptyTag
token[:type] = :Characters
token.delete(:name)
yield token
end end
else
yield token
end
end
end
def sanitize_css(style)
# disallow urls
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
# gauntlet
return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
clean = []
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
next if val.empty?
prop.downcase!
if ALLOWED_CSS_PROPERTIES.include?(prop)
clean << "#{prop}: #{val};"
elsif %w[background border margin padding].include?(prop.split('-')[0])
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
!ALLOWED_CSS_KEYWORDS.include?(keyword) and
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
end
elsif ALLOWED_SVG_PROPERTIES.include?(prop)
clean << "#{prop}: #{val};"
end end
if attrs['style']
attrs['style'] = sanitize_css(attrs['style'])
end
token[:data] = attrs.map {|k,v| [k,v]}
end
yield token
else
if token[:type] == :EndTag
token[:data] = "</#{token[:name]}>"
elsif token[:data]
attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('')
token[:data] = "<#{token[:name]}#{attrs}>"
else
token[:data] = "<#{token[:name]}>"
end
token[:data].insert(-2,'/') if token[:type] == :EmptyTag
token[:type] = :Characters
token.delete(:name)
yield token
end end
else
style = clean.join(' ') yield token
end
end end
end end
def sanitize_css(style)
# disallow urls
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
# gauntlet
return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
clean = []
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
next if val.empty?
prop.downcase!
if ALLOWED_CSS_PROPERTIES.include?(prop)
clean << "#{prop}: #{val};"
elsif %w[background border margin padding].include?(prop.split('-')[0])
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
!ALLOWED_CSS_KEYWORDS.include?(keyword) and
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
end
elsif ALLOWED_SVG_PROPERTIES.include?(prop)
clean << "#{prop}: #{val};"
end
end
style = clean.join(' ')
end
end
end end

File diff suppressed because it is too large Load diff

View file

@ -1,21 +1,21 @@
module HTML5lib module HTML5lib
module TreeBuilders module TreeBuilders
def self.getTreeBuilder(name) def self.getTreeBuilder(name)
case name.to_s.downcase case name.to_s.downcase
when 'simpletree' then when 'simpletree' then
require 'html5lib/treebuilders/simpletree' require 'html5lib/treebuilders/simpletree'
SimpleTree::TreeBuilder SimpleTree::TreeBuilder
when 'rexml' then when 'rexml' then
require 'html5lib/treebuilders/rexml' require 'html5lib/treebuilders/rexml'
REXMLTree::TreeBuilder REXMLTree::TreeBuilder
when 'hpricot' then when 'hpricot' then
require 'html5lib/treebuilders/hpricot' require 'html5lib/treebuilders/hpricot'
Hpricot::TreeBuilder Hpricot::TreeBuilder
else else
raise "Unknown TreeBuilder #{name}" raise "Unknown TreeBuilder #{name}"
end
end end
end
end end
end end

View file

@ -4,166 +4,166 @@ require 'html5lib/constants'
module HTML5lib module HTML5lib
# The scope markers are inserted when entering buttons, object elements, # The scope markers are inserted when entering buttons, object elements,
# marquees, table cells, and table captions, and are used to prevent formatting # marquees, table cells, and table captions, and are used to prevent formatting
# from "leaking" into tables, buttons, object elements, and marquees. # from "leaking" into tables, buttons, object elements, and marquees.
Marker = nil Marker = nil
module TreeBuilders module TreeBuilders
module Base module Base
class Node class Node
# The parent of the current node (or nil for the document node) # The parent of the current node (or nil for the document node)
attr_accessor :parent attr_accessor :parent
# a list of child nodes of the current node. This must # a list of child nodes of the current node. This must
# include all elements but not necessarily other node types # include all elements but not necessarily other node types
attr_accessor :childNodes attr_accessor :childNodes
# A list of miscellaneous flags that can be set on the node # A list of miscellaneous flags that can be set on the node
attr_accessor :_flags attr_accessor :_flags
def initialize(name) def initialize(name)
@parent = nil @parent = nil
@childNodes = [] @childNodes = []
@_flags = [] @_flags = []
end
# Insert node as a child of the current node
def appendChild(node)
raise NotImplementedError
end
# Insert data as text in the current node, positioned before the
# start of node insertBefore or to the end of the node's text.
def insertText(data, insertBefore = nil)
raise NotImplementedError
end
# Insert node as a child of the current node, before refNode in the
# list of child nodes. Raises ValueError if refNode is not a child of
# the current node
def insertBefore(node, refNode)
raise NotImplementedError
end
# Remove node from the children of the current node
def removeChild(node)
raise NotImplementedError
end
# Move all the children of the current node to newParent.
# This is needed so that trees that don't store text as nodes move the
# text in the correct way
def reparentChildren(newParent)
#XXX - should this method be made more general?
@childNodes.each { |child| newParent.appendChild(child) }
@childNodes = []
end
# Return a shallow copy of the current node i.e. a node with the same
# name and attributes but with no parent or child nodes
def cloneNode
raise NotImplementedError
end
# Return true if the node has children or text, false otherwise
def hasContent
raise NotImplementedError
end
end
# Base treebuilder implementation
class TreeBuilder
attr_accessor :openElements
attr_accessor :activeFormattingElements
attr_accessor :document
attr_accessor :headPointer
attr_accessor :formPointer
# Class to use for document root
documentClass = nil
# Class to use for HTML elements
elementClass = nil
# Class to use for comments
commentClass = nil
# Class to use for doctypes
doctypeClass = nil
# Fragment class
fragmentClass = nil
def initialize
reset
end
def reset
@openElements = []
@activeFormattingElements = []
#XXX - rename these to headElement, formElement
@headPointer = nil
@formPointer = nil
self.insertFromTable = false
@document = @documentClass.new
end
def elementInScope(target, tableVariant = false)
# Exit early when possible.
return true if @openElements[-1].name == target
# AT How about while true and simply set node to [-1] and set it to
# [-2] at the end...
@openElements.reverse.each do |element|
if element.name == target
return true
elsif element.name == 'table'
return false
elsif not tableVariant and SCOPING_ELEMENTS.include?(element.name)
return false
elsif element.name == 'html'
return false
end
end end
assert false # We should never reach this point
end
def reconstructActiveFormattingElements # Insert node as a child of the current node
# Within this algorithm the order of steps described in the def appendChild(node)
# specification is not quite the same as the order of steps in the raise NotImplementedError
# code. It should still do the same though. end
# Step 1: stop the algorithm when there's nothing to do. # Insert data as text in the current node, positioned before the
return unless @activeFormattingElements # start of node insertBefore or to the end of the node's text.
def insertText(data, insertBefore=nil)
raise NotImplementedError
end
# Step 2 and step 3: we start with the last element. So i is -1. # Insert node as a child of the current node, before refNode in the
i = -1 # list of child nodes. Raises ValueError if refNode is not a child of
entry = @activeFormattingElements[i] # the current node
return if entry == Marker or @openElements.include?(entry) def insertBefore(node, refNode)
raise NotImplementedError
end
# Step 6 # Remove node from the children of the current node
until entry == Marker or @openElements.include?(entry) def removeChild(node)
raise NotImplementedError
end
# Move all the children of the current node to newParent.
# This is needed so that trees that don't store text as nodes move the
# text in the correct way
def reparentChildren(newParent)
#XXX - should this method be made more general?
@childNodes.each { |child| newParent.appendChild(child) }
@childNodes = []
end
# Return a shallow copy of the current node i.e. a node with the same
# name and attributes but with no parent or child nodes
def cloneNode
raise NotImplementedError
end
# Return true if the node has children or text, false otherwise
def hasContent
raise NotImplementedError
end
end
# Base treebuilder implementation
class TreeBuilder
attr_accessor :openElements
attr_accessor :activeFormattingElements
attr_accessor :document
attr_accessor :headPointer
attr_accessor :formPointer
# Class to use for document root
documentClass = nil
# Class to use for HTML elements
elementClass = nil
# Class to use for comments
commentClass = nil
# Class to use for doctypes
doctypeClass = nil
# Fragment class
fragmentClass = nil
def initialize
reset
end
def reset
@openElements = []
@activeFormattingElements = []
#XXX - rename these to headElement, formElement
@headPointer = nil
@formPointer = nil
self.insertFromTable = false
@document = @documentClass.new
end
def elementInScope(target, tableVariant=false)
# Exit early when possible.
return true if @openElements[-1].name == target
# AT How about while true and simply set node to [-1] and set it to
# [-2] at the end...
@openElements.reverse.each do |element|
if element.name == target
return true
elsif element.name == 'table'
return false
elsif not tableVariant and SCOPING_ELEMENTS.include?(element.name)
return false
elsif element.name == 'html'
return false
end
end
assert false # We should never reach this point
end
def reconstructActiveFormattingElements
# Within this algorithm the order of steps described in the
# specification is not quite the same as the order of steps in the
# code. It should still do the same though.
# Step 1: stop the algorithm when there's nothing to do.
return unless @activeFormattingElements
# Step 2 and step 3: we start with the last element. So i is -1.
i = -1
entry = @activeFormattingElements[i]
return if entry == Marker or @openElements.include?(entry)
# Step 6
until entry == Marker or @openElements.include?(entry)
# Step 5: let entry be one earlier in the list. # Step 5: let entry be one earlier in the list.
i -= 1 i -= 1
begin begin
entry = @activeFormattingElements[i] entry = @activeFormattingElements[i]
rescue rescue
# Step 4: at this point we need to jump to step 8. By not doing # Step 4: at this point we need to jump to step 8. By not doing
# i += 1 which is also done in step 7 we achieve that. # i += 1 which is also done in step 7 we achieve that.
break break
end end
end end
while true while true
# Step 7 # Step 7
i += 1 i += 1
@ -178,153 +178,153 @@ class TreeBuilder
# Step 11 # Step 11
break if element == @activeFormattingElements[-1] break if element == @activeFormattingElements[-1]
end
end end
end
def clearActiveFormattingElements def clearActiveFormattingElements
{} until @activeFormattingElements.empty? || @activeFormattingElements.pop == Marker {} until @activeFormattingElements.empty? || @activeFormattingElements.pop == Marker
end end
# Check if an element exists between the end of the active # Check if an element exists between the end of the active
# formatting elements and the last marker. If it does, return it, else # formatting elements and the last marker. If it does, return it, else
# return false # return false
def elementInActiveFormattingElements(name) def elementInActiveFormattingElements(name)
@activeFormattingElements.reverse.each do |element| @activeFormattingElements.reverse.each do |element|
# Check for Marker first because if it's a Marker it doesn't have a # Check for Marker first because if it's a Marker it doesn't have a
# name attribute. # name attribute.
break if element == Marker break if element == Marker
return element if element.name == name return element if element.name == name
end
return false
end end
return false
end
def insertDoctype(name) def insertDoctype(name)
@document.appendChild(@doctypeClass.new(name)) @document.appendChild(@doctypeClass.new(name))
end end
def insertComment(data, parent = nil) def insertComment(data, parent=nil)
parent = @openElements[-1] if parent.nil? parent = @openElements[-1] if parent.nil?
parent.appendChild(@commentClass.new(data)) parent.appendChild(@commentClass.new(data))
end end
# Create an element but don't insert it anywhere # Create an element but don't insert it anywhere
def createElement(name, attributes) def createElement(name, attributes)
element = @elementClass.new(name) element = @elementClass.new(name)
element.attributes = attributes element.attributes = attributes
return element return element
end end
# Switch the function used to insert an element from the # Switch the function used to insert an element from the
# normal one to the misnested table one and back again # normal one to the misnested table one and back again
def insertFromTable=(value) def insertFromTable=(value)
@insertFromTable = value @insertFromTable = value
@insertElement = value ? :insertElementTable : :insertElementNormal @insertElement = value ? :insertElementTable : :insertElementNormal
end end
def insertElement(name, attributes) def insertElement(name, attributes)
send(@insertElement, name, attributes) send(@insertElement, name, attributes)
end end
def insertElementNormal(name, attributes) def insertElementNormal(name, attributes)
element = @elementClass.new(name) element = @elementClass.new(name)
element.attributes = attributes element.attributes = attributes
@openElements[-1].appendChild(element) @openElements[-1].appendChild(element)
@openElements.push(element) @openElements.push(element)
return element return element
end end
# Create an element and insert it into the tree # Create an element and insert it into the tree
def insertElementTable(name, attributes) def insertElementTable(name, attributes)
element = @elementClass.new(name) element = @elementClass.new(name)
element.attributes = attributes element.attributes = attributes
if TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name) if TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)
#We should be in the InTable mode. This means we want to do #We should be in the InTable mode. This means we want to do
#special magic element rearranging #special magic element rearranging
parent, insertBefore = getTableMisnestedNodePosition parent, insertBefore = getTableMisnestedNodePosition
if insertBefore.nil? if insertBefore.nil?
parent.appendChild(element) parent.appendChild(element)
else else
parent.insertBefore(element, insertBefore) parent.insertBefore(element, insertBefore)
end end
@openElements.push(element) @openElements.push(element)
else else
return insertElementNormal(name, attributes) return insertElementNormal(name, attributes)
end
return element
end end
return element
end
def insertText(data, parent = nil) def insertText(data, parent=nil)
parent = @openElements[-1] if parent.nil? parent = @openElements[-1] if parent.nil?
if (not(@insertFromTable) or (@insertFromTable and not TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name))) if (not(@insertFromTable) or (@insertFromTable and not TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)))
parent.insertText(data) parent.insertText(data)
else else
#We should be in the InTable mode. This means we want to do #We should be in the InTable mode. This means we want to do
#special magic element rearranging #special magic element rearranging
parent, insertBefore = getTableMisnestedNodePosition parent, insertBefore = getTableMisnestedNodePosition
parent.insertText(data, insertBefore) parent.insertText(data, insertBefore)
end
end end
end
# Get the foster parent element, and sibling to insert before # Get the foster parent element, and sibling to insert before
# (or nil) when inserting a misnested table node # (or nil) when inserting a misnested table node
def getTableMisnestedNodePosition def getTableMisnestedNodePosition
#The foster parent element is the one which comes before the most #The foster parent element is the one which comes before the most
#recently opened table element #recently opened table element
#XXX - this is really inelegant #XXX - this is really inelegant
lastTable = nil lastTable = nil
fosterParent = nil fosterParent = nil
insertBefore = nil insertBefore = nil
@openElements.reverse.each do |element| @openElements.reverse.each do |element|
if element.name == "table" if element.name == "table"
lastTable = element lastTable = element
break break
end end
end end
if lastTable if lastTable
#XXX - we should really check that this parent is actually a #XXX - we should really check that this parent is actually a
#node here #node here
if lastTable.parent if lastTable.parent
fosterParent = lastTable.parent fosterParent = lastTable.parent
insertBefore = lastTable insertBefore = lastTable
else else
fosterParent = @openElements[@openElements.index(lastTable) - 1] fosterParent = @openElements[@openElements.index(lastTable) - 1]
end end
else else
fosterParent = @openElements[0] fosterParent = @openElements[0]
end
return fosterParent, insertBefore
end end
return fosterParent, insertBefore
end
def generateImpliedEndTags(exclude = nil) def generateImpliedEndTags(exclude=nil)
name = @openElements[-1].name name = @openElements[-1].name
if (['dd', 'dt', 'li', 'p', 'td', 'th', 'tr'].include?(name) and name != exclude) if (['dd', 'dt', 'li', 'p', 'td', 'th', 'tr'].include?(name) and name != exclude)
@openElements.pop @openElements.pop
# XXX This is not entirely what the specification says. We should # XXX This is not entirely what the specification says. We should
# investigate it more closely. # investigate it more closely.
generateImpliedEndTags(exclude) generateImpliedEndTags(exclude)
end
end end
end
def getDocument def getDocument
@document @document
end end
def getFragment def getFragment
#assert @innerHTML #assert @innerHTML
fragment = @fragmentClass.new fragment = @fragmentClass.new
@openElements[0].reparentChildren(fragment) @openElements[0].reparentChildren(fragment)
return fragment return fragment
end end
# Serialize the subtree of node in the format required by unit tests # Serialize the subtree of node in the format required by unit tests
# node - the node from which to start serializing # node - the node from which to start serializing
def testSerializer(node) def testSerializer(node)
raise NotImplementedError raise NotImplementedError
end end
end end
end end
end end
end end

View file

@ -3,209 +3,212 @@ require 'hpricot'
require 'forwardable' require 'forwardable'
module HTML5lib module HTML5lib
module TreeBuilders module TreeBuilders
module Hpricot module Hpricot
class Node < Base::Node class Node < Base::Node
extend Forwardable extend Forwardable
def_delegators :@hpricot, :name def_delegators :@hpricot, :name
attr_accessor :hpricot attr_accessor :hpricot
def initialize(name) def initialize(name)
super(name) super(name)
@hpricot = self.class.hpricot_class.new name @hpricot = self.class.hpricot_class.new name
end end
def appendChild(node) def appendChild(node)
if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode) if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s
else else
childNodes << node childNodes << node
hpricot.children << node.hpricot hpricot.children << node.hpricot
end
node.parent = self
end end
node.parent = self
end
def removeChild(node) def removeChild(node)
childNodes.delete(node) childNodes.delete(node)
hpricot.children.delete_at(hpricot.children.index(node.hpricot)) hpricot.children.delete_at(hpricot.children.index(node.hpricot))
node.parent = nil node.parent = nil
end end
def insertText(data, before = nil) def insertText(data, before=nil)
if before if before
insertBefore(TextNode.new(data), before) insertBefore(TextNode.new(data), before)
else else
appendChild(TextNode.new(data)) appendChild(TextNode.new(data))
end
end end
end
def insertBefore(node, refNode) def insertBefore(node, refNode)
index = childNodes.index(refNode) index = childNodes.index(refNode)
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode) if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
else else
childNodes.insert(index, node) childNodes.insert(index, node)
end
end end
end
def hasContent def hasContent
childNodes.any? childNodes.any?
end end
end end
class Element < Node class Element < Node
def self.hpricot_class def self.hpricot_class
::Hpricot::Elem ::Hpricot::Elem
end end
def initialize(name) def initialize(name)
super(name) super(name)
@hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name)) @hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
end end
def name def name
@hpricot.stag.name @hpricot.stag.name
end end
def cloneNode def cloneNode
attributes.inject(self.class.new(name)) do |node, (name, value)| attributes.inject(self.class.new(name)) do |node, (name, value)|
node.hpricot[name] = value node.hpricot[name] = value
node node
end
end end
end
# A call to Hpricot::Elem#raw_attributes is built dynamically, # A call to Hpricot::Elem#raw_attributes is built dynamically,
# so alterations to the returned value (a hash) will be lost. # so alterations to the returned value (a hash) will be lost.
# #
# AttributeProxy works around this by forwarding :[]= calls # AttributeProxy works around this by forwarding :[]= calls
# to the raw_attributes accessor on the element start tag. # to the raw_attributes accessor on the element start tag.
# #
class AttributeProxy class AttributeProxy
def initialize(hpricot) def initialize(hpricot)
@hpricot = hpricot @hpricot = hpricot
end end
def []=(k, v)
def []=(k, v)
@hpricot.stag.send(stag_attributes_method)[k] = v @hpricot.stag.send(stag_attributes_method)[k] = v
end end
def stag_attributes_method
def stag_attributes_method
# STag#attributes changed to STag#raw_attributes after Hpricot 0.5 # STag#attributes changed to STag#raw_attributes after Hpricot 0.5
@hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes @hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
end end
def method_missing(*a, &b)
def method_missing(*a, &b)
@hpricot.attributes.send(*a, &b) @hpricot.attributes.send(*a, &b)
end
end end
end
def attributes def attributes
AttributeProxy.new(@hpricot) AttributeProxy.new(@hpricot)
end end
def attributes=(attrs) def attributes=(attrs)
attrs.each { |name, value| @hpricot[name] = value } attrs.each { |name, value| @hpricot[name] = value }
end end
def printTree(indent = 0) def printTree(indent=0)
tree = "\n|#{' ' * indent}<#{name}>" tree = "\n|#{' ' * indent}<#{name}>"
indent += 2 indent += 2
attributes.each do |name, value| attributes.each do |name, value|
next if name == 'xmlns' next if name == 'xmlns'
tree += "\n|#{' ' * indent}#{name}=\"#{value}\"" tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
end
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
end end
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) } end
end
end
class Document < Node class Document < Node
def self.hpricot_class def self.hpricot_class
::Hpricot::Doc ::Hpricot::Doc
end end
def initialize def initialize
super(nil) super(nil)
end end
def printTree(indent = 0) def printTree(indent=0)
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) } childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
end end
end end
class DocumentType < Node class DocumentType < Node
def self.hpricot_class def self.hpricot_class
::Hpricot::DocType ::Hpricot::DocType
end end
def initialize(name) def initialize(name)
begin begin
super(name) super(name)
rescue ArgumentError # needs 3... rescue ArgumentError # needs 3...
end
@hpricot = ::Hpricot::DocType.new(name, nil, nil)
end end
@hpricot = ::Hpricot::DocType.new(name, nil, nil) def printTree(indent=0)
end "\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
end
end
class DocumentFragment < Element
def initialize
super('')
end
def printTree(indent=0)
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
end
end
class TextNode < Node
def initialize(data)
@hpricot = ::Hpricot::Text.new(data)
end
def printTree(indent=0)
"\n|#{' ' * indent}\"#{hpricot.content}\""
end
end
class CommentNode < Node
def self.hpricot_class
::Hpricot::Comment
end
def printTree(indent=0)
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer(node)
node.printTree
end
def getDocument
@document.hpricot
end
def getFragment
@document = super
return @document.hpricot.children
end
end
def printTree(indent = 0)
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
end end
end end
class DocumentFragment < Element
def initialize
super('')
end
def printTree(indent = 0)
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
end
end
class TextNode < Node
def initialize(data)
@hpricot = ::Hpricot::Text.new(data)
end
def printTree(indent = 0)
"\n|#{' ' * indent}\"#{hpricot.content}\""
end
end
class CommentNode < Node
def self.hpricot_class
::Hpricot::Comment
end
def printTree(indent = 0)
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer(node)
node.printTree
end
def getDocument
@document.hpricot
end
def getFragment
@document = super
return @document.hpricot.children
end
end
end
end
end end

View file

@ -3,189 +3,189 @@ require 'rexml/document'
require 'forwardable' require 'forwardable'
module HTML5lib module HTML5lib
module TreeBuilders module TreeBuilders
module REXMLTree module REXMLTree
class Node < Base::Node class Node < Base::Node
extend Forwardable extend Forwardable
def_delegators :@rxobj, :name, :attributes def_delegators :@rxobj, :name, :attributes
attr_accessor :rxobj attr_accessor :rxobj
def initialize name def initialize name
super name super name
@rxobj = self.class.rxclass.new name @rxobj = self.class.rxclass.new name
end end
def appendChild node def appendChild node
if node.kind_of? TextNode and if node.kind_of? TextNode and
childNodes.length>0 and childNodes[-1].kind_of? TextNode childNodes.length>0 and childNodes[-1].kind_of? TextNode
childNodes[-1].rxobj.value = childNodes[-1].rxobj.value =
childNodes[-1].rxobj.to_s + node.rxobj.to_s childNodes[-1].rxobj.to_s + node.rxobj.to_s
childNodes[-1].rxobj.raw = true childNodes[-1].rxobj.raw = true
else else
childNodes.push node childNodes.push node
rxobj.add node.rxobj rxobj.add node.rxobj
end
node.parent = self
end end
node.parent = self
end
def removeChild node def removeChild node
childNodes.delete node childNodes.delete node
rxobj.delete node.rxobj rxobj.delete node.rxobj
node.parent = nil node.parent = nil
end end
def insertText data, before=nil def insertText data, before=nil
if before if before
insertBefore TextNode.new(data), before insertBefore TextNode.new(data), before
else else
appendChild TextNode.new(data) appendChild TextNode.new(data)
end
end end
end
def insertBefore node, refNode def insertBefore node, refNode
index = childNodes.index(refNode) index = childNodes.index(refNode)
if node.kind_of? TextNode and index>0 and if node.kind_of? TextNode and index>0 and
childNodes[index-1].kind_of? TextNode childNodes[index-1].kind_of? TextNode
childNodes[index-1].rxobj.value = childNodes[index-1].rxobj.value =
childNodes[index-1].rxobj.to_s + node.rxobj.to_s childNodes[index-1].rxobj.to_s + node.rxobj.to_s
childNodes[index-1].rxobj.raw = true childNodes[index-1].rxobj.raw = true
else else
childNodes.insert index, node childNodes.insert index, node
end
end end
end
def hasContent def hasContent
return (childNodes.length > 0) return (childNodes.length > 0)
end end
end end
class Element < Node class Element < Node
def self.rxclass def self.rxclass
REXML::Element REXML::Element
end end
def initialize name def initialize name
super name super name
end end
def cloneNode def cloneNode
newNode = self.class.new name newNode = self.class.new name
attributes.each {|name,value| newNode.attributes[name] = value} attributes.each {|name,value| newNode.attributes[name] = value}
newNode newNode
end end
def attributes= value def attributes= value
value.each {|name,value| rxobj.attributes[name]=value} value.each {|name, value| rxobj.attributes[name]=value}
end end
def printTree indent=0 def printTree indent=0
tree = "\n|#{' ' * indent}<#{name}>" tree = "\n|#{' ' * indent}<#{name}>"
indent += 2 indent += 2
for name, value in attributes for name, value in attributes
next if name == 'xmlns' next if name == 'xmlns'
tree += "\n|#{' ' * indent}#{name}=\"#{value}\"" tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
end end
for child in childNodes for child in childNodes
tree += child.printTree(indent) tree += child.printTree(indent)
end
return tree
end end
return tree end
end
end
class Document < Node class Document < Node
def self.rxclass def self.rxclass
REXML::Document REXML::Document
end end
def initialize def initialize
super nil super nil
end end
def appendChild node def appendChild node
if node.kind_of? Element and node.name == 'html' if node.kind_of? Element and node.name == 'html'
node.rxobj.add_namespace('http://www.w3.org/1999/xhtml') node.rxobj.add_namespace('http://www.w3.org/1999/xhtml')
end end
super node super node
end end
def printTree indent=0 def printTree indent=0
tree = "#document" tree = "#document"
for child in childNodes for child in childNodes
tree += child.printTree(indent + 2) tree += child.printTree(indent + 2)
end
return tree
end end
return tree end
end
end
class DocumentType < Node class DocumentType < Node
def self.rxclass def self.rxclass
REXML::DocType REXML::DocType
end end
def printTree indent=0 def printTree indent=0
"\n|#{' ' * indent}<!DOCTYPE #{name}>" "\n|#{' ' * indent}<!DOCTYPE #{name}>"
end end
end end
class DocumentFragment < Element class DocumentFragment < Element
def initialize def initialize
super nil super nil
end end
def printTree indent=0 def printTree indent=0
tree = "" tree = ""
for child in childNodes for child in childNodes
tree += child.printTree(indent+2) tree += child.printTree(indent+2)
end
return tree
end end
return tree end
class TextNode < Node
def initialize data
raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;')
@rxobj = REXML::Text.new(raw, true, nil, true)
end
def printTree indent=0
"\n|#{' ' * indent}\"#{rxobj.value}\""
end
end
class CommentNode < Node
def self.rxclass
REXML::Comment
end
def printTree indent=0
"\n|#{' ' * indent}<!-- #{rxobj.string} -->"
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer node
node.printTree()
end
def getDocument
@document.rxobj
end
def getFragment
@document = super
return @document.rxobj.children
end
end
end end
end end
class TextNode < Node
def initialize data
raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;')
@rxobj = REXML::Text.new(raw, true, nil, true)
end
def printTree indent=0
"\n|#{' ' * indent}\"#{rxobj.value}\""
end
end
class CommentNode < Node
def self.rxclass
REXML::Comment
end
def printTree indent=0
"\n|#{' ' * indent}<!-- #{rxobj.string} -->"
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer node
node.printTree()
end
def getDocument
@document.rxobj
end
def getFragment
@document = super
return @document.rxobj.children
end
end
end
end
end end

View file

@ -1,178 +1,178 @@
require 'html5lib/treebuilders/base' require 'html5lib/treebuilders/base'
module HTML5lib module HTML5lib
module TreeBuilders module TreeBuilders
module SimpleTree module SimpleTree
class Node < Base::Node class Node < Base::Node
# Node representing an item in the tree. # Node representing an item in the tree.
# name - The tag name associated with the node # name - The tag name associated with the node
attr_accessor :name attr_accessor :name
# The value of the current node (applies to text nodes and # The value of the current node (applies to text nodes and
# comments # comments
attr_accessor :value attr_accessor :value
# a dict holding name, value pairs for attributes of the node # a dict holding name, value pairs for attributes of the node
attr_accessor :attributes attr_accessor :attributes
def initialize name def initialize name
super super
@name = name @name = name
@value = nil @value = nil
@attributes = {} @attributes = {}
end end
def appendChild node def appendChild node
if node.kind_of? TextNode and if node.kind_of? TextNode and
childNodes.length>0 and childNodes[-1].kind_of? TextNode childNodes.length>0 and childNodes[-1].kind_of? TextNode
childNodes[-1].value += node.value childNodes[-1].value += node.value
else else
childNodes.push node childNodes.push node
end
node.parent = self
end end
node.parent = self
end
def removeChild node def removeChild node
childNodes.delete node childNodes.delete node
node.parent = nil node.parent = nil
end end
def cloneNode def cloneNode
newNode = self.class.new name newNode = self.class.new name
attributes.each {|name,value| newNode.attributes[name] = value} attributes.each {|name,value| newNode.attributes[name] = value}
newNode.value = value newNode.value = value
newNode newNode
end end
def insertText data, before=nil def insertText data, before=nil
if before if before
insertBefore TextNode.new(data), before insertBefore TextNode.new(data), before
else else
appendChild TextNode.new(data) appendChild TextNode.new(data)
end
end end
end
def insertBefore node, refNode def insertBefore node, refNode
index = childNodes.index(refNode) index = childNodes.index(refNode)
if node.kind_of? TextNode and index>0 and if node.kind_of? TextNode and index>0 and
childNodes[index-1].kind_of? TextNode childNodes[index-1].kind_of? TextNode
childNodes[index-1].value += node.value childNodes[index-1].value += node.value
else else
childNodes.insert index, node childNodes.insert index, node
end
end end
end
def printTree indent=0 def printTree indent=0
tree = "\n|%s%s" % [' '* indent, self.to_s] tree = "\n|%s%s" % [' '* indent, self.to_s]
for child in childNodes for child in childNodes
tree += child.printTree(indent + 2) tree += child.printTree(indent + 2)
end
return tree
end end
return tree
end
def hasContent def hasContent
return (childNodes.length > 0) return (childNodes.length > 0)
end end
end end
class Element < Node class Element < Node
def to_s def to_s
"<%s>" % name "<%s>" % name
end end
def printTree indent=0 def printTree indent=0
tree = "\n|%s%s" % [' '* indent, self.to_s] tree = "\n|%s%s" % [' '* indent, self.to_s]
indent += 2 indent += 2
for name, value in attributes for name, value in attributes
tree += "\n|%s%s=\"%s\"" % [' ' * indent, name, value] tree += "\n|%s%s=\"%s\"" % [' ' * indent, name, value]
end end
for child in childNodes for child in childNodes
tree += child.printTree(indent) tree += child.printTree(indent)
end
return tree
end end
return tree end
end
end
class Document < Node class Document < Node
def to_s def to_s
"#document" "#document"
end end
def initialize def initialize
super nil super nil
end end
def printTree indent=0 def printTree indent=0
tree = to_s tree = to_s
for child in childNodes for child in childNodes
tree += child.printTree(indent + 2) tree += child.printTree(indent + 2)
end
return tree
end end
return tree end
end
end
class DocumentType < Node class DocumentType < Node
def to_s def to_s
"<!DOCTYPE %s>" % name "<!DOCTYPE %s>" % name
end end
end end
class DocumentFragment < Element class DocumentFragment < Element
def initialize def initialize
super nil super nil
end end
def printTree indent=0 def printTree indent=0
tree = "" tree = ""
for child in childNodes for child in childNodes
tree += child.printTree(indent+2) tree += child.printTree(indent+2)
end
return tree
end end
return tree end
class TextNode < Node
def initialize value
super nil
@value = value
end
def to_s
'"%s"' % value
end
end
class CommentNode < Node
def initialize value
super nil
@value = value
end
def to_s
"<!-- %s -->" % value
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer node
node.printTree()
end
def getFragment
@document = super
return @document.childNodes
end
end
end end
end end
class TextNode < Node
def initialize value
super nil
@value = value
end
def to_s
'"%s"' % value
end
end
class CommentNode < Node
def initialize value
super nil
@value = value
end
def to_s
"<!-- %s -->" % value
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer node
node.printTree()
end
def getFragment
@document = super
return @document.childNodes
end
end
end
end
end end

View file

@ -7,5 +7,17 @@ $:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
$:.unshift File.dirname(__FILE__) $:.unshift File.dirname(__FILE__)
def html5lib_test_files(subdirectory) def html5lib_test_files(subdirectory)
Dir[File.join(HTML5LIB_BASE, 'tests', subdirectory, '*.*')] Dir[File.join(HTML5LIB_BASE, 'tests', subdirectory, '*.*')]
end
begin
require 'jsonx'
rescue LoadError
class JSON
def self.parse json
json.gsub! /"\s*:/, '"=>'
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
eval json
end
end
end end

View file

@ -11,7 +11,7 @@ begin
def test_chardet def test_chardet
File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file| File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true) stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
assert_equal 'big5', stream.charEncoding.downcase assert_equal 'big5', stream.char_encoding.downcase
end end
end end
rescue LoadError rescue LoadError
@ -28,7 +28,7 @@ end
define_method 'test_%s_%d' % [ test_name, index + 1 ] do define_method 'test_%s_%d' % [ test_name, index + 1 ] do
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false) stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
assert_equal encoding.downcase, stream.charEncoding.downcase, input assert_equal encoding.downcase, stream.char_encoding.downcase, input
end end
end end
end end

View file

@ -6,19 +6,19 @@ XMLELEM = /<(\w+\s*)((?:[-:\w]+="[^"]*"\s*)+)(\/?)>/
SORTATTRS = '<#{$1+$2.split.sort.join(' ')+$3}>' SORTATTRS = '<#{$1+$2.split.sort.join(' ')+$3}>'
def assert_xml_equal(input, expected=nil, parser=HTML5lib::XMLParser) def assert_xml_equal(input, expected=nil, parser=HTML5lib::XMLParser)
document = parser.parse(input.chomp).root document = parser.parse(input.chomp).root
if not expected if not expected
expected = input.chomp.gsub(XMLELEM,SORTATTRS) expected = input.chomp.gsub(XMLELEM,SORTATTRS)
expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')} expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,SORTATTRS) output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,SORTATTRS)
assert_equal(expected, output) assert_equal(expected, output)
else else
assert_equal(expected, document.to_s.gsub(/'/,'"')) assert_equal(expected, document.to_s.gsub(/'/,'"'))
end end
end end
def assert_xhtml_equal(input, expected=nil, parser=HTML5lib::XHTMLParser) def assert_xhtml_equal(input, expected=nil, parser=HTML5lib::XHTMLParser)
assert_xml_equal(input, expected, parser) assert_xml_equal(input, expected, parser)
end end
class BasicXhtml5Test < Test::Unit::TestCase class BasicXhtml5Test < Test::Unit::TestCase
@ -27,8 +27,8 @@ class BasicXhtml5Test < Test::Unit::TestCase
assert_xhtml_equal( assert_xhtml_equal(
'<title>Xhtml</title><b><i>content</b></i>', '<title>Xhtml</title><b><i>content</b></i>',
'<html xmlns="http://www.w3.org/1999/xhtml">' + '<html xmlns="http://www.w3.org/1999/xhtml">' +
'<head><title>Xhtml</title></head>' + '<head><title>Xhtml</title></head>' +
'<body><b><i>content</i></b></body>' + '<body><b><i>content</i></b></body>' +
'</html>') '</html>')
end end
@ -36,8 +36,8 @@ class BasicXhtml5Test < Test::Unit::TestCase
assert_xhtml_equal( assert_xhtml_equal(
'<title>mdash</title>A &mdash B', '<title>mdash</title>A &mdash B',
'<html xmlns="http://www.w3.org/1999/xhtml">' + '<html xmlns="http://www.w3.org/1999/xhtml">' +
'<head><title>mdash</title></head>' + '<head><title>mdash</title></head>' +
'<body>A '+ [0x2014].pack('U') + ' B</body>' + '<body>A '+ [0x2014].pack('U') + ' B</body>' +
'</html>') '</html>')
end end
end end
@ -70,24 +70,24 @@ class OpmlTest < Test::Unit::TestCase
def test_mixedCaseElement def test_mixedCaseElement
assert_xml_equal( assert_xml_equal(
'<opml version="1.0">' + '<opml version="1.0">' +
'<head><ownerName>Dave Winer</ownerName></head>' + '<head><ownerName>Dave Winer</ownerName></head>' +
'</opml>') '</opml>')
end end
def test_mixedCaseAttribute def test_mixedCaseAttribute
assert_xml_equal( assert_xml_equal(
'<opml version="1.0">' + '<opml version="1.0">' +
'<body><outline isComment="true"/></body>' + '<body><outline isComment="true"/></body>' +
'</opml>') '</opml>')
end end
def test_malformed def test_malformed
assert_xml_equal( assert_xml_equal(
'<opml version="1.0">' + '<opml version="1.0">' +
'<body><outline text="Odds & Ends"/></body>' + '<body><outline text="Odds & Ends"/></body>' +
'</opml>', '</opml>',
'<opml version="1.0">' + '<opml version="1.0">' +
'<body><outline text="Odds &amp; Ends"/></body>' + '<body><outline text="Odds &amp; Ends"/></body>' +
'</opml>') '</opml>')
end end
end end
@ -100,45 +100,45 @@ class XhtmlTest < Test::Unit::TestCase
<head><title>MathML</title></head> <head><title>MathML</title></head>
<body> <body>
<math xmlns="http://www.w3.org/1998/Math/MathML"> <math xmlns="http://www.w3.org/1998/Math/MathML">
<mrow>
<mi>x</mi>
<mo>=</mo>
<mfrac>
<mrow> <mrow>
<mi>x</mi> <mrow>
<mo>=</mo> <mo>-</mo>
<mi>b</mi>
</mrow>
<mo>&#177;</mo>
<msqrt>
<mfrac> <mrow>
<msup>
<mi>b</mi>
<mn>2</mn>
</msup>
<mo>-</mo>
<mrow> <mrow>
<mrow>
<mo>-</mo>
<mi>b</mi>
</mrow>
<mo>&#177;</mo>
<msqrt>
<mrow> <mn>4</mn>
<msup> <mo>&#8290;</mo>
<mi>b</mi> <mi>a</mi>
<mn>2</mn> <mo>&#8290;</mo>
</msup> <mi>c</mi>
<mo>-</mo>
<mrow>
<mn>4</mn>
<mo>&#8290;</mo>
<mi>a</mi>
<mo>&#8290;</mo>
<mi>c</mi>
</mrow>
</mrow>
</msqrt>
</mrow> </mrow>
<mrow> </mrow>
<mn>2</mn>
<mo>&#8290;</mo>
<mi>a</mi>
</mrow>
</mfrac>
</msqrt>
</mrow> </mrow>
<mrow>
<mn>2</mn>
<mo>&#8290;</mo>
<mi>a</mi>
</mrow>
</mfrac>
</mrow>
</math> </math>
</body></html> </body></html>
EOX EOX
@ -150,11 +150,11 @@ EOX
<head><title>SVG</title></head> <head><title>SVG</title></head>
<body> <body>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100"> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
<path d="M38,38c0-12,24-15,23-2c0,9-16,13-16,23v7h11v-4c0-9,17-12,17-27 <path d="M38,38c0-12,24-15,23-2c0,9-16,13-16,23v7h11v-4c0-9,17-12,17-27
c-2-22-45-22-45,3zM45,70h11v11h-11z" fill="#371"> c-2-22-45-22-45,3zM45,70h11v11h-11z" fill="#371">
</path> </path>
<circle cx="50" cy="50" r="45" fill="none" stroke="#371" stroke-width="10"> <circle cx="50" cy="50" r="45" fill="none" stroke="#371" stroke-width="10">
</circle> </circle>
</svg> </svg>
</body></html> </body></html>
@ -167,24 +167,24 @@ EOX
<head><title>XLINK</title></head> <head><title>XLINK</title></head>
<body> <body>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100"> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
<defs xmlns:l="http://www.w3.org/1999/xlink"> <defs xmlns:l="http://www.w3.org/1999/xlink">
<radialGradient id="s1" fx=".4" fy=".2" r=".7"> <radialGradient id="s1" fx=".4" fy=".2" r=".7">
<stop stop-color="#FE8"/> <stop stop-color="#FE8"/>
<stop stop-color="#D70" offset="1"/> <stop stop-color="#D70" offset="1"/>
</radialGradient> </radialGradient>
<radialGradient id="s2" fx=".8" fy=".5" l:href="#s1"/> <radialGradient id="s2" fx=".8" fy=".5" l:href="#s1"/>
<radialGradient id="s3" fx=".5" fy=".9" l:href="#s1"/> <radialGradient id="s3" fx=".5" fy=".9" l:href="#s1"/>
<radialGradient id="s4" fx=".1" fy=".5" l:href="#s1"/> <radialGradient id="s4" fx=".1" fy=".5" l:href="#s1"/>
</defs> </defs>
<g stroke="#940"> <g stroke="#940">
<path d="M73,29c-37-40-62-24-52,4l6-7c-8-16,7-26,42,9z" fill="url(#s1)"/> <path d="M73,29c-37-40-62-24-52,4l6-7c-8-16,7-26,42,9z" fill="url(#s1)"/>
<path d="M47,8c33-16,48,21,9,47l-6-5c38-27,20-44,5-37z" fill="url(#s2)"/> <path d="M47,8c33-16,48,21,9,47l-6-5c38-27,20-44,5-37z" fill="url(#s2)"/>
<path d="M77,32c22,30,10,57-39,51l-1-8c3,3,67,5,36-36z" fill="url(#s3)"/> <path d="M77,32c22,30,10,57-39,51l-1-8c3,3,67,5,36-36z" fill="url(#s3)"/>
<path d="M58,84c-4,20-38-4-8-24l-6-5c-36,43,15,56,23,27z" fill="url(#s4)"/> <path d="M58,84c-4,20-38-4-8-24l-6-5c-36,43,15,56,23,27z" fill="url(#s4)"/>
<path d="M40,14c-40,37-37,52-9,68l1-8c-16-13-29-21,16-56z" fill="url(#s1)"/> <path d="M40,14c-40,37-37,52-9,68l1-8c-16-13-29-21,16-56z" fill="url(#s1)"/>
<path d="M31,33c19,23,20,7,35,41l-9,1.7c-4-19-8-14-31-37z" fill="url(#s2)"/> <path d="M31,33c19,23,20,7,35,41l-9,1.7c-4-19-8-14-31-37z" fill="url(#s2)"/>
</g> </g>
</svg> </svg>
</body></html> </body></html>
EOX EOX

View file

@ -7,8 +7,8 @@ require 'html5lib/html5parser'
$tree_types_to_test = ['simpletree', 'rexml'] $tree_types_to_test = ['simpletree', 'rexml']
begin begin
require 'hpricot' require 'hpricot'
$tree_types_to_test.push('hpricot') $tree_types_to_test.push('hpricot')
rescue LoadError rescue LoadError
end end
@ -19,90 +19,90 @@ puts 'Testing: ' + $tree_types_to_test * ', '
class Html5ParserTestCase < Test::Unit::TestCase class Html5ParserTestCase < Test::Unit::TestCase
def self.startswith?(a, b) def self.startswith?(a, b)
b[0... a.length] == a b[0... a.length] == a
end end
def self.parseTestcase(data) def self.parseTestcase(data)
innerHTML = nil innerHTML = nil
input = [] input = []
output = [] output = []
errors = [] errors = []
currentList = input currentList = input
data.split(/\n/).each do |line| data.split(/\n/).each do |line|
if !line.empty? and !startswith?("#errors", line) and if !line.empty? and !startswith?("#errors", line) and
!startswith?("#document", line) and !startswith?("#document", line) and
!startswith?("#data", line) and !startswith?("#data", line) and
!startswith?("#document-fragment", line) !startswith?("#document-fragment", line)
if currentList == output and startswith?("|", line) if currentList == output and startswith?("|", line)
currentList.push(line[2..-1]) currentList.push(line[2..-1])
else else
currentList.push(line) currentList.push(line)
end
elsif line == "#errors"
currentList = errors
elsif line == "#document" or startswith?("#document-fragment", line)
if startswith?("#document-fragment", line)
innerHTML = line[19..-1]
raise AssertionError unless innerHTML
end
currentList = output
end
end end
return innerHTML, input.join("\n"), output.join("\n"), errors elsif line == "#errors"
end currentList = errors
elsif line == "#document" or startswith?("#document-fragment", line)
# convert the output of str(document) to the format used in the testcases if startswith?("#document-fragment", line)
def convertTreeDump(treedump) innerHTML = line[19..-1]
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n") raise AssertionError unless innerHTML
end
def sortattrs(output)
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
end
html5lib_test_files('tree-construction').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '')
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
innerHTML, input, expected_output, expected_errors = parseTestcase(data)
$tree_types_to_test.each do |tree_name|
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name))
if innerHTML
parser.parseFragment(input, innerHTML)
else
parser.parse(input)
end
actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
assert_equal sortattrs(expected_output), sortattrs(actual_output), [
'Input:', input,
'Expected:', expected_output,
'Recieved:', actual_output
].join("\n")
if $CHECK_PARSER_ERRORS
actual_errors = parser.errors.map do |(line, col), message|
'Line: %i Col: %i %s' % [line, col, message]
end
assert_equal parser.errors.length, expected_errors.length, [
'Expected errors:', expected_errors.join("\n"),
'Actual errors:', actual_errors.join("\n")
].join("\n")
end
end
end
end end
currentList = output
end
end end
return innerHTML, input.join("\n"), output.join("\n"), errors
end
# convert the output of str(document) to the format used in the testcases
def convertTreeDump(treedump)
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
end
def sortattrs(output)
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
end
html5lib_test_files('tree-construction').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '')
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
innerHTML, input, expected_output, expected_errors = parseTestcase(data)
$tree_types_to_test.each do |tree_name|
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name))
if innerHTML
parser.parseFragment(input, innerHTML)
else
parser.parse(input)
end
actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
assert_equal sortattrs(expected_output), sortattrs(actual_output), [
'Input:', input,
'Expected:', expected_output,
'Recieved:', actual_output
].join("\n")
if $CHECK_PARSER_ERRORS
actual_errors = parser.errors.map do |(line, col), message|
'Line: %i Col: %i %s' % [line, col, message]
end
assert_equal parser.errors.length, expected_errors.length, [
'Expected errors:', expected_errors.join("\n"),
'Actual errors:', actual_errors.join("\n")
].join("\n")
end
end
end
end
end
end end

View file

@ -203,4 +203,8 @@ class SanitizeTest < Test::Unit::TestCase
sanitize_html(%(<img src='vbscript:msgbox("XSS")' />)) sanitize_html(%(<img src='vbscript:msgbox("XSS")' />))
end end
def test_should_handle_astral_plane_characters
assert_equal "<p>\360\235\222\265 \360\235\224\270</p>",
sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
end
end end

View file

@ -4,75 +4,63 @@ require 'html5lib/tokenizer'
require 'tokenizer_test_parser' require 'tokenizer_test_parser'
begin
require 'jsonx'
rescue LoadError
class JSON
def self.parse json
json.gsub! /"\s*:/, '"=>'
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
eval json
end
end
end
class Html5TokenizerTestCase < Test::Unit::TestCase class Html5TokenizerTestCase < Test::Unit::TestCase
def type_of?(token_name, token) def type_of?(token_name, token)
token != 'ParseError' and token_name == token.first token != 'ParseError' and token_name == token.first
end
def convert_attribute_arrays_to_hashes(tokens)
tokens.inject([]) do |tokens, token|
token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
tokens << token
end end
end
def convert_attribute_arrays_to_hashes(tokens) def concatenate_consecutive_characters(tokens)
tokens.inject([]) do |tokens, token| tokens.inject([]) do |tokens, token|
token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token) if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
tokens << token tokens.last[1] = tokens.last[1] + token[1]
end next tokens
end
tokens << token
end end
end
def concatenate_consecutive_characters(tokens) def tokenizer_test(data)
tokens.inject([]) do |tokens, token| (data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last) message = [
tokens.last[1] = tokens.last[1] + token[1] 'Description:', data['description'],
next tokens 'Input:', data['input'],
end 'Content Model Flag:', content_model_flag ] * "\n"
tokens << token
end assert_nothing_raised message do
tokenizer = HTML5lib::HTMLTokenizer.new(data['input'])
tokenizer.contentModelFlag = content_model_flag.to_sym
tokenizer.currentToken = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
tokens = TokenizerTestParser.new(tokenizer).parse
actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
expected = concatenate_consecutive_characters(data['output'])
assert_equal expected, actual, message
end
end end
end
def tokenizer_test(data) html5lib_test_files('tokenizer').each do |test_file|
(data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag| test_name = File.basename(test_file).sub('.test', '')
message = [
'Description:', data['description'],
'Input:', data['input'],
'Content Model Flag:', content_model_flag ] * "\n"
assert_nothing_raised message do tests = JSON.parse(File.read(test_file))['tests']
tokenizer = HTML5lib::HTMLTokenizer.new(data['input'])
tokenizer.contentModelFlag = content_model_flag.to_sym tests.each_with_index do |data, index|
define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
tokenizer.currentToken = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
tokens = TokenizerTestParser.new(tokenizer).parse
actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
expected = concatenate_consecutive_characters(data['output'])
assert_equal expected, actual, message
end
end
end
html5lib_test_files('tokenizer').each do |test_file|
test_name = File.basename(test_file).sub('.test', '')
tests = JSON.parse(File.read(test_file))['tests']
tests.each_with_index do |data, index|
define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
end
end end
end
end end

View file

@ -1,62 +1,62 @@
require 'html5lib/constants' require 'html5lib/constants'
class TokenizerTestParser class TokenizerTestParser
def initialize(tokenizer) def initialize(tokenizer)
@tokenizer = tokenizer @tokenizer = tokenizer
end
def parse
@outputTokens = []
debug = nil
for token in @tokenizer
debug = token.inspect if token[:type] == :ParseError
send ('process' + token[:type].to_s), token
end end
def parse return @outputTokens
@outputTokens = [] end
debug = nil def processDoctype(token)
for token in @tokenizer @outputTokens.push(["DOCTYPE", token[:name], token[:data]])
debug = token.inspect if token[:type] == :ParseError end
send ('process' + token[:type].to_s), token
end
return @outputTokens def processStartTag(token)
@outputTokens.push(["StartTag", token[:name], token[:data]])
end
def processEmptyTag(token)
if not HTML5lib::VOID_ELEMENTS.include? token[:name]
@outputTokens.push("ParseError")
end end
@outputTokens.push(["StartTag", token[:name], token[:data]])
end
def processDoctype(token) def processEndTag(token)
@outputTokens.push(["DOCTYPE", token[:name], token[:data]]) if token[:data].length > 0
self.processParseError(token)
end end
@outputTokens.push(["EndTag", token[:name]])
end
def processStartTag(token) def processComment(token)
@outputTokens.push(["StartTag", token[:name], token[:data]]) @outputTokens.push(["Comment", token[:data]])
end end
def processEmptyTag(token) def processCharacters(token)
if not HTML5lib::VOID_ELEMENTS.include? token[:name] @outputTokens.push(["Character", token[:data]])
@outputTokens.push("ParseError") end
end
@outputTokens.push(["StartTag", token[:name], token[:data]])
end
def processEndTag(token) alias processSpaceCharacters processCharacters
if token[:data].length > 0
self.processParseError(token)
end
@outputTokens.push(["EndTag", token[:name]])
end
def processComment(token) def processCharacters(token)
@outputTokens.push(["Comment", token[:data]]) @outputTokens.push(["Character", token[:data]])
end end
def processCharacters(token) def processEOF(token)
@outputTokens.push(["Character", token[:data]]) end
end
alias processSpaceCharacters processCharacters def processParseError(token)
@outputTokens.push("ParseError")
def processCharacters(token) end
@outputTokens.push(["Character", token[:data]])
end
def processEOF(token)
end
def processParseError(token)
@outputTokens.push("ParseError")
end
end end