Merged with latest trunk.

This commit is contained in:
Jason Blevins 2007-06-04 22:47:59 -04:00
commit aadfb55342
38 changed files with 4839 additions and 4849 deletions

View file

@ -8,7 +8,19 @@ OPTIONS = {
:ip => "0.0.0.0",
:environment => "production",
:server_root => File.expand_path(File.dirname(__FILE__) + "/../public/"),
:server_type => WEBrick::SimpleServer
:server_type => WEBrick::SimpleServer,
:mime_types => WEBrick::HTTPUtils::DefaultMimeTypes.merge({
'avi' => 'video/x-msvideo',
'gz' => 'application/x-gzip',
'js' => 'application/x-javascript',
'nb' => 'application/mathematica',
'pdf' => 'application/pdf',
'svg' => 'application/svg+xml',
'tar' => 'application/x-tar',
'tex' => 'application/x-tex',
'xml' => 'application/xml',
'xslt' => 'application/xslt+xml'
})
}
ARGV.options do |opts|

View file

@ -2,6 +2,6 @@ require 'rake'
require 'rake/testtask'
Rake::TestTask.new do |task|
task.pattern = 'tests/test_*.rb'
task.verbose = true
task.pattern = 'tests/test_*.rb'
task.verbose = true
end

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,46 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class AfterBodyPhase < Phase
handle_end 'html'
def processComment(data)
# This is needed because data is to be appended to the <html> element
# here and not to whatever is currently open.
@tree.insertComment(data, @tree.openElements[0])
end
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters in the after body phase.'))
@parser.phase = @parser.phases[:inBody]
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.parseError(_("Unexpected start tag token (#{name}) in the after body phase."))
@parser.phase = @parser.phases[:inBody]
@parser.phase.processStartTag(name, attributes)
end
def endTagHtml(name)
if @parser.innerHTML
@parser.parseError
else
# XXX: This may need to be done, not sure
# Don't set lastPhase to the current phase but to the inBody phase
# instead. No need for extra parse errors if there's something after </html>.
# Try "<!doctype html>X</html>X" for instance.
@parser.lastPhase = @parser.phase
@parser.phase = @parser.phases[:trailingEnd]
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the after body phase."))
@parser.phase = @parser.phases[:inBody]
@parser.phase.processEndTag(name)
end
end
end

View file

@ -0,0 +1,34 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class AfterFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#after3
handle_start 'html', 'noframes'
handle_end 'html'
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters in the after frameset phase. Ignored.'))
end
def startTagNoframes(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) in the after frameset phase. Ignored."))
end
def endTagHtml(name)
@parser.lastPhase = @parser.phase
@parser.phase = @parser.phases[:trailingEnd]
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the after frameset phase. Ignored."))
end
end
end

View file

@ -0,0 +1,50 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class AfterHeadPhase < Phase
handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead'
def processEOF
anythingElse
@parser.phase.processEOF
end
def processCharacters(data)
anythingElse
@parser.phase.processCharacters(data)
end
def startTagBody(name, attributes)
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inBody]
end
def startTagFrameset(name, attributes)
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inFrameset]
end
def startTagFromHead(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) that can be in head. Moved."))
@parser.phase = @parser.phases[:inHead]
@parser.phase.processStartTag(name, attributes)
end
def startTagOther(name, attributes)
anythingElse
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
anythingElse
@parser.phase.processEndTag(name)
end
def anythingElse
@tree.insertElement('body', {})
@parser.phase = @parser.phases[:inBody]
end
end
end

View file

@ -0,0 +1,41 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class BeforeHeadPhase < Phase
handle_start 'html', 'head'
handle_end 'html'
def processEOF
startTagHead('head', {})
@parser.phase.processEOF
end
def processCharacters(data)
startTagHead('head', {})
@parser.phase.processCharacters(data)
end
def startTagHead(name, attributes)
@tree.insertElement(name, attributes)
@tree.headPointer = @tree.openElements[-1]
@parser.phase = @parser.phases[:inHead]
end
def startTagOther(name, attributes)
startTagHead('head', {})
@parser.phase.processStartTag(name, attributes)
end
def endTagHtml(name)
startTagHead('head', {})
@parser.phase.processEndTag(name)
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) after the (implied) root element."))
end
end
end

View file

@ -0,0 +1,548 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InBodyPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-body
handle_start 'html', 'body', 'form', 'plaintext', 'a', 'button', 'xmp', 'table', 'hr', 'image'
handle_start 'input', 'textarea', 'select', 'isindex', %w( script style ), %w( marquee object )
handle_start %w( li dd dt ) => 'ListItem', %w( base link meta title ) => 'FromHead'
handle_start %w( address blockquote center dir div dl fieldset listing menu ol p pre ul ) => 'CloseP'
handle_start %w( b big em font i nobr s small strike strong tt u ) => 'Formatting'
handle_start %w( area basefont bgsound br embed img param spacer wbr ) => 'VoidFormatting'
handle_start %w( iframe noembed noframes noscript ) => 'Cdata', HEADING_ELEMENTS => 'Heading'
handle_start %w( caption col colgroup frame frameset head option optgroup tbody td tfoot th thead tr ) => 'Misplaced'
handle_start %w( event-source section nav article aside header footer datagrid command ) => 'New'
handle_end 'p', 'body', 'html', 'form', %w( button marquee object ), %w( dd dt li ) => 'ListItem'
handle_end %w( address blockquote center div dl fieldset listing menu ol pre ul ) => 'Block'
handle_end %w( a b big em font i nobr s small strike strong tt u ) => 'Formatting'
handle_end %w( head frameset select optgroup option table caption colgroup col thead tfoot tbody tr td th ) => 'Misplaced'
handle_end %w( area basefont bgsound br embed hr image img input isindex param spacer wbr frame ) => 'None'
handle_end %w( noframes noscript noembed textarea xmp iframe ) => 'CdataTextAreaXmp'
handle_end %w( event-source section nav article aside header footer datagrid command ) => 'New'
def initialize(parser, tree)
super(parser, tree)
# for special handling of whitespace in <pre>
@processSpaceCharactersPre = false
end
def processSpaceCharactersPre(data)
#Sometimes (start of <pre> blocks) we want to drop leading newlines
@processSpaceCharactersPre = false
if (data.length > 0 and data[0] == ?\n and
@tree.openElements[-1].name == 'pre' and
not @tree.openElements[-1].hasContent)
data = data[1..-1]
end
@tree.insertText(data) if data.length > 0
end
def processSpaceCharacters(data)
if @processSpaceCharactersPre
processSpaceCharactersPre(data)
else
super(data)
end
end
def processCharacters(data)
# XXX The specification says to do this for every character at the
# moment, but apparently that doesn't match the real world so we don't
# do it for space characters.
@tree.reconstructActiveFormattingElements
@tree.insertText(data)
end
def startTagScriptStyle(name, attributes)
@parser.phases[:inHead].processStartTag(name, attributes)
end
def startTagFromHead(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) that belongs in the head. Moved."))
@parser.phases[:inHead].processStartTag(name, attributes)
end
def startTagBody(name, attributes)
@parser.parseError(_('Unexpected start tag (body).'))
if (@tree.openElements.length == 1 or
@tree.openElements[1].name != 'body')
assert @parser.innerHTML
else
attributes.each do |attr, value|
unless @tree.openElements[1].attributes.has_key?(attr)
@tree.openElements[1].attributes[attr] = value
end
end
end
end
def startTagCloseP(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@processSpaceCharactersPre = true if name == 'pre'
end
def startTagForm(name, attributes)
if @tree.formPointer
@parser.parseError('Unexpected start tag (form). Ignored.')
else
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@tree.formPointer = @tree.openElements[-1]
end
end
def startTagListItem(name, attributes)
endTagP('p') if in_scope?('p')
stopNames = {'li' => ['li'], 'dd' => ['dd', 'dt'], 'dt' => ['dd', 'dt']}
stopName = stopNames[name]
@tree.openElements.reverse.each_with_index do |node, i|
if stopName.include?(node.name)
(i + 1).times { @tree.openElements.pop }
break
end
# Phrasing elements are all non special, non scoping, non
# formatting elements
break if ((SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name) and
not ['address', 'div'].include?(node.name))
end
# Always insert an <li> element.
@tree.insertElement(name, attributes)
end
def startTagPlaintext(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :PLAINTEXT
end
def startTagHeading(name, attributes)
endTagP('p') if in_scope?('p')
HEADING_ELEMENTS.each do |element|
if in_scope?(element)
@parser.parseError(_("Unexpected start tag (#{name})."))
remove_open_elements_until { |element| HEADING_ELEMENTS.include?(element.name) }
break
end
end
@tree.insertElement(name, attributes)
end
def startTagA(name, attributes)
if afeAElement = @tree.elementInActiveFormattingElements('a')
@parser.parseError(_('Unexpected start tag (a) implies end tag (a).'))
endTagFormatting('a')
@tree.openElements.delete(afeAElement) if @tree.openElements.include?(afeAElement)
@tree.activeFormattingElements.delete(afeAElement) if @tree.activeFormattingElements.include?(afeAElement)
end
@tree.reconstructActiveFormattingElements
addFormattingElement(name, attributes)
end
def startTagFormatting(name, attributes)
@tree.reconstructActiveFormattingElements
addFormattingElement(name, attributes)
end
def startTagButton(name, attributes)
if in_scope?('button')
@parser.parseError(_('Unexpected start tag (button) implied end tag (button).'))
processEndTag('button')
@parser.phase.processStartTag(name, attributes)
else
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.activeFormattingElements.push(Marker)
end
end
def startTagMarqueeObject(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.activeFormattingElements.push(Marker)
end
def startTagXmp(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagTable(name, attributes)
processEndTag('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inTable]
end
def startTagVoidFormatting(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagHr(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagImage(name, attributes)
# No really...
@parser.parseError(_('Unexpected start tag (image). Treated as img.'))
processStartTag('img', attributes)
end
def startTagInput(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
if @tree.formPointer
# XXX Not exactly sure what to do here
# @tree.openElements[-1].form = @tree.formPointer
end
@tree.openElements.pop
end
def startTagIsindex(name, attributes)
@parser.parseError("Unexpected start tag isindex. Don't use it!")
return if @tree.formPointer
processStartTag('form', {})
processStartTag('hr', {})
processStartTag('p', {})
processStartTag('label', {})
# XXX Localization ...
processCharacters('This is a searchable index. Insert your search keywords here:')
attributes['name'] = 'isindex'
attrs = attributes.to_a
processStartTag('input', attributes)
processEndTag('label')
processEndTag('p')
processStartTag('hr', {})
processEndTag('form')
end
def startTagTextarea(name, attributes)
# XXX Form element pointer checking here as well...
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :RCDATA
end
# iframe, noembed noframes, noscript(if scripting enabled)
def startTagCdata(name, attributes)
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagSelect(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inSelect]
end
def startTagMisplaced(name, attributes)
# Elements that should be children of other elements that have a
# different insertion mode; here they are ignored
# "caption", "col", "colgroup", "frame", "frameset", "head",
# "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
# "tr", "noscript"
@parser.parseError(_("Unexpected start tag (#{name}). Ignored."))
end
def startTagNew(name, attributes)
# New HTML5 elements, "event-source", "section", "nav",
# "article", "aside", "header", "footer", "datagrid", "command"
sys.stderr.write("Warning: Undefined behaviour for start tag #{name}")
startTagOther(name, attributes)
#raise NotImplementedError
end
def startTagOther(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
end
def endTagP(name)
@tree.generateImpliedEndTags('p') if in_scope?('p')
@parser.parseError('Unexpected end tag (p).') unless @tree.openElements[-1].name == 'p'
@tree.openElements.pop while in_scope?('p')
end
def endTagBody(name)
# XXX Need to take open <p> tags into account here. We shouldn't imply
# </p> but we should not throw a parse error either. Specification is
# likely to be updated.
unless @tree.openElements[1].name == 'body'
# innerHTML case
@parser.parseError
return
end
unless @tree.openElements[-1].name == 'body'
@parser.parseError(_("Unexpected end tag (body). Missing end tag (#{@tree.openElements[-1].name})."))
end
@parser.phase = @parser.phases[:afterBody]
end
def endTagHtml(name)
endTagBody(name)
@parser.phase.processEndTag(name) unless @parser.innerHTML
end
def endTagBlock(name)
#Put us back in the right whitespace handling mode
@processSpaceCharactersPre = false if name == 'pre'
@tree.generateImpliedEndTags if in_scope?(name)
unless @tree.openElements[-1].name == name
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
end
if in_scope?(name)
remove_open_elements_until(name)
end
end
def endTagForm(name)
endTagBlock(name)
@tree.formPointer = nil
end
def endTagListItem(name)
# AT Could merge this with the Block case
if in_scope?(name)
@tree.generateImpliedEndTags(name)
unless @tree.openElements[-1].name == name
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
end
end
remove_open_elements_until(name) if in_scope?(name)
end
def endTagHeading(name)
HEADING_ELEMENTS.each do |element|
if in_scope?(element)
@tree.generateImpliedEndTags
break
end
end
unless @tree.openElements[-1].name == name
@parser.parseError(("Unexpected end tag (#{name}). Expected other end tag."))
end
HEADING_ELEMENTS.each do |element|
if in_scope?(element)
remove_open_elements_until { |element| HEADING_ELEMENTS.include?(element.name) }
break
end
end
end
# The much-feared adoption agency algorithm
def endTagFormatting(name)
# http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
# XXX Better parseError messages appreciated.
while true
# Step 1 paragraph 1
afeElement = @tree.elementInActiveFormattingElements(name)
if not afeElement or (@tree.openElements.include?(afeElement) and not in_scope?(afeElement.name))
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 1 of the adoption agency algorithm."))
return
# Step 1 paragraph 2
elsif not @tree.openElements.include?(afeElement)
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 2 of the adoption agency algorithm."))
@tree.activeFormattingElements.delete(afeElement)
return
end
# Step 1 paragraph 3
if afeElement != @tree.openElements[-1]
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 3 of the adoption agency algorithm."))
end
# Step 2
# Start of the adoption agency algorithm proper
afeIndex = @tree.openElements.index(afeElement)
furthestBlock = nil
@tree.openElements[afeIndex..-1].each do |element|
if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(element.name)
furthestBlock = element
break
end
end
# Step 3
if furthestBlock.nil?
element = remove_open_elements_until { |element| element == afeElement }
@tree.activeFormattingElements.delete(element)
return
end
commonAncestor = @tree.openElements[afeIndex - 1]
# Step 5
furthestBlock.parent.removeChild(furthestBlock) if furthestBlock.parent
# Step 6
# The bookmark is supposed to help us identify where to reinsert
# nodes in step 12. We have to ensure that we reinsert nodes after
# the node before the active formatting element. Note the bookmark
# can move in step 7.4
bookmark = @tree.activeFormattingElements.index(afeElement)
# Step 7
lastNode = node = furthestBlock
while true
# AT replace this with a function and recursion?
# Node is element before node in open elements
node = @tree.openElements[@tree.openElements.index(node) - 1]
until @tree.activeFormattingElements.include?(node)
tmpNode = node
node = @tree.openElements[@tree.openElements.index(node) - 1]
@tree.openElements.delete(tmpNode)
end
# Step 7.3
break if node == afeElement
# Step 7.4
if lastNode == furthestBlock
# XXX should this be index(node) or index(node)+1
# Anne: I think +1 is ok. Given x = [2,3,4,5]
# x.index(3) gives 1 and then x[1 +1] gives 4...
bookmark = @tree.activeFormattingElements.index(node) + 1
end
# Step 7.5
cite = node.parent
if node.hasContent
clone = node.cloneNode
# Replace node with clone
@tree.activeFormattingElements[@tree.activeFormattingElements.index(node)] = clone
@tree.openElements[@tree.openElements.index(node)] = clone
node = clone
end
# Step 7.6
# Remove lastNode from its parents, if any
lastNode.parent.removeChild(lastNode) if lastNode.parent
node.appendChild(lastNode)
# Step 7.7
lastNode = node
# End of inner loop
end
# Step 8
lastNode.parent.removeChild(lastNode) if lastNode.parent
commonAncestor.appendChild(lastNode)
# Step 9
clone = afeElement.cloneNode
# Step 10
furthestBlock.reparentChildren(clone)
# Step 11
furthestBlock.appendChild(clone)
# Step 12
@tree.activeFormattingElements.delete(afeElement)
@tree.activeFormattingElements.insert([bookmark,@tree.activeFormattingElements.length].min, clone)
# Step 13
@tree.openElements.delete(afeElement)
@tree.openElements.insert(@tree.openElements.index(furthestBlock) + 1, clone)
end
end
def endTagButtonMarqueeObject(name)
@tree.generateImpliedEndTags if in_scope?(name)
unless @tree.openElements[-1].name == name
@parser.parseError(_("Unexpected end tag (#{name}). Expected other end tag first."))
end
if in_scope?(name)
remove_open_elements_until(name)
@tree.clearActiveFormattingElements
end
end
def endTagMisplaced(name)
# This handles elements with end tags in other insertion modes.
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagNone(name)
# This handles elements with no end tag.
@parser.parseError(_("This tag (#{name}) has no end tag"))
end
def endTagCdataTextAreaXmp(name)
if @tree.openElements[-1].name == name
@tree.openElements.pop
else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagNew(name)
# New HTML5 elements, "event-source", "section", "nav",
# "article", "aside", "header", "footer", "datagrid", "command"
STDERR.puts "Warning: Undefined behaviour for end tag #{name}"
endTagOther(name)
#raise NotImplementedError
end
def endTagOther(name)
# XXX This logic should be moved into the treebuilder
@tree.openElements.reverse.each do |node|
if node.name == name
@tree.generateImpliedEndTags
unless @tree.openElements[-1].name == name
@parser.parseError(_("Unexpected end tag (#{name})."))
end
remove_open_elements_until { |element| element == node }
break
else
if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
break
end
end
end
end
protected
def addFormattingElement(name, attributes)
@tree.insertElement(name, attributes)
@tree.activeFormattingElements.push(@tree.openElements[-1])
end
end
end

View file

@ -0,0 +1,68 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InCaptionPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
handle_start 'html', %w( caption col colgroup tbody td tfoot th thead tr ) => 'TableElement'
handle_end 'caption', 'table', %w( body col colgroup html tbody td tfoot th thead tr ) => 'Ignore'
def ignoreEndTagCaption
not in_scope?('caption', true)
end
def processCharacters(data)
@parser.phases[:inBody].processCharacters(data)
end
def startTagTableElement(name, attributes)
@parser.parseError
#XXX Have to duplicate logic here to find out if the tag is ignored
ignoreEndTag = ignoreEndTagCaption
@parser.phase.processEndTag('caption')
@parser.phase.processStartTag(name, attributes) unless ignoreEndTag
end
def startTagOther(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def endTagCaption(name)
if ignoreEndTagCaption
# innerHTML case
assert @parser.innerHTML
@parser.parseError
else
# AT this code is quite similar to endTagTable in "InTable"
@tree.generateImpliedEndTags
unless @tree.openElements[-1].name == 'caption'
@parser.parseError(_("Unexpected end tag (caption). Missing end tags."))
end
remove_open_elements_until('caption')
@tree.clearActiveFormattingElements
@parser.phase = @parser.phases[:inTable]
end
end
def endTagTable(name)
@parser.parseError
ignoreEndTag = ignoreEndTagCaption
@parser.phase.processEndTag('caption')
@parser.phase.processEndTag(name) unless ignoreEndTag
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagOther(name)
@parser.phases[:inBody].processEndTag(name)
end
end
end

View file

@ -0,0 +1,78 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InCellPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
handle_start 'html', %w( caption col colgroup tbody td tfoot th thead tr ) => 'TableOther'
handle_end %w( td th ) => 'TableCell', %w( body caption col colgroup html ) => 'Ignore'
handle_end %w( table tbody tfoot thead tr ) => 'Imply'
def processCharacters(data)
@parser.phases[:inBody].processCharacters(data)
end
def startTagTableOther(name, attributes)
if in_scope?('td', true) or in_scope?('th', true)
closeCell
@parser.phase.processStartTag(name, attributes)
else
# innerHTML case
@parser.parseError
end
end
def startTagOther(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def endTagTableCell(name)
if in_scope?(name, true)
@tree.generateImpliedEndTags(name)
if @tree.openElements[-1].name != name
@parser.parseError("Got table cell end tag (#{name}) while required end tags are missing.")
remove_open_elements_until(name)
else
@tree.openElements.pop
end
@tree.clearActiveFormattingElements
@parser.phase = @parser.phases[:inRow]
else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagImply(name)
if in_scope?(name, true)
closeCell
@parser.phase.processEndTag(name)
else
# sometimes innerHTML case
@parser.parseError
end
end
def endTagOther(name)
@parser.phases[:inBody].processEndTag(name)
end
protected
def closeCell
if in_scope?('td', true)
endTagTableCell('td')
elsif in_scope?('th', true)
endTagTableCell('th')
end
end
end
end

View file

@ -0,0 +1,55 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InColumnGroupPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-column
handle_start 'html', 'col'
handle_end 'colgroup', 'col'
def ignoreEndTagColgroup
@tree.openElements[-1].name == 'html'
end
def processCharacters(data)
ignoreEndTag = ignoreEndTagColgroup
endTagColgroup("colgroup")
@parser.phase.processCharacters(data) unless ignoreEndTag
end
def startTagCol(name, attributes)
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagOther(name, attributes)
ignoreEndTag = ignoreEndTagColgroup
endTagColgroup('colgroup')
@parser.phase.processStartTag(name, attributes) unless ignoreEndTag
end
def endTagColgroup(name)
if ignoreEndTagColgroup
# innerHTML case
assert @parser.innerHTML
@parser.parseError
else
@tree.openElements.pop
@parser.phase = @parser.phases[:inTable]
end
end
def endTagCol(name)
@parser.parseError(_('Unexpected end tag (col). col has no end tag.'))
end
def endTagOther(name)
ignoreEndTag = ignoreEndTagColgroup
endTagColgroup('colgroup')
@parser.phase.processEndTag(name) unless ignoreEndTag
end
end
end

View file

@ -0,0 +1,57 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
handle_start 'html', 'frameset', 'frame', 'noframes'
handle_end 'frameset', 'noframes'
def processCharacters(data)
@parser.parseError(_('Unexpected characters in the frameset phase. Characters ignored.'))
end
def startTagFrameset(name, attributes)
@tree.insertElement(name, attributes)
end
def startTagFrame(name, attributes)
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagNoframes(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag token (#{name}) in the frameset phase. Ignored"))
end
def endTagFrameset(name)
if @tree.openElements[-1].name == 'html'
# innerHTML case
@parser.parseError(_("Unexpected end tag token (frameset) in the frameset phase (innerHTML)."))
else
@tree.openElements.pop
end
if (not @parser.innerHTML and
@tree.openElements[-1].name != 'frameset')
# If we're not in innerHTML mode and the the current node is not a
# "frameset" element (anymore) then switch.
@parser.phase = @parser.phases[:afterFrameset]
end
end
def endTagNoframes(name)
@parser.phases[:inBody].processEndTag(name)
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the frameset phase. Ignored."))
end
end
end

View file

@ -0,0 +1,120 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InHeadPhase < Phase
handle_start 'html', 'head', 'title', 'style', 'script', %w( base link meta )
handle_end 'head', 'html', %w( title style script )
def processEOF
if ['title', 'style', 'script'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected end of file. Expected end tag (#{name})."))
@tree.openElements.pop
end
anythingElse
@parser.phase.processEOF
end
def processCharacters(data)
if ['title', 'style', 'script'].include?(@tree.openElements[-1].name)
@tree.insertText(data)
else
anythingElse
@parser.phase.processCharacters(data)
end
end
def startTagHead(name, attributes)
@parser.parseError(_('Unexpected start tag head in existing head. Ignored'))
end
def startTagTitle(name, attributes)
element = @tree.createElement(name, attributes)
appendToHead(element)
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :RCDATA
end
def startTagStyle(name, attributes)
element = @tree.createElement(name, attributes)
if @tree.headPointer != nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.openElements[-1].appendChild(element)
end
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagScript(name, attributes)
#XXX Inner HTML case may be wrong
element = @tree.createElement(name, attributes)
element._flags.push("parser-inserted")
if (@tree.headPointer != nil and
@parser.phase == @parser.phases[:inHead])
appendToHead(element)
else
@tree.openElements[-1].appendChild(element)
end
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagBaseLinkMeta(name, attributes)
element = @tree.createElement(name, attributes)
appendToHead(element)
end
def startTagOther(name, attributes)
anythingElse
@parser.phase.processStartTag(name, attributes)
end
def endTagHead(name)
if @tree.openElements[-1].name == 'head'
@tree.openElements.pop
else
@parser.parseError(_("Unexpected end tag (head). Ignored."))
end
@parser.phase = @parser.phases[:afterHead]
end
def endTagHtml(name)
anythingElse
@parser.phase.processEndTag(name)
end
def endTagTitleStyleScript(name)
if @tree.openElements[-1].name == name
@tree.openElements.pop
else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def anythingElse
if @tree.openElements[-1].name == 'head'
endTagHead('head')
else
@parser.phase = @parser.phases[:afterHead]
end
end
protected
def appendToHead(element)
if @tree.headPointer.nil?
assert @parser.innerHTML
@tree.openElements[-1].appendChild(element)
else
@tree.headPointer.appendChild(element)
end
end
end
end

View file

@ -0,0 +1,87 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InRowPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
handle_start 'html', %w( td th ) => 'TableCell', %w( caption col colgroup tbody tfoot thead tr ) => 'TableOther'
handle_end 'tr', 'table', %w( tbody tfoot thead ) => 'TableRowGroup', %w( body caption col colgroup html td th ) => 'Ignore'
def processCharacters(data)
@parser.phases[:inTable].processCharacters(data)
end
def startTagTableCell(name, attributes)
clearStackToTableRowContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inCell]
@tree.activeFormattingElements.push(Marker)
end
def startTagTableOther(name, attributes)
ignoreEndTag = ignoreEndTagTr
endTagTr('tr')
# XXX how are we sure it's always ignored in the innerHTML case?
@parser.phase.processStartTag(name, attributes) unless ignoreEndTag
end
def startTagOther(name, attributes)
@parser.phases[:inTable].processStartTag(name, attributes)
end
def endTagTr(name)
if ignoreEndTagTr
# innerHTML case
assert @parser.innerHTML
@parser.parseError
else
clearStackToTableRowContext
@tree.openElements.pop
@parser.phase = @parser.phases[:inTableBody]
end
end
def endTagTable(name)
ignoreEndTag = ignoreEndTagTr
endTagTr('tr')
# Reprocess the current tag if the tr end tag was not ignored
# XXX how are we sure it's always ignored in the innerHTML case?
@parser.phase.processEndTag(name) unless ignoreEndTag
end
def endTagTableRowGroup(name)
if in_scope?(name, true)
endTagTr('tr')
@parser.phase.processEndTag(name)
else
# innerHTML case
@parser.parseError
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the row phase. Ignored."))
end
def endTagOther(name)
@parser.phases[:inTable].processEndTag(name)
end
protected
# XXX unify this with other table helper methods
def clearStackToTableRowContext
until ['tr', 'html'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the row phase."))
@tree.openElements.pop
end
end
def ignoreEndTagTr
not in_scope?('tr', :tableVariant => true)
end
end
end

View file

@ -0,0 +1,84 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InSelectPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
handle_start 'html', 'option', 'optgroup', 'select'
handle_end 'option', 'optgroup', 'select', %w( caption table tbody tfoot thead tr td th ) => 'TableElements'
def processCharacters(data)
@tree.insertText(data)
end
def startTagOption(name, attributes)
# We need to imply </option> if <option> is the current node.
@tree.openElements.pop if @tree.openElements[-1].name == 'option'
@tree.insertElement(name, attributes)
end
def startTagOptgroup(name, attributes)
@tree.openElements.pop if @tree.openElements[-1].name == 'option'
@tree.openElements.pop if @tree.openElements[-1].name == 'optgroup'
@tree.insertElement(name, attributes)
end
def startTagSelect(name, attributes)
@parser.parseError(_('Unexpected start tag (select) in the select phase implies select start tag.'))
endTagSelect('select')
end
def startTagOther(name, attributes)
@parser.parseError(_('Unexpected start tag token (#{name}) in the select phase. Ignored.'))
end
def endTagOption(name)
if @tree.openElements[-1].name == 'option'
@tree.openElements.pop
else
@parser.parseError(_('Unexpected end tag (option) in the select phase. Ignored.'))
end
end
def endTagOptgroup(name)
# </optgroup> implicitly closes <option>
if @tree.openElements[-1].name == 'option' and @tree.openElements[-2].name == 'optgroup'
@tree.openElements.pop
end
# It also closes </optgroup>
if @tree.openElements[-1].name == 'optgroup'
@tree.openElements.pop
# But nothing else
else
@parser.parseError(_('Unexpected end tag (optgroup) in the select phase. Ignored.'))
end
end
def endTagSelect(name)
if in_scope?('select', true)
remove_open_elements_until('select')
@parser.resetInsertionMode
else
# innerHTML case
@parser.parseError
end
end
def endTagTableElements(name)
@parser.parseError(_("Unexpected table end tag (#{name}) in the select phase."))
if in_scope?(name, true)
endTagSelect('select')
@parser.phase.processEndTag(name)
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the select phase. Ignored."))
end
end
end

View file

@ -0,0 +1,83 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InTableBodyPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
handle_start 'html', 'tr', %w( td th ) => 'TableCell', %w( caption col colgroup tbody tfoot thead ) => 'TableOther'
handle_end 'table', %w( tbody tfoot thead ) => 'TableRowGroup', %w( body caption col colgroup html td th tr ) => 'Ingore'
def processCharacters(data)
@parser.phases[:inTable].processCharacters(data)
end
def startTagTr(name, attributes)
clearStackToTableBodyContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inRow]
end
def startTagTableCell(name, attributes)
@parser.parseError(_("Unexpected table cell start tag (#{name}) in the table body phase."))
startTagTr('tr', {})
@parser.phase.processStartTag(name, attributes)
end
def startTagTableOther(name, attributes)
# XXX AT Any ideas on how to share this with endTagTable?
if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
clearStackToTableBodyContext
endTagTableRowGroup(@tree.openElements[-1].name)
@parser.phase.processStartTag(name, attributes)
else
# innerHTML case
@parser.parseError
end
end
def startTagOther(name, attributes)
@parser.phases[:inTable].processStartTag(name, attributes)
end
def endTagTableRowGroup(name)
if in_scope?(name, true)
clearStackToTableBodyContext
@tree.openElements.pop
@parser.phase = @parser.phases[:inTable]
else
@parser.parseError(_("Unexpected end tag (#{name}) in the table body phase. Ignored."))
end
end
def endTagTable(name)
if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
clearStackToTableBodyContext
endTagTableRowGroup(@tree.openElements[-1].name)
@parser.phase.processEndTag(name)
else
# innerHTML case
@parser.parseError
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the table body phase. Ignored."))
end
def endTagOther(name)
@parser.phases[:inTable].processEndTag(name)
end
protected
def clearStackToTableBodyContext
until ['tbody', 'tfoot', 'thead', 'html'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the table body phase."))
@tree.openElements.pop
end
end
end
end

View file

@ -0,0 +1,110 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InTablePhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
handle_start 'html', 'caption', 'colgroup', 'col', 'table'
handle_start %w( tbody tfoot thead ) => 'RowGroup', %w( td th tr ) => 'ImplyTbody'
handle_end 'table', %w( body caption col colgroup html tbody td tfoot th thead tr ) => 'Ignore'
def processCharacters(data)
@parser.parseError(_("Unexpected non-space characters in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
@tree.insertFromTable = true
# Process the character in the "in body" mode
@parser.phases[:inBody].processCharacters(data)
@tree.insertFromTable = false
end
def startTagCaption(name, attributes)
clearStackToTableContext
@tree.activeFormattingElements.push(Marker)
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inCaption]
end
def startTagColgroup(name, attributes)
clearStackToTableContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inColumnGroup]
end
def startTagCol(name, attributes)
startTagColgroup('colgroup', {})
@parser.phase.processStartTag(name, attributes)
end
def startTagRowGroup(name, attributes)
clearStackToTableContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inTableBody]
end
def startTagImplyTbody(name, attributes)
startTagRowGroup('tbody', {})
@parser.phase.processStartTag(name, attributes)
end
def startTagTable(name, attributes)
@parser.parseError(_("Unexpected start tag (table) in table phase. Implies end tag (table)."))
@parser.phase.processEndTag('table')
@parser.phase.processStartTag(name, attributes) unless @parser.innerHTML
end
def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
@tree.insertFromTable = true
# Process the start tag in the "in body" mode
@parser.phases[:inBody].processStartTag(name, attributes)
@tree.insertFromTable = false
end
def endTagTable(name)
if in_scope?('table', true)
@tree.generateImpliedEndTags
unless @tree.openElements[-1].name == 'table'
@parser.parseError(_("Unexpected end tag (table). Expected end tag (#{@tree.openElements[-1].name})."))
end
remove_open_elements_until('table')
@parser.resetInsertionMode
else
# innerHTML case
assert @parser.innerHTML
@parser.parseError
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
@parser.insertFromTable = true
# Process the end tag in the "in body" mode
@parser.phases[:inBody].processEndTag(name)
@parser.insertFromTable = false
end
protected
def clearStackToTableContext
# "clear the stack back to a table context"
until ['table', 'html'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the table phase."))
@tree.openElements.pop
end
# When the current node is <html> it's an innerHTML case
end
end
end

View file

@ -0,0 +1,49 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InitialPhase < Phase
# This phase deals with error handling as well which is currently not
# covered in the specification. The error handling is typically known as
# "quirks mode". It is expected that a future version of HTML5 will define this.
def processEOF
@parser.parseError(_('Unexpected End of file. Expected DOCTYPE.'))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processEOF
end
def processComment(data)
@tree.insertComment(data, @tree.document)
end
def processDoctype(name, error)
@parser.parseError(_('Erroneous DOCTYPE.')) if error
@tree.insertDoctype(name)
@parser.phase = @parser.phases[:rootElement]
end
def processSpaceCharacters(data)
@tree.insertText(data, @tree.document)
end
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters. Expected DOCTYPE.'))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}). Expected DOCTYPE."))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
@parser.parseError(_("Unexpected end tag (#{name}). Expected DOCTYPE."))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processEndTag(name)
end
end
end

View file

@ -0,0 +1,156 @@
module HTML5lib
# Base class for helper objects that implement each phase of processing.
#
# Handler methods should be in the following order (they can be omitted):
#
# * EOF
# * Comment
# * Doctype
# * SpaceCharacters
# * Characters
# * StartTag
# - startTag* methods
# * EndTag
# - endTag* methods
#
class Phase
# The following example call:
#
# tag_handlers('startTag', 'html', %( base link meta ), %( li dt dd ) => 'ListItem')
#
# ...would return a hash equal to this:
#
# { 'html' => 'startTagHtml',
# 'base' => 'startTagBaseLinkMeta',
# 'link' => 'startTagBaseLinkMeta',
# 'meta' => 'startTagBaseLinkMeta',
# 'li' => 'startTagListItem',
# 'dt' => 'startTagListItem',
# 'dd' => 'startTagListItem' }
#
def self.tag_handlers(prefix, *tags)
mapping = {}
if tags.last.is_a?(Hash)
tags.pop.each do |names, handler_method_suffix|
handler_method = prefix + handler_method_suffix
Array(names).each { |name| mapping[name] = handler_method }
end
end
tags.each do |names|
names = Array(names)
handler_method = prefix + names.map { |name| name.capitalize }.join
names.each { |name| mapping[name] = handler_method }
end
return mapping
end
def self.start_tag_handlers
@start_tag_handlers ||= Hash.new('startTagOther')
end
# Declare what start tags this Phase handles. Can be called more than once.
#
# Example usage:
#
# handle_start 'html'
# # html start tags will be handled by a method named 'startTagHtml'
#
# handle_start %( base link meta )
# # base, link and meta start tags will be handled by a method named 'startTagBaseLinkMeta'
#
# handle_start %( li dt dd ) => 'ListItem'
# # li, dt, and dd start tags will be handled by a method named 'startTagListItem'
#
def self.handle_start(*tags)
start_tag_handlers.update tag_handlers('startTag', *tags)
end
def self.end_tag_handlers
@end_tag_handlers ||= Hash.new('endTagOther')
end
# Declare what end tags this Phase handles. Behaves like handle_start.
#
def self.handle_end(*tags)
end_tag_handlers.update tag_handlers('endTag', *tags)
end
def initialize(parser, tree)
@parser, @tree = parser, tree
end
def processEOF
@tree.generateImpliedEndTags
if @tree.openElements.length > 2
@parser.parseError(_('Unexpected end of file. Missing closing tags.'))
elsif @tree.openElements.length == 2 and @tree.openElements[1].name != 'body'
# This happens for framesets or something?
@parser.parseError(_("Unexpected end of file. Expected end tag (#{@tree.openElements[1].name}) first."))
elsif @parser.innerHTML and @tree.openElements.length > 1
# XXX This is not what the specification says. Not sure what to do here.
@parser.parseError(_('XXX innerHTML EOF'))
end
# Betting ends.
end
def processComment(data)
# For most phases the following is correct. Where it's not it will be
# overridden.
@tree.insertComment(data, @tree.openElements[-1])
end
def processDoctype(name, error)
@parser.parseError(_('Unexpected DOCTYPE. Ignored.'))
end
def processSpaceCharacters(data)
@tree.insertText(data)
end
def processStartTag(name, attributes)
send self.class.start_tag_handlers[name], name, attributes
end
def startTagHtml(name, attributes)
if @parser.firstStartTag == false and name == 'html'
@parser.parseError(_('html needs to be the first start tag.'))
end
# XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke @parser.parseError.
attributes.each do |attr, value|
unless @tree.openElements[0].attributes.has_key?(attr)
@tree.openElements[0].attributes[attr] = value
end
end
@parser.firstStartTag = false
end
def processEndTag(name)
send self.class.end_tag_handlers[name], name
end
def _(string)
string
end
def assert(value)
throw AssertionError.new unless value
end
def in_scope?(*args)
@tree.elementInScope(*args)
end
def remove_open_elements_until(name=nil)
finished = false
until finished
element = @tree.openElements.pop
finished = name.nil?? yield(element) : element.name == name
end
return element
end
end
end

View file

@ -0,0 +1,43 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class RootElementPhase < Phase
def processEOF
insertHtmlElement
@parser.phase.processEOF
end
def processComment(data)
@tree.insertComment(data, @tree.document)
end
def processSpaceCharacters(data)
@tree.insertText(data, @tree.document)
end
def processCharacters(data)
insertHtmlElement
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.firstStartTag = true if name == 'html'
insertHtmlElement
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
insertHtmlElement
@parser.phase.processEndTag(name)
end
def insertHtmlElement
element = @tree.createElement('html', {})
@tree.openElements.push(element)
@tree.document.appendChild(element)
@parser.phase = @parser.phases[:beforeHead]
end
end
end

View file

@ -0,0 +1,36 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class TrailingEndPhase < Phase
def processEOF
end
def processComment(data)
@tree.insertComment(data, @tree.document)
end
def processSpaceCharacters(data)
@parser.lastPhase.processSpaceCharacters(data)
end
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters. Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.parseError(_('Unexpected start tag (#{name}). Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
@parser.parseError(_('Unexpected end tag (#{name}). Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processEndTag(name)
end
end
end

View file

@ -3,14 +3,14 @@ require 'html5lib/constants'
module HTML5lib
# Provides a unicode stream of characters to the HTMLTokenizer.
# Provides a unicode stream of characters to the HTMLTokenizer.
# This class takes care of character encoding and removing or replacing
# incorrect byte-sequences and also provides column and line tracking.
# This class takes care of character encoding and removing or replacing
# incorrect byte-sequences and also provides column and line tracking.
class HTMLInputStream
class HTMLInputStream
attr_accessor :queue, :charEncoding
attr_accessor :queue, :char_encoding
# Initialises the HTMLInputStream.
#
@ -27,523 +27,524 @@ class HTMLInputStream
# parseMeta - Look for a <meta> element containing encoding information
def initialize(source, options = {})
@encoding = nil
@parseMeta = true
@chardet = true
@encoding = nil
@parse_meta = true
@chardet = true
options.each { |name, value| instance_variable_set("@#{name}", value) }
options.each { |name, value| instance_variable_set("@#{name}", value) }
# List of where new lines occur
@newLines = []
# List of where new lines occur
@new_lines = []
# Raw Stream
@rawStream = openStream(source)
# Raw Stream
@raw_stream = open_stream(source)
# Encoding Information
#Number of bytes to use when looking for a meta element with
#encoding information
@NUM_BYTES_META = 512
#Encoding to use if no other information can be found
@DEFAULT_ENCODING = 'windows-1252'
#Detect encoding iff no explicit "transport level" encoding is supplied
if @encoding.nil? or not HTML5lib.isValidEncoding(@encoding)
@charEncoding = detectEncoding
else
@charEncoding = @encoding
# Encoding Information
#Number of bytes to use when looking for a meta element with
#encoding information
@NUM_BYTES_META = 512
#Encoding to use if no other information can be found
@DEFAULT_ENCODING = 'windows-1252'
#Detect encoding iff no explicit "transport level" encoding is supplied
if @encoding.nil? or not HTML5lib.is_valid_encoding(@encoding)
@char_encoding = detect_encoding
else
@char_encoding = @encoding
end
# Read bytes from stream decoding them into Unicode
uString = @raw_stream.read
unless @char_encoding == 'utf-8'
begin
require 'iconv'
uString = Iconv.iconv('utf-8', @encoding, uString)[0]
rescue
end
end
# Read bytes from stream decoding them into Unicode
uString = @rawStream.read
unless @charEncoding == 'utf-8'
begin
require 'iconv'
uString = Iconv.iconv('utf-8', @encoding, uString)[0]
rescue
end
end
# Normalize newlines and null characters
uString.gsub!(/\r\n?/, "\n")
uString.gsub!("\x00", [0xFFFD].pack('U'))
# Normalize newlines and null characters
uString.gsub!(/\r\n?/, "\n")
uString.gsub!("\x00", [0xFFFD].pack('U'))
# Convert the unicode string into a list to be used as the data stream
@data_stream = uString
# Convert the unicode string into a list to be used as the data stream
@dataStream = uString
@queue = []
@queue = []
# Reset position in the list to read from
reset
# Reset position in the list to read from
reset
end
# Produces a file object from source.
#
# source can be either a file object, local filename or a string.
def openStream(source)
# Already an IO like object
if source.respond_to?(:read)
@stream = source
else
# Treat source as a string and wrap in StringIO
@stream = StringIO.new(source)
end
return @stream
def open_stream(source)
# Already an IO like object
if source.respond_to?(:read)
@stream = source
else
# Treat source as a string and wrap in StringIO
@stream = StringIO.new(source)
end
return @stream
end
def detectEncoding
def detect_encoding
#First look for a BOM
#This will also read past the BOM if present
encoding = detectBOM
#If there is no BOM need to look for meta elements with encoding
#information
if encoding.nil? and @parseMeta
encoding = detectEncodingMeta
#First look for a BOM
#This will also read past the BOM if present
encoding = detect_bom
#If there is no BOM need to look for meta elements with encoding
#information
if encoding.nil? and @parse_meta
encoding = detect_encoding_meta
end
#Guess with chardet, if avaliable
if encoding.nil? and @chardet
begin
require 'rubygems'
require 'UniversalDetector' # gem install chardet
buffer = @raw_stream.read
encoding = UniversalDetector::chardet(buffer)['encoding']
@raw_stream = open_stream(buffer)
rescue LoadError
end
#Guess with chardet, if avaliable
if encoding.nil? and @chardet
begin
require 'rubygems'
require 'UniversalDetector' # gem install chardet
buffer = @rawStream.read
encoding = UniversalDetector::chardet(buffer)['encoding']
@rawStream = openStream(buffer)
rescue LoadError
end
end
# If all else fails use the default encoding
if encoding.nil?
encoding = @DEFAULT_ENCODING
end
#Substitute for equivalent encodings:
encodingSub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
end
# If all else fails use the default encoding
if encoding.nil?
encoding = @DEFAULT_ENCODING
end
#Substitute for equivalent encodings:
encoding_sub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
if encodingSub.has_key?(encoding.downcase)
encoding = encodingSub[encoding.downcase]
end
if encoding_sub.has_key?(encoding.downcase)
encoding = encoding_sub[encoding.downcase]
end
return encoding
return encoding
end
# Attempts to detect at BOM at the start of the stream. If
# an encoding can be determined from the BOM return the name of the
# encoding otherwise return nil
def detectBOM
bomDict = {
"\xef\xbb\xbf" => 'utf-8',
"\xff\xfe" => 'utf-16-le',
"\xfe\xff" => 'utf-16-be',
"\xff\xfe\x00\x00" => 'utf-32-le',
"\x00\x00\xfe\xff" => 'utf-32-be'
}
def detect_bom
bom_dict = {
"\xef\xbb\xbf" => 'utf-8',
"\xff\xfe" => 'utf-16-le',
"\xfe\xff" => 'utf-16-be',
"\xff\xfe\x00\x00" => 'utf-32-le',
"\x00\x00\xfe\xff" => 'utf-32-be'
}
# Go to beginning of file and read in 4 bytes
@rawStream.seek(0)
string = @rawStream.read(4)
return nil unless string
# Go to beginning of file and read in 4 bytes
@raw_stream.seek(0)
string = @raw_stream.read(4)
return nil unless string
# Try detecting the BOM using bytes from the string
encoding = bomDict[string[0...3]] # UTF-8
seek = 3
# Try detecting the BOM using bytes from the string
encoding = bom_dict[string[0...3]] # UTF-8
seek = 3
unless encoding
# Need to detect UTF-32 before UTF-16
encoding = bom_dict[string] # UTF-32
seek = 4
unless encoding
# Need to detect UTF-32 before UTF-16
encoding = bomDict[string] # UTF-32
seek = 4
unless encoding
encoding = bomDict[string[0...2]] # UTF-16
seek = 2
end
encoding = bom_dict[string[0...2]] # UTF-16
seek = 2
end
end
#AT - move this to the caller?
# Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream
@rawStream.seek(encoding ? seek : 0)
#AT - move this to the caller?
# Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream
@raw_stream.seek(encoding ? seek : 0)
return encoding
return encoding
end
# Report the encoding declared by the meta element
def detectEncodingMeta
parser = EncodingParser.new(@rawStream.read(@NUM_BYTES_META))
@rawStream.seek(0)
return parser.getEncoding
def detect_encoding_meta
parser = EncodingParser.new(@raw_stream.read(@NUM_BYTES_META))
@raw_stream.seek(0)
return parser.get_encoding
end
def determineNewLines
# Looks through the stream to find where new lines occur so
# the position method can tell where it is.
@newLines.push(0)
(0...@dataStream.length).each { |i| @newLines.push(i) if @dataStream[i] == ?\n }
def determine_new_lines
# Looks through the stream to find where new lines occur so
# the position method can tell where it is.
@new_lines.push(0)
(0...@data_stream.length).each { |i| @new_lines.push(i) if @data_stream[i] == ?\n }
end
# Returns (line, col) of the current position in the stream.
def position
# Generate list of new lines first time around
determineNewLines if @newLines.empty?
line = 0
tell = @tell
@newLines.each do |pos|
break unless pos < tell
line += 1
end
col = tell - @newLines[line-1] - 1
return [line, col]
# Generate list of new lines first time around
determine_new_lines if @new_lines.empty?
line = 0
tell = @tell
@new_lines.each do |pos|
break unless pos < tell
line += 1
end
col = tell - @new_lines[line-1] - 1
return [line, col]
end
# Resets the position in the stream back to the start.
def reset
@tell = 0
@tell = 0
end
# Read one character from the stream or queue if available. Return
# EOF when EOF is reached.
def char
unless @queue.empty?
return @queue.shift
else
begin
@tell += 1
return @dataStream[@tell - 1].chr
rescue
return :EOF
end
unless @queue.empty?
return @queue.shift
else
begin
@tell += 1
return @data_stream[@tell - 1].chr
rescue
return :EOF
end
end
end
# Returns a string of characters from the stream up to but not
# including any character in characters or EOF. characters can be
# any container that supports the in method being called on it.
def charsUntil(characters, opposite = false)
charStack = [char]
def chars_until(characters, opposite=false)
char_stack = [char]
unless charStack[0] == :EOF
while (characters.include? charStack[-1]) == opposite
unless @queue.empty?
# First from the queue
charStack.push(@queue.shift)
break if charStack[-1] == :EOF
else
# Then the rest
begin
charStack.push(@dataStream[@tell].chr)
@tell += 1
rescue
charStack.push(:EOF)
break
end
end
unless char_stack[0] == :EOF
while (characters.include? char_stack[-1]) == opposite
unless @queue.empty?
# First from the queue
char_stack.push(@queue.shift)
break if char_stack[-1] == :EOF
else
# Then the rest
begin
char_stack.push(@data_stream[@tell].chr)
@tell += 1
rescue
char_stack.push(:EOF)
break
end
end
end
end
# Put the character stopped on back to the front of the queue
# from where it came.
@queue.insert(0, charStack.pop)
return charStack.join('')
# Put the character stopped on back to the front of the queue
# from where it came.
@queue.insert(0, char_stack.pop)
return char_stack.join('')
end
end
end
# String-like object with an assosiated position and various extra methods
# If the position is ever greater than the string length then an exception is raised
class EncodingBytes < String
# String-like object with an assosiated position and various extra methods
# If the position is ever greater than the string length then an exception is raised
class EncodingBytes < String
attr_accessor :position
def initialize(value)
super(value)
@position = -1
super(value)
@position = -1
end
def each
while @position < length
@position += 1
yield self[@position]
end
while @position < length
@position += 1
yield self[@position]
end
rescue EOF
end
def currentByte
raise EOF if @position >= length
return self[@position].chr
def current_byte
raise EOF if @position >= length
return self[@position].chr
end
# Skip past a list of characters
def skip(chars = SPACE_CHARACTERS)
while chars.include?(currentByte)
@position += 1
end
def skip(chars=SPACE_CHARACTERS)
while chars.include?(current_byte)
@position += 1
end
end
# Look for a sequence of bytes at the start of a string. If the bytes
# are found return true and advance the position to the byte after the
# match. Otherwise return false and leave the position alone
def matchBytes(bytes, lower = false)
data = self[position ... position+bytes.length]
data.downcase! if lower
rv = (data == bytes)
@position += bytes.length if rv == true
return rv
def match_bytes(bytes, lower=false)
data = self[position ... position+bytes.length]
data.downcase! if lower
rv = (data == bytes)
@position += bytes.length if rv == true
return rv
end
# Look for the next sequence of bytes matching a given sequence. If
# a match is found advance the position to the last byte of the match
def jumpTo(bytes)
newPosition = self[position .. -1].index(bytes)
if newPosition
@position += (newPosition + bytes.length-1)
return true
else
raise EOF
end
def jump_to(bytes)
new_position = self[position .. -1].index(bytes)
if new_position
@position += (new_position + bytes.length-1)
return true
else
raise EOF
end
end
# Move the pointer so it points to the next byte in a set of possible
# bytes
def findNext(byteList)
until byteList.include?(currentByte)
@position += 1
end
def find_next(byte_list)
until byte_list.include?(current_byte)
@position += 1
end
end
end
end
# Mini parser for detecting character encoding from meta elements
class EncodingParser
# Mini parser for detecting character encoding from meta elements
class EncodingParser
# string - the data to work on for encoding detection
def initialize(data)
@data = EncodingBytes.new(data.to_s)
@encoding = nil
@data = EncodingBytes.new(data.to_s)
@encoding = nil
end
@@method_dispatch = [
['<!--', :handleComment],
['<meta', :handleMeta],
['</', :handlePossibleEndTag],
['<!', :handleOther],
['<?', :handleOther],
['<', :handlePossibleStartTag]
['<!--', :handle_comment],
['<meta', :handle_meta],
['</', :handle_possible_end_tag],
['<!', :handle_other],
['<?', :handle_other],
['<', :handle_possible_start_tag]
]
def getEncoding
@data.each do |byte|
keepParsing = true
@@method_dispatch.each do |(key, method)|
if @data.matchBytes(key, lower = true)
keepParsing = send(method)
break
end
end
break unless keepParsing
def get_encoding
@data.each do |byte|
keep_parsing = true
@@method_dispatch.each do |(key, method)|
if @data.match_bytes(key, lower = true)
keep_parsing = send(method)
break
end
end
@encoding = @encoding.strip unless @encoding.nil?
return @encoding
break unless keep_parsing
end
@encoding = @encoding.strip unless @encoding.nil?
return @encoding
end
# Skip over comments
def handleComment
return @data.jumpTo('-->')
def handle_comment
return @data.jump_to('-->')
end
def handleMeta
# if we have <meta not followed by a space so just keep going
return true unless SPACE_CHARACTERS.include?(@data.currentByte)
def handle_meta
# if we have <meta not followed by a space so just keep going
return true unless SPACE_CHARACTERS.include?(@data.current_byte)
#We have a valid meta element we want to search for attributes
while true
#Try to find the next attribute after the current position
attr = getAttribute
#We have a valid meta element we want to search for attributes
while true
#Try to find the next attribute after the current position
attr = get_attribute
return true if attr.nil?
if attr[0] == 'charset'
tentativeEncoding = attr[1]
if HTML5lib.isValidEncoding(tentativeEncoding)
@encoding = tentativeEncoding
return false
end
elsif attr[0] == 'content'
contentParser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
tentativeEncoding = contentParser.parse
if HTML5lib.isValidEncoding(tentativeEncoding)
@encoding = tentativeEncoding
return false
end
end
end
end
def handlePossibleStartTag
return handlePossibleTag(false)
end
def handlePossibleEndTag
@data.position+=1
return handlePossibleTag(true)
end
def handlePossibleTag(endTag)
unless ASCII_LETTERS.include?(@data.currentByte)
#If the next byte is not an ascii letter either ignore this
#fragment (possible start tag case) or treat it according to
#handleOther
if endTag
@data.position -= 1
handleOther
end
return true
end
return true if attr.nil?
@data.findNext(SPACE_CHARACTERS + ['<', '>'])
if attr[0] == 'charset'
tentative_encoding = attr[1]
if HTML5lib.is_valid_encoding(tentative_encoding)
@encoding = tentative_encoding
return false
end
elsif attr[0] == 'content'
content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
tentative_encoding = content_parser.parse
if HTML5lib.is_valid_encoding(tentative_encoding)
@encoding = tentative_encoding
return false
end
end
end
end
if @data.currentByte == '<'
#return to the first step in the overall "two step" algorithm
#reprocessing the < byte
@data.position -= 1
else
#Read all attributes
{} until getAttribute.nil?
def handle_possible_start_tag
return handle_possible_tag(false)
end
def handle_possible_end_tag
@data.position += 1
return handle_possible_tag(true)
end
def handle_possible_tag(end_tag)
unless ASCII_LETTERS.include?(@data.current_byte)
#If the next byte is not an ascii letter either ignore this
#fragment (possible start tag case) or treat it according to
#handleOther
if end_tag
@data.position -= 1
handle_other
end
return true
end
@data.find_next(SPACE_CHARACTERS + ['<', '>'])
if @data.current_byte == '<'
#return to the first step in the overall "two step" algorithm
#reprocessing the < byte
@data.position -= 1
else
#Read all attributes
{} until get_attribute.nil?
end
return true
end
def handleOther
return @data.jumpTo('>')
def handle_other
return @data.jump_to('>')
end
# Return a name,value pair for the next attribute in the stream,
# Return a name,value pair for the next attribute in the stream,
# if one is found, or nil
def getAttribute
@data.skip(SPACE_CHARACTERS + ['/'])
def get_attribute
@data.skip(SPACE_CHARACTERS + ['/'])
if @data.currentByte == '<'
@data.position -= 1
return nil
elsif @data.currentByte == '>'
return nil
end
if @data.current_byte == '<'
@data.position -= 1
return nil
elsif @data.current_byte == '>'
return nil
end
attrName = []
attrValue = []
spaceFound = false
#Step 5 attribute name
while true
if @data.currentByte == '=' and attrName:
break
elsif SPACE_CHARACTERS.include?(@data.currentByte)
spaceFound = true
break
elsif ['/', '<', '>'].include?(@data.currentByte)
return [attrName.join(''), '']
elsif ASCII_UPPERCASE.include?(@data.currentByte)
attrName.push(@data.currentByte.downcase)
else
attrName.push(@data.currentByte)
end
#Step 6
@data.position += 1
end
#Step 7
if spaceFound
@data.skip
#Step 8
unless @data.currentByte == '='
@data.position -= 1
return [attrName.join(''), '']
end
end
#XXX need to advance position in both spaces and value case
#Step 9
@data.position += 1
#Step 10
@data.skip
#Step 11
if ["'", '"'].include?(@data.currentByte)
#11.1
quoteChar = @data.currentByte
while true
@data.position+=1
#11.3
if @data.currentByte == quoteChar
@data.position += 1
return [attrName.join(''), attrValue.join('')]
#11.4
elsif ASCII_UPPERCASE.include?(@data.currentByte)
attrValue.push(@data.currentByte.downcase)
#11.5
else
attrValue.push(@data.currentByte)
end
end
elsif ['>', '<'].include?(@data.currentByte)
return [attrName.join(''), '']
elsif ASCII_UPPERCASE.include?(@data.currentByte)
attrValue.push(@data.currentByte.downcase)
attr_name = []
attr_value = []
space_found = false
#Step 5 attribute name
while true
if @data.current_byte == '=' and attr_name:
break
elsif SPACE_CHARACTERS.include?(@data.current_byte)
space_found = true
break
elsif ['/', '<', '>'].include?(@data.current_byte)
return [attr_name.join(''), '']
elsif ASCII_UPPERCASE.include?(@data.current_byte)
attr_name.push(@data.current_byte.downcase)
else
attrValue.push(@data.currentByte)
attr_name.push(@data.current_byte)
end
#Step 6
@data.position += 1
end
#Step 7
if space_found
@data.skip
#Step 8
unless @data.current_byte == '='
@data.position -= 1
return [attr_name.join(''), '']
end
end
#XXX need to advance position in both spaces and value case
#Step 9
@data.position += 1
#Step 10
@data.skip
#Step 11
if ["'", '"'].include?(@data.current_byte)
#11.1
quote_char = @data.current_byte
while true
@data.position +=1
if (SPACE_CHARACTERS + ['>', '<']).include?(@data.currentByte)
return [attrName.join(''), attrValue.join('')]
elsif ASCII_UPPERCASE.include?(@data.currentByte)
attrValue.push(@data.currentByte.downcase)
else
attrValue.push(@data.currentByte)
end
@data.position+=1
#11.3
if @data.current_byte == quote_char
@data.position += 1
return [attr_name.join(''), attr_value.join('')]
#11.4
elsif ASCII_UPPERCASE.include?(@data.current_byte)
attr_value.push(@data.current_byte.downcase)
#11.5
else
attr_value.push(@data.current_byte)
end
end
elsif ['>', '<'].include?(@data.current_byte)
return [attr_name.join(''), '']
elsif ASCII_UPPERCASE.include?(@data.current_byte)
attr_value.push(@data.current_byte.downcase)
else
attr_value.push(@data.current_byte)
end
while true
@data.position += 1
if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
return [attr_name.join(''), attr_value.join('')]
elsif ASCII_UPPERCASE.include?(@data.current_byte)
attr_value.push(@data.current_byte.downcase)
else
attr_value.push(@data.current_byte)
end
end
end
end
end
class ContentAttrParser
class ContentAttrParser
def initialize(data)
@data = data
@data = data
end
def parse
begin
#Skip to the first ";"
@data.position = 0
@data.jumpTo(';')
@data.position += 1
@data.skip
#Check if the attr name is charset
#otherwise return
@data.jumpTo('charset')
@data.position += 1
@data.skip
unless @data.currentByte == '='
#If there is no = sign keep looking for attrs
return nil
end
@data.position += 1
@data.skip
#Look for an encoding between matching quote marks
if ['"', "'"].include?(@data.currentByte)
quoteMark = @data.currentByte
@data.position += 1
oldPosition = @data.position
@data.jumpTo(quoteMark)
return @data[oldPosition ... @data.position]
else
#Unquoted value
oldPosition = @data.position
begin
@data.findNext(SPACE_CHARACTERS)
return @data[oldPosition ... @data.position]
rescue EOF
#Return the whole remaining value
return @data[oldPosition .. -1]
end
end
rescue EOF
return nil
begin
#Skip to the first ";"
@data.position = 0
@data.jump_to(';')
@data.position += 1
@data.skip
#Check if the attr name is charset
#otherwise return
@data.jump_to('charset')
@data.position += 1
@data.skip
unless @data.current_byte == '='
#If there is no = sign keep looking for attrs
return nil
end
@data.position += 1
@data.skip
#Look for an encoding between matching quote marks
if ['"', "'"].include?(@data.current_byte)
quote_mark = @data.current_byte
@data.position += 1
old_position = @data.position
@data.jump_to(quote_mark)
return @data[old_position ... @data.position]
else
#Unquoted value
old_position = @data.position
begin
@data.find_next(SPACE_CHARACTERS)
return @data[old_position ... @data.position]
rescue EOF
#Return the whole remaining value
return @data[old_position .. -1]
end
end
rescue EOF
return nil
end
end
end
end
# Determine if a string is a supported encoding
def self.isValidEncoding(encoding)
# Determine if a string is a supported encoding
def self.is_valid_encoding(encoding)
(not encoding.nil? and encoding.kind_of?(String) and ENCODINGS.include?(encoding.downcase.strip))
end
end
end

View file

@ -16,126 +16,126 @@ require 'html5lib/constants'
module HTML5lib
# liberal XML parser
class XMLParser < HTMLParser
# liberal XML parser
class XMLParser < HTMLParser
def initialize(options={})
super options
@phases[:initial] = XmlRootPhase.new(self, @tree)
def initialize(options = {})
super options
@phases[:initial] = XmlRootPhase.new(self, @tree)
end
def normalizeToken(token)
if token[:type] == :StartTag or token[:type] == :EmptyTag
# We need to remove the duplicate attributes and convert attributes
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
if token[:type] == :StartTag or token[:type] == :EmptyTag
# We need to remove the duplicate attributes and convert attributes
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
token[:data] = Hash[*token[:data].reverse.flatten]
token[:data] = Hash[*token[:data].reverse.flatten]
# For EmptyTags, process both a Start and an End tag
if token[:type] == :EmptyTag
@phase.processStartTag(token[:name], token[:data])
token[:data] = {}
token[:type] = :EndTag
end
elsif token[:type] == :EndTag
if token[:data]
parseError(_("End tag contains unexpected attributes."))
end
elsif token[:type] == :Comment
# Rescue CDATA from the comments
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
token[:type] = :Characters
token[:data] = token[:data][7 ... -2]
end
# For EmptyTags, process both a Start and an End tag
if token[:type] == :EmptyTag
@phase.processStartTag(token[:name], token[:data])
token[:data] = {}
token[:type] = :EndTag
end
return token
elsif token[:type] == :EndTag
if token[:data]
parseError(_("End tag contains unexpected attributes."))
end
elsif token[:type] == :Comment
# Rescue CDATA from the comments
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
token[:type] = :Characters
token[:data] = token[:data][7 ... -2]
end
end
return token
end
end
end
# liberal XMTHML parser
class XHTMLParser < XMLParser
# liberal XMTHML parser
class XHTMLParser < XMLParser
def initialize(options={})
super options
@phases[:initial] = InitialPhase.new(self, @tree)
@phases[:rootElement] = XhmlRootPhase.new(self, @tree)
def initialize(options = {})
super options
@phases[:initial] = InitialPhase.new(self, @tree)
@phases[:rootElement] = XhmlRootPhase.new(self, @tree)
end
def normalizeToken(token)
super(token)
super(token)
# ensure that non-void XHTML elements have content so that separate
# open and close tags are emitted
if token[:type] == :EndTag and \
not VOID_ELEMENTS.include? token[:name] and \
token[:name] == @tree.openElements[-1].name and \
not @tree.openElements[-1].hasContent
@tree.insertText('') unless
@tree.openElements.any? {|e|
e.attributes.keys.include? 'xmlns' and
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
}
end
# ensure that non-void XHTML elements have content so that separate
# open and close tags are emitted
if token[:type] == :EndTag and \
not VOID_ELEMENTS.include? token[:name] and \
token[:name] == @tree.openElements[-1].name and \
not @tree.openElements[-1].hasContent
@tree.insertText('') unless
@tree.openElements.any? {|e|
e.attributes.keys.include? 'xmlns' and
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
}
end
return token
return token
end
end
end
class XhmlRootPhase < RootElementPhase
class XhmlRootPhase < RootElementPhase
def insertHtmlElement
element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
@tree.openElements.push(element)
@tree.document.appendChild(element)
@parser.phase = @parser.phases[:beforeHead]
element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
@tree.openElements.push(element)
@tree.document.appendChild(element)
@parser.phase = @parser.phases[:beforeHead]
end
end
end
class XmlRootPhase < Phase
class XmlRootPhase < Phase
# Prime the Xml parser
@start_tag_handlers = Hash.new(:startTagOther)
@end_tag_handlers = Hash.new(:endTagOther)
def startTagOther(name, attributes)
@tree.openElements.push(@tree.document)
element = @tree.createElement(name, attributes)
@tree.openElements[-1].appendChild(element)
@tree.openElements.push(element)
@parser.phase = XmlElementPhase.new(@parser,@tree)
@tree.openElements.push(@tree.document)
element = @tree.createElement(name, attributes)
@tree.openElements[-1].appendChild(element)
@tree.openElements.push(element)
@parser.phase = XmlElementPhase.new(@parser,@tree)
end
def endTagOther(name)
super
@tree.openElements.pop
super
@tree.openElements.pop
end
end
end
class XmlElementPhase < Phase
class XmlElementPhase < Phase
# Generic handling for all XML elements
@start_tag_handlers = Hash.new(:startTagOther)
@end_tag_handlers = Hash.new(:endTagOther)
def startTagOther(name, attributes)
element = @tree.createElement(name, attributes)
@tree.openElements[-1].appendChild(element)
@tree.openElements.push(element)
element = @tree.createElement(name, attributes)
@tree.openElements[-1].appendChild(element)
@tree.openElements.push(element)
end
def endTagOther(name)
for node in @tree.openElements.reverse
if node.name == name
{} while @tree.openElements.pop != node
break
else
@parser.parseError
end
for node in @tree.openElements.reverse
if node.name == name
{} while @tree.openElements.pop != node
break
else
@parser.parseError
end
end
end
def processCharacters(data)
@tree.insertText(data)
@tree.insertText(data)
end
end
end
end

View file

@ -6,87 +6,87 @@ module HTML5lib
# This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes.
class HTMLSanitizer < HTMLTokenizer
class HTMLSanitizer < HTMLTokenizer
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
button caption center cite code col colgroup dd del dfn dir div dl dt
em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label
legend li map menu ol optgroup option p pre q s samp select small span
strike strong sub sup table tbody td textarea tfoot th thead tr tt u
ul var]
button caption center cite code col colgroup dd del dfn dir div dl dt
em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label
legend li map menu ol optgroup option p pre q s samp select small span
strike strong sub sup table tbody td textarea tfoot th thead tr tt u
ul var]
MATHML_ELEMENTS = %w[maction math merror mfrac mi mmultiscripts mn mo
mover mpadded mphantom mprescripts mroot mrow mspace msqrt mstyle msub
msubsup msup mtable mtd mtext mtr munder munderover none]
mover mpadded mphantom mprescripts mroot mrow mspace msqrt mstyle msub
msubsup msup mtable mtd mtext mtr munder munderover none]
SVG_ELEMENTS = %w[a animate animateColor animateMotion animateTransform
circle defs desc ellipse font-face font-face-name font-face-src g
glyph hkern image linearGradient line marker metadata missing-glyph
mpath path polygon polyline radialGradient rect set stop svg switch
text title tspan use]
circle defs desc ellipse font-face font-face-name font-face-src g
glyph hkern image linearGradient line marker metadata missing-glyph
mpath path polygon polyline radialGradient rect set stop svg switch
text title tspan use]
ACCEPTABLE_ATTRIBUTES = %w[abbr accept accept-charset accesskey action
align alt axis border cellpadding cellspacing char charoff charset
checked cite class clear cols colspan color compact coords datetime
dir disabled enctype for frame headers height href hreflang hspace id
ismap label lang longdesc maxlength media method multiple name nohref
noshade nowrap prompt readonly rel rev rows rowspan rules scope
selected shape size span src start style summary tabindex target title
type usemap valign value vspace width xml:lang]
align alt axis border cellpadding cellspacing char charoff charset
checked cite class clear cols colspan color compact coords datetime
dir disabled enctype for frame headers height href hreflang hspace id
ismap label lang longdesc maxlength media method multiple name nohref
noshade nowrap prompt readonly rel rev rows rowspan rules scope
selected shape size span src start style summary tabindex target title
type usemap valign value vspace width xml:lang]
MATHML_ATTRIBUTES = %w[actiontype align columnalign columnalign
columnalign columnlines columnspacing columnspan depth display
displaystyle equalcolumns equalrows fence fontstyle fontweight frame
height linethickness lspace mathbackground mathcolor mathvariant
mathvariant maxsize minsize other rowalign rowalign rowalign rowlines
rowspacing rowspan rspace scriptlevel selection separator stretchy
width width xlink:href xlink:show xlink:type xmlns xmlns:xlink]
columnalign columnlines columnspacing columnspan depth display
displaystyle equalcolumns equalrows fence fontstyle fontweight frame
height linethickness lspace mathbackground mathcolor mathvariant
mathvariant maxsize minsize other rowalign rowalign rowalign rowlines
rowspacing rowspan rspace scriptlevel selection separator stretchy
width width xlink:href xlink:show xlink:type xmlns xmlns:xlink]
SVG_ATTRIBUTES = %w[accent-height accumulate additive alphabetic
arabic-form ascent attributeName attributeType baseProfile bbox begin
by calcMode cap-height class color color-rendering content cx cy d dx
dy descent display dur end fill fill-rule font-family font-size
font-stretch font-style font-variant font-weight from fx fy g1 g2
glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
ideographic k keyPoints keySplines keyTimes lang marker-end
marker-mid marker-start markerHeight markerUnits markerWidth
mathematical max min name offset opacity orient origin
overline-position overline-thickness panose-1 path pathLength points
preserveAspectRatio r refX refY repeatCount repeatDur
requiredExtensions requiredFeatures restart rotate rx ry slope stemh
stemv stop-color stop-opacity strikethrough-position
strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
stroke-width systemLanguage target text-anchor to transform type u1
u2 underline-position underline-thickness unicode unicode-range
units-per-em values version viewBox visibility width widths x
x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
xmlns:xlink y y1 y2 zoomAndPan]
arabic-form ascent attributeName attributeType baseProfile bbox begin
by calcMode cap-height class color color-rendering content cx cy d dx
dy descent display dur end fill fill-rule font-family font-size
font-stretch font-style font-variant font-weight from fx fy g1 g2
glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
ideographic k keyPoints keySplines keyTimes lang marker-end
marker-mid marker-start markerHeight markerUnits markerWidth
mathematical max min name offset opacity orient origin
overline-position overline-thickness panose-1 path pathLength points
preserveAspectRatio r refX refY repeatCount repeatDur
requiredExtensions requiredFeatures restart rotate rx ry slope stemh
stemv stop-color stop-opacity strikethrough-position
strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
stroke-width systemLanguage target text-anchor to transform type u1
u2 underline-position underline-thickness unicode unicode-range
units-per-em values version viewBox visibility width widths x
x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
xmlns:xlink y y1 y2 zoomAndPan]
ATTR_VAL_IS_URI = %w[href src cite action longdesc xlink:href]
ACCEPTABLE_CSS_PROPERTIES = %w[azimuth background-color
border-bottom-color border-collapse border-color border-left-color
border-right-color border-top-color clear color cursor direction
display elevation float font font-family font-size font-style
font-variant font-weight height letter-spacing line-height overflow
pause pause-after pause-before pitch pitch-range richness speak
speak-header speak-numeral speak-punctuation speech-rate stress
text-align text-decoration text-indent unicode-bidi vertical-align
voice-family volume white-space width]
border-bottom-color border-collapse border-color border-left-color
border-right-color border-top-color clear color cursor direction
display elevation float font font-family font-size font-style
font-variant font-weight height letter-spacing line-height overflow
pause pause-after pause-before pitch pitch-range richness speak
speak-header speak-numeral speak-punctuation speech-rate stress
text-align text-decoration text-indent unicode-bidi vertical-align
voice-family volume white-space width]
ACCEPTABLE_CSS_KEYWORDS = %w[auto aqua black block blue bold both bottom
brown center collapse dashed dotted fuchsia gray green !important
italic left lime maroon medium none navy normal nowrap olive pointer
purple red right solid silver teal top transparent underline white
yellow]
brown center collapse dashed dotted fuchsia gray green !important
italic left lime maroon medium none navy normal nowrap olive pointer
purple red right solid silver teal top transparent underline white
yellow]
ACCEPTABLE_SVG_PROPERTIES = %w[fill fill-opacity fill-rule stroke
stroke-width stroke-linecap stroke-linejoin stroke-opacity]
stroke-width stroke-linecap stroke-linejoin stroke-opacity]
ACCEPTABLE_PROTOCOLS = %w[ed2k ftp http https irc mailto news gopher nntp
telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
# subclasses may define their own versions of these constants
ALLOWED_ELEMENTS = ACCEPTABLE_ELEMENTS + MATHML_ELEMENTS + SVG_ELEMENTS
@ -104,75 +104,74 @@ class HTMLSanitizer < HTMLTokenizer
# in ALLOWED_PROTOCOLS are allowed.
#
# sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script>
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
# => <a>Click here for $100</a>
def each
super do |token|
case token[:type]
when :StartTag, :EndTag, :EmptyTag
if ALLOWED_ELEMENTS.include?(token[:name])
if token.has_key? :data
attrs = Hash[*token[:data].flatten]
attrs.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
ATTR_VAL_IS_URI.each do |attr|
val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
attrs.delete attr
end
end
if attrs['style']
attrs['style'] = sanitize_css(attrs['style'])
end
token[:data] = attrs.map {|k,v| [k,v]}
end
yield token
else
if token[:type] == :EndTag
token[:data] = "</#{token[:name]}>"
elsif token[:data]
attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('')
token[:data] = "<#{token[:name]}#{attrs}>"
else
token[:data] = "<#{token[:name]}>"
end
token[:data].insert(-2,'/') if token[:type] == :EmptyTag
token[:type] = :Characters
token.delete(:name)
yield token
super do |token|
case token[:type]
when :StartTag, :EndTag, :EmptyTag
if ALLOWED_ELEMENTS.include?(token[:name])
if token.has_key? :data
attrs = Hash[*token[:data].flatten]
attrs.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
ATTR_VAL_IS_URI.each do |attr|
val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
attrs.delete attr
end
else
yield token
end
end
end
def sanitize_css(style)
# disallow urls
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
# gauntlet
return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
clean = []
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
next if val.empty?
prop.downcase!
if ALLOWED_CSS_PROPERTIES.include?(prop)
clean << "#{prop}: #{val};"
elsif %w[background border margin padding].include?(prop.split('-')[0])
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
!ALLOWED_CSS_KEYWORDS.include?(keyword) and
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
end
elsif ALLOWED_SVG_PROPERTIES.include?(prop)
clean << "#{prop}: #{val};"
end
if attrs['style']
attrs['style'] = sanitize_css(attrs['style'])
end
token[:data] = attrs.map {|k,v| [k,v]}
end
yield token
else
if token[:type] == :EndTag
token[:data] = "</#{token[:name]}>"
elsif token[:data]
attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('')
token[:data] = "<#{token[:name]}#{attrs}>"
else
token[:data] = "<#{token[:name]}>"
end
token[:data].insert(-2,'/') if token[:type] == :EmptyTag
token[:type] = :Characters
token.delete(:name)
yield token
end
style = clean.join(' ')
else
yield token
end
end
end
end
def sanitize_css(style)
# disallow urls
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
# gauntlet
return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
clean = []
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
next if val.empty?
prop.downcase!
if ALLOWED_CSS_PROPERTIES.include?(prop)
clean << "#{prop}: #{val};"
elsif %w[background border margin padding].include?(prop.split('-')[0])
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
!ALLOWED_CSS_KEYWORDS.include?(keyword) and
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
end
elsif ALLOWED_SVG_PROPERTIES.include?(prop)
clean << "#{prop}: #{val};"
end
end
style = clean.join(' ')
end
end
end

File diff suppressed because it is too large Load diff

View file

@ -1,21 +1,21 @@
module HTML5lib
module TreeBuilders
module TreeBuilders
def self.getTreeBuilder(name)
case name.to_s.downcase
def self.getTreeBuilder(name)
case name.to_s.downcase
when 'simpletree' then
require 'html5lib/treebuilders/simpletree'
SimpleTree::TreeBuilder
require 'html5lib/treebuilders/simpletree'
SimpleTree::TreeBuilder
when 'rexml' then
require 'html5lib/treebuilders/rexml'
REXMLTree::TreeBuilder
require 'html5lib/treebuilders/rexml'
REXMLTree::TreeBuilder
when 'hpricot' then
require 'html5lib/treebuilders/hpricot'
Hpricot::TreeBuilder
require 'html5lib/treebuilders/hpricot'
Hpricot::TreeBuilder
else
raise "Unknown TreeBuilder #{name}"
raise "Unknown TreeBuilder #{name}"
end
end
end
end
end
end

View file

@ -4,166 +4,166 @@ require 'html5lib/constants'
module HTML5lib
# The scope markers are inserted when entering buttons, object elements,
# marquees, table cells, and table captions, and are used to prevent formatting
# from "leaking" into tables, buttons, object elements, and marquees.
Marker = nil
# The scope markers are inserted when entering buttons, object elements,
# marquees, table cells, and table captions, and are used to prevent formatting
# from "leaking" into tables, buttons, object elements, and marquees.
Marker = nil
module TreeBuilders
module Base
module TreeBuilders
module Base
class Node
# The parent of the current node (or nil for the document node)
attr_accessor :parent
class Node
# The parent of the current node (or nil for the document node)
attr_accessor :parent
# a list of child nodes of the current node. This must
# include all elements but not necessarily other node types
attr_accessor :childNodes
# a list of child nodes of the current node. This must
# include all elements but not necessarily other node types
attr_accessor :childNodes
# A list of miscellaneous flags that can be set on the node
attr_accessor :_flags
# A list of miscellaneous flags that can be set on the node
attr_accessor :_flags
def initialize(name)
@parent = nil
@childNodes = []
@_flags = []
end
# Insert node as a child of the current node
def appendChild(node)
raise NotImplementedError
end
# Insert data as text in the current node, positioned before the
# start of node insertBefore or to the end of the node's text.
def insertText(data, insertBefore = nil)
raise NotImplementedError
end
# Insert node as a child of the current node, before refNode in the
# list of child nodes. Raises ValueError if refNode is not a child of
# the current node
def insertBefore(node, refNode)
raise NotImplementedError
end
# Remove node from the children of the current node
def removeChild(node)
raise NotImplementedError
end
# Move all the children of the current node to newParent.
# This is needed so that trees that don't store text as nodes move the
# text in the correct way
def reparentChildren(newParent)
#XXX - should this method be made more general?
@childNodes.each { |child| newParent.appendChild(child) }
@childNodes = []
end
# Return a shallow copy of the current node i.e. a node with the same
# name and attributes but with no parent or child nodes
def cloneNode
raise NotImplementedError
end
# Return true if the node has children or text, false otherwise
def hasContent
raise NotImplementedError
end
end
# Base treebuilder implementation
class TreeBuilder
attr_accessor :openElements
attr_accessor :activeFormattingElements
attr_accessor :document
attr_accessor :headPointer
attr_accessor :formPointer
# Class to use for document root
documentClass = nil
# Class to use for HTML elements
elementClass = nil
# Class to use for comments
commentClass = nil
# Class to use for doctypes
doctypeClass = nil
# Fragment class
fragmentClass = nil
def initialize
reset
end
def reset
@openElements = []
@activeFormattingElements = []
#XXX - rename these to headElement, formElement
@headPointer = nil
@formPointer = nil
self.insertFromTable = false
@document = @documentClass.new
end
def elementInScope(target, tableVariant = false)
# Exit early when possible.
return true if @openElements[-1].name == target
# AT How about while true and simply set node to [-1] and set it to
# [-2] at the end...
@openElements.reverse.each do |element|
if element.name == target
return true
elsif element.name == 'table'
return false
elsif not tableVariant and SCOPING_ELEMENTS.include?(element.name)
return false
elsif element.name == 'html'
return false
end
def initialize(name)
@parent = nil
@childNodes = []
@_flags = []
end
assert false # We should never reach this point
end
def reconstructActiveFormattingElements
# Within this algorithm the order of steps described in the
# specification is not quite the same as the order of steps in the
# code. It should still do the same though.
# Insert node as a child of the current node
def appendChild(node)
raise NotImplementedError
end
# Step 1: stop the algorithm when there's nothing to do.
return unless @activeFormattingElements
# Insert data as text in the current node, positioned before the
# start of node insertBefore or to the end of the node's text.
def insertText(data, insertBefore=nil)
raise NotImplementedError
end
# Step 2 and step 3: we start with the last element. So i is -1.
i = -1
entry = @activeFormattingElements[i]
return if entry == Marker or @openElements.include?(entry)
# Insert node as a child of the current node, before refNode in the
# list of child nodes. Raises ValueError if refNode is not a child of
# the current node
def insertBefore(node, refNode)
raise NotImplementedError
end
# Step 6
until entry == Marker or @openElements.include?(entry)
# Remove node from the children of the current node
def removeChild(node)
raise NotImplementedError
end
# Move all the children of the current node to newParent.
# This is needed so that trees that don't store text as nodes move the
# text in the correct way
def reparentChildren(newParent)
#XXX - should this method be made more general?
@childNodes.each { |child| newParent.appendChild(child) }
@childNodes = []
end
# Return a shallow copy of the current node i.e. a node with the same
# name and attributes but with no parent or child nodes
def cloneNode
raise NotImplementedError
end
# Return true if the node has children or text, false otherwise
def hasContent
raise NotImplementedError
end
end
# Base treebuilder implementation
class TreeBuilder
attr_accessor :openElements
attr_accessor :activeFormattingElements
attr_accessor :document
attr_accessor :headPointer
attr_accessor :formPointer
# Class to use for document root
documentClass = nil
# Class to use for HTML elements
elementClass = nil
# Class to use for comments
commentClass = nil
# Class to use for doctypes
doctypeClass = nil
# Fragment class
fragmentClass = nil
def initialize
reset
end
def reset
@openElements = []
@activeFormattingElements = []
#XXX - rename these to headElement, formElement
@headPointer = nil
@formPointer = nil
self.insertFromTable = false
@document = @documentClass.new
end
def elementInScope(target, tableVariant=false)
# Exit early when possible.
return true if @openElements[-1].name == target
# AT How about while true and simply set node to [-1] and set it to
# [-2] at the end...
@openElements.reverse.each do |element|
if element.name == target
return true
elsif element.name == 'table'
return false
elsif not tableVariant and SCOPING_ELEMENTS.include?(element.name)
return false
elsif element.name == 'html'
return false
end
end
assert false # We should never reach this point
end
def reconstructActiveFormattingElements
# Within this algorithm the order of steps described in the
# specification is not quite the same as the order of steps in the
# code. It should still do the same though.
# Step 1: stop the algorithm when there's nothing to do.
return unless @activeFormattingElements
# Step 2 and step 3: we start with the last element. So i is -1.
i = -1
entry = @activeFormattingElements[i]
return if entry == Marker or @openElements.include?(entry)
# Step 6
until entry == Marker or @openElements.include?(entry)
# Step 5: let entry be one earlier in the list.
i -= 1
begin
entry = @activeFormattingElements[i]
entry = @activeFormattingElements[i]
rescue
# Step 4: at this point we need to jump to step 8. By not doing
# i += 1 which is also done in step 7 we achieve that.
break
# Step 4: at this point we need to jump to step 8. By not doing
# i += 1 which is also done in step 7 we achieve that.
break
end
end
while true
end
while true
# Step 7
i += 1
@ -178,153 +178,153 @@ class TreeBuilder
# Step 11
break if element == @activeFormattingElements[-1]
end
end
end
def clearActiveFormattingElements
{} until @activeFormattingElements.empty? || @activeFormattingElements.pop == Marker
end
def clearActiveFormattingElements
{} until @activeFormattingElements.empty? || @activeFormattingElements.pop == Marker
end
# Check if an element exists between the end of the active
# formatting elements and the last marker. If it does, return it, else
# return false
def elementInActiveFormattingElements(name)
@activeFormattingElements.reverse.each do |element|
# Check if an element exists between the end of the active
# formatting elements and the last marker. If it does, return it, else
# return false
def elementInActiveFormattingElements(name)
@activeFormattingElements.reverse.each do |element|
# Check for Marker first because if it's a Marker it doesn't have a
# name attribute.
break if element == Marker
return element if element.name == name
end
return false
end
return false
end
def insertDoctype(name)
@document.appendChild(@doctypeClass.new(name))
end
def insertDoctype(name)
@document.appendChild(@doctypeClass.new(name))
end
def insertComment(data, parent = nil)
parent = @openElements[-1] if parent.nil?
parent.appendChild(@commentClass.new(data))
end
# Create an element but don't insert it anywhere
def createElement(name, attributes)
element = @elementClass.new(name)
element.attributes = attributes
return element
end
def insertComment(data, parent=nil)
parent = @openElements[-1] if parent.nil?
parent.appendChild(@commentClass.new(data))
end
# Create an element but don't insert it anywhere
def createElement(name, attributes)
element = @elementClass.new(name)
element.attributes = attributes
return element
end
# Switch the function used to insert an element from the
# normal one to the misnested table one and back again
def insertFromTable=(value)
@insertFromTable = value
@insertElement = value ? :insertElementTable : :insertElementNormal
end
# Switch the function used to insert an element from the
# normal one to the misnested table one and back again
def insertFromTable=(value)
@insertFromTable = value
@insertElement = value ? :insertElementTable : :insertElementNormal
end
def insertElement(name, attributes)
send(@insertElement, name, attributes)
end
def insertElement(name, attributes)
send(@insertElement, name, attributes)
end
def insertElementNormal(name, attributes)
element = @elementClass.new(name)
element.attributes = attributes
@openElements[-1].appendChild(element)
@openElements.push(element)
return element
end
def insertElementNormal(name, attributes)
element = @elementClass.new(name)
element.attributes = attributes
@openElements[-1].appendChild(element)
@openElements.push(element)
return element
end
# Create an element and insert it into the tree
def insertElementTable(name, attributes)
element = @elementClass.new(name)
element.attributes = attributes
if TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)
# Create an element and insert it into the tree
def insertElementTable(name, attributes)
element = @elementClass.new(name)
element.attributes = attributes
if TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)
#We should be in the InTable mode. This means we want to do
#special magic element rearranging
parent, insertBefore = getTableMisnestedNodePosition
if insertBefore.nil?
parent.appendChild(element)
parent.appendChild(element)
else
parent.insertBefore(element, insertBefore)
parent.insertBefore(element, insertBefore)
end
@openElements.push(element)
else
else
return insertElementNormal(name, attributes)
end
return element
end
return element
end
def insertText(data, parent = nil)
parent = @openElements[-1] if parent.nil?
def insertText(data, parent=nil)
parent = @openElements[-1] if parent.nil?
if (not(@insertFromTable) or (@insertFromTable and not TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)))
if (not(@insertFromTable) or (@insertFromTable and not TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)))
parent.insertText(data)
else
else
#We should be in the InTable mode. This means we want to do
#special magic element rearranging
parent, insertBefore = getTableMisnestedNodePosition
parent.insertText(data, insertBefore)
end
end
end
# Get the foster parent element, and sibling to insert before
# (or nil) when inserting a misnested table node
def getTableMisnestedNodePosition
#The foster parent element is the one which comes before the most
#recently opened table element
#XXX - this is really inelegant
lastTable = nil
fosterParent = nil
insertBefore = nil
@openElements.reverse.each do |element|
# Get the foster parent element, and sibling to insert before
# (or nil) when inserting a misnested table node
def getTableMisnestedNodePosition
#The foster parent element is the one which comes before the most
#recently opened table element
#XXX - this is really inelegant
lastTable = nil
fosterParent = nil
insertBefore = nil
@openElements.reverse.each do |element|
if element.name == "table"
lastTable = element
break
lastTable = element
break
end
end
if lastTable
end
if lastTable
#XXX - we should really check that this parent is actually a
#node here
if lastTable.parent
fosterParent = lastTable.parent
insertBefore = lastTable
fosterParent = lastTable.parent
insertBefore = lastTable
else
fosterParent = @openElements[@openElements.index(lastTable) - 1]
fosterParent = @openElements[@openElements.index(lastTable) - 1]
end
else
else
fosterParent = @openElements[0]
end
return fosterParent, insertBefore
end
return fosterParent, insertBefore
end
def generateImpliedEndTags(exclude = nil)
name = @openElements[-1].name
def generateImpliedEndTags(exclude=nil)
name = @openElements[-1].name
if (['dd', 'dt', 'li', 'p', 'td', 'th', 'tr'].include?(name) and name != exclude)
if (['dd', 'dt', 'li', 'p', 'td', 'th', 'tr'].include?(name) and name != exclude)
@openElements.pop
# XXX This is not entirely what the specification says. We should
# investigate it more closely.
generateImpliedEndTags(exclude)
end
end
end
def getDocument
@document
end
def getFragment
#assert @innerHTML
fragment = @fragmentClass.new
@openElements[0].reparentChildren(fragment)
return fragment
end
def getDocument
@document
end
def getFragment
#assert @innerHTML
fragment = @fragmentClass.new
@openElements[0].reparentChildren(fragment)
return fragment
end
# Serialize the subtree of node in the format required by unit tests
# node - the node from which to start serializing
def testSerializer(node)
raise NotImplementedError
end
# Serialize the subtree of node in the format required by unit tests
# node - the node from which to start serializing
def testSerializer(node)
raise NotImplementedError
end
end
end
end
end
end
end
end

View file

@ -3,209 +3,212 @@ require 'hpricot'
require 'forwardable'
module HTML5lib
module TreeBuilders
module Hpricot
module TreeBuilders
module Hpricot
class Node < Base::Node
class Node < Base::Node
extend Forwardable
extend Forwardable
def_delegators :@hpricot, :name
def_delegators :@hpricot, :name
attr_accessor :hpricot
attr_accessor :hpricot
def initialize(name)
super(name)
@hpricot = self.class.hpricot_class.new name
end
def initialize(name)
super(name)
@hpricot = self.class.hpricot_class.new name
end
def appendChild(node)
if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
def appendChild(node)
if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s
else
else
childNodes << node
hpricot.children << node.hpricot
end
node.parent = self
end
node.parent = self
end
def removeChild(node)
childNodes.delete(node)
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
node.parent = nil
end
def removeChild(node)
childNodes.delete(node)
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
node.parent = nil
end
def insertText(data, before = nil)
if before
def insertText(data, before=nil)
if before
insertBefore(TextNode.new(data), before)
else
else
appendChild(TextNode.new(data))
end
end
end
def insertBefore(node, refNode)
index = childNodes.index(refNode)
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
def insertBefore(node, refNode)
index = childNodes.index(refNode)
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
else
else
childNodes.insert(index, node)
end
end
end
def hasContent
childNodes.any?
end
end
def hasContent
childNodes.any?
end
end
class Element < Node
def self.hpricot_class
::Hpricot::Elem
end
class Element < Node
def self.hpricot_class
::Hpricot::Elem
end
def initialize(name)
super(name)
def initialize(name)
super(name)
@hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
end
@hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
end
def name
@hpricot.stag.name
end
def name
@hpricot.stag.name
end
def cloneNode
attributes.inject(self.class.new(name)) do |node, (name, value)|
def cloneNode
attributes.inject(self.class.new(name)) do |node, (name, value)|
node.hpricot[name] = value
node
end
end
end
# A call to Hpricot::Elem#raw_attributes is built dynamically,
# so alterations to the returned value (a hash) will be lost.
#
# AttributeProxy works around this by forwarding :[]= calls
# to the raw_attributes accessor on the element start tag.
#
class AttributeProxy
def initialize(hpricot)
# A call to Hpricot::Elem#raw_attributes is built dynamically,
# so alterations to the returned value (a hash) will be lost.
#
# AttributeProxy works around this by forwarding :[]= calls
# to the raw_attributes accessor on the element start tag.
#
class AttributeProxy
def initialize(hpricot)
@hpricot = hpricot
end
def []=(k, v)
end
def []=(k, v)
@hpricot.stag.send(stag_attributes_method)[k] = v
end
def stag_attributes_method
end
def stag_attributes_method
# STag#attributes changed to STag#raw_attributes after Hpricot 0.5
@hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
end
def method_missing(*a, &b)
end
def method_missing(*a, &b)
@hpricot.attributes.send(*a, &b)
end
end
end
def attributes
AttributeProxy.new(@hpricot)
end
def attributes
AttributeProxy.new(@hpricot)
end
def attributes=(attrs)
attrs.each { |name, value| @hpricot[name] = value }
end
def attributes=(attrs)
attrs.each { |name, value| @hpricot[name] = value }
end
def printTree(indent = 0)
tree = "\n|#{' ' * indent}<#{name}>"
indent += 2
attributes.each do |name, value|
def printTree(indent=0)
tree = "\n|#{' ' * indent}<#{name}>"
indent += 2
attributes.each do |name, value|
next if name == 'xmlns'
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
end
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
end
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
end
end
end
class Document < Node
def self.hpricot_class
::Hpricot::Doc
end
class Document < Node
def self.hpricot_class
::Hpricot::Doc
end
def initialize
super(nil)
end
def initialize
super(nil)
end
def printTree(indent = 0)
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
end
end
def printTree(indent=0)
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
end
end
class DocumentType < Node
def self.hpricot_class
::Hpricot::DocType
end
class DocumentType < Node
def self.hpricot_class
::Hpricot::DocType
end
def initialize(name)
begin
def initialize(name)
begin
super(name)
rescue ArgumentError # needs 3...
rescue ArgumentError # needs 3...
end
@hpricot = ::Hpricot::DocType.new(name, nil, nil)
end
@hpricot = ::Hpricot::DocType.new(name, nil, nil)
end
def printTree(indent=0)
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
end
end
class DocumentFragment < Element
def initialize
super('')
end
def printTree(indent=0)
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
end
end
class TextNode < Node
def initialize(data)
@hpricot = ::Hpricot::Text.new(data)
end
def printTree(indent=0)
"\n|#{' ' * indent}\"#{hpricot.content}\""
end
end
class CommentNode < Node
def self.hpricot_class
::Hpricot::Comment
end
def printTree(indent=0)
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer(node)
node.printTree
end
def getDocument
@document.hpricot
end
def getFragment
@document = super
return @document.hpricot.children
end
end
def printTree(indent = 0)
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
end
end
class DocumentFragment < Element
def initialize
super('')
end
def printTree(indent = 0)
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
end
end
class TextNode < Node
def initialize(data)
@hpricot = ::Hpricot::Text.new(data)
end
def printTree(indent = 0)
"\n|#{' ' * indent}\"#{hpricot.content}\""
end
end
class CommentNode < Node
def self.hpricot_class
::Hpricot::Comment
end
def printTree(indent = 0)
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer(node)
node.printTree
end
def getDocument
@document.hpricot
end
def getFragment
@document = super
return @document.hpricot.children
end
end
end
end
end
end

View file

@ -3,189 +3,189 @@ require 'rexml/document'
require 'forwardable'
module HTML5lib
module TreeBuilders
module REXMLTree
module TreeBuilders
module REXMLTree
class Node < Base::Node
extend Forwardable
def_delegators :@rxobj, :name, :attributes
attr_accessor :rxobj
class Node < Base::Node
extend Forwardable
def_delegators :@rxobj, :name, :attributes
attr_accessor :rxobj
def initialize name
super name
@rxobj = self.class.rxclass.new name
end
def initialize name
super name
@rxobj = self.class.rxclass.new name
end
def appendChild node
if node.kind_of? TextNode and
childNodes.length>0 and childNodes[-1].kind_of? TextNode
def appendChild node
if node.kind_of? TextNode and
childNodes.length>0 and childNodes[-1].kind_of? TextNode
childNodes[-1].rxobj.value =
childNodes[-1].rxobj.to_s + node.rxobj.to_s
childNodes[-1].rxobj.raw = true
else
else
childNodes.push node
rxobj.add node.rxobj
end
node.parent = self
end
node.parent = self
end
def removeChild node
childNodes.delete node
rxobj.delete node.rxobj
node.parent = nil
end
def removeChild node
childNodes.delete node
rxobj.delete node.rxobj
node.parent = nil
end
def insertText data, before=nil
if before
def insertText data, before=nil
if before
insertBefore TextNode.new(data), before
else
else
appendChild TextNode.new(data)
end
end
end
def insertBefore node, refNode
index = childNodes.index(refNode)
if node.kind_of? TextNode and index>0 and
childNodes[index-1].kind_of? TextNode
def insertBefore node, refNode
index = childNodes.index(refNode)
if node.kind_of? TextNode and index>0 and
childNodes[index-1].kind_of? TextNode
childNodes[index-1].rxobj.value =
childNodes[index-1].rxobj.to_s + node.rxobj.to_s
childNodes[index-1].rxobj.raw = true
else
else
childNodes.insert index, node
end
end
end
def hasContent
return (childNodes.length > 0)
end
end
def hasContent
return (childNodes.length > 0)
end
end
class Element < Node
def self.rxclass
REXML::Element
end
class Element < Node
def self.rxclass
REXML::Element
end
def initialize name
super name
end
def initialize name
super name
end
def cloneNode
newNode = self.class.new name
attributes.each {|name,value| newNode.attributes[name] = value}
newNode
end
def cloneNode
newNode = self.class.new name
attributes.each {|name,value| newNode.attributes[name] = value}
newNode
end
def attributes= value
value.each {|name,value| rxobj.attributes[name]=value}
end
def attributes= value
value.each {|name, value| rxobj.attributes[name]=value}
end
def printTree indent=0
tree = "\n|#{' ' * indent}<#{name}>"
indent += 2
for name, value in attributes
def printTree indent=0
tree = "\n|#{' ' * indent}<#{name}>"
indent += 2
for name, value in attributes
next if name == 'xmlns'
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
end
for child in childNodes
end
for child in childNodes
tree += child.printTree(indent)
end
return tree
end
return tree
end
end
end
class Document < Node
def self.rxclass
REXML::Document
end
class Document < Node
def self.rxclass
REXML::Document
end
def initialize
super nil
end
def initialize
super nil
end
def appendChild node
if node.kind_of? Element and node.name == 'html'
node.rxobj.add_namespace('http://www.w3.org/1999/xhtml')
end
super node
end
def appendChild node
if node.kind_of? Element and node.name == 'html'
node.rxobj.add_namespace('http://www.w3.org/1999/xhtml')
end
super node
end
def printTree indent=0
tree = "#document"
for child in childNodes
def printTree indent=0
tree = "#document"
for child in childNodes
tree += child.printTree(indent + 2)
end
return tree
end
return tree
end
end
end
class DocumentType < Node
def self.rxclass
REXML::DocType
end
class DocumentType < Node
def self.rxclass
REXML::DocType
end
def printTree indent=0
"\n|#{' ' * indent}<!DOCTYPE #{name}>"
end
end
def printTree indent=0
"\n|#{' ' * indent}<!DOCTYPE #{name}>"
end
end
class DocumentFragment < Element
def initialize
super nil
end
class DocumentFragment < Element
def initialize
super nil
end
def printTree indent=0
tree = ""
for child in childNodes
def printTree indent=0
tree = ""
for child in childNodes
tree += child.printTree(indent+2)
end
return tree
end
return tree
end
class TextNode < Node
def initialize data
raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;')
@rxobj = REXML::Text.new(raw, true, nil, true)
end
def printTree indent=0
"\n|#{' ' * indent}\"#{rxobj.value}\""
end
end
class CommentNode < Node
def self.rxclass
REXML::Comment
end
def printTree indent=0
"\n|#{' ' * indent}<!-- #{rxobj.string} -->"
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer node
node.printTree()
end
def getDocument
@document.rxobj
end
def getFragment
@document = super
return @document.rxobj.children
end
end
end
end
class TextNode < Node
def initialize data
raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;')
@rxobj = REXML::Text.new(raw, true, nil, true)
end
def printTree indent=0
"\n|#{' ' * indent}\"#{rxobj.value}\""
end
end
class CommentNode < Node
def self.rxclass
REXML::Comment
end
def printTree indent=0
"\n|#{' ' * indent}<!-- #{rxobj.string} -->"
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer node
node.printTree()
end
def getDocument
@document.rxobj
end
def getFragment
@document = super
return @document.rxobj.children
end
end
end
end
end
end

View file

@ -1,178 +1,178 @@
require 'html5lib/treebuilders/base'
module HTML5lib
module TreeBuilders
module SimpleTree
module TreeBuilders
module SimpleTree
class Node < Base::Node
# Node representing an item in the tree.
# name - The tag name associated with the node
attr_accessor :name
class Node < Base::Node
# Node representing an item in the tree.
# name - The tag name associated with the node
attr_accessor :name
# The value of the current node (applies to text nodes and
# comments
attr_accessor :value
# The value of the current node (applies to text nodes and
# comments
attr_accessor :value
# a dict holding name, value pairs for attributes of the node
attr_accessor :attributes
# a dict holding name, value pairs for attributes of the node
attr_accessor :attributes
def initialize name
super
@name = name
@value = nil
@attributes = {}
end
def initialize name
super
@name = name
@value = nil
@attributes = {}
end
def appendChild node
if node.kind_of? TextNode and
childNodes.length>0 and childNodes[-1].kind_of? TextNode
def appendChild node
if node.kind_of? TextNode and
childNodes.length>0 and childNodes[-1].kind_of? TextNode
childNodes[-1].value += node.value
else
else
childNodes.push node
end
node.parent = self
end
node.parent = self
end
def removeChild node
childNodes.delete node
node.parent = nil
end
def removeChild node
childNodes.delete node
node.parent = nil
end
def cloneNode
newNode = self.class.new name
attributes.each {|name,value| newNode.attributes[name] = value}
newNode.value = value
newNode
end
def cloneNode
newNode = self.class.new name
attributes.each {|name,value| newNode.attributes[name] = value}
newNode.value = value
newNode
end
def insertText data, before=nil
if before
def insertText data, before=nil
if before
insertBefore TextNode.new(data), before
else
else
appendChild TextNode.new(data)
end
end
end
def insertBefore node, refNode
index = childNodes.index(refNode)
if node.kind_of? TextNode and index>0 and
childNodes[index-1].kind_of? TextNode
def insertBefore node, refNode
index = childNodes.index(refNode)
if node.kind_of? TextNode and index>0 and
childNodes[index-1].kind_of? TextNode
childNodes[index-1].value += node.value
else
else
childNodes.insert index, node
end
end
end
def printTree indent=0
tree = "\n|%s%s" % [' '* indent, self.to_s]
for child in childNodes
def printTree indent=0
tree = "\n|%s%s" % [' '* indent, self.to_s]
for child in childNodes
tree += child.printTree(indent + 2)
end
return tree
end
return tree
end
def hasContent
return (childNodes.length > 0)
end
end
def hasContent
return (childNodes.length > 0)
end
end
class Element < Node
def to_s
"<%s>" % name
end
class Element < Node
def to_s
"<%s>" % name
end
def printTree indent=0
tree = "\n|%s%s" % [' '* indent, self.to_s]
indent += 2
for name, value in attributes
def printTree indent=0
tree = "\n|%s%s" % [' '* indent, self.to_s]
indent += 2
for name, value in attributes
tree += "\n|%s%s=\"%s\"" % [' ' * indent, name, value]
end
for child in childNodes
end
for child in childNodes
tree += child.printTree(indent)
end
return tree
end
return tree
end
end
end
class Document < Node
def to_s
"#document"
end
class Document < Node
def to_s
"#document"
end
def initialize
super nil
end
def initialize
super nil
end
def printTree indent=0
tree = to_s
for child in childNodes
def printTree indent=0
tree = to_s
for child in childNodes
tree += child.printTree(indent + 2)
end
return tree
end
return tree
end
end
end
class DocumentType < Node
def to_s
"<!DOCTYPE %s>" % name
end
end
class DocumentType < Node
def to_s
"<!DOCTYPE %s>" % name
end
end
class DocumentFragment < Element
def initialize
super nil
end
class DocumentFragment < Element
def initialize
super nil
end
def printTree indent=0
tree = ""
for child in childNodes
def printTree indent=0
tree = ""
for child in childNodes
tree += child.printTree(indent+2)
end
return tree
end
return tree
end
class TextNode < Node
def initialize value
super nil
@value = value
end
def to_s
'"%s"' % value
end
end
class CommentNode < Node
def initialize value
super nil
@value = value
end
def to_s
"<!-- %s -->" % value
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer node
node.printTree()
end
def getFragment
@document = super
return @document.childNodes
end
end
end
end
class TextNode < Node
def initialize value
super nil
@value = value
end
def to_s
'"%s"' % value
end
end
class CommentNode < Node
def initialize value
super nil
@value = value
end
def to_s
"<!-- %s -->" % value
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer node
node.printTree()
end
def getFragment
@document = super
return @document.childNodes
end
end
end
end
end
end

View file

@ -7,5 +7,17 @@ $:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
$:.unshift File.dirname(__FILE__)
def html5lib_test_files(subdirectory)
Dir[File.join(HTML5LIB_BASE, 'tests', subdirectory, '*.*')]
Dir[File.join(HTML5LIB_BASE, 'tests', subdirectory, '*.*')]
end
begin
require 'jsonx'
rescue LoadError
class JSON
def self.parse json
json.gsub! /"\s*:/, '"=>'
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
eval json
end
end
end

View file

@ -11,7 +11,7 @@ begin
def test_chardet
File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
assert_equal 'big5', stream.charEncoding.downcase
assert_equal 'big5', stream.char_encoding.downcase
end
end
rescue LoadError
@ -28,7 +28,7 @@ end
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
assert_equal encoding.downcase, stream.charEncoding.downcase, input
assert_equal encoding.downcase, stream.char_encoding.downcase, input
end
end
end

View file

@ -6,19 +6,19 @@ XMLELEM = /<(\w+\s*)((?:[-:\w]+="[^"]*"\s*)+)(\/?)>/
SORTATTRS = '<#{$1+$2.split.sort.join(' ')+$3}>'
def assert_xml_equal(input, expected=nil, parser=HTML5lib::XMLParser)
document = parser.parse(input.chomp).root
if not expected
expected = input.chomp.gsub(XMLELEM,SORTATTRS)
expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,SORTATTRS)
assert_equal(expected, output)
else
assert_equal(expected, document.to_s.gsub(/'/,'"'))
end
document = parser.parse(input.chomp).root
if not expected
expected = input.chomp.gsub(XMLELEM,SORTATTRS)
expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,SORTATTRS)
assert_equal(expected, output)
else
assert_equal(expected, document.to_s.gsub(/'/,'"'))
end
end
def assert_xhtml_equal(input, expected=nil, parser=HTML5lib::XHTMLParser)
assert_xml_equal(input, expected, parser)
assert_xml_equal(input, expected, parser)
end
class BasicXhtml5Test < Test::Unit::TestCase
@ -27,8 +27,8 @@ class BasicXhtml5Test < Test::Unit::TestCase
assert_xhtml_equal(
'<title>Xhtml</title><b><i>content</b></i>',
'<html xmlns="http://www.w3.org/1999/xhtml">' +
'<head><title>Xhtml</title></head>' +
'<body><b><i>content</i></b></body>' +
'<head><title>Xhtml</title></head>' +
'<body><b><i>content</i></b></body>' +
'</html>')
end
@ -36,8 +36,8 @@ class BasicXhtml5Test < Test::Unit::TestCase
assert_xhtml_equal(
'<title>mdash</title>A &mdash B',
'<html xmlns="http://www.w3.org/1999/xhtml">' +
'<head><title>mdash</title></head>' +
'<body>A '+ [0x2014].pack('U') + ' B</body>' +
'<head><title>mdash</title></head>' +
'<body>A '+ [0x2014].pack('U') + ' B</body>' +
'</html>')
end
end
@ -70,24 +70,24 @@ class OpmlTest < Test::Unit::TestCase
def test_mixedCaseElement
assert_xml_equal(
'<opml version="1.0">' +
'<head><ownerName>Dave Winer</ownerName></head>' +
'<head><ownerName>Dave Winer</ownerName></head>' +
'</opml>')
end
def test_mixedCaseAttribute
assert_xml_equal(
'<opml version="1.0">' +
'<body><outline isComment="true"/></body>' +
'<body><outline isComment="true"/></body>' +
'</opml>')
end
def test_malformed
assert_xml_equal(
'<opml version="1.0">' +
'<body><outline text="Odds & Ends"/></body>' +
'<body><outline text="Odds & Ends"/></body>' +
'</opml>',
'<opml version="1.0">' +
'<body><outline text="Odds &amp; Ends"/></body>' +
'<body><outline text="Odds &amp; Ends"/></body>' +
'</opml>')
end
end
@ -100,45 +100,45 @@ class XhtmlTest < Test::Unit::TestCase
<head><title>MathML</title></head>
<body>
<math xmlns="http://www.w3.org/1998/Math/MathML">
<mrow>
<mi>x</mi>
<mo>=</mo>
<mfrac>
<mrow>
<mi>x</mi>
<mo>=</mo>
<mrow>
<mo>-</mo>
<mi>b</mi>
</mrow>
<mo>&#177;</mo>
<msqrt>
<mfrac>
<mrow>
<msup>
<mi>b</mi>
<mn>2</mn>
</msup>
<mo>-</mo>
<mrow>
<mrow>
<mo>-</mo>
<mi>b</mi>
</mrow>
<mo>&#177;</mo>
<msqrt>
<mrow>
<msup>
<mi>b</mi>
<mn>2</mn>
</msup>
<mo>-</mo>
<mrow>
<mn>4</mn>
<mo>&#8290;</mo>
<mi>a</mi>
<mo>&#8290;</mo>
<mi>c</mi>
</mrow>
</mrow>
</msqrt>
<mn>4</mn>
<mo>&#8290;</mo>
<mi>a</mi>
<mo>&#8290;</mo>
<mi>c</mi>
</mrow>
<mrow>
<mn>2</mn>
<mo>&#8290;</mo>
<mi>a</mi>
</mrow>
</mfrac>
</mrow>
</msqrt>
</mrow>
<mrow>
<mn>2</mn>
<mo>&#8290;</mo>
<mi>a</mi>
</mrow>
</mfrac>
</mrow>
</math>
</body></html>
EOX
@ -150,11 +150,11 @@ EOX
<head><title>SVG</title></head>
<body>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
<path d="M38,38c0-12,24-15,23-2c0,9-16,13-16,23v7h11v-4c0-9,17-12,17-27
c-2-22-45-22-45,3zM45,70h11v11h-11z" fill="#371">
</path>
<circle cx="50" cy="50" r="45" fill="none" stroke="#371" stroke-width="10">
</circle>
<path d="M38,38c0-12,24-15,23-2c0,9-16,13-16,23v7h11v-4c0-9,17-12,17-27
c-2-22-45-22-45,3zM45,70h11v11h-11z" fill="#371">
</path>
<circle cx="50" cy="50" r="45" fill="none" stroke="#371" stroke-width="10">
</circle>
</svg>
</body></html>
@ -167,24 +167,24 @@ EOX
<head><title>XLINK</title></head>
<body>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
<defs xmlns:l="http://www.w3.org/1999/xlink">
<radialGradient id="s1" fx=".4" fy=".2" r=".7">
<stop stop-color="#FE8"/>
<stop stop-color="#D70" offset="1"/>
</radialGradient>
<radialGradient id="s2" fx=".8" fy=".5" l:href="#s1"/>
<radialGradient id="s3" fx=".5" fy=".9" l:href="#s1"/>
<radialGradient id="s4" fx=".1" fy=".5" l:href="#s1"/>
</defs>
<g stroke="#940">
<path d="M73,29c-37-40-62-24-52,4l6-7c-8-16,7-26,42,9z" fill="url(#s1)"/>
<path d="M47,8c33-16,48,21,9,47l-6-5c38-27,20-44,5-37z" fill="url(#s2)"/>
<path d="M77,32c22,30,10,57-39,51l-1-8c3,3,67,5,36-36z" fill="url(#s3)"/>
<defs xmlns:l="http://www.w3.org/1999/xlink">
<radialGradient id="s1" fx=".4" fy=".2" r=".7">
<stop stop-color="#FE8"/>
<stop stop-color="#D70" offset="1"/>
</radialGradient>
<radialGradient id="s2" fx=".8" fy=".5" l:href="#s1"/>
<radialGradient id="s3" fx=".5" fy=".9" l:href="#s1"/>
<radialGradient id="s4" fx=".1" fy=".5" l:href="#s1"/>
</defs>
<g stroke="#940">
<path d="M73,29c-37-40-62-24-52,4l6-7c-8-16,7-26,42,9z" fill="url(#s1)"/>
<path d="M47,8c33-16,48,21,9,47l-6-5c38-27,20-44,5-37z" fill="url(#s2)"/>
<path d="M77,32c22,30,10,57-39,51l-1-8c3,3,67,5,36-36z" fill="url(#s3)"/>
<path d="M58,84c-4,20-38-4-8-24l-6-5c-36,43,15,56,23,27z" fill="url(#s4)"/>
<path d="M40,14c-40,37-37,52-9,68l1-8c-16-13-29-21,16-56z" fill="url(#s1)"/>
<path d="M31,33c19,23,20,7,35,41l-9,1.7c-4-19-8-14-31-37z" fill="url(#s2)"/>
</g>
<path d="M58,84c-4,20-38-4-8-24l-6-5c-36,43,15,56,23,27z" fill="url(#s4)"/>
<path d="M40,14c-40,37-37,52-9,68l1-8c-16-13-29-21,16-56z" fill="url(#s1)"/>
<path d="M31,33c19,23,20,7,35,41l-9,1.7c-4-19-8-14-31-37z" fill="url(#s2)"/>
</g>
</svg>
</body></html>
EOX

View file

@ -7,8 +7,8 @@ require 'html5lib/html5parser'
$tree_types_to_test = ['simpletree', 'rexml']
begin
require 'hpricot'
$tree_types_to_test.push('hpricot')
require 'hpricot'
$tree_types_to_test.push('hpricot')
rescue LoadError
end
@ -19,90 +19,90 @@ puts 'Testing: ' + $tree_types_to_test * ', '
class Html5ParserTestCase < Test::Unit::TestCase
def self.startswith?(a, b)
b[0... a.length] == a
end
def self.startswith?(a, b)
b[0... a.length] == a
end
def self.parseTestcase(data)
innerHTML = nil
input = []
output = []
errors = []
currentList = input
data.split(/\n/).each do |line|
if !line.empty? and !startswith?("#errors", line) and
!startswith?("#document", line) and
!startswith?("#data", line) and
!startswith?("#document-fragment", line)
def self.parseTestcase(data)
innerHTML = nil
input = []
output = []
errors = []
currentList = input
data.split(/\n/).each do |line|
if !line.empty? and !startswith?("#errors", line) and
!startswith?("#document", line) and
!startswith?("#data", line) and
!startswith?("#document-fragment", line)
if currentList == output and startswith?("|", line)
currentList.push(line[2..-1])
else
currentList.push(line)
end
elsif line == "#errors"
currentList = errors
elsif line == "#document" or startswith?("#document-fragment", line)
if startswith?("#document-fragment", line)
innerHTML = line[19..-1]
raise AssertionError unless innerHTML
end
currentList = output
end
if currentList == output and startswith?("|", line)
currentList.push(line[2..-1])
else
currentList.push(line)
end
return innerHTML, input.join("\n"), output.join("\n"), errors
end
# convert the output of str(document) to the format used in the testcases
def convertTreeDump(treedump)
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
end
def sortattrs(output)
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
end
html5lib_test_files('tree-construction').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '')
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
innerHTML, input, expected_output, expected_errors = parseTestcase(data)
$tree_types_to_test.each do |tree_name|
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name))
if innerHTML
parser.parseFragment(input, innerHTML)
else
parser.parse(input)
end
actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
assert_equal sortattrs(expected_output), sortattrs(actual_output), [
'Input:', input,
'Expected:', expected_output,
'Recieved:', actual_output
].join("\n")
if $CHECK_PARSER_ERRORS
actual_errors = parser.errors.map do |(line, col), message|
'Line: %i Col: %i %s' % [line, col, message]
end
assert_equal parser.errors.length, expected_errors.length, [
'Expected errors:', expected_errors.join("\n"),
'Actual errors:', actual_errors.join("\n")
].join("\n")
end
end
end
elsif line == "#errors"
currentList = errors
elsif line == "#document" or startswith?("#document-fragment", line)
if startswith?("#document-fragment", line)
innerHTML = line[19..-1]
raise AssertionError unless innerHTML
end
currentList = output
end
end
return innerHTML, input.join("\n"), output.join("\n"), errors
end
# convert the output of str(document) to the format used in the testcases
def convertTreeDump(treedump)
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
end
def sortattrs(output)
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
end
html5lib_test_files('tree-construction').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '')
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
innerHTML, input, expected_output, expected_errors = parseTestcase(data)
$tree_types_to_test.each do |tree_name|
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name))
if innerHTML
parser.parseFragment(input, innerHTML)
else
parser.parse(input)
end
actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
assert_equal sortattrs(expected_output), sortattrs(actual_output), [
'Input:', input,
'Expected:', expected_output,
'Recieved:', actual_output
].join("\n")
if $CHECK_PARSER_ERRORS
actual_errors = parser.errors.map do |(line, col), message|
'Line: %i Col: %i %s' % [line, col, message]
end
assert_equal parser.errors.length, expected_errors.length, [
'Expected errors:', expected_errors.join("\n"),
'Actual errors:', actual_errors.join("\n")
].join("\n")
end
end
end
end
end
end

View file

@ -203,4 +203,8 @@ class SanitizeTest < Test::Unit::TestCase
sanitize_html(%(<img src='vbscript:msgbox("XSS")' />))
end
def test_should_handle_astral_plane_characters
assert_equal "<p>\360\235\222\265 \360\235\224\270</p>",
sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
end
end

View file

@ -4,75 +4,63 @@ require 'html5lib/tokenizer'
require 'tokenizer_test_parser'
begin
require 'jsonx'
rescue LoadError
class JSON
def self.parse json
json.gsub! /"\s*:/, '"=>'
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
eval json
end
end
end
class Html5TokenizerTestCase < Test::Unit::TestCase
def type_of?(token_name, token)
token != 'ParseError' and token_name == token.first
def type_of?(token_name, token)
token != 'ParseError' and token_name == token.first
end
def convert_attribute_arrays_to_hashes(tokens)
tokens.inject([]) do |tokens, token|
token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
tokens << token
end
def convert_attribute_arrays_to_hashes(tokens)
tokens.inject([]) do |tokens, token|
token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
tokens << token
end
end
def concatenate_consecutive_characters(tokens)
tokens.inject([]) do |tokens, token|
if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
tokens.last[1] = tokens.last[1] + token[1]
next tokens
end
tokens << token
end
def concatenate_consecutive_characters(tokens)
tokens.inject([]) do |tokens, token|
if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
tokens.last[1] = tokens.last[1] + token[1]
next tokens
end
tokens << token
end
end
def tokenizer_test(data)
(data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
message = [
'Description:', data['description'],
'Input:', data['input'],
'Content Model Flag:', content_model_flag ] * "\n"
assert_nothing_raised message do
tokenizer = HTML5lib::HTMLTokenizer.new(data['input'])
tokenizer.contentModelFlag = content_model_flag.to_sym
tokenizer.currentToken = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
tokens = TokenizerTestParser.new(tokenizer).parse
actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
expected = concatenate_consecutive_characters(data['output'])
assert_equal expected, actual, message
end
end
end
html5lib_test_files('tokenizer').each do |test_file|
test_name = File.basename(test_file).sub('.test', '')
tests = JSON.parse(File.read(test_file))['tests']
tests.each_with_index do |data, index|
define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
end
end
def tokenizer_test(data)
(data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
message = [
'Description:', data['description'],
'Input:', data['input'],
'Content Model Flag:', content_model_flag ] * "\n"
assert_nothing_raised message do
tokenizer = HTML5lib::HTMLTokenizer.new(data['input'])
tokenizer.contentModelFlag = content_model_flag.to_sym
tokenizer.currentToken = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
tokens = TokenizerTestParser.new(tokenizer).parse
actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
expected = concatenate_consecutive_characters(data['output'])
assert_equal expected, actual, message
end
end
end
html5lib_test_files('tokenizer').each do |test_file|
test_name = File.basename(test_file).sub('.test', '')
tests = JSON.parse(File.read(test_file))['tests']
tests.each_with_index do |data, index|
define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
end
end
end

View file

@ -1,62 +1,62 @@
require 'html5lib/constants'
class TokenizerTestParser
def initialize(tokenizer)
@tokenizer = tokenizer
def initialize(tokenizer)
@tokenizer = tokenizer
end
def parse
@outputTokens = []
debug = nil
for token in @tokenizer
debug = token.inspect if token[:type] == :ParseError
send ('process' + token[:type].to_s), token
end
def parse
@outputTokens = []
return @outputTokens
end
debug = nil
for token in @tokenizer
debug = token.inspect if token[:type] == :ParseError
send ('process' + token[:type].to_s), token
end
def processDoctype(token)
@outputTokens.push(["DOCTYPE", token[:name], token[:data]])
end
return @outputTokens
def processStartTag(token)
@outputTokens.push(["StartTag", token[:name], token[:data]])
end
def processEmptyTag(token)
if not HTML5lib::VOID_ELEMENTS.include? token[:name]
@outputTokens.push("ParseError")
end
@outputTokens.push(["StartTag", token[:name], token[:data]])
end
def processDoctype(token)
@outputTokens.push(["DOCTYPE", token[:name], token[:data]])
def processEndTag(token)
if token[:data].length > 0
self.processParseError(token)
end
@outputTokens.push(["EndTag", token[:name]])
end
def processStartTag(token)
@outputTokens.push(["StartTag", token[:name], token[:data]])
end
def processComment(token)
@outputTokens.push(["Comment", token[:data]])
end
def processEmptyTag(token)
if not HTML5lib::VOID_ELEMENTS.include? token[:name]
@outputTokens.push("ParseError")
end
@outputTokens.push(["StartTag", token[:name], token[:data]])
end
def processCharacters(token)
@outputTokens.push(["Character", token[:data]])
end
def processEndTag(token)
if token[:data].length > 0
self.processParseError(token)
end
@outputTokens.push(["EndTag", token[:name]])
end
alias processSpaceCharacters processCharacters
def processComment(token)
@outputTokens.push(["Comment", token[:data]])
end
def processCharacters(token)
@outputTokens.push(["Character", token[:data]])
end
def processCharacters(token)
@outputTokens.push(["Character", token[:data]])
end
def processEOF(token)
end
alias processSpaceCharacters processCharacters
def processCharacters(token)
@outputTokens.push(["Character", token[:data]])
end
def processEOF(token)
end
def processParseError(token)
@outputTokens.push("ParseError")
end
def processParseError(token)
@outputTokens.push("ParseError")
end
end