Merged with latest trunk.

This commit is contained in:
Jason Blevins 2007-06-04 22:47:59 -04:00
commit aadfb55342
38 changed files with 4839 additions and 4849 deletions

View file

@ -8,7 +8,19 @@ OPTIONS = {
:ip => "0.0.0.0",
:environment => "production",
:server_root => File.expand_path(File.dirname(__FILE__) + "/../public/"),
:server_type => WEBrick::SimpleServer
:server_type => WEBrick::SimpleServer,
:mime_types => WEBrick::HTTPUtils::DefaultMimeTypes.merge({
'avi' => 'video/x-msvideo',
'gz' => 'application/x-gzip',
'js' => 'application/x-javascript',
'nb' => 'application/mathematica',
'pdf' => 'application/pdf',
'svg' => 'application/svg+xml',
'tar' => 'application/x-tar',
'tex' => 'application/x-tex',
'xml' => 'application/xml',
'xslt' => 'application/xslt+xml'
})
}
ARGV.options do |opts|

View file

@ -1,15 +1,15 @@
module HTML5lib
class EOF < Exception; end
class EOF < Exception; end
CONTENT_MODEL_FLAGS = [
CONTENT_MODEL_FLAGS = [
:PCDATA,
:RCDATA,
:CDATA,
:PLAINTEXT
]
]
SCOPING_ELEMENTS = %w[
SCOPING_ELEMENTS = %w[
button
caption
html
@ -18,9 +18,9 @@ SCOPING_ELEMENTS = %w[
table
td
th
]
]
FORMATTING_ELEMENTS = %w[
FORMATTING_ELEMENTS = %w[
a
b
big
@ -34,9 +34,9 @@ FORMATTING_ELEMENTS = %w[
strong
tt
u
]
]
SPECIAL_ELEMENTS = %w[
SPECIAL_ELEMENTS = %w[
address
area
base
@ -98,43 +98,43 @@ SPECIAL_ELEMENTS = %w[
tr
ul
wbr
]
]
SPACE_CHARACTERS = %W[
SPACE_CHARACTERS = %W[
\t
\n
\x0B
\x0C
\x20
\r
]
]
TABLE_INSERT_MODE_ELEMENTS = %w[
TABLE_INSERT_MODE_ELEMENTS = %w[
table
tbody
tfoot
thead
tr
]
]
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
DIGITS = '0'..'9'
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
DIGITS = '0'..'9'
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
# Heading elements need to be ordered
HEADING_ELEMENTS = %w[
# Heading elements need to be ordered
HEADING_ELEMENTS = %w[
h1
h2
h3
h4
h5
h6
]
]
# XXX What about event-source and command?
VOID_ELEMENTS = %w[
# XXX What about event-source and command?
VOID_ELEMENTS = %w[
base
link
meta
@ -146,10 +146,10 @@ VOID_ELEMENTS = %w[
area
col
input
]
]
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
ENTITIES_WINDOWS1252 = [
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
ENTITIES_WINDOWS1252 = [
8364, # 0x80 0x20AC EURO SIGN
65533, # 0x81 UNDEFINED
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
@ -182,17 +182,17 @@ ENTITIES_WINDOWS1252 = [
65533, # 0x9D UNDEFINED
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
]
]
private
private
def self.U n
[n].pack('U')
end
public
public
ENTITIES = {
ENTITIES = {
"AElig" => U(0xC6),
"Aacute" => U(0xC1),
"Acirc" => U(0xC2),
@ -452,9 +452,9 @@ ENTITIES = {
"zeta" => U(0x03B6),
"zwj" => U(0x200D),
"zwnj" => U(0x200C)
}
}
ENCODINGS = %w[
ENCODINGS = %w[
ansi_x3.4-1968
iso-ir-6
ansi_x3.4-1986
@ -671,6 +671,6 @@ ENCODINGS = %w[
windows-1258
tis-620
hz-gb-2312
]
]
end

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,46 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class AfterBodyPhase < Phase
handle_end 'html'
def processComment(data)
# This is needed because data is to be appended to the <html> element
# here and not to whatever is currently open.
@tree.insertComment(data, @tree.openElements[0])
end
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters in the after body phase.'))
@parser.phase = @parser.phases[:inBody]
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.parseError(_("Unexpected start tag token (#{name}) in the after body phase."))
@parser.phase = @parser.phases[:inBody]
@parser.phase.processStartTag(name, attributes)
end
def endTagHtml(name)
if @parser.innerHTML
@parser.parseError
else
# XXX: This may need to be done, not sure
# Don't set lastPhase to the current phase but to the inBody phase
# instead. No need for extra parse errors if there's something after </html>.
# Try "<!doctype html>X</html>X" for instance.
@parser.lastPhase = @parser.phase
@parser.phase = @parser.phases[:trailingEnd]
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the after body phase."))
@parser.phase = @parser.phases[:inBody]
@parser.phase.processEndTag(name)
end
end
end

View file

@ -0,0 +1,34 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class AfterFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#after3
handle_start 'html', 'noframes'
handle_end 'html'
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters in the after frameset phase. Ignored.'))
end
def startTagNoframes(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) in the after frameset phase. Ignored."))
end
def endTagHtml(name)
@parser.lastPhase = @parser.phase
@parser.phase = @parser.phases[:trailingEnd]
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the after frameset phase. Ignored."))
end
end
end

View file

@ -0,0 +1,50 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class AfterHeadPhase < Phase
handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead'
def processEOF
anythingElse
@parser.phase.processEOF
end
def processCharacters(data)
anythingElse
@parser.phase.processCharacters(data)
end
def startTagBody(name, attributes)
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inBody]
end
def startTagFrameset(name, attributes)
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inFrameset]
end
def startTagFromHead(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) that can be in head. Moved."))
@parser.phase = @parser.phases[:inHead]
@parser.phase.processStartTag(name, attributes)
end
def startTagOther(name, attributes)
anythingElse
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
anythingElse
@parser.phase.processEndTag(name)
end
def anythingElse
@tree.insertElement('body', {})
@parser.phase = @parser.phases[:inBody]
end
end
end

View file

@ -0,0 +1,41 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class BeforeHeadPhase < Phase
handle_start 'html', 'head'
handle_end 'html'
def processEOF
startTagHead('head', {})
@parser.phase.processEOF
end
def processCharacters(data)
startTagHead('head', {})
@parser.phase.processCharacters(data)
end
def startTagHead(name, attributes)
@tree.insertElement(name, attributes)
@tree.headPointer = @tree.openElements[-1]
@parser.phase = @parser.phases[:inHead]
end
def startTagOther(name, attributes)
startTagHead('head', {})
@parser.phase.processStartTag(name, attributes)
end
def endTagHtml(name)
startTagHead('head', {})
@parser.phase.processEndTag(name)
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) after the (implied) root element."))
end
end
end

View file

@ -0,0 +1,548 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InBodyPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-body
handle_start 'html', 'body', 'form', 'plaintext', 'a', 'button', 'xmp', 'table', 'hr', 'image'
handle_start 'input', 'textarea', 'select', 'isindex', %w( script style ), %w( marquee object )
handle_start %w( li dd dt ) => 'ListItem', %w( base link meta title ) => 'FromHead'
handle_start %w( address blockquote center dir div dl fieldset listing menu ol p pre ul ) => 'CloseP'
handle_start %w( b big em font i nobr s small strike strong tt u ) => 'Formatting'
handle_start %w( area basefont bgsound br embed img param spacer wbr ) => 'VoidFormatting'
handle_start %w( iframe noembed noframes noscript ) => 'Cdata', HEADING_ELEMENTS => 'Heading'
handle_start %w( caption col colgroup frame frameset head option optgroup tbody td tfoot th thead tr ) => 'Misplaced'
handle_start %w( event-source section nav article aside header footer datagrid command ) => 'New'
handle_end 'p', 'body', 'html', 'form', %w( button marquee object ), %w( dd dt li ) => 'ListItem'
handle_end %w( address blockquote center div dl fieldset listing menu ol pre ul ) => 'Block'
handle_end %w( a b big em font i nobr s small strike strong tt u ) => 'Formatting'
handle_end %w( head frameset select optgroup option table caption colgroup col thead tfoot tbody tr td th ) => 'Misplaced'
handle_end %w( area basefont bgsound br embed hr image img input isindex param spacer wbr frame ) => 'None'
handle_end %w( noframes noscript noembed textarea xmp iframe ) => 'CdataTextAreaXmp'
handle_end %w( event-source section nav article aside header footer datagrid command ) => 'New'
def initialize(parser, tree)
super(parser, tree)
# for special handling of whitespace in <pre>
@processSpaceCharactersPre = false
end
def processSpaceCharactersPre(data)
#Sometimes (start of <pre> blocks) we want to drop leading newlines
@processSpaceCharactersPre = false
if (data.length > 0 and data[0] == ?\n and
@tree.openElements[-1].name == 'pre' and
not @tree.openElements[-1].hasContent)
data = data[1..-1]
end
@tree.insertText(data) if data.length > 0
end
def processSpaceCharacters(data)
if @processSpaceCharactersPre
processSpaceCharactersPre(data)
else
super(data)
end
end
def processCharacters(data)
# XXX The specification says to do this for every character at the
# moment, but apparently that doesn't match the real world so we don't
# do it for space characters.
@tree.reconstructActiveFormattingElements
@tree.insertText(data)
end
def startTagScriptStyle(name, attributes)
@parser.phases[:inHead].processStartTag(name, attributes)
end
def startTagFromHead(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) that belongs in the head. Moved."))
@parser.phases[:inHead].processStartTag(name, attributes)
end
def startTagBody(name, attributes)
@parser.parseError(_('Unexpected start tag (body).'))
if (@tree.openElements.length == 1 or
@tree.openElements[1].name != 'body')
assert @parser.innerHTML
else
attributes.each do |attr, value|
unless @tree.openElements[1].attributes.has_key?(attr)
@tree.openElements[1].attributes[attr] = value
end
end
end
end
def startTagCloseP(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@processSpaceCharactersPre = true if name == 'pre'
end
def startTagForm(name, attributes)
if @tree.formPointer
@parser.parseError('Unexpected start tag (form). Ignored.')
else
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@tree.formPointer = @tree.openElements[-1]
end
end
def startTagListItem(name, attributes)
endTagP('p') if in_scope?('p')
stopNames = {'li' => ['li'], 'dd' => ['dd', 'dt'], 'dt' => ['dd', 'dt']}
stopName = stopNames[name]
@tree.openElements.reverse.each_with_index do |node, i|
if stopName.include?(node.name)
(i + 1).times { @tree.openElements.pop }
break
end
# Phrasing elements are all non special, non scoping, non
# formatting elements
break if ((SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name) and
not ['address', 'div'].include?(node.name))
end
# Always insert an <li> element.
@tree.insertElement(name, attributes)
end
def startTagPlaintext(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :PLAINTEXT
end
def startTagHeading(name, attributes)
endTagP('p') if in_scope?('p')
HEADING_ELEMENTS.each do |element|
if in_scope?(element)
@parser.parseError(_("Unexpected start tag (#{name})."))
remove_open_elements_until { |element| HEADING_ELEMENTS.include?(element.name) }
break
end
end
@tree.insertElement(name, attributes)
end
def startTagA(name, attributes)
if afeAElement = @tree.elementInActiveFormattingElements('a')
@parser.parseError(_('Unexpected start tag (a) implies end tag (a).'))
endTagFormatting('a')
@tree.openElements.delete(afeAElement) if @tree.openElements.include?(afeAElement)
@tree.activeFormattingElements.delete(afeAElement) if @tree.activeFormattingElements.include?(afeAElement)
end
@tree.reconstructActiveFormattingElements
addFormattingElement(name, attributes)
end
def startTagFormatting(name, attributes)
@tree.reconstructActiveFormattingElements
addFormattingElement(name, attributes)
end
def startTagButton(name, attributes)
if in_scope?('button')
@parser.parseError(_('Unexpected start tag (button) implied end tag (button).'))
processEndTag('button')
@parser.phase.processStartTag(name, attributes)
else
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.activeFormattingElements.push(Marker)
end
end
def startTagMarqueeObject(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.activeFormattingElements.push(Marker)
end
def startTagXmp(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagTable(name, attributes)
processEndTag('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inTable]
end
def startTagVoidFormatting(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagHr(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagImage(name, attributes)
# No really...
@parser.parseError(_('Unexpected start tag (image). Treated as img.'))
processStartTag('img', attributes)
end
def startTagInput(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
if @tree.formPointer
# XXX Not exactly sure what to do here
# @tree.openElements[-1].form = @tree.formPointer
end
@tree.openElements.pop
end
def startTagIsindex(name, attributes)
@parser.parseError("Unexpected start tag isindex. Don't use it!")
return if @tree.formPointer
processStartTag('form', {})
processStartTag('hr', {})
processStartTag('p', {})
processStartTag('label', {})
# XXX Localization ...
processCharacters('This is a searchable index. Insert your search keywords here:')
attributes['name'] = 'isindex'
attrs = attributes.to_a
processStartTag('input', attributes)
processEndTag('label')
processEndTag('p')
processStartTag('hr', {})
processEndTag('form')
end
def startTagTextarea(name, attributes)
# XXX Form element pointer checking here as well...
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :RCDATA
end
# iframe, noembed noframes, noscript(if scripting enabled)
def startTagCdata(name, attributes)
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagSelect(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inSelect]
end
def startTagMisplaced(name, attributes)
# Elements that should be children of other elements that have a
# different insertion mode; here they are ignored
# "caption", "col", "colgroup", "frame", "frameset", "head",
# "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
# "tr", "noscript"
@parser.parseError(_("Unexpected start tag (#{name}). Ignored."))
end
def startTagNew(name, attributes)
# New HTML5 elements, "event-source", "section", "nav",
# "article", "aside", "header", "footer", "datagrid", "command"
sys.stderr.write("Warning: Undefined behaviour for start tag #{name}")
startTagOther(name, attributes)
#raise NotImplementedError
end
def startTagOther(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
end
def endTagP(name)
@tree.generateImpliedEndTags('p') if in_scope?('p')
@parser.parseError('Unexpected end tag (p).') unless @tree.openElements[-1].name == 'p'
@tree.openElements.pop while in_scope?('p')
end
def endTagBody(name)
# XXX Need to take open <p> tags into account here. We shouldn't imply
# </p> but we should not throw a parse error either. Specification is
# likely to be updated.
unless @tree.openElements[1].name == 'body'
# innerHTML case
@parser.parseError
return
end
unless @tree.openElements[-1].name == 'body'
@parser.parseError(_("Unexpected end tag (body). Missing end tag (#{@tree.openElements[-1].name})."))
end
@parser.phase = @parser.phases[:afterBody]
end
def endTagHtml(name)
endTagBody(name)
@parser.phase.processEndTag(name) unless @parser.innerHTML
end
def endTagBlock(name)
#Put us back in the right whitespace handling mode
@processSpaceCharactersPre = false if name == 'pre'
@tree.generateImpliedEndTags if in_scope?(name)
unless @tree.openElements[-1].name == name
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
end
if in_scope?(name)
remove_open_elements_until(name)
end
end
def endTagForm(name)
endTagBlock(name)
@tree.formPointer = nil
end
def endTagListItem(name)
# AT Could merge this with the Block case
if in_scope?(name)
@tree.generateImpliedEndTags(name)
unless @tree.openElements[-1].name == name
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
end
end
remove_open_elements_until(name) if in_scope?(name)
end
def endTagHeading(name)
HEADING_ELEMENTS.each do |element|
if in_scope?(element)
@tree.generateImpliedEndTags
break
end
end
unless @tree.openElements[-1].name == name
@parser.parseError(("Unexpected end tag (#{name}). Expected other end tag."))
end
HEADING_ELEMENTS.each do |element|
if in_scope?(element)
remove_open_elements_until { |element| HEADING_ELEMENTS.include?(element.name) }
break
end
end
end
# The much-feared adoption agency algorithm
def endTagFormatting(name)
# http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
# XXX Better parseError messages appreciated.
while true
# Step 1 paragraph 1
afeElement = @tree.elementInActiveFormattingElements(name)
if not afeElement or (@tree.openElements.include?(afeElement) and not in_scope?(afeElement.name))
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 1 of the adoption agency algorithm."))
return
# Step 1 paragraph 2
elsif not @tree.openElements.include?(afeElement)
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 2 of the adoption agency algorithm."))
@tree.activeFormattingElements.delete(afeElement)
return
end
# Step 1 paragraph 3
if afeElement != @tree.openElements[-1]
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 3 of the adoption agency algorithm."))
end
# Step 2
# Start of the adoption agency algorithm proper
afeIndex = @tree.openElements.index(afeElement)
furthestBlock = nil
@tree.openElements[afeIndex..-1].each do |element|
if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(element.name)
furthestBlock = element
break
end
end
# Step 3
if furthestBlock.nil?
element = remove_open_elements_until { |element| element == afeElement }
@tree.activeFormattingElements.delete(element)
return
end
commonAncestor = @tree.openElements[afeIndex - 1]
# Step 5
furthestBlock.parent.removeChild(furthestBlock) if furthestBlock.parent
# Step 6
# The bookmark is supposed to help us identify where to reinsert
# nodes in step 12. We have to ensure that we reinsert nodes after
# the node before the active formatting element. Note the bookmark
# can move in step 7.4
bookmark = @tree.activeFormattingElements.index(afeElement)
# Step 7
lastNode = node = furthestBlock
while true
# AT replace this with a function and recursion?
# Node is element before node in open elements
node = @tree.openElements[@tree.openElements.index(node) - 1]
until @tree.activeFormattingElements.include?(node)
tmpNode = node
node = @tree.openElements[@tree.openElements.index(node) - 1]
@tree.openElements.delete(tmpNode)
end
# Step 7.3
break if node == afeElement
# Step 7.4
if lastNode == furthestBlock
# XXX should this be index(node) or index(node)+1
# Anne: I think +1 is ok. Given x = [2,3,4,5]
# x.index(3) gives 1 and then x[1 +1] gives 4...
bookmark = @tree.activeFormattingElements.index(node) + 1
end
# Step 7.5
cite = node.parent
if node.hasContent
clone = node.cloneNode
# Replace node with clone
@tree.activeFormattingElements[@tree.activeFormattingElements.index(node)] = clone
@tree.openElements[@tree.openElements.index(node)] = clone
node = clone
end
# Step 7.6
# Remove lastNode from its parents, if any
lastNode.parent.removeChild(lastNode) if lastNode.parent
node.appendChild(lastNode)
# Step 7.7
lastNode = node
# End of inner loop
end
# Step 8
lastNode.parent.removeChild(lastNode) if lastNode.parent
commonAncestor.appendChild(lastNode)
# Step 9
clone = afeElement.cloneNode
# Step 10
furthestBlock.reparentChildren(clone)
# Step 11
furthestBlock.appendChild(clone)
# Step 12
@tree.activeFormattingElements.delete(afeElement)
@tree.activeFormattingElements.insert([bookmark,@tree.activeFormattingElements.length].min, clone)
# Step 13
@tree.openElements.delete(afeElement)
@tree.openElements.insert(@tree.openElements.index(furthestBlock) + 1, clone)
end
end
def endTagButtonMarqueeObject(name)
@tree.generateImpliedEndTags if in_scope?(name)
unless @tree.openElements[-1].name == name
@parser.parseError(_("Unexpected end tag (#{name}). Expected other end tag first."))
end
if in_scope?(name)
remove_open_elements_until(name)
@tree.clearActiveFormattingElements
end
end
def endTagMisplaced(name)
# This handles elements with end tags in other insertion modes.
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagNone(name)
# This handles elements with no end tag.
@parser.parseError(_("This tag (#{name}) has no end tag"))
end
def endTagCdataTextAreaXmp(name)
if @tree.openElements[-1].name == name
@tree.openElements.pop
else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagNew(name)
# New HTML5 elements, "event-source", "section", "nav",
# "article", "aside", "header", "footer", "datagrid", "command"
STDERR.puts "Warning: Undefined behaviour for end tag #{name}"
endTagOther(name)
#raise NotImplementedError
end
def endTagOther(name)
# XXX This logic should be moved into the treebuilder
@tree.openElements.reverse.each do |node|
if node.name == name
@tree.generateImpliedEndTags
unless @tree.openElements[-1].name == name
@parser.parseError(_("Unexpected end tag (#{name})."))
end
remove_open_elements_until { |element| element == node }
break
else
if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
break
end
end
end
end
protected
def addFormattingElement(name, attributes)
@tree.insertElement(name, attributes)
@tree.activeFormattingElements.push(@tree.openElements[-1])
end
end
end

View file

@ -0,0 +1,68 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InCaptionPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
handle_start 'html', %w( caption col colgroup tbody td tfoot th thead tr ) => 'TableElement'
handle_end 'caption', 'table', %w( body col colgroup html tbody td tfoot th thead tr ) => 'Ignore'
def ignoreEndTagCaption
not in_scope?('caption', true)
end
def processCharacters(data)
@parser.phases[:inBody].processCharacters(data)
end
def startTagTableElement(name, attributes)
@parser.parseError
#XXX Have to duplicate logic here to find out if the tag is ignored
ignoreEndTag = ignoreEndTagCaption
@parser.phase.processEndTag('caption')
@parser.phase.processStartTag(name, attributes) unless ignoreEndTag
end
def startTagOther(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def endTagCaption(name)
if ignoreEndTagCaption
# innerHTML case
assert @parser.innerHTML
@parser.parseError
else
# AT this code is quite similar to endTagTable in "InTable"
@tree.generateImpliedEndTags
unless @tree.openElements[-1].name == 'caption'
@parser.parseError(_("Unexpected end tag (caption). Missing end tags."))
end
remove_open_elements_until('caption')
@tree.clearActiveFormattingElements
@parser.phase = @parser.phases[:inTable]
end
end
def endTagTable(name)
@parser.parseError
ignoreEndTag = ignoreEndTagCaption
@parser.phase.processEndTag('caption')
@parser.phase.processEndTag(name) unless ignoreEndTag
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagOther(name)
@parser.phases[:inBody].processEndTag(name)
end
end
end

View file

@ -0,0 +1,78 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InCellPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
handle_start 'html', %w( caption col colgroup tbody td tfoot th thead tr ) => 'TableOther'
handle_end %w( td th ) => 'TableCell', %w( body caption col colgroup html ) => 'Ignore'
handle_end %w( table tbody tfoot thead tr ) => 'Imply'
def processCharacters(data)
@parser.phases[:inBody].processCharacters(data)
end
def startTagTableOther(name, attributes)
if in_scope?('td', true) or in_scope?('th', true)
closeCell
@parser.phase.processStartTag(name, attributes)
else
# innerHTML case
@parser.parseError
end
end
def startTagOther(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def endTagTableCell(name)
if in_scope?(name, true)
@tree.generateImpliedEndTags(name)
if @tree.openElements[-1].name != name
@parser.parseError("Got table cell end tag (#{name}) while required end tags are missing.")
remove_open_elements_until(name)
else
@tree.openElements.pop
end
@tree.clearActiveFormattingElements
@parser.phase = @parser.phases[:inRow]
else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagImply(name)
if in_scope?(name, true)
closeCell
@parser.phase.processEndTag(name)
else
# sometimes innerHTML case
@parser.parseError
end
end
def endTagOther(name)
@parser.phases[:inBody].processEndTag(name)
end
protected
def closeCell
if in_scope?('td', true)
endTagTableCell('td')
elsif in_scope?('th', true)
endTagTableCell('th')
end
end
end
end

View file

@ -0,0 +1,55 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InColumnGroupPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-column
handle_start 'html', 'col'
handle_end 'colgroup', 'col'
def ignoreEndTagColgroup
@tree.openElements[-1].name == 'html'
end
def processCharacters(data)
ignoreEndTag = ignoreEndTagColgroup
endTagColgroup("colgroup")
@parser.phase.processCharacters(data) unless ignoreEndTag
end
def startTagCol(name, attributes)
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagOther(name, attributes)
ignoreEndTag = ignoreEndTagColgroup
endTagColgroup('colgroup')
@parser.phase.processStartTag(name, attributes) unless ignoreEndTag
end
def endTagColgroup(name)
if ignoreEndTagColgroup
# innerHTML case
assert @parser.innerHTML
@parser.parseError
else
@tree.openElements.pop
@parser.phase = @parser.phases[:inTable]
end
end
def endTagCol(name)
@parser.parseError(_('Unexpected end tag (col). col has no end tag.'))
end
def endTagOther(name)
ignoreEndTag = ignoreEndTagColgroup
endTagColgroup('colgroup')
@parser.phase.processEndTag(name) unless ignoreEndTag
end
end
end

View file

@ -0,0 +1,57 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
handle_start 'html', 'frameset', 'frame', 'noframes'
handle_end 'frameset', 'noframes'
def processCharacters(data)
@parser.parseError(_('Unexpected characters in the frameset phase. Characters ignored.'))
end
def startTagFrameset(name, attributes)
@tree.insertElement(name, attributes)
end
def startTagFrame(name, attributes)
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagNoframes(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag token (#{name}) in the frameset phase. Ignored"))
end
def endTagFrameset(name)
if @tree.openElements[-1].name == 'html'
# innerHTML case
@parser.parseError(_("Unexpected end tag token (frameset) in the frameset phase (innerHTML)."))
else
@tree.openElements.pop
end
if (not @parser.innerHTML and
@tree.openElements[-1].name != 'frameset')
# If we're not in innerHTML mode and the the current node is not a
# "frameset" element (anymore) then switch.
@parser.phase = @parser.phases[:afterFrameset]
end
end
def endTagNoframes(name)
@parser.phases[:inBody].processEndTag(name)
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the frameset phase. Ignored."))
end
end
end

View file

@ -0,0 +1,120 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InHeadPhase < Phase
handle_start 'html', 'head', 'title', 'style', 'script', %w( base link meta )
handle_end 'head', 'html', %w( title style script )
def processEOF
if ['title', 'style', 'script'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected end of file. Expected end tag (#{name})."))
@tree.openElements.pop
end
anythingElse
@parser.phase.processEOF
end
def processCharacters(data)
if ['title', 'style', 'script'].include?(@tree.openElements[-1].name)
@tree.insertText(data)
else
anythingElse
@parser.phase.processCharacters(data)
end
end
def startTagHead(name, attributes)
@parser.parseError(_('Unexpected start tag head in existing head. Ignored'))
end
def startTagTitle(name, attributes)
element = @tree.createElement(name, attributes)
appendToHead(element)
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :RCDATA
end
def startTagStyle(name, attributes)
element = @tree.createElement(name, attributes)
if @tree.headPointer != nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.openElements[-1].appendChild(element)
end
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagScript(name, attributes)
#XXX Inner HTML case may be wrong
element = @tree.createElement(name, attributes)
element._flags.push("parser-inserted")
if (@tree.headPointer != nil and
@parser.phase == @parser.phases[:inHead])
appendToHead(element)
else
@tree.openElements[-1].appendChild(element)
end
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagBaseLinkMeta(name, attributes)
element = @tree.createElement(name, attributes)
appendToHead(element)
end
def startTagOther(name, attributes)
anythingElse
@parser.phase.processStartTag(name, attributes)
end
def endTagHead(name)
if @tree.openElements[-1].name == 'head'
@tree.openElements.pop
else
@parser.parseError(_("Unexpected end tag (head). Ignored."))
end
@parser.phase = @parser.phases[:afterHead]
end
def endTagHtml(name)
anythingElse
@parser.phase.processEndTag(name)
end
def endTagTitleStyleScript(name)
if @tree.openElements[-1].name == name
@tree.openElements.pop
else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def anythingElse
if @tree.openElements[-1].name == 'head'
endTagHead('head')
else
@parser.phase = @parser.phases[:afterHead]
end
end
protected
def appendToHead(element)
if @tree.headPointer.nil?
assert @parser.innerHTML
@tree.openElements[-1].appendChild(element)
else
@tree.headPointer.appendChild(element)
end
end
end
end

View file

@ -0,0 +1,87 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InRowPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
handle_start 'html', %w( td th ) => 'TableCell', %w( caption col colgroup tbody tfoot thead tr ) => 'TableOther'
handle_end 'tr', 'table', %w( tbody tfoot thead ) => 'TableRowGroup', %w( body caption col colgroup html td th ) => 'Ignore'
def processCharacters(data)
@parser.phases[:inTable].processCharacters(data)
end
def startTagTableCell(name, attributes)
clearStackToTableRowContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inCell]
@tree.activeFormattingElements.push(Marker)
end
def startTagTableOther(name, attributes)
ignoreEndTag = ignoreEndTagTr
endTagTr('tr')
# XXX how are we sure it's always ignored in the innerHTML case?
@parser.phase.processStartTag(name, attributes) unless ignoreEndTag
end
def startTagOther(name, attributes)
@parser.phases[:inTable].processStartTag(name, attributes)
end
def endTagTr(name)
if ignoreEndTagTr
# innerHTML case
assert @parser.innerHTML
@parser.parseError
else
clearStackToTableRowContext
@tree.openElements.pop
@parser.phase = @parser.phases[:inTableBody]
end
end
def endTagTable(name)
ignoreEndTag = ignoreEndTagTr
endTagTr('tr')
# Reprocess the current tag if the tr end tag was not ignored
# XXX how are we sure it's always ignored in the innerHTML case?
@parser.phase.processEndTag(name) unless ignoreEndTag
end
def endTagTableRowGroup(name)
if in_scope?(name, true)
endTagTr('tr')
@parser.phase.processEndTag(name)
else
# innerHTML case
@parser.parseError
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the row phase. Ignored."))
end
def endTagOther(name)
@parser.phases[:inTable].processEndTag(name)
end
protected
# XXX unify this with other table helper methods
def clearStackToTableRowContext
until ['tr', 'html'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the row phase."))
@tree.openElements.pop
end
end
def ignoreEndTagTr
not in_scope?('tr', :tableVariant => true)
end
end
end

View file

@ -0,0 +1,84 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InSelectPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
handle_start 'html', 'option', 'optgroup', 'select'
handle_end 'option', 'optgroup', 'select', %w( caption table tbody tfoot thead tr td th ) => 'TableElements'
def processCharacters(data)
@tree.insertText(data)
end
def startTagOption(name, attributes)
# We need to imply </option> if <option> is the current node.
@tree.openElements.pop if @tree.openElements[-1].name == 'option'
@tree.insertElement(name, attributes)
end
def startTagOptgroup(name, attributes)
@tree.openElements.pop if @tree.openElements[-1].name == 'option'
@tree.openElements.pop if @tree.openElements[-1].name == 'optgroup'
@tree.insertElement(name, attributes)
end
def startTagSelect(name, attributes)
@parser.parseError(_('Unexpected start tag (select) in the select phase implies select start tag.'))
endTagSelect('select')
end
def startTagOther(name, attributes)
@parser.parseError(_('Unexpected start tag token (#{name}) in the select phase. Ignored.'))
end
def endTagOption(name)
if @tree.openElements[-1].name == 'option'
@tree.openElements.pop
else
@parser.parseError(_('Unexpected end tag (option) in the select phase. Ignored.'))
end
end
def endTagOptgroup(name)
# </optgroup> implicitly closes <option>
if @tree.openElements[-1].name == 'option' and @tree.openElements[-2].name == 'optgroup'
@tree.openElements.pop
end
# It also closes </optgroup>
if @tree.openElements[-1].name == 'optgroup'
@tree.openElements.pop
# But nothing else
else
@parser.parseError(_('Unexpected end tag (optgroup) in the select phase. Ignored.'))
end
end
def endTagSelect(name)
if in_scope?('select', true)
remove_open_elements_until('select')
@parser.resetInsertionMode
else
# innerHTML case
@parser.parseError
end
end
def endTagTableElements(name)
@parser.parseError(_("Unexpected table end tag (#{name}) in the select phase."))
if in_scope?(name, true)
endTagSelect('select')
@parser.phase.processEndTag(name)
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the select phase. Ignored."))
end
end
end

View file

@ -0,0 +1,83 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InTableBodyPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
handle_start 'html', 'tr', %w( td th ) => 'TableCell', %w( caption col colgroup tbody tfoot thead ) => 'TableOther'
handle_end 'table', %w( tbody tfoot thead ) => 'TableRowGroup', %w( body caption col colgroup html td th tr ) => 'Ingore'
def processCharacters(data)
@parser.phases[:inTable].processCharacters(data)
end
def startTagTr(name, attributes)
clearStackToTableBodyContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inRow]
end
def startTagTableCell(name, attributes)
@parser.parseError(_("Unexpected table cell start tag (#{name}) in the table body phase."))
startTagTr('tr', {})
@parser.phase.processStartTag(name, attributes)
end
def startTagTableOther(name, attributes)
# XXX AT Any ideas on how to share this with endTagTable?
if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
clearStackToTableBodyContext
endTagTableRowGroup(@tree.openElements[-1].name)
@parser.phase.processStartTag(name, attributes)
else
# innerHTML case
@parser.parseError
end
end
def startTagOther(name, attributes)
@parser.phases[:inTable].processStartTag(name, attributes)
end
def endTagTableRowGroup(name)
if in_scope?(name, true)
clearStackToTableBodyContext
@tree.openElements.pop
@parser.phase = @parser.phases[:inTable]
else
@parser.parseError(_("Unexpected end tag (#{name}) in the table body phase. Ignored."))
end
end
def endTagTable(name)
if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
clearStackToTableBodyContext
endTagTableRowGroup(@tree.openElements[-1].name)
@parser.phase.processEndTag(name)
else
# innerHTML case
@parser.parseError
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the table body phase. Ignored."))
end
def endTagOther(name)
@parser.phases[:inTable].processEndTag(name)
end
protected
def clearStackToTableBodyContext
until ['tbody', 'tfoot', 'thead', 'html'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the table body phase."))
@tree.openElements.pop
end
end
end
end

View file

@ -0,0 +1,110 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InTablePhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
handle_start 'html', 'caption', 'colgroup', 'col', 'table'
handle_start %w( tbody tfoot thead ) => 'RowGroup', %w( td th tr ) => 'ImplyTbody'
handle_end 'table', %w( body caption col colgroup html tbody td tfoot th thead tr ) => 'Ignore'
def processCharacters(data)
@parser.parseError(_("Unexpected non-space characters in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
@tree.insertFromTable = true
# Process the character in the "in body" mode
@parser.phases[:inBody].processCharacters(data)
@tree.insertFromTable = false
end
def startTagCaption(name, attributes)
clearStackToTableContext
@tree.activeFormattingElements.push(Marker)
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inCaption]
end
def startTagColgroup(name, attributes)
clearStackToTableContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inColumnGroup]
end
def startTagCol(name, attributes)
startTagColgroup('colgroup', {})
@parser.phase.processStartTag(name, attributes)
end
def startTagRowGroup(name, attributes)
clearStackToTableContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inTableBody]
end
def startTagImplyTbody(name, attributes)
startTagRowGroup('tbody', {})
@parser.phase.processStartTag(name, attributes)
end
def startTagTable(name, attributes)
@parser.parseError(_("Unexpected start tag (table) in table phase. Implies end tag (table)."))
@parser.phase.processEndTag('table')
@parser.phase.processStartTag(name, attributes) unless @parser.innerHTML
end
def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
@tree.insertFromTable = true
# Process the start tag in the "in body" mode
@parser.phases[:inBody].processStartTag(name, attributes)
@tree.insertFromTable = false
end
def endTagTable(name)
if in_scope?('table', true)
@tree.generateImpliedEndTags
unless @tree.openElements[-1].name == 'table'
@parser.parseError(_("Unexpected end tag (table). Expected end tag (#{@tree.openElements[-1].name})."))
end
remove_open_elements_until('table')
@parser.resetInsertionMode
else
# innerHTML case
assert @parser.innerHTML
@parser.parseError
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
@parser.insertFromTable = true
# Process the end tag in the "in body" mode
@parser.phases[:inBody].processEndTag(name)
@parser.insertFromTable = false
end
protected
def clearStackToTableContext
# "clear the stack back to a table context"
until ['table', 'html'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the table phase."))
@tree.openElements.pop
end
# When the current node is <html> it's an innerHTML case
end
end
end

View file

@ -0,0 +1,49 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InitialPhase < Phase
# This phase deals with error handling as well which is currently not
# covered in the specification. The error handling is typically known as
# "quirks mode". It is expected that a future version of HTML5 will define this.
def processEOF
@parser.parseError(_('Unexpected End of file. Expected DOCTYPE.'))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processEOF
end
def processComment(data)
@tree.insertComment(data, @tree.document)
end
def processDoctype(name, error)
@parser.parseError(_('Erroneous DOCTYPE.')) if error
@tree.insertDoctype(name)
@parser.phase = @parser.phases[:rootElement]
end
def processSpaceCharacters(data)
@tree.insertText(data, @tree.document)
end
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters. Expected DOCTYPE.'))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}). Expected DOCTYPE."))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
@parser.parseError(_("Unexpected end tag (#{name}). Expected DOCTYPE."))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processEndTag(name)
end
end
end

View file

@ -0,0 +1,156 @@
module HTML5lib
# Base class for helper objects that implement each phase of processing.
#
# Handler methods should be in the following order (they can be omitted):
#
# * EOF
# * Comment
# * Doctype
# * SpaceCharacters
# * Characters
# * StartTag
# - startTag* methods
# * EndTag
# - endTag* methods
#
class Phase
# The following example call:
#
# tag_handlers('startTag', 'html', %( base link meta ), %( li dt dd ) => 'ListItem')
#
# ...would return a hash equal to this:
#
# { 'html' => 'startTagHtml',
# 'base' => 'startTagBaseLinkMeta',
# 'link' => 'startTagBaseLinkMeta',
# 'meta' => 'startTagBaseLinkMeta',
# 'li' => 'startTagListItem',
# 'dt' => 'startTagListItem',
# 'dd' => 'startTagListItem' }
#
def self.tag_handlers(prefix, *tags)
mapping = {}
if tags.last.is_a?(Hash)
tags.pop.each do |names, handler_method_suffix|
handler_method = prefix + handler_method_suffix
Array(names).each { |name| mapping[name] = handler_method }
end
end
tags.each do |names|
names = Array(names)
handler_method = prefix + names.map { |name| name.capitalize }.join
names.each { |name| mapping[name] = handler_method }
end
return mapping
end
def self.start_tag_handlers
@start_tag_handlers ||= Hash.new('startTagOther')
end
# Declare what start tags this Phase handles. Can be called more than once.
#
# Example usage:
#
# handle_start 'html'
# # html start tags will be handled by a method named 'startTagHtml'
#
# handle_start %( base link meta )
# # base, link and meta start tags will be handled by a method named 'startTagBaseLinkMeta'
#
# handle_start %( li dt dd ) => 'ListItem'
# # li, dt, and dd start tags will be handled by a method named 'startTagListItem'
#
def self.handle_start(*tags)
start_tag_handlers.update tag_handlers('startTag', *tags)
end
def self.end_tag_handlers
@end_tag_handlers ||= Hash.new('endTagOther')
end
# Declare what end tags this Phase handles. Behaves like handle_start.
#
def self.handle_end(*tags)
end_tag_handlers.update tag_handlers('endTag', *tags)
end
def initialize(parser, tree)
@parser, @tree = parser, tree
end
def processEOF
@tree.generateImpliedEndTags
if @tree.openElements.length > 2
@parser.parseError(_('Unexpected end of file. Missing closing tags.'))
elsif @tree.openElements.length == 2 and @tree.openElements[1].name != 'body'
# This happens for framesets or something?
@parser.parseError(_("Unexpected end of file. Expected end tag (#{@tree.openElements[1].name}) first."))
elsif @parser.innerHTML and @tree.openElements.length > 1
# XXX This is not what the specification says. Not sure what to do here.
@parser.parseError(_('XXX innerHTML EOF'))
end
# Betting ends.
end
def processComment(data)
# For most phases the following is correct. Where it's not it will be
# overridden.
@tree.insertComment(data, @tree.openElements[-1])
end
def processDoctype(name, error)
@parser.parseError(_('Unexpected DOCTYPE. Ignored.'))
end
def processSpaceCharacters(data)
@tree.insertText(data)
end
def processStartTag(name, attributes)
send self.class.start_tag_handlers[name], name, attributes
end
def startTagHtml(name, attributes)
if @parser.firstStartTag == false and name == 'html'
@parser.parseError(_('html needs to be the first start tag.'))
end
# XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke @parser.parseError.
attributes.each do |attr, value|
unless @tree.openElements[0].attributes.has_key?(attr)
@tree.openElements[0].attributes[attr] = value
end
end
@parser.firstStartTag = false
end
def processEndTag(name)
send self.class.end_tag_handlers[name], name
end
def _(string)
string
end
def assert(value)
throw AssertionError.new unless value
end
def in_scope?(*args)
@tree.elementInScope(*args)
end
def remove_open_elements_until(name=nil)
finished = false
until finished
element = @tree.openElements.pop
finished = name.nil?? yield(element) : element.name == name
end
return element
end
end
end

View file

@ -0,0 +1,43 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class RootElementPhase < Phase
def processEOF
insertHtmlElement
@parser.phase.processEOF
end
def processComment(data)
@tree.insertComment(data, @tree.document)
end
def processSpaceCharacters(data)
@tree.insertText(data, @tree.document)
end
def processCharacters(data)
insertHtmlElement
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.firstStartTag = true if name == 'html'
insertHtmlElement
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
insertHtmlElement
@parser.phase.processEndTag(name)
end
def insertHtmlElement
element = @tree.createElement('html', {})
@tree.openElements.push(element)
@tree.document.appendChild(element)
@parser.phase = @parser.phases[:beforeHead]
end
end
end

View file

@ -0,0 +1,36 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class TrailingEndPhase < Phase
def processEOF
end
def processComment(data)
@tree.insertComment(data, @tree.document)
end
def processSpaceCharacters(data)
@parser.lastPhase.processSpaceCharacters(data)
end
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters. Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.parseError(_('Unexpected start tag (#{name}). Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
@parser.parseError(_('Unexpected end tag (#{name}). Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processEndTag(name)
end
end
end

View file

@ -3,14 +3,14 @@ require 'html5lib/constants'
module HTML5lib
# Provides a unicode stream of characters to the HTMLTokenizer.
# Provides a unicode stream of characters to the HTMLTokenizer.
# This class takes care of character encoding and removing or replacing
# incorrect byte-sequences and also provides column and line tracking.
# This class takes care of character encoding and removing or replacing
# incorrect byte-sequences and also provides column and line tracking.
class HTMLInputStream
class HTMLInputStream
attr_accessor :queue, :charEncoding
attr_accessor :queue, :char_encoding
# Initialises the HTMLInputStream.
#
@ -28,16 +28,16 @@ class HTMLInputStream
def initialize(source, options = {})
@encoding = nil
@parseMeta = true
@parse_meta = true
@chardet = true
options.each { |name, value| instance_variable_set("@#{name}", value) }
# List of where new lines occur
@newLines = []
@new_lines = []
# Raw Stream
@rawStream = openStream(source)
@raw_stream = open_stream(source)
# Encoding Information
#Number of bytes to use when looking for a meta element with
@ -47,15 +47,15 @@ class HTMLInputStream
@DEFAULT_ENCODING = 'windows-1252'
#Detect encoding iff no explicit "transport level" encoding is supplied
if @encoding.nil? or not HTML5lib.isValidEncoding(@encoding)
@charEncoding = detectEncoding
if @encoding.nil? or not HTML5lib.is_valid_encoding(@encoding)
@char_encoding = detect_encoding
else
@charEncoding = @encoding
@char_encoding = @encoding
end
# Read bytes from stream decoding them into Unicode
uString = @rawStream.read
unless @charEncoding == 'utf-8'
uString = @raw_stream.read
unless @char_encoding == 'utf-8'
begin
require 'iconv'
uString = Iconv.iconv('utf-8', @encoding, uString)[0]
@ -68,7 +68,7 @@ class HTMLInputStream
uString.gsub!("\x00", [0xFFFD].pack('U'))
# Convert the unicode string into a list to be used as the data stream
@dataStream = uString
@data_stream = uString
@queue = []
@ -79,7 +79,7 @@ class HTMLInputStream
# Produces a file object from source.
#
# source can be either a file object, local filename or a string.
def openStream(source)
def open_stream(source)
# Already an IO like object
if source.respond_to?(:read)
@stream = source
@ -90,24 +90,24 @@ class HTMLInputStream
return @stream
end
def detectEncoding
def detect_encoding
#First look for a BOM
#This will also read past the BOM if present
encoding = detectBOM
encoding = detect_bom
#If there is no BOM need to look for meta elements with encoding
#information
if encoding.nil? and @parseMeta
encoding = detectEncodingMeta
if encoding.nil? and @parse_meta
encoding = detect_encoding_meta
end
#Guess with chardet, if avaliable
if encoding.nil? and @chardet
begin
require 'rubygems'
require 'UniversalDetector' # gem install chardet
buffer = @rawStream.read
buffer = @raw_stream.read
encoding = UniversalDetector::chardet(buffer)['encoding']
@rawStream = openStream(buffer)
@raw_stream = open_stream(buffer)
rescue LoadError
end
end
@ -117,10 +117,10 @@ class HTMLInputStream
end
#Substitute for equivalent encodings:
encodingSub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
encoding_sub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
if encodingSub.has_key?(encoding.downcase)
encoding = encodingSub[encoding.downcase]
if encoding_sub.has_key?(encoding.downcase)
encoding = encoding_sub[encoding.downcase]
end
return encoding
@ -129,8 +129,8 @@ class HTMLInputStream
# Attempts to detect at BOM at the start of the stream. If
# an encoding can be determined from the BOM return the name of the
# encoding otherwise return nil
def detectBOM
bomDict = {
def detect_bom
bom_dict = {
"\xef\xbb\xbf" => 'utf-8',
"\xff\xfe" => 'utf-16-le',
"\xfe\xff" => 'utf-16-be',
@ -139,19 +139,19 @@ class HTMLInputStream
}
# Go to beginning of file and read in 4 bytes
@rawStream.seek(0)
string = @rawStream.read(4)
@raw_stream.seek(0)
string = @raw_stream.read(4)
return nil unless string
# Try detecting the BOM using bytes from the string
encoding = bomDict[string[0...3]] # UTF-8
encoding = bom_dict[string[0...3]] # UTF-8
seek = 3
unless encoding
# Need to detect UTF-32 before UTF-16
encoding = bomDict[string] # UTF-32
encoding = bom_dict[string] # UTF-32
seek = 4
unless encoding
encoding = bomDict[string[0...2]] # UTF-16
encoding = bom_dict[string[0...2]] # UTF-16
seek = 2
end
end
@ -159,36 +159,36 @@ class HTMLInputStream
#AT - move this to the caller?
# Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream
@rawStream.seek(encoding ? seek : 0)
@raw_stream.seek(encoding ? seek : 0)
return encoding
end
# Report the encoding declared by the meta element
def detectEncodingMeta
parser = EncodingParser.new(@rawStream.read(@NUM_BYTES_META))
@rawStream.seek(0)
return parser.getEncoding
def detect_encoding_meta
parser = EncodingParser.new(@raw_stream.read(@NUM_BYTES_META))
@raw_stream.seek(0)
return parser.get_encoding
end
def determineNewLines
def determine_new_lines
# Looks through the stream to find where new lines occur so
# the position method can tell where it is.
@newLines.push(0)
(0...@dataStream.length).each { |i| @newLines.push(i) if @dataStream[i] == ?\n }
@new_lines.push(0)
(0...@data_stream.length).each { |i| @new_lines.push(i) if @data_stream[i] == ?\n }
end
# Returns (line, col) of the current position in the stream.
def position
# Generate list of new lines first time around
determineNewLines if @newLines.empty?
determine_new_lines if @new_lines.empty?
line = 0
tell = @tell
@newLines.each do |pos|
@new_lines.each do |pos|
break unless pos < tell
line += 1
end
col = tell - @newLines[line-1] - 1
col = tell - @new_lines[line-1] - 1
return [line, col]
end
@ -205,7 +205,7 @@ class HTMLInputStream
else
begin
@tell += 1
return @dataStream[@tell - 1].chr
return @data_stream[@tell - 1].chr
rescue
return :EOF
end
@ -215,22 +215,22 @@ class HTMLInputStream
# Returns a string of characters from the stream up to but not
# including any character in characters or EOF. characters can be
# any container that supports the in method being called on it.
def charsUntil(characters, opposite = false)
charStack = [char]
def chars_until(characters, opposite=false)
char_stack = [char]
unless charStack[0] == :EOF
while (characters.include? charStack[-1]) == opposite
unless char_stack[0] == :EOF
while (characters.include? char_stack[-1]) == opposite
unless @queue.empty?
# First from the queue
charStack.push(@queue.shift)
break if charStack[-1] == :EOF
char_stack.push(@queue.shift)
break if char_stack[-1] == :EOF
else
# Then the rest
begin
charStack.push(@dataStream[@tell].chr)
char_stack.push(@data_stream[@tell].chr)
@tell += 1
rescue
charStack.push(:EOF)
char_stack.push(:EOF)
break
end
end
@ -239,14 +239,14 @@ class HTMLInputStream
# Put the character stopped on back to the front of the queue
# from where it came.
@queue.insert(0, charStack.pop)
return charStack.join('')
@queue.insert(0, char_stack.pop)
return char_stack.join('')
end
end
end
# String-like object with an assosiated position and various extra methods
# If the position is ever greater than the string length then an exception is raised
class EncodingBytes < String
# String-like object with an assosiated position and various extra methods
# If the position is ever greater than the string length then an exception is raised
class EncodingBytes < String
attr_accessor :position
@ -263,14 +263,14 @@ class EncodingBytes < String
rescue EOF
end
def currentByte
def current_byte
raise EOF if @position >= length
return self[@position].chr
end
# Skip past a list of characters
def skip(chars = SPACE_CHARACTERS)
while chars.include?(currentByte)
def skip(chars=SPACE_CHARACTERS)
while chars.include?(current_byte)
@position += 1
end
end
@ -278,7 +278,7 @@ class EncodingBytes < String
# Look for a sequence of bytes at the start of a string. If the bytes
# are found return true and advance the position to the byte after the
# match. Otherwise return false and leave the position alone
def matchBytes(bytes, lower = false)
def match_bytes(bytes, lower=false)
data = self[position ... position+bytes.length]
data.downcase! if lower
rv = (data == bytes)
@ -288,10 +288,10 @@ class EncodingBytes < String
# Look for the next sequence of bytes matching a given sequence. If
# a match is found advance the position to the last byte of the match
def jumpTo(bytes)
newPosition = self[position .. -1].index(bytes)
if newPosition
@position += (newPosition + bytes.length-1)
def jump_to(bytes)
new_position = self[position .. -1].index(bytes)
if new_position
@position += (new_position + bytes.length-1)
return true
else
raise EOF
@ -300,15 +300,15 @@ class EncodingBytes < String
# Move the pointer so it points to the next byte in a set of possible
# bytes
def findNext(byteList)
until byteList.include?(currentByte)
def find_next(byte_list)
until byte_list.include?(current_byte)
@position += 1
end
end
end
end
# Mini parser for detecting character encoding from meta elements
class EncodingParser
# Mini parser for detecting character encoding from meta elements
class EncodingParser
# string - the data to work on for encoding detection
def initialize(data)
@ -317,139 +317,139 @@ class EncodingParser
end
@@method_dispatch = [
['<!--', :handleComment],
['<meta', :handleMeta],
['</', :handlePossibleEndTag],
['<!', :handleOther],
['<?', :handleOther],
['<', :handlePossibleStartTag]
['<!--', :handle_comment],
['<meta', :handle_meta],
['</', :handle_possible_end_tag],
['<!', :handle_other],
['<?', :handle_other],
['<', :handle_possible_start_tag]
]
def getEncoding
def get_encoding
@data.each do |byte|
keepParsing = true
keep_parsing = true
@@method_dispatch.each do |(key, method)|
if @data.matchBytes(key, lower = true)
keepParsing = send(method)
if @data.match_bytes(key, lower = true)
keep_parsing = send(method)
break
end
end
break unless keepParsing
break unless keep_parsing
end
@encoding = @encoding.strip unless @encoding.nil?
return @encoding
end
# Skip over comments
def handleComment
return @data.jumpTo('-->')
def handle_comment
return @data.jump_to('-->')
end
def handleMeta
def handle_meta
# if we have <meta not followed by a space so just keep going
return true unless SPACE_CHARACTERS.include?(@data.currentByte)
return true unless SPACE_CHARACTERS.include?(@data.current_byte)
#We have a valid meta element we want to search for attributes
while true
#Try to find the next attribute after the current position
attr = getAttribute
attr = get_attribute
return true if attr.nil?
if attr[0] == 'charset'
tentativeEncoding = attr[1]
if HTML5lib.isValidEncoding(tentativeEncoding)
@encoding = tentativeEncoding
tentative_encoding = attr[1]
if HTML5lib.is_valid_encoding(tentative_encoding)
@encoding = tentative_encoding
return false
end
elsif attr[0] == 'content'
contentParser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
tentativeEncoding = contentParser.parse
if HTML5lib.isValidEncoding(tentativeEncoding)
@encoding = tentativeEncoding
content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
tentative_encoding = content_parser.parse
if HTML5lib.is_valid_encoding(tentative_encoding)
@encoding = tentative_encoding
return false
end
end
end
end
def handlePossibleStartTag
return handlePossibleTag(false)
def handle_possible_start_tag
return handle_possible_tag(false)
end
def handlePossibleEndTag
@data.position+=1
return handlePossibleTag(true)
def handle_possible_end_tag
@data.position += 1
return handle_possible_tag(true)
end
def handlePossibleTag(endTag)
unless ASCII_LETTERS.include?(@data.currentByte)
def handle_possible_tag(end_tag)
unless ASCII_LETTERS.include?(@data.current_byte)
#If the next byte is not an ascii letter either ignore this
#fragment (possible start tag case) or treat it according to
#handleOther
if endTag
if end_tag
@data.position -= 1
handleOther
handle_other
end
return true
end
@data.findNext(SPACE_CHARACTERS + ['<', '>'])
@data.find_next(SPACE_CHARACTERS + ['<', '>'])
if @data.currentByte == '<'
if @data.current_byte == '<'
#return to the first step in the overall "two step" algorithm
#reprocessing the < byte
@data.position -= 1
else
#Read all attributes
{} until getAttribute.nil?
{} until get_attribute.nil?
end
return true
end
def handleOther
return @data.jumpTo('>')
def handle_other
return @data.jump_to('>')
end
# Return a name,value pair for the next attribute in the stream,
# if one is found, or nil
def getAttribute
def get_attribute
@data.skip(SPACE_CHARACTERS + ['/'])
if @data.currentByte == '<'
if @data.current_byte == '<'
@data.position -= 1
return nil
elsif @data.currentByte == '>'
elsif @data.current_byte == '>'
return nil
end
attrName = []
attrValue = []
spaceFound = false
attr_name = []
attr_value = []
space_found = false
#Step 5 attribute name
while true
if @data.currentByte == '=' and attrName:
if @data.current_byte == '=' and attr_name:
break
elsif SPACE_CHARACTERS.include?(@data.currentByte)
spaceFound = true
elsif SPACE_CHARACTERS.include?(@data.current_byte)
space_found = true
break
elsif ['/', '<', '>'].include?(@data.currentByte)
return [attrName.join(''), '']
elsif ASCII_UPPERCASE.include?(@data.currentByte)
attrName.push(@data.currentByte.downcase)
elsif ['/', '<', '>'].include?(@data.current_byte)
return [attr_name.join(''), '']
elsif ASCII_UPPERCASE.include?(@data.current_byte)
attr_name.push(@data.current_byte.downcase)
else
attrName.push(@data.currentByte)
attr_name.push(@data.current_byte)
end
#Step 6
@data.position += 1
end
#Step 7
if spaceFound
if space_found
@data.skip
#Step 8
unless @data.currentByte == '='
unless @data.current_byte == '='
@data.position -= 1
return [attrName.join(''), '']
return [attr_name.join(''), '']
end
end
#XXX need to advance position in both spaces and value case
@ -458,92 +458,93 @@ class EncodingParser
#Step 10
@data.skip
#Step 11
if ["'", '"'].include?(@data.currentByte)
if ["'", '"'].include?(@data.current_byte)
#11.1
quoteChar = @data.currentByte
quote_char = @data.current_byte
while true
@data.position+=1
#11.3
if @data.currentByte == quoteChar
if @data.current_byte == quote_char
@data.position += 1
return [attrName.join(''), attrValue.join('')]
return [attr_name.join(''), attr_value.join('')]
#11.4
elsif ASCII_UPPERCASE.include?(@data.currentByte)
attrValue.push(@data.currentByte.downcase)
elsif ASCII_UPPERCASE.include?(@data.current_byte)
attr_value.push(@data.current_byte.downcase)
#11.5
else
attrValue.push(@data.currentByte)
attr_value.push(@data.current_byte)
end
end
elsif ['>', '<'].include?(@data.currentByte)
return [attrName.join(''), '']
elsif ASCII_UPPERCASE.include?(@data.currentByte)
attrValue.push(@data.currentByte.downcase)
elsif ['>', '<'].include?(@data.current_byte)
return [attr_name.join(''), '']
elsif ASCII_UPPERCASE.include?(@data.current_byte)
attr_value.push(@data.current_byte.downcase)
else
attrValue.push(@data.currentByte)
attr_value.push(@data.current_byte)
end
while true
@data.position +=1
if (SPACE_CHARACTERS + ['>', '<']).include?(@data.currentByte)
return [attrName.join(''), attrValue.join('')]
elsif ASCII_UPPERCASE.include?(@data.currentByte)
attrValue.push(@data.currentByte.downcase)
@data.position += 1
if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
return [attr_name.join(''), attr_value.join('')]
elsif ASCII_UPPERCASE.include?(@data.current_byte)
attr_value.push(@data.current_byte.downcase)
else
attrValue.push(@data.currentByte)
attr_value.push(@data.current_byte)
end
end
end
end
end
class ContentAttrParser
class ContentAttrParser
def initialize(data)
@data = data
end
def parse
begin
#Skip to the first ";"
@data.position = 0
@data.jumpTo(';')
@data.jump_to(';')
@data.position += 1
@data.skip
#Check if the attr name is charset
#otherwise return
@data.jumpTo('charset')
@data.jump_to('charset')
@data.position += 1
@data.skip
unless @data.currentByte == '='
unless @data.current_byte == '='
#If there is no = sign keep looking for attrs
return nil
end
@data.position += 1
@data.skip
#Look for an encoding between matching quote marks
if ['"', "'"].include?(@data.currentByte)
quoteMark = @data.currentByte
if ['"', "'"].include?(@data.current_byte)
quote_mark = @data.current_byte
@data.position += 1
oldPosition = @data.position
@data.jumpTo(quoteMark)
return @data[oldPosition ... @data.position]
old_position = @data.position
@data.jump_to(quote_mark)
return @data[old_position ... @data.position]
else
#Unquoted value
oldPosition = @data.position
old_position = @data.position
begin
@data.findNext(SPACE_CHARACTERS)
return @data[oldPosition ... @data.position]
@data.find_next(SPACE_CHARACTERS)
return @data[old_position ... @data.position]
rescue EOF
#Return the whole remaining value
return @data[oldPosition .. -1]
return @data[old_position .. -1]
end
end
rescue EOF
return nil
end
end
end
end
# Determine if a string is a supported encoding
def self.isValidEncoding(encoding)
# Determine if a string is a supported encoding
def self.is_valid_encoding(encoding)
(not encoding.nil? and encoding.kind_of?(String) and ENCODINGS.include?(encoding.downcase.strip))
end
end
end

View file

@ -16,10 +16,10 @@ require 'html5lib/constants'
module HTML5lib
# liberal XML parser
class XMLParser < HTMLParser
# liberal XML parser
class XMLParser < HTMLParser
def initialize(options={})
def initialize(options = {})
super options
@phases[:initial] = XmlRootPhase.new(self, @tree)
end
@ -53,12 +53,12 @@ class XMLParser < HTMLParser
return token
end
end
end
# liberal XMTHML parser
class XHTMLParser < XMLParser
# liberal XMTHML parser
class XHTMLParser < XMLParser
def initialize(options={})
def initialize(options = {})
super options
@phases[:initial] = InitialPhase.new(self, @tree)
@phases[:rootElement] = XhmlRootPhase.new(self, @tree)
@ -82,18 +82,18 @@ class XHTMLParser < XMLParser
return token
end
end
end
class XhmlRootPhase < RootElementPhase
class XhmlRootPhase < RootElementPhase
def insertHtmlElement
element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
@tree.openElements.push(element)
@tree.document.appendChild(element)
@parser.phase = @parser.phases[:beforeHead]
end
end
end
class XmlRootPhase < Phase
class XmlRootPhase < Phase
# Prime the Xml parser
@start_tag_handlers = Hash.new(:startTagOther)
@end_tag_handlers = Hash.new(:endTagOther)
@ -108,9 +108,9 @@ class XmlRootPhase < Phase
super
@tree.openElements.pop
end
end
end
class XmlElementPhase < Phase
class XmlElementPhase < Phase
# Generic handling for all XML elements
@start_tag_handlers = Hash.new(:startTagOther)
@ -136,6 +136,6 @@ class XmlElementPhase < Phase
def processCharacters(data)
@tree.insertText(data)
end
end
end
end

View file

@ -6,7 +6,7 @@ module HTML5lib
# This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes.
class HTMLSanitizer < HTMLTokenizer
class HTMLSanitizer < HTMLTokenizer
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
button caption center cite code col colgroup dd del dfn dir div dl dt
@ -144,7 +144,6 @@ class HTMLSanitizer < HTMLTokenizer
else
yield token
end
end
end
@ -157,7 +156,7 @@ class HTMLSanitizer < HTMLTokenizer
return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
clean = []
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
next if val.empty?
prop.downcase!
if ALLOWED_CSS_PROPERTIES.include?(prop)
@ -174,5 +173,5 @@ class HTMLSanitizer < HTMLTokenizer
style = clean.join(' ')
end
end
end
end

View file

@ -3,27 +3,27 @@ require 'html5lib/inputstream'
module HTML5lib
# This class takes care of tokenizing HTML.
#
# * @currentToken
# Holds the token that is currently being processed.
#
# * @state
# Holds a reference to the method to be invoked... XXX
#
# * @states
# Holds a mapping between states and methods that implement the state.
#
# * @stream
# Points to HTMLInputStream object.
# This class takes care of tokenizing HTML.
#
# * @currentToken
# Holds the token that is currently being processed.
#
# * @state
# Holds a reference to the method to be invoked... XXX
#
# * @states
# Holds a mapping between states and methods that implement the state.
#
# * @stream
# Points to HTMLInputStream object.
class HTMLTokenizer
class HTMLTokenizer
attr_accessor :contentModelFlag, :currentToken
attr_reader :stream
# XXX need to fix documentation
def initialize(stream, options={})
def initialize(stream, options = {})
@stream = HTMLInputStream.new(stream, options)
@states = {
@ -147,7 +147,7 @@ class HTMLTokenizer
charAsInt = 65533
end
if charAsInt <= 0x10FFF
if charAsInt <= 0x10FFFF
char = [charAsInt].pack('U')
else
@tokenQueue.push({:type => :ParseError, :data =>
@ -261,13 +261,11 @@ class HTMLTokenizer
@state = @states[:data]
end
# Below are the various tokenizer states worked out.
# XXX AT Perhaps we should have Hixie run some evaluation on billions of
# documents to figure out what the order of the various if and elsif
# statements should be.
def dataState
data = @stream.char
if data == "&" and (@contentModelFlag == :PCDATA or
@ -285,10 +283,10 @@ class HTMLTokenizer
# XXX need to check if we don't need a special "spaces" flag on
# characters.
@tokenQueue.push({:type => :SpaceCharacters, :data =>
data + @stream.charsUntil(SPACE_CHARACTERS, true)})
data + @stream.chars_until(SPACE_CHARACTERS, true)})
else
@tokenQueue.push({:type => :Characters, :data =>
data + @stream.charsUntil(["&", "<"])})
data + @stream.chars_until(["&", "<"])})
end
return true
end
@ -430,7 +428,7 @@ class HTMLTokenizer
emitCurrentToken
elsif ASCII_LETTERS.include? data
@currentToken[:name] += data +\
@stream.charsUntil(ASCII_LETTERS, true)
@stream.chars_until(ASCII_LETTERS, true)
elsif data == ">"
emitCurrentToken
elsif data == "<"
@ -450,7 +448,7 @@ class HTMLTokenizer
def beforeAttributeNameState
data = @stream.char
if SPACE_CHARACTERS.include? data
@stream.charsUntil(SPACE_CHARACTERS, true)
@stream.chars_until(SPACE_CHARACTERS, true)
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file. Expected attribute name instead.")})
@ -486,7 +484,7 @@ class HTMLTokenizer
leavingThisState = false
elsif ASCII_LETTERS.include? data
@currentToken[:data][-1][0] += data +\
@stream.charsUntil(ASCII_LETTERS, true)
@stream.chars_until(ASCII_LETTERS, true)
leavingThisState = false
elsif data == ">"
# XXX If we emit here the attributes are converted to a dict
@ -529,7 +527,7 @@ class HTMLTokenizer
def afterAttributeNameState
data = @stream.char
if SPACE_CHARACTERS.include? data
@stream.charsUntil(SPACE_CHARACTERS, true)
@stream.chars_until(SPACE_CHARACTERS, true)
elsif data == "="
@state = @states[:beforeAttributeValue]
elsif data == ">"
@ -559,7 +557,7 @@ class HTMLTokenizer
def beforeAttributeValueState
data = @stream.char
if SPACE_CHARACTERS.include? data
@stream.charsUntil(SPACE_CHARACTERS, true)
@stream.chars_until(SPACE_CHARACTERS, true)
elsif data == "\""
@state = @states[:attributeValueDoubleQuoted]
elsif data == "&"
@ -597,7 +595,7 @@ class HTMLTokenizer
emitCurrentToken
else
@currentToken[:data][-1][1] += data +\
@stream.charsUntil(["\"", "&"])
@stream.chars_until(["\"", "&"])
end
return true
end
@ -614,7 +612,7 @@ class HTMLTokenizer
emitCurrentToken
else
@currentToken[:data][-1][1] += data +\
@stream.charsUntil(["'", "&"])
@stream.chars_until(["'", "&"])
end
return true
end
@ -638,17 +636,17 @@ class HTMLTokenizer
emitCurrentToken
else
@currentToken[:data][-1][1] += data +
@stream.charsUntil(["&", ">","<"] + SPACE_CHARACTERS)
@stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
end
return true
end
def bogusCommentState
# Make a new comment token and give it as value all the characters
# until the first > or :EOF (charsUntil checks for :EOF automatically)
# until the first > or :EOF (chars_until checks for :EOF automatically)
# and emit it.
@tokenQueue.push(
{:type => :Comment, :data => @stream.charsUntil((">"))})
{:type => :Comment, :data => @stream.chars_until((">"))})
# Eat the character directly after the bogus comment which is either a
# ">" or an :EOF.
@ -690,7 +688,7 @@ class HTMLTokenizer
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@currentToken[:data] += data + @stream.charsUntil("-")
@currentToken[:data] += data + @stream.chars_until("-")
end
return true
end
@ -706,7 +704,7 @@ class HTMLTokenizer
@state = @states[:data]
else
@currentToken[:data] += "-" + data +\
@stream.charsUntil("-")
@stream.chars_until("-")
# Consume the next character which is either a "-" or an :EOF as
# well so if there's a "-" directly after the "-" we go nicely to
# the "comment end state" without emitting a ParseError there.
@ -849,6 +847,6 @@ class HTMLTokenizer
end
def _(string); string; end
end
end
end

View file

@ -1,5 +1,5 @@
module HTML5lib
module TreeBuilders
module TreeBuilders
def self.getTreeBuilder(name)
case name.to_s.downcase
@ -17,5 +17,5 @@ module TreeBuilders
end
end
end
end
end

View file

@ -4,15 +4,15 @@ require 'html5lib/constants'
module HTML5lib
# The scope markers are inserted when entering buttons, object elements,
# marquees, table cells, and table captions, and are used to prevent formatting
# from "leaking" into tables, buttons, object elements, and marquees.
Marker = nil
# The scope markers are inserted when entering buttons, object elements,
# marquees, table cells, and table captions, and are used to prevent formatting
# from "leaking" into tables, buttons, object elements, and marquees.
Marker = nil
module TreeBuilders
module Base
module TreeBuilders
module Base
class Node
class Node
# The parent of the current node (or nil for the document node)
attr_accessor :parent
@ -36,7 +36,7 @@ class Node
# Insert data as text in the current node, positioned before the
# start of node insertBefore or to the end of the node's text.
def insertText(data, insertBefore = nil)
def insertText(data, insertBefore=nil)
raise NotImplementedError
end
@ -71,10 +71,10 @@ class Node
def hasContent
raise NotImplementedError
end
end
end
# Base treebuilder implementation
class TreeBuilder
# Base treebuilder implementation
class TreeBuilder
attr_accessor :openElements
@ -118,7 +118,7 @@ class TreeBuilder
@document = @documentClass.new
end
def elementInScope(target, tableVariant = false)
def elementInScope(target, tableVariant=false)
# Exit early when possible.
return true if @openElements[-1].name == target
@ -202,7 +202,7 @@ class TreeBuilder
@document.appendChild(@doctypeClass.new(name))
end
def insertComment(data, parent = nil)
def insertComment(data, parent=nil)
parent = @openElements[-1] if parent.nil?
parent.appendChild(@commentClass.new(data))
end
@ -253,7 +253,7 @@ class TreeBuilder
return element
end
def insertText(data, parent = nil)
def insertText(data, parent=nil)
parent = @openElements[-1] if parent.nil?
if (not(@insertFromTable) or (@insertFromTable and not TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)))
@ -296,7 +296,7 @@ class TreeBuilder
return fosterParent, insertBefore
end
def generateImpliedEndTags(exclude = nil)
def generateImpliedEndTags(exclude=nil)
name = @openElements[-1].name
if (['dd', 'dt', 'li', 'p', 'td', 'th', 'tr'].include?(name) and name != exclude)
@ -324,7 +324,7 @@ class TreeBuilder
raise NotImplementedError
end
end
end
end
end
end
end
end

View file

@ -3,10 +3,10 @@ require 'hpricot'
require 'forwardable'
module HTML5lib
module TreeBuilders
module Hpricot
module TreeBuilders
module Hpricot
class Node < Base::Node
class Node < Base::Node
extend Forwardable
@ -35,7 +35,7 @@ class Node < Base::Node
node.parent = nil
end
def insertText(data, before = nil)
def insertText(data, before=nil)
if before
insertBefore(TextNode.new(data), before)
else
@ -55,9 +55,9 @@ class Node < Base::Node
def hasContent
childNodes.any?
end
end
end
class Element < Node
class Element < Node
def self.hpricot_class
::Hpricot::Elem
end
@ -89,13 +89,16 @@ class Element < Node
def initialize(hpricot)
@hpricot = hpricot
end
def []=(k, v)
@hpricot.stag.send(stag_attributes_method)[k] = v
end
def stag_attributes_method
# STag#attributes changed to STag#raw_attributes after Hpricot 0.5
@hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
end
def method_missing(*a, &b)
@hpricot.attributes.send(*a, &b)
end
@ -109,7 +112,7 @@ class Element < Node
attrs.each { |name, value| @hpricot[name] = value }
end
def printTree(indent = 0)
def printTree(indent=0)
tree = "\n|#{' ' * indent}<#{name}>"
indent += 2
attributes.each do |name, value|
@ -118,9 +121,9 @@ class Element < Node
end
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
end
end
end
class Document < Node
class Document < Node
def self.hpricot_class
::Hpricot::Doc
end
@ -129,12 +132,12 @@ class Document < Node
super(nil)
end
def printTree(indent = 0)
def printTree(indent=0)
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
end
end
end
class DocumentType < Node
class DocumentType < Node
def self.hpricot_class
::Hpricot::DocType
end
@ -148,42 +151,42 @@ class DocumentType < Node
@hpricot = ::Hpricot::DocType.new(name, nil, nil)
end
def printTree(indent = 0)
def printTree(indent=0)
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
end
end
end
class DocumentFragment < Element
class DocumentFragment < Element
def initialize
super('')
end
def printTree(indent = 0)
def printTree(indent=0)
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
end
end
end
class TextNode < Node
class TextNode < Node
def initialize(data)
@hpricot = ::Hpricot::Text.new(data)
end
def printTree(indent = 0)
def printTree(indent=0)
"\n|#{' ' * indent}\"#{hpricot.content}\""
end
end
end
class CommentNode < Node
class CommentNode < Node
def self.hpricot_class
::Hpricot::Comment
end
def printTree(indent = 0)
def printTree(indent=0)
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
end
end
end
class TreeBuilder < Base::TreeBuilder
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@ -204,8 +207,8 @@ class TreeBuilder < Base::TreeBuilder
@document = super
return @document.hpricot.children
end
end
end
end
end
end
end
end

View file

@ -3,10 +3,10 @@ require 'rexml/document'
require 'forwardable'
module HTML5lib
module TreeBuilders
module REXMLTree
module TreeBuilders
module REXMLTree
class Node < Base::Node
class Node < Base::Node
extend Forwardable
def_delegators :@rxobj, :name, :attributes
attr_accessor :rxobj
@ -58,9 +58,9 @@ class Node < Base::Node
def hasContent
return (childNodes.length > 0)
end
end
end
class Element < Node
class Element < Node
def self.rxclass
REXML::Element
end
@ -76,7 +76,7 @@ class Element < Node
end
def attributes= value
value.each {|name,value| rxobj.attributes[name]=value}
value.each {|name, value| rxobj.attributes[name]=value}
end
def printTree indent=0
@ -91,9 +91,9 @@ class Element < Node
end
return tree
end
end
end
class Document < Node
class Document < Node
def self.rxclass
REXML::Document
end
@ -116,9 +116,9 @@ class Document < Node
end
return tree
end
end
end
class DocumentType < Node
class DocumentType < Node
def self.rxclass
REXML::DocType
end
@ -126,9 +126,9 @@ class DocumentType < Node
def printTree indent=0
"\n|#{' ' * indent}<!DOCTYPE #{name}>"
end
end
end
class DocumentFragment < Element
class DocumentFragment < Element
def initialize
super nil
end
@ -140,9 +140,9 @@ class DocumentFragment < Element
end
return tree
end
end
end
class TextNode < Node
class TextNode < Node
def initialize data
raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;')
@rxobj = REXML::Text.new(raw, true, nil, true)
@ -151,9 +151,9 @@ class TextNode < Node
def printTree indent=0
"\n|#{' ' * indent}\"#{rxobj.value}\""
end
end
end
class CommentNode < Node
class CommentNode < Node
def self.rxclass
REXML::Comment
end
@ -161,9 +161,9 @@ class CommentNode < Node
def printTree indent=0
"\n|#{' ' * indent}<!-- #{rxobj.string} -->"
end
end
end
class TreeBuilder < Base::TreeBuilder
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@ -184,8 +184,8 @@ class TreeBuilder < Base::TreeBuilder
@document = super
return @document.rxobj.children
end
end
end
end
end
end
end
end

View file

@ -1,10 +1,10 @@
require 'html5lib/treebuilders/base'
module HTML5lib
module TreeBuilders
module SimpleTree
module TreeBuilders
module SimpleTree
class Node < Base::Node
class Node < Base::Node
# Node representing an item in the tree.
# name - The tag name associated with the node
attr_accessor :name
@ -74,9 +74,9 @@ class Node < Base::Node
def hasContent
return (childNodes.length > 0)
end
end
end
class Element < Node
class Element < Node
def to_s
"<%s>" % name
end
@ -92,9 +92,9 @@ class Element < Node
end
return tree
end
end
end
class Document < Node
class Document < Node
def to_s
"#document"
end
@ -110,15 +110,15 @@ class Document < Node
end
return tree
end
end
end
class DocumentType < Node
class DocumentType < Node
def to_s
"<!DOCTYPE %s>" % name
end
end
end
class DocumentFragment < Element
class DocumentFragment < Element
def initialize
super nil
end
@ -130,9 +130,9 @@ class DocumentFragment < Element
end
return tree
end
end
end
class TextNode < Node
class TextNode < Node
def initialize value
super nil
@value = value
@ -141,9 +141,9 @@ class TextNode < Node
def to_s
'"%s"' % value
end
end
end
class CommentNode < Node
class CommentNode < Node
def initialize value
super nil
@value = value
@ -152,9 +152,9 @@ class CommentNode < Node
def to_s
"<!-- %s -->" % value
end
end
end
class TreeBuilder < Base::TreeBuilder
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@ -171,8 +171,8 @@ class TreeBuilder < Base::TreeBuilder
@document = super
return @document.childNodes
end
end
end
end
end
end
end
end

View file

@ -9,3 +9,15 @@ $:.unshift File.dirname(__FILE__)
def html5lib_test_files(subdirectory)
Dir[File.join(HTML5LIB_BASE, 'tests', subdirectory, '*.*')]
end
begin
require 'jsonx'
rescue LoadError
class JSON
def self.parse json
json.gsub! /"\s*:/, '"=>'
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
eval json
end
end
end

View file

@ -11,7 +11,7 @@ begin
def test_chardet
File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
assert_equal 'big5', stream.charEncoding.downcase
assert_equal 'big5', stream.char_encoding.downcase
end
end
rescue LoadError
@ -28,7 +28,7 @@ end
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
assert_equal encoding.downcase, stream.charEncoding.downcase, input
assert_equal encoding.downcase, stream.char_encoding.downcase, input
end
end
end

View file

@ -203,4 +203,8 @@ class SanitizeTest < Test::Unit::TestCase
sanitize_html(%(<img src='vbscript:msgbox("XSS")' />))
end
def test_should_handle_astral_plane_characters
assert_equal "<p>\360\235\222\265 \360\235\224\270</p>",
sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
end
end

View file

@ -4,18 +4,6 @@ require 'html5lib/tokenizer'
require 'tokenizer_test_parser'
begin
require 'jsonx'
rescue LoadError
class JSON
def self.parse json
json.gsub! /"\s*:/, '"=>'
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
eval json
end
end
end
class Html5TokenizerTestCase < Test::Unit::TestCase
def type_of?(token_name, token)