Merged with latest trunk.

This commit is contained in:
Jason Blevins 2007-06-04 22:47:59 -04:00
commit aadfb55342
38 changed files with 4839 additions and 4849 deletions

View file

@ -8,7 +8,19 @@ OPTIONS = {
:ip => "0.0.0.0", :ip => "0.0.0.0",
:environment => "production", :environment => "production",
:server_root => File.expand_path(File.dirname(__FILE__) + "/../public/"), :server_root => File.expand_path(File.dirname(__FILE__) + "/../public/"),
:server_type => WEBrick::SimpleServer :server_type => WEBrick::SimpleServer,
:mime_types => WEBrick::HTTPUtils::DefaultMimeTypes.merge({
'avi' => 'video/x-msvideo',
'gz' => 'application/x-gzip',
'js' => 'application/x-javascript',
'nb' => 'application/mathematica',
'pdf' => 'application/pdf',
'svg' => 'application/svg+xml',
'tar' => 'application/x-tar',
'tex' => 'application/x-tex',
'xml' => 'application/xml',
'xslt' => 'application/xslt+xml'
})
} }
ARGV.options do |opts| ARGV.options do |opts|

View file

@ -1,15 +1,15 @@
module HTML5lib module HTML5lib
class EOF < Exception; end class EOF < Exception; end
CONTENT_MODEL_FLAGS = [ CONTENT_MODEL_FLAGS = [
:PCDATA, :PCDATA,
:RCDATA, :RCDATA,
:CDATA, :CDATA,
:PLAINTEXT :PLAINTEXT
] ]
SCOPING_ELEMENTS = %w[ SCOPING_ELEMENTS = %w[
button button
caption caption
html html
@ -18,9 +18,9 @@ SCOPING_ELEMENTS = %w[
table table
td td
th th
] ]
FORMATTING_ELEMENTS = %w[ FORMATTING_ELEMENTS = %w[
a a
b b
big big
@ -34,9 +34,9 @@ FORMATTING_ELEMENTS = %w[
strong strong
tt tt
u u
] ]
SPECIAL_ELEMENTS = %w[ SPECIAL_ELEMENTS = %w[
address address
area area
base base
@ -98,43 +98,43 @@ SPECIAL_ELEMENTS = %w[
tr tr
ul ul
wbr wbr
] ]
SPACE_CHARACTERS = %W[ SPACE_CHARACTERS = %W[
\t \t
\n \n
\x0B \x0B
\x0C \x0C
\x20 \x20
\r \r
] ]
TABLE_INSERT_MODE_ELEMENTS = %w[ TABLE_INSERT_MODE_ELEMENTS = %w[
table table
tbody tbody
tfoot tfoot
thead thead
tr tr
] ]
ASCII_LOWERCASE = ('a'..'z').to_a.join('') ASCII_LOWERCASE = ('a'..'z').to_a.join('')
ASCII_UPPERCASE = ('A'..'Z').to_a.join('') ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
DIGITS = '0'..'9' DIGITS = '0'..'9'
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
# Heading elements need to be ordered # Heading elements need to be ordered
HEADING_ELEMENTS = %w[ HEADING_ELEMENTS = %w[
h1 h1
h2 h2
h3 h3
h4 h4
h5 h5
h6 h6
] ]
# XXX What about event-source and command? # XXX What about event-source and command?
VOID_ELEMENTS = %w[ VOID_ELEMENTS = %w[
base base
link link
meta meta
@ -146,10 +146,10 @@ VOID_ELEMENTS = %w[
area area
col col
input input
] ]
# entitiesWindows1252 has to be _ordered_ and needs to have an index. # entitiesWindows1252 has to be _ordered_ and needs to have an index.
ENTITIES_WINDOWS1252 = [ ENTITIES_WINDOWS1252 = [
8364, # 0x80 0x20AC EURO SIGN 8364, # 0x80 0x20AC EURO SIGN
65533, # 0x81 UNDEFINED 65533, # 0x81 UNDEFINED
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
@ -182,17 +182,17 @@ ENTITIES_WINDOWS1252 = [
65533, # 0x9D UNDEFINED 65533, # 0x9D UNDEFINED
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
] ]
private private
def self.U n def self.U n
[n].pack('U') [n].pack('U')
end end
public public
ENTITIES = { ENTITIES = {
"AElig" => U(0xC6), "AElig" => U(0xC6),
"Aacute" => U(0xC1), "Aacute" => U(0xC1),
"Acirc" => U(0xC2), "Acirc" => U(0xC2),
@ -452,9 +452,9 @@ ENTITIES = {
"zeta" => U(0x03B6), "zeta" => U(0x03B6),
"zwj" => U(0x200D), "zwj" => U(0x200D),
"zwnj" => U(0x200C) "zwnj" => U(0x200C)
} }
ENCODINGS = %w[ ENCODINGS = %w[
ansi_x3.4-1968 ansi_x3.4-1968
iso-ir-6 iso-ir-6
ansi_x3.4-1986 ansi_x3.4-1986
@ -671,6 +671,6 @@ ENCODINGS = %w[
windows-1258 windows-1258
tis-620 tis-620
hz-gb-2312 hz-gb-2312
] ]
end end

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,46 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class AfterBodyPhase < Phase
handle_end 'html'
def processComment(data)
# This is needed because data is to be appended to the <html> element
# here and not to whatever is currently open.
@tree.insertComment(data, @tree.openElements[0])
end
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters in the after body phase.'))
@parser.phase = @parser.phases[:inBody]
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.parseError(_("Unexpected start tag token (#{name}) in the after body phase."))
@parser.phase = @parser.phases[:inBody]
@parser.phase.processStartTag(name, attributes)
end
def endTagHtml(name)
if @parser.innerHTML
@parser.parseError
else
# XXX: This may need to be done, not sure
# Don't set lastPhase to the current phase but to the inBody phase
# instead. No need for extra parse errors if there's something after </html>.
# Try "<!doctype html>X</html>X" for instance.
@parser.lastPhase = @parser.phase
@parser.phase = @parser.phases[:trailingEnd]
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the after body phase."))
@parser.phase = @parser.phases[:inBody]
@parser.phase.processEndTag(name)
end
end
end

View file

@ -0,0 +1,34 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class AfterFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#after3
handle_start 'html', 'noframes'
handle_end 'html'
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters in the after frameset phase. Ignored.'))
end
def startTagNoframes(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) in the after frameset phase. Ignored."))
end
def endTagHtml(name)
@parser.lastPhase = @parser.phase
@parser.phase = @parser.phases[:trailingEnd]
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the after frameset phase. Ignored."))
end
end
end

View file

@ -0,0 +1,50 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class AfterHeadPhase < Phase
handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead'
def processEOF
anythingElse
@parser.phase.processEOF
end
def processCharacters(data)
anythingElse
@parser.phase.processCharacters(data)
end
def startTagBody(name, attributes)
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inBody]
end
def startTagFrameset(name, attributes)
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inFrameset]
end
def startTagFromHead(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) that can be in head. Moved."))
@parser.phase = @parser.phases[:inHead]
@parser.phase.processStartTag(name, attributes)
end
def startTagOther(name, attributes)
anythingElse
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
anythingElse
@parser.phase.processEndTag(name)
end
def anythingElse
@tree.insertElement('body', {})
@parser.phase = @parser.phases[:inBody]
end
end
end

View file

@ -0,0 +1,41 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class BeforeHeadPhase < Phase
handle_start 'html', 'head'
handle_end 'html'
def processEOF
startTagHead('head', {})
@parser.phase.processEOF
end
def processCharacters(data)
startTagHead('head', {})
@parser.phase.processCharacters(data)
end
def startTagHead(name, attributes)
@tree.insertElement(name, attributes)
@tree.headPointer = @tree.openElements[-1]
@parser.phase = @parser.phases[:inHead]
end
def startTagOther(name, attributes)
startTagHead('head', {})
@parser.phase.processStartTag(name, attributes)
end
def endTagHtml(name)
startTagHead('head', {})
@parser.phase.processEndTag(name)
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) after the (implied) root element."))
end
end
end

View file

@ -0,0 +1,548 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InBodyPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-body
handle_start 'html', 'body', 'form', 'plaintext', 'a', 'button', 'xmp', 'table', 'hr', 'image'
handle_start 'input', 'textarea', 'select', 'isindex', %w( script style ), %w( marquee object )
handle_start %w( li dd dt ) => 'ListItem', %w( base link meta title ) => 'FromHead'
handle_start %w( address blockquote center dir div dl fieldset listing menu ol p pre ul ) => 'CloseP'
handle_start %w( b big em font i nobr s small strike strong tt u ) => 'Formatting'
handle_start %w( area basefont bgsound br embed img param spacer wbr ) => 'VoidFormatting'
handle_start %w( iframe noembed noframes noscript ) => 'Cdata', HEADING_ELEMENTS => 'Heading'
handle_start %w( caption col colgroup frame frameset head option optgroup tbody td tfoot th thead tr ) => 'Misplaced'
handle_start %w( event-source section nav article aside header footer datagrid command ) => 'New'
handle_end 'p', 'body', 'html', 'form', %w( button marquee object ), %w( dd dt li ) => 'ListItem'
handle_end %w( address blockquote center div dl fieldset listing menu ol pre ul ) => 'Block'
handle_end %w( a b big em font i nobr s small strike strong tt u ) => 'Formatting'
handle_end %w( head frameset select optgroup option table caption colgroup col thead tfoot tbody tr td th ) => 'Misplaced'
handle_end %w( area basefont bgsound br embed hr image img input isindex param spacer wbr frame ) => 'None'
handle_end %w( noframes noscript noembed textarea xmp iframe ) => 'CdataTextAreaXmp'
handle_end %w( event-source section nav article aside header footer datagrid command ) => 'New'
def initialize(parser, tree)
super(parser, tree)
# for special handling of whitespace in <pre>
@processSpaceCharactersPre = false
end
def processSpaceCharactersPre(data)
#Sometimes (start of <pre> blocks) we want to drop leading newlines
@processSpaceCharactersPre = false
if (data.length > 0 and data[0] == ?\n and
@tree.openElements[-1].name == 'pre' and
not @tree.openElements[-1].hasContent)
data = data[1..-1]
end
@tree.insertText(data) if data.length > 0
end
def processSpaceCharacters(data)
if @processSpaceCharactersPre
processSpaceCharactersPre(data)
else
super(data)
end
end
def processCharacters(data)
# XXX The specification says to do this for every character at the
# moment, but apparently that doesn't match the real world so we don't
# do it for space characters.
@tree.reconstructActiveFormattingElements
@tree.insertText(data)
end
def startTagScriptStyle(name, attributes)
@parser.phases[:inHead].processStartTag(name, attributes)
end
def startTagFromHead(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) that belongs in the head. Moved."))
@parser.phases[:inHead].processStartTag(name, attributes)
end
def startTagBody(name, attributes)
@parser.parseError(_('Unexpected start tag (body).'))
if (@tree.openElements.length == 1 or
@tree.openElements[1].name != 'body')
assert @parser.innerHTML
else
attributes.each do |attr, value|
unless @tree.openElements[1].attributes.has_key?(attr)
@tree.openElements[1].attributes[attr] = value
end
end
end
end
def startTagCloseP(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@processSpaceCharactersPre = true if name == 'pre'
end
def startTagForm(name, attributes)
if @tree.formPointer
@parser.parseError('Unexpected start tag (form). Ignored.')
else
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@tree.formPointer = @tree.openElements[-1]
end
end
def startTagListItem(name, attributes)
endTagP('p') if in_scope?('p')
stopNames = {'li' => ['li'], 'dd' => ['dd', 'dt'], 'dt' => ['dd', 'dt']}
stopName = stopNames[name]
@tree.openElements.reverse.each_with_index do |node, i|
if stopName.include?(node.name)
(i + 1).times { @tree.openElements.pop }
break
end
# Phrasing elements are all non special, non scoping, non
# formatting elements
break if ((SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name) and
not ['address', 'div'].include?(node.name))
end
# Always insert an <li> element.
@tree.insertElement(name, attributes)
end
def startTagPlaintext(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :PLAINTEXT
end
def startTagHeading(name, attributes)
endTagP('p') if in_scope?('p')
HEADING_ELEMENTS.each do |element|
if in_scope?(element)
@parser.parseError(_("Unexpected start tag (#{name})."))
remove_open_elements_until { |element| HEADING_ELEMENTS.include?(element.name) }
break
end
end
@tree.insertElement(name, attributes)
end
def startTagA(name, attributes)
if afeAElement = @tree.elementInActiveFormattingElements('a')
@parser.parseError(_('Unexpected start tag (a) implies end tag (a).'))
endTagFormatting('a')
@tree.openElements.delete(afeAElement) if @tree.openElements.include?(afeAElement)
@tree.activeFormattingElements.delete(afeAElement) if @tree.activeFormattingElements.include?(afeAElement)
end
@tree.reconstructActiveFormattingElements
addFormattingElement(name, attributes)
end
def startTagFormatting(name, attributes)
@tree.reconstructActiveFormattingElements
addFormattingElement(name, attributes)
end
def startTagButton(name, attributes)
if in_scope?('button')
@parser.parseError(_('Unexpected start tag (button) implied end tag (button).'))
processEndTag('button')
@parser.phase.processStartTag(name, attributes)
else
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.activeFormattingElements.push(Marker)
end
end
def startTagMarqueeObject(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.activeFormattingElements.push(Marker)
end
def startTagXmp(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagTable(name, attributes)
processEndTag('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inTable]
end
def startTagVoidFormatting(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagHr(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagImage(name, attributes)
# No really...
@parser.parseError(_('Unexpected start tag (image). Treated as img.'))
processStartTag('img', attributes)
end
def startTagInput(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
if @tree.formPointer
# XXX Not exactly sure what to do here
# @tree.openElements[-1].form = @tree.formPointer
end
@tree.openElements.pop
end
def startTagIsindex(name, attributes)
@parser.parseError("Unexpected start tag isindex. Don't use it!")
return if @tree.formPointer
processStartTag('form', {})
processStartTag('hr', {})
processStartTag('p', {})
processStartTag('label', {})
# XXX Localization ...
processCharacters('This is a searchable index. Insert your search keywords here:')
attributes['name'] = 'isindex'
attrs = attributes.to_a
processStartTag('input', attributes)
processEndTag('label')
processEndTag('p')
processStartTag('hr', {})
processEndTag('form')
end
def startTagTextarea(name, attributes)
# XXX Form element pointer checking here as well...
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :RCDATA
end
# iframe, noembed noframes, noscript(if scripting enabled)
def startTagCdata(name, attributes)
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagSelect(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inSelect]
end
def startTagMisplaced(name, attributes)
# Elements that should be children of other elements that have a
# different insertion mode; here they are ignored
# "caption", "col", "colgroup", "frame", "frameset", "head",
# "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
# "tr", "noscript"
@parser.parseError(_("Unexpected start tag (#{name}). Ignored."))
end
def startTagNew(name, attributes)
# New HTML5 elements, "event-source", "section", "nav",
# "article", "aside", "header", "footer", "datagrid", "command"
sys.stderr.write("Warning: Undefined behaviour for start tag #{name}")
startTagOther(name, attributes)
#raise NotImplementedError
end
def startTagOther(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
end
def endTagP(name)
@tree.generateImpliedEndTags('p') if in_scope?('p')
@parser.parseError('Unexpected end tag (p).') unless @tree.openElements[-1].name == 'p'
@tree.openElements.pop while in_scope?('p')
end
def endTagBody(name)
# XXX Need to take open <p> tags into account here. We shouldn't imply
# </p> but we should not throw a parse error either. Specification is
# likely to be updated.
unless @tree.openElements[1].name == 'body'
# innerHTML case
@parser.parseError
return
end
unless @tree.openElements[-1].name == 'body'
@parser.parseError(_("Unexpected end tag (body). Missing end tag (#{@tree.openElements[-1].name})."))
end
@parser.phase = @parser.phases[:afterBody]
end
def endTagHtml(name)
endTagBody(name)
@parser.phase.processEndTag(name) unless @parser.innerHTML
end
def endTagBlock(name)
#Put us back in the right whitespace handling mode
@processSpaceCharactersPre = false if name == 'pre'
@tree.generateImpliedEndTags if in_scope?(name)
unless @tree.openElements[-1].name == name
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
end
if in_scope?(name)
remove_open_elements_until(name)
end
end
def endTagForm(name)
endTagBlock(name)
@tree.formPointer = nil
end
def endTagListItem(name)
# AT Could merge this with the Block case
if in_scope?(name)
@tree.generateImpliedEndTags(name)
unless @tree.openElements[-1].name == name
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
end
end
remove_open_elements_until(name) if in_scope?(name)
end
def endTagHeading(name)
HEADING_ELEMENTS.each do |element|
if in_scope?(element)
@tree.generateImpliedEndTags
break
end
end
unless @tree.openElements[-1].name == name
@parser.parseError(("Unexpected end tag (#{name}). Expected other end tag."))
end
HEADING_ELEMENTS.each do |element|
if in_scope?(element)
remove_open_elements_until { |element| HEADING_ELEMENTS.include?(element.name) }
break
end
end
end
# The much-feared adoption agency algorithm
def endTagFormatting(name)
# http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
# XXX Better parseError messages appreciated.
while true
# Step 1 paragraph 1
afeElement = @tree.elementInActiveFormattingElements(name)
if not afeElement or (@tree.openElements.include?(afeElement) and not in_scope?(afeElement.name))
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 1 of the adoption agency algorithm."))
return
# Step 1 paragraph 2
elsif not @tree.openElements.include?(afeElement)
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 2 of the adoption agency algorithm."))
@tree.activeFormattingElements.delete(afeElement)
return
end
# Step 1 paragraph 3
if afeElement != @tree.openElements[-1]
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 3 of the adoption agency algorithm."))
end
# Step 2
# Start of the adoption agency algorithm proper
afeIndex = @tree.openElements.index(afeElement)
furthestBlock = nil
@tree.openElements[afeIndex..-1].each do |element|
if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(element.name)
furthestBlock = element
break
end
end
# Step 3
if furthestBlock.nil?
element = remove_open_elements_until { |element| element == afeElement }
@tree.activeFormattingElements.delete(element)
return
end
commonAncestor = @tree.openElements[afeIndex - 1]
# Step 5
furthestBlock.parent.removeChild(furthestBlock) if furthestBlock.parent
# Step 6
# The bookmark is supposed to help us identify where to reinsert
# nodes in step 12. We have to ensure that we reinsert nodes after
# the node before the active formatting element. Note the bookmark
# can move in step 7.4
bookmark = @tree.activeFormattingElements.index(afeElement)
# Step 7
lastNode = node = furthestBlock
while true
# AT replace this with a function and recursion?
# Node is element before node in open elements
node = @tree.openElements[@tree.openElements.index(node) - 1]
until @tree.activeFormattingElements.include?(node)
tmpNode = node
node = @tree.openElements[@tree.openElements.index(node) - 1]
@tree.openElements.delete(tmpNode)
end
# Step 7.3
break if node == afeElement
# Step 7.4
if lastNode == furthestBlock
# XXX should this be index(node) or index(node)+1
# Anne: I think +1 is ok. Given x = [2,3,4,5]
# x.index(3) gives 1 and then x[1 +1] gives 4...
bookmark = @tree.activeFormattingElements.index(node) + 1
end
# Step 7.5
cite = node.parent
if node.hasContent
clone = node.cloneNode
# Replace node with clone
@tree.activeFormattingElements[@tree.activeFormattingElements.index(node)] = clone
@tree.openElements[@tree.openElements.index(node)] = clone
node = clone
end
# Step 7.6
# Remove lastNode from its parents, if any
lastNode.parent.removeChild(lastNode) if lastNode.parent
node.appendChild(lastNode)
# Step 7.7
lastNode = node
# End of inner loop
end
# Step 8
lastNode.parent.removeChild(lastNode) if lastNode.parent
commonAncestor.appendChild(lastNode)
# Step 9
clone = afeElement.cloneNode
# Step 10
furthestBlock.reparentChildren(clone)
# Step 11
furthestBlock.appendChild(clone)
# Step 12
@tree.activeFormattingElements.delete(afeElement)
@tree.activeFormattingElements.insert([bookmark,@tree.activeFormattingElements.length].min, clone)
# Step 13
@tree.openElements.delete(afeElement)
@tree.openElements.insert(@tree.openElements.index(furthestBlock) + 1, clone)
end
end
def endTagButtonMarqueeObject(name)
@tree.generateImpliedEndTags if in_scope?(name)
unless @tree.openElements[-1].name == name
@parser.parseError(_("Unexpected end tag (#{name}). Expected other end tag first."))
end
if in_scope?(name)
remove_open_elements_until(name)
@tree.clearActiveFormattingElements
end
end
def endTagMisplaced(name)
# This handles elements with end tags in other insertion modes.
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagNone(name)
# This handles elements with no end tag.
@parser.parseError(_("This tag (#{name}) has no end tag"))
end
def endTagCdataTextAreaXmp(name)
if @tree.openElements[-1].name == name
@tree.openElements.pop
else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagNew(name)
# New HTML5 elements, "event-source", "section", "nav",
# "article", "aside", "header", "footer", "datagrid", "command"
STDERR.puts "Warning: Undefined behaviour for end tag #{name}"
endTagOther(name)
#raise NotImplementedError
end
def endTagOther(name)
# XXX This logic should be moved into the treebuilder
@tree.openElements.reverse.each do |node|
if node.name == name
@tree.generateImpliedEndTags
unless @tree.openElements[-1].name == name
@parser.parseError(_("Unexpected end tag (#{name})."))
end
remove_open_elements_until { |element| element == node }
break
else
if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
break
end
end
end
end
protected
def addFormattingElement(name, attributes)
@tree.insertElement(name, attributes)
@tree.activeFormattingElements.push(@tree.openElements[-1])
end
end
end

View file

@ -0,0 +1,68 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InCaptionPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
handle_start 'html', %w( caption col colgroup tbody td tfoot th thead tr ) => 'TableElement'
handle_end 'caption', 'table', %w( body col colgroup html tbody td tfoot th thead tr ) => 'Ignore'
def ignoreEndTagCaption
not in_scope?('caption', true)
end
def processCharacters(data)
@parser.phases[:inBody].processCharacters(data)
end
def startTagTableElement(name, attributes)
@parser.parseError
#XXX Have to duplicate logic here to find out if the tag is ignored
ignoreEndTag = ignoreEndTagCaption
@parser.phase.processEndTag('caption')
@parser.phase.processStartTag(name, attributes) unless ignoreEndTag
end
def startTagOther(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def endTagCaption(name)
if ignoreEndTagCaption
# innerHTML case
assert @parser.innerHTML
@parser.parseError
else
# AT this code is quite similar to endTagTable in "InTable"
@tree.generateImpliedEndTags
unless @tree.openElements[-1].name == 'caption'
@parser.parseError(_("Unexpected end tag (caption). Missing end tags."))
end
remove_open_elements_until('caption')
@tree.clearActiveFormattingElements
@parser.phase = @parser.phases[:inTable]
end
end
def endTagTable(name)
@parser.parseError
ignoreEndTag = ignoreEndTagCaption
@parser.phase.processEndTag('caption')
@parser.phase.processEndTag(name) unless ignoreEndTag
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagOther(name)
@parser.phases[:inBody].processEndTag(name)
end
end
end

View file

@ -0,0 +1,78 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InCellPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
handle_start 'html', %w( caption col colgroup tbody td tfoot th thead tr ) => 'TableOther'
handle_end %w( td th ) => 'TableCell', %w( body caption col colgroup html ) => 'Ignore'
handle_end %w( table tbody tfoot thead tr ) => 'Imply'
def processCharacters(data)
@parser.phases[:inBody].processCharacters(data)
end
def startTagTableOther(name, attributes)
if in_scope?('td', true) or in_scope?('th', true)
closeCell
@parser.phase.processStartTag(name, attributes)
else
# innerHTML case
@parser.parseError
end
end
def startTagOther(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def endTagTableCell(name)
if in_scope?(name, true)
@tree.generateImpliedEndTags(name)
if @tree.openElements[-1].name != name
@parser.parseError("Got table cell end tag (#{name}) while required end tags are missing.")
remove_open_elements_until(name)
else
@tree.openElements.pop
end
@tree.clearActiveFormattingElements
@parser.phase = @parser.phases[:inRow]
else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagImply(name)
if in_scope?(name, true)
closeCell
@parser.phase.processEndTag(name)
else
# sometimes innerHTML case
@parser.parseError
end
end
def endTagOther(name)
@parser.phases[:inBody].processEndTag(name)
end
protected
def closeCell
if in_scope?('td', true)
endTagTableCell('td')
elsif in_scope?('th', true)
endTagTableCell('th')
end
end
end
end

View file

@ -0,0 +1,55 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InColumnGroupPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-column
handle_start 'html', 'col'
handle_end 'colgroup', 'col'
def ignoreEndTagColgroup
@tree.openElements[-1].name == 'html'
end
def processCharacters(data)
ignoreEndTag = ignoreEndTagColgroup
endTagColgroup("colgroup")
@parser.phase.processCharacters(data) unless ignoreEndTag
end
def startTagCol(name, attributes)
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagOther(name, attributes)
ignoreEndTag = ignoreEndTagColgroup
endTagColgroup('colgroup')
@parser.phase.processStartTag(name, attributes) unless ignoreEndTag
end
def endTagColgroup(name)
if ignoreEndTagColgroup
# innerHTML case
assert @parser.innerHTML
@parser.parseError
else
@tree.openElements.pop
@parser.phase = @parser.phases[:inTable]
end
end
def endTagCol(name)
@parser.parseError(_('Unexpected end tag (col). col has no end tag.'))
end
def endTagOther(name)
ignoreEndTag = ignoreEndTagColgroup
endTagColgroup('colgroup')
@parser.phase.processEndTag(name) unless ignoreEndTag
end
end
end

View file

@ -0,0 +1,57 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
handle_start 'html', 'frameset', 'frame', 'noframes'
handle_end 'frameset', 'noframes'
def processCharacters(data)
@parser.parseError(_('Unexpected characters in the frameset phase. Characters ignored.'))
end
def startTagFrameset(name, attributes)
@tree.insertElement(name, attributes)
end
def startTagFrame(name, attributes)
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagNoframes(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag token (#{name}) in the frameset phase. Ignored"))
end
def endTagFrameset(name)
if @tree.openElements[-1].name == 'html'
# innerHTML case
@parser.parseError(_("Unexpected end tag token (frameset) in the frameset phase (innerHTML)."))
else
@tree.openElements.pop
end
if (not @parser.innerHTML and
@tree.openElements[-1].name != 'frameset')
# If we're not in innerHTML mode and the the current node is not a
# "frameset" element (anymore) then switch.
@parser.phase = @parser.phases[:afterFrameset]
end
end
def endTagNoframes(name)
@parser.phases[:inBody].processEndTag(name)
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the frameset phase. Ignored."))
end
end
end

View file

@ -0,0 +1,120 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InHeadPhase < Phase
handle_start 'html', 'head', 'title', 'style', 'script', %w( base link meta )
handle_end 'head', 'html', %w( title style script )
def processEOF
if ['title', 'style', 'script'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected end of file. Expected end tag (#{name})."))
@tree.openElements.pop
end
anythingElse
@parser.phase.processEOF
end
def processCharacters(data)
if ['title', 'style', 'script'].include?(@tree.openElements[-1].name)
@tree.insertText(data)
else
anythingElse
@parser.phase.processCharacters(data)
end
end
def startTagHead(name, attributes)
@parser.parseError(_('Unexpected start tag head in existing head. Ignored'))
end
def startTagTitle(name, attributes)
element = @tree.createElement(name, attributes)
appendToHead(element)
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :RCDATA
end
def startTagStyle(name, attributes)
element = @tree.createElement(name, attributes)
if @tree.headPointer != nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.openElements[-1].appendChild(element)
end
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagScript(name, attributes)
#XXX Inner HTML case may be wrong
element = @tree.createElement(name, attributes)
element._flags.push("parser-inserted")
if (@tree.headPointer != nil and
@parser.phase == @parser.phases[:inHead])
appendToHead(element)
else
@tree.openElements[-1].appendChild(element)
end
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagBaseLinkMeta(name, attributes)
element = @tree.createElement(name, attributes)
appendToHead(element)
end
def startTagOther(name, attributes)
anythingElse
@parser.phase.processStartTag(name, attributes)
end
def endTagHead(name)
if @tree.openElements[-1].name == 'head'
@tree.openElements.pop
else
@parser.parseError(_("Unexpected end tag (head). Ignored."))
end
@parser.phase = @parser.phases[:afterHead]
end
def endTagHtml(name)
anythingElse
@parser.phase.processEndTag(name)
end
def endTagTitleStyleScript(name)
if @tree.openElements[-1].name == name
@tree.openElements.pop
else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def anythingElse
if @tree.openElements[-1].name == 'head'
endTagHead('head')
else
@parser.phase = @parser.phases[:afterHead]
end
end
protected
def appendToHead(element)
if @tree.headPointer.nil?
assert @parser.innerHTML
@tree.openElements[-1].appendChild(element)
else
@tree.headPointer.appendChild(element)
end
end
end
end

View file

@ -0,0 +1,87 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InRowPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
handle_start 'html', %w( td th ) => 'TableCell', %w( caption col colgroup tbody tfoot thead tr ) => 'TableOther'
handle_end 'tr', 'table', %w( tbody tfoot thead ) => 'TableRowGroup', %w( body caption col colgroup html td th ) => 'Ignore'
def processCharacters(data)
@parser.phases[:inTable].processCharacters(data)
end
def startTagTableCell(name, attributes)
clearStackToTableRowContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inCell]
@tree.activeFormattingElements.push(Marker)
end
def startTagTableOther(name, attributes)
ignoreEndTag = ignoreEndTagTr
endTagTr('tr')
# XXX how are we sure it's always ignored in the innerHTML case?
@parser.phase.processStartTag(name, attributes) unless ignoreEndTag
end
def startTagOther(name, attributes)
@parser.phases[:inTable].processStartTag(name, attributes)
end
def endTagTr(name)
if ignoreEndTagTr
# innerHTML case
assert @parser.innerHTML
@parser.parseError
else
clearStackToTableRowContext
@tree.openElements.pop
@parser.phase = @parser.phases[:inTableBody]
end
end
def endTagTable(name)
ignoreEndTag = ignoreEndTagTr
endTagTr('tr')
# Reprocess the current tag if the tr end tag was not ignored
# XXX how are we sure it's always ignored in the innerHTML case?
@parser.phase.processEndTag(name) unless ignoreEndTag
end
def endTagTableRowGroup(name)
if in_scope?(name, true)
endTagTr('tr')
@parser.phase.processEndTag(name)
else
# innerHTML case
@parser.parseError
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the row phase. Ignored."))
end
def endTagOther(name)
@parser.phases[:inTable].processEndTag(name)
end
protected
# XXX unify this with other table helper methods
def clearStackToTableRowContext
until ['tr', 'html'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the row phase."))
@tree.openElements.pop
end
end
def ignoreEndTagTr
not in_scope?('tr', :tableVariant => true)
end
end
end

View file

@ -0,0 +1,84 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InSelectPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
handle_start 'html', 'option', 'optgroup', 'select'
handle_end 'option', 'optgroup', 'select', %w( caption table tbody tfoot thead tr td th ) => 'TableElements'
def processCharacters(data)
@tree.insertText(data)
end
def startTagOption(name, attributes)
# We need to imply </option> if <option> is the current node.
@tree.openElements.pop if @tree.openElements[-1].name == 'option'
@tree.insertElement(name, attributes)
end
def startTagOptgroup(name, attributes)
@tree.openElements.pop if @tree.openElements[-1].name == 'option'
@tree.openElements.pop if @tree.openElements[-1].name == 'optgroup'
@tree.insertElement(name, attributes)
end
def startTagSelect(name, attributes)
@parser.parseError(_('Unexpected start tag (select) in the select phase implies select start tag.'))
endTagSelect('select')
end
def startTagOther(name, attributes)
@parser.parseError(_('Unexpected start tag token (#{name}) in the select phase. Ignored.'))
end
def endTagOption(name)
if @tree.openElements[-1].name == 'option'
@tree.openElements.pop
else
@parser.parseError(_('Unexpected end tag (option) in the select phase. Ignored.'))
end
end
def endTagOptgroup(name)
# </optgroup> implicitly closes <option>
if @tree.openElements[-1].name == 'option' and @tree.openElements[-2].name == 'optgroup'
@tree.openElements.pop
end
# It also closes </optgroup>
if @tree.openElements[-1].name == 'optgroup'
@tree.openElements.pop
# But nothing else
else
@parser.parseError(_('Unexpected end tag (optgroup) in the select phase. Ignored.'))
end
end
def endTagSelect(name)
if in_scope?('select', true)
remove_open_elements_until('select')
@parser.resetInsertionMode
else
# innerHTML case
@parser.parseError
end
end
def endTagTableElements(name)
@parser.parseError(_("Unexpected table end tag (#{name}) in the select phase."))
if in_scope?(name, true)
endTagSelect('select')
@parser.phase.processEndTag(name)
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the select phase. Ignored."))
end
end
end

View file

@ -0,0 +1,83 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InTableBodyPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
handle_start 'html', 'tr', %w( td th ) => 'TableCell', %w( caption col colgroup tbody tfoot thead ) => 'TableOther'
handle_end 'table', %w( tbody tfoot thead ) => 'TableRowGroup', %w( body caption col colgroup html td th tr ) => 'Ingore'
def processCharacters(data)
@parser.phases[:inTable].processCharacters(data)
end
def startTagTr(name, attributes)
clearStackToTableBodyContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inRow]
end
def startTagTableCell(name, attributes)
@parser.parseError(_("Unexpected table cell start tag (#{name}) in the table body phase."))
startTagTr('tr', {})
@parser.phase.processStartTag(name, attributes)
end
def startTagTableOther(name, attributes)
# XXX AT Any ideas on how to share this with endTagTable?
if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
clearStackToTableBodyContext
endTagTableRowGroup(@tree.openElements[-1].name)
@parser.phase.processStartTag(name, attributes)
else
# innerHTML case
@parser.parseError
end
end
def startTagOther(name, attributes)
@parser.phases[:inTable].processStartTag(name, attributes)
end
def endTagTableRowGroup(name)
if in_scope?(name, true)
clearStackToTableBodyContext
@tree.openElements.pop
@parser.phase = @parser.phases[:inTable]
else
@parser.parseError(_("Unexpected end tag (#{name}) in the table body phase. Ignored."))
end
end
def endTagTable(name)
if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
clearStackToTableBodyContext
endTagTableRowGroup(@tree.openElements[-1].name)
@parser.phase.processEndTag(name)
else
# innerHTML case
@parser.parseError
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the table body phase. Ignored."))
end
def endTagOther(name)
@parser.phases[:inTable].processEndTag(name)
end
protected
def clearStackToTableBodyContext
until ['tbody', 'tfoot', 'thead', 'html'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the table body phase."))
@tree.openElements.pop
end
end
end
end

View file

@ -0,0 +1,110 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InTablePhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
handle_start 'html', 'caption', 'colgroup', 'col', 'table'
handle_start %w( tbody tfoot thead ) => 'RowGroup', %w( td th tr ) => 'ImplyTbody'
handle_end 'table', %w( body caption col colgroup html tbody td tfoot th thead tr ) => 'Ignore'
def processCharacters(data)
@parser.parseError(_("Unexpected non-space characters in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
@tree.insertFromTable = true
# Process the character in the "in body" mode
@parser.phases[:inBody].processCharacters(data)
@tree.insertFromTable = false
end
def startTagCaption(name, attributes)
clearStackToTableContext
@tree.activeFormattingElements.push(Marker)
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inCaption]
end
def startTagColgroup(name, attributes)
clearStackToTableContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inColumnGroup]
end
def startTagCol(name, attributes)
startTagColgroup('colgroup', {})
@parser.phase.processStartTag(name, attributes)
end
def startTagRowGroup(name, attributes)
clearStackToTableContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inTableBody]
end
def startTagImplyTbody(name, attributes)
startTagRowGroup('tbody', {})
@parser.phase.processStartTag(name, attributes)
end
def startTagTable(name, attributes)
@parser.parseError(_("Unexpected start tag (table) in table phase. Implies end tag (table)."))
@parser.phase.processEndTag('table')
@parser.phase.processStartTag(name, attributes) unless @parser.innerHTML
end
def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
@tree.insertFromTable = true
# Process the start tag in the "in body" mode
@parser.phases[:inBody].processStartTag(name, attributes)
@tree.insertFromTable = false
end
def endTagTable(name)
if in_scope?('table', true)
@tree.generateImpliedEndTags
unless @tree.openElements[-1].name == 'table'
@parser.parseError(_("Unexpected end tag (table). Expected end tag (#{@tree.openElements[-1].name})."))
end
remove_open_elements_until('table')
@parser.resetInsertionMode
else
# innerHTML case
assert @parser.innerHTML
@parser.parseError
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
@parser.insertFromTable = true
# Process the end tag in the "in body" mode
@parser.phases[:inBody].processEndTag(name)
@parser.insertFromTable = false
end
protected
def clearStackToTableContext
# "clear the stack back to a table context"
until ['table', 'html'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the table phase."))
@tree.openElements.pop
end
# When the current node is <html> it's an innerHTML case
end
end
end

View file

@ -0,0 +1,49 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InitialPhase < Phase
# This phase deals with error handling as well which is currently not
# covered in the specification. The error handling is typically known as
# "quirks mode". It is expected that a future version of HTML5 will define this.
def processEOF
@parser.parseError(_('Unexpected End of file. Expected DOCTYPE.'))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processEOF
end
def processComment(data)
@tree.insertComment(data, @tree.document)
end
def processDoctype(name, error)
@parser.parseError(_('Erroneous DOCTYPE.')) if error
@tree.insertDoctype(name)
@parser.phase = @parser.phases[:rootElement]
end
def processSpaceCharacters(data)
@tree.insertText(data, @tree.document)
end
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters. Expected DOCTYPE.'))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}). Expected DOCTYPE."))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
@parser.parseError(_("Unexpected end tag (#{name}). Expected DOCTYPE."))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processEndTag(name)
end
end
end

View file

@ -0,0 +1,156 @@
module HTML5lib
# Base class for helper objects that implement each phase of processing.
#
# Handler methods should be in the following order (they can be omitted):
#
# * EOF
# * Comment
# * Doctype
# * SpaceCharacters
# * Characters
# * StartTag
# - startTag* methods
# * EndTag
# - endTag* methods
#
class Phase
# The following example call:
#
# tag_handlers('startTag', 'html', %( base link meta ), %( li dt dd ) => 'ListItem')
#
# ...would return a hash equal to this:
#
# { 'html' => 'startTagHtml',
# 'base' => 'startTagBaseLinkMeta',
# 'link' => 'startTagBaseLinkMeta',
# 'meta' => 'startTagBaseLinkMeta',
# 'li' => 'startTagListItem',
# 'dt' => 'startTagListItem',
# 'dd' => 'startTagListItem' }
#
def self.tag_handlers(prefix, *tags)
mapping = {}
if tags.last.is_a?(Hash)
tags.pop.each do |names, handler_method_suffix|
handler_method = prefix + handler_method_suffix
Array(names).each { |name| mapping[name] = handler_method }
end
end
tags.each do |names|
names = Array(names)
handler_method = prefix + names.map { |name| name.capitalize }.join
names.each { |name| mapping[name] = handler_method }
end
return mapping
end
def self.start_tag_handlers
@start_tag_handlers ||= Hash.new('startTagOther')
end
# Declare what start tags this Phase handles. Can be called more than once.
#
# Example usage:
#
# handle_start 'html'
# # html start tags will be handled by a method named 'startTagHtml'
#
# handle_start %( base link meta )
# # base, link and meta start tags will be handled by a method named 'startTagBaseLinkMeta'
#
# handle_start %( li dt dd ) => 'ListItem'
# # li, dt, and dd start tags will be handled by a method named 'startTagListItem'
#
def self.handle_start(*tags)
start_tag_handlers.update tag_handlers('startTag', *tags)
end
def self.end_tag_handlers
@end_tag_handlers ||= Hash.new('endTagOther')
end
# Declare what end tags this Phase handles. Behaves like handle_start.
#
def self.handle_end(*tags)
end_tag_handlers.update tag_handlers('endTag', *tags)
end
def initialize(parser, tree)
@parser, @tree = parser, tree
end
def processEOF
@tree.generateImpliedEndTags
if @tree.openElements.length > 2
@parser.parseError(_('Unexpected end of file. Missing closing tags.'))
elsif @tree.openElements.length == 2 and @tree.openElements[1].name != 'body'
# This happens for framesets or something?
@parser.parseError(_("Unexpected end of file. Expected end tag (#{@tree.openElements[1].name}) first."))
elsif @parser.innerHTML and @tree.openElements.length > 1
# XXX This is not what the specification says. Not sure what to do here.
@parser.parseError(_('XXX innerHTML EOF'))
end
# Betting ends.
end
def processComment(data)
# For most phases the following is correct. Where it's not it will be
# overridden.
@tree.insertComment(data, @tree.openElements[-1])
end
def processDoctype(name, error)
@parser.parseError(_('Unexpected DOCTYPE. Ignored.'))
end
def processSpaceCharacters(data)
@tree.insertText(data)
end
def processStartTag(name, attributes)
send self.class.start_tag_handlers[name], name, attributes
end
def startTagHtml(name, attributes)
if @parser.firstStartTag == false and name == 'html'
@parser.parseError(_('html needs to be the first start tag.'))
end
# XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke @parser.parseError.
attributes.each do |attr, value|
unless @tree.openElements[0].attributes.has_key?(attr)
@tree.openElements[0].attributes[attr] = value
end
end
@parser.firstStartTag = false
end
def processEndTag(name)
send self.class.end_tag_handlers[name], name
end
def _(string)
string
end
def assert(value)
throw AssertionError.new unless value
end
def in_scope?(*args)
@tree.elementInScope(*args)
end
def remove_open_elements_until(name=nil)
finished = false
until finished
element = @tree.openElements.pop
finished = name.nil?? yield(element) : element.name == name
end
return element
end
end
end

View file

@ -0,0 +1,43 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class RootElementPhase < Phase
def processEOF
insertHtmlElement
@parser.phase.processEOF
end
def processComment(data)
@tree.insertComment(data, @tree.document)
end
def processSpaceCharacters(data)
@tree.insertText(data, @tree.document)
end
def processCharacters(data)
insertHtmlElement
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.firstStartTag = true if name == 'html'
insertHtmlElement
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
insertHtmlElement
@parser.phase.processEndTag(name)
end
def insertHtmlElement
element = @tree.createElement('html', {})
@tree.openElements.push(element)
@tree.document.appendChild(element)
@parser.phase = @parser.phases[:beforeHead]
end
end
end

View file

@ -0,0 +1,36 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class TrailingEndPhase < Phase
def processEOF
end
def processComment(data)
@tree.insertComment(data, @tree.document)
end
def processSpaceCharacters(data)
@parser.lastPhase.processSpaceCharacters(data)
end
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters. Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.parseError(_('Unexpected start tag (#{name}). Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
@parser.parseError(_('Unexpected end tag (#{name}). Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processEndTag(name)
end
end
end

View file

@ -3,14 +3,14 @@ require 'html5lib/constants'
module HTML5lib module HTML5lib
# Provides a unicode stream of characters to the HTMLTokenizer. # Provides a unicode stream of characters to the HTMLTokenizer.
# This class takes care of character encoding and removing or replacing # This class takes care of character encoding and removing or replacing
# incorrect byte-sequences and also provides column and line tracking. # incorrect byte-sequences and also provides column and line tracking.
class HTMLInputStream class HTMLInputStream
attr_accessor :queue, :charEncoding attr_accessor :queue, :char_encoding
# Initialises the HTMLInputStream. # Initialises the HTMLInputStream.
# #
@ -28,16 +28,16 @@ class HTMLInputStream
def initialize(source, options = {}) def initialize(source, options = {})
@encoding = nil @encoding = nil
@parseMeta = true @parse_meta = true
@chardet = true @chardet = true
options.each { |name, value| instance_variable_set("@#{name}", value) } options.each { |name, value| instance_variable_set("@#{name}", value) }
# List of where new lines occur # List of where new lines occur
@newLines = [] @new_lines = []
# Raw Stream # Raw Stream
@rawStream = openStream(source) @raw_stream = open_stream(source)
# Encoding Information # Encoding Information
#Number of bytes to use when looking for a meta element with #Number of bytes to use when looking for a meta element with
@ -47,15 +47,15 @@ class HTMLInputStream
@DEFAULT_ENCODING = 'windows-1252' @DEFAULT_ENCODING = 'windows-1252'
#Detect encoding iff no explicit "transport level" encoding is supplied #Detect encoding iff no explicit "transport level" encoding is supplied
if @encoding.nil? or not HTML5lib.isValidEncoding(@encoding) if @encoding.nil? or not HTML5lib.is_valid_encoding(@encoding)
@charEncoding = detectEncoding @char_encoding = detect_encoding
else else
@charEncoding = @encoding @char_encoding = @encoding
end end
# Read bytes from stream decoding them into Unicode # Read bytes from stream decoding them into Unicode
uString = @rawStream.read uString = @raw_stream.read
unless @charEncoding == 'utf-8' unless @char_encoding == 'utf-8'
begin begin
require 'iconv' require 'iconv'
uString = Iconv.iconv('utf-8', @encoding, uString)[0] uString = Iconv.iconv('utf-8', @encoding, uString)[0]
@ -68,7 +68,7 @@ class HTMLInputStream
uString.gsub!("\x00", [0xFFFD].pack('U')) uString.gsub!("\x00", [0xFFFD].pack('U'))
# Convert the unicode string into a list to be used as the data stream # Convert the unicode string into a list to be used as the data stream
@dataStream = uString @data_stream = uString
@queue = [] @queue = []
@ -79,7 +79,7 @@ class HTMLInputStream
# Produces a file object from source. # Produces a file object from source.
# #
# source can be either a file object, local filename or a string. # source can be either a file object, local filename or a string.
def openStream(source) def open_stream(source)
# Already an IO like object # Already an IO like object
if source.respond_to?(:read) if source.respond_to?(:read)
@stream = source @stream = source
@ -90,24 +90,24 @@ class HTMLInputStream
return @stream return @stream
end end
def detectEncoding def detect_encoding
#First look for a BOM #First look for a BOM
#This will also read past the BOM if present #This will also read past the BOM if present
encoding = detectBOM encoding = detect_bom
#If there is no BOM need to look for meta elements with encoding #If there is no BOM need to look for meta elements with encoding
#information #information
if encoding.nil? and @parseMeta if encoding.nil? and @parse_meta
encoding = detectEncodingMeta encoding = detect_encoding_meta
end end
#Guess with chardet, if avaliable #Guess with chardet, if avaliable
if encoding.nil? and @chardet if encoding.nil? and @chardet
begin begin
require 'rubygems' require 'rubygems'
require 'UniversalDetector' # gem install chardet require 'UniversalDetector' # gem install chardet
buffer = @rawStream.read buffer = @raw_stream.read
encoding = UniversalDetector::chardet(buffer)['encoding'] encoding = UniversalDetector::chardet(buffer)['encoding']
@rawStream = openStream(buffer) @raw_stream = open_stream(buffer)
rescue LoadError rescue LoadError
end end
end end
@ -117,10 +117,10 @@ class HTMLInputStream
end end
#Substitute for equivalent encodings: #Substitute for equivalent encodings:
encodingSub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'} encoding_sub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
if encodingSub.has_key?(encoding.downcase) if encoding_sub.has_key?(encoding.downcase)
encoding = encodingSub[encoding.downcase] encoding = encoding_sub[encoding.downcase]
end end
return encoding return encoding
@ -129,8 +129,8 @@ class HTMLInputStream
# Attempts to detect at BOM at the start of the stream. If # Attempts to detect at BOM at the start of the stream. If
# an encoding can be determined from the BOM return the name of the # an encoding can be determined from the BOM return the name of the
# encoding otherwise return nil # encoding otherwise return nil
def detectBOM def detect_bom
bomDict = { bom_dict = {
"\xef\xbb\xbf" => 'utf-8', "\xef\xbb\xbf" => 'utf-8',
"\xff\xfe" => 'utf-16-le', "\xff\xfe" => 'utf-16-le',
"\xfe\xff" => 'utf-16-be', "\xfe\xff" => 'utf-16-be',
@ -139,19 +139,19 @@ class HTMLInputStream
} }
# Go to beginning of file and read in 4 bytes # Go to beginning of file and read in 4 bytes
@rawStream.seek(0) @raw_stream.seek(0)
string = @rawStream.read(4) string = @raw_stream.read(4)
return nil unless string return nil unless string
# Try detecting the BOM using bytes from the string # Try detecting the BOM using bytes from the string
encoding = bomDict[string[0...3]] # UTF-8 encoding = bom_dict[string[0...3]] # UTF-8
seek = 3 seek = 3
unless encoding unless encoding
# Need to detect UTF-32 before UTF-16 # Need to detect UTF-32 before UTF-16
encoding = bomDict[string] # UTF-32 encoding = bom_dict[string] # UTF-32
seek = 4 seek = 4
unless encoding unless encoding
encoding = bomDict[string[0...2]] # UTF-16 encoding = bom_dict[string[0...2]] # UTF-16
seek = 2 seek = 2
end end
end end
@ -159,36 +159,36 @@ class HTMLInputStream
#AT - move this to the caller? #AT - move this to the caller?
# Set the read position past the BOM if one was found, otherwise # Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream # set it to the start of the stream
@rawStream.seek(encoding ? seek : 0) @raw_stream.seek(encoding ? seek : 0)
return encoding return encoding
end end
# Report the encoding declared by the meta element # Report the encoding declared by the meta element
def detectEncodingMeta def detect_encoding_meta
parser = EncodingParser.new(@rawStream.read(@NUM_BYTES_META)) parser = EncodingParser.new(@raw_stream.read(@NUM_BYTES_META))
@rawStream.seek(0) @raw_stream.seek(0)
return parser.getEncoding return parser.get_encoding
end end
def determineNewLines def determine_new_lines
# Looks through the stream to find where new lines occur so # Looks through the stream to find where new lines occur so
# the position method can tell where it is. # the position method can tell where it is.
@newLines.push(0) @new_lines.push(0)
(0...@dataStream.length).each { |i| @newLines.push(i) if @dataStream[i] == ?\n } (0...@data_stream.length).each { |i| @new_lines.push(i) if @data_stream[i] == ?\n }
end end
# Returns (line, col) of the current position in the stream. # Returns (line, col) of the current position in the stream.
def position def position
# Generate list of new lines first time around # Generate list of new lines first time around
determineNewLines if @newLines.empty? determine_new_lines if @new_lines.empty?
line = 0 line = 0
tell = @tell tell = @tell
@newLines.each do |pos| @new_lines.each do |pos|
break unless pos < tell break unless pos < tell
line += 1 line += 1
end end
col = tell - @newLines[line-1] - 1 col = tell - @new_lines[line-1] - 1
return [line, col] return [line, col]
end end
@ -205,7 +205,7 @@ class HTMLInputStream
else else
begin begin
@tell += 1 @tell += 1
return @dataStream[@tell - 1].chr return @data_stream[@tell - 1].chr
rescue rescue
return :EOF return :EOF
end end
@ -215,22 +215,22 @@ class HTMLInputStream
# Returns a string of characters from the stream up to but not # Returns a string of characters from the stream up to but not
# including any character in characters or EOF. characters can be # including any character in characters or EOF. characters can be
# any container that supports the in method being called on it. # any container that supports the in method being called on it.
def charsUntil(characters, opposite = false) def chars_until(characters, opposite=false)
charStack = [char] char_stack = [char]
unless charStack[0] == :EOF unless char_stack[0] == :EOF
while (characters.include? charStack[-1]) == opposite while (characters.include? char_stack[-1]) == opposite
unless @queue.empty? unless @queue.empty?
# First from the queue # First from the queue
charStack.push(@queue.shift) char_stack.push(@queue.shift)
break if charStack[-1] == :EOF break if char_stack[-1] == :EOF
else else
# Then the rest # Then the rest
begin begin
charStack.push(@dataStream[@tell].chr) char_stack.push(@data_stream[@tell].chr)
@tell += 1 @tell += 1
rescue rescue
charStack.push(:EOF) char_stack.push(:EOF)
break break
end end
end end
@ -239,14 +239,14 @@ class HTMLInputStream
# Put the character stopped on back to the front of the queue # Put the character stopped on back to the front of the queue
# from where it came. # from where it came.
@queue.insert(0, charStack.pop) @queue.insert(0, char_stack.pop)
return charStack.join('') return char_stack.join('')
end
end end
end
# String-like object with an assosiated position and various extra methods # String-like object with an assosiated position and various extra methods
# If the position is ever greater than the string length then an exception is raised # If the position is ever greater than the string length then an exception is raised
class EncodingBytes < String class EncodingBytes < String
attr_accessor :position attr_accessor :position
@ -263,14 +263,14 @@ class EncodingBytes < String
rescue EOF rescue EOF
end end
def currentByte def current_byte
raise EOF if @position >= length raise EOF if @position >= length
return self[@position].chr return self[@position].chr
end end
# Skip past a list of characters # Skip past a list of characters
def skip(chars = SPACE_CHARACTERS) def skip(chars=SPACE_CHARACTERS)
while chars.include?(currentByte) while chars.include?(current_byte)
@position += 1 @position += 1
end end
end end
@ -278,7 +278,7 @@ class EncodingBytes < String
# Look for a sequence of bytes at the start of a string. If the bytes # Look for a sequence of bytes at the start of a string. If the bytes
# are found return true and advance the position to the byte after the # are found return true and advance the position to the byte after the
# match. Otherwise return false and leave the position alone # match. Otherwise return false and leave the position alone
def matchBytes(bytes, lower = false) def match_bytes(bytes, lower=false)
data = self[position ... position+bytes.length] data = self[position ... position+bytes.length]
data.downcase! if lower data.downcase! if lower
rv = (data == bytes) rv = (data == bytes)
@ -288,10 +288,10 @@ class EncodingBytes < String
# Look for the next sequence of bytes matching a given sequence. If # Look for the next sequence of bytes matching a given sequence. If
# a match is found advance the position to the last byte of the match # a match is found advance the position to the last byte of the match
def jumpTo(bytes) def jump_to(bytes)
newPosition = self[position .. -1].index(bytes) new_position = self[position .. -1].index(bytes)
if newPosition if new_position
@position += (newPosition + bytes.length-1) @position += (new_position + bytes.length-1)
return true return true
else else
raise EOF raise EOF
@ -300,15 +300,15 @@ class EncodingBytes < String
# Move the pointer so it points to the next byte in a set of possible # Move the pointer so it points to the next byte in a set of possible
# bytes # bytes
def findNext(byteList) def find_next(byte_list)
until byteList.include?(currentByte) until byte_list.include?(current_byte)
@position += 1 @position += 1
end end
end end
end end
# Mini parser for detecting character encoding from meta elements # Mini parser for detecting character encoding from meta elements
class EncodingParser class EncodingParser
# string - the data to work on for encoding detection # string - the data to work on for encoding detection
def initialize(data) def initialize(data)
@ -317,139 +317,139 @@ class EncodingParser
end end
@@method_dispatch = [ @@method_dispatch = [
['<!--', :handleComment], ['<!--', :handle_comment],
['<meta', :handleMeta], ['<meta', :handle_meta],
['</', :handlePossibleEndTag], ['</', :handle_possible_end_tag],
['<!', :handleOther], ['<!', :handle_other],
['<?', :handleOther], ['<?', :handle_other],
['<', :handlePossibleStartTag] ['<', :handle_possible_start_tag]
] ]
def getEncoding def get_encoding
@data.each do |byte| @data.each do |byte|
keepParsing = true keep_parsing = true
@@method_dispatch.each do |(key, method)| @@method_dispatch.each do |(key, method)|
if @data.matchBytes(key, lower = true) if @data.match_bytes(key, lower = true)
keepParsing = send(method) keep_parsing = send(method)
break break
end end
end end
break unless keepParsing break unless keep_parsing
end end
@encoding = @encoding.strip unless @encoding.nil? @encoding = @encoding.strip unless @encoding.nil?
return @encoding return @encoding
end end
# Skip over comments # Skip over comments
def handleComment def handle_comment
return @data.jumpTo('-->') return @data.jump_to('-->')
end end
def handleMeta def handle_meta
# if we have <meta not followed by a space so just keep going # if we have <meta not followed by a space so just keep going
return true unless SPACE_CHARACTERS.include?(@data.currentByte) return true unless SPACE_CHARACTERS.include?(@data.current_byte)
#We have a valid meta element we want to search for attributes #We have a valid meta element we want to search for attributes
while true while true
#Try to find the next attribute after the current position #Try to find the next attribute after the current position
attr = getAttribute attr = get_attribute
return true if attr.nil? return true if attr.nil?
if attr[0] == 'charset' if attr[0] == 'charset'
tentativeEncoding = attr[1] tentative_encoding = attr[1]
if HTML5lib.isValidEncoding(tentativeEncoding) if HTML5lib.is_valid_encoding(tentative_encoding)
@encoding = tentativeEncoding @encoding = tentative_encoding
return false return false
end end
elsif attr[0] == 'content' elsif attr[0] == 'content'
contentParser = ContentAttrParser.new(EncodingBytes.new(attr[1])) content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
tentativeEncoding = contentParser.parse tentative_encoding = content_parser.parse
if HTML5lib.isValidEncoding(tentativeEncoding) if HTML5lib.is_valid_encoding(tentative_encoding)
@encoding = tentativeEncoding @encoding = tentative_encoding
return false return false
end end
end end
end end
end end
def handlePossibleStartTag def handle_possible_start_tag
return handlePossibleTag(false) return handle_possible_tag(false)
end end
def handlePossibleEndTag def handle_possible_end_tag
@data.position+=1 @data.position += 1
return handlePossibleTag(true) return handle_possible_tag(true)
end end
def handlePossibleTag(endTag) def handle_possible_tag(end_tag)
unless ASCII_LETTERS.include?(@data.currentByte) unless ASCII_LETTERS.include?(@data.current_byte)
#If the next byte is not an ascii letter either ignore this #If the next byte is not an ascii letter either ignore this
#fragment (possible start tag case) or treat it according to #fragment (possible start tag case) or treat it according to
#handleOther #handleOther
if endTag if end_tag
@data.position -= 1 @data.position -= 1
handleOther handle_other
end end
return true return true
end end
@data.findNext(SPACE_CHARACTERS + ['<', '>']) @data.find_next(SPACE_CHARACTERS + ['<', '>'])
if @data.currentByte == '<' if @data.current_byte == '<'
#return to the first step in the overall "two step" algorithm #return to the first step in the overall "two step" algorithm
#reprocessing the < byte #reprocessing the < byte
@data.position -= 1 @data.position -= 1
else else
#Read all attributes #Read all attributes
{} until getAttribute.nil? {} until get_attribute.nil?
end end
return true return true
end end
def handleOther def handle_other
return @data.jumpTo('>') return @data.jump_to('>')
end end
# Return a name,value pair for the next attribute in the stream, # Return a name,value pair for the next attribute in the stream,
# if one is found, or nil # if one is found, or nil
def getAttribute def get_attribute
@data.skip(SPACE_CHARACTERS + ['/']) @data.skip(SPACE_CHARACTERS + ['/'])
if @data.currentByte == '<' if @data.current_byte == '<'
@data.position -= 1 @data.position -= 1
return nil return nil
elsif @data.currentByte == '>' elsif @data.current_byte == '>'
return nil return nil
end end
attrName = [] attr_name = []
attrValue = [] attr_value = []
spaceFound = false space_found = false
#Step 5 attribute name #Step 5 attribute name
while true while true
if @data.currentByte == '=' and attrName: if @data.current_byte == '=' and attr_name:
break break
elsif SPACE_CHARACTERS.include?(@data.currentByte) elsif SPACE_CHARACTERS.include?(@data.current_byte)
spaceFound = true space_found = true
break break
elsif ['/', '<', '>'].include?(@data.currentByte) elsif ['/', '<', '>'].include?(@data.current_byte)
return [attrName.join(''), ''] return [attr_name.join(''), '']
elsif ASCII_UPPERCASE.include?(@data.currentByte) elsif ASCII_UPPERCASE.include?(@data.current_byte)
attrName.push(@data.currentByte.downcase) attr_name.push(@data.current_byte.downcase)
else else
attrName.push(@data.currentByte) attr_name.push(@data.current_byte)
end end
#Step 6 #Step 6
@data.position += 1 @data.position += 1
end end
#Step 7 #Step 7
if spaceFound if space_found
@data.skip @data.skip
#Step 8 #Step 8
unless @data.currentByte == '=' unless @data.current_byte == '='
@data.position -= 1 @data.position -= 1
return [attrName.join(''), ''] return [attr_name.join(''), '']
end end
end end
#XXX need to advance position in both spaces and value case #XXX need to advance position in both spaces and value case
@ -458,92 +458,93 @@ class EncodingParser
#Step 10 #Step 10
@data.skip @data.skip
#Step 11 #Step 11
if ["'", '"'].include?(@data.currentByte) if ["'", '"'].include?(@data.current_byte)
#11.1 #11.1
quoteChar = @data.currentByte quote_char = @data.current_byte
while true while true
@data.position+=1 @data.position+=1
#11.3 #11.3
if @data.currentByte == quoteChar if @data.current_byte == quote_char
@data.position += 1 @data.position += 1
return [attrName.join(''), attrValue.join('')] return [attr_name.join(''), attr_value.join('')]
#11.4 #11.4
elsif ASCII_UPPERCASE.include?(@data.currentByte) elsif ASCII_UPPERCASE.include?(@data.current_byte)
attrValue.push(@data.currentByte.downcase) attr_value.push(@data.current_byte.downcase)
#11.5 #11.5
else else
attrValue.push(@data.currentByte) attr_value.push(@data.current_byte)
end end
end end
elsif ['>', '<'].include?(@data.currentByte) elsif ['>', '<'].include?(@data.current_byte)
return [attrName.join(''), ''] return [attr_name.join(''), '']
elsif ASCII_UPPERCASE.include?(@data.currentByte) elsif ASCII_UPPERCASE.include?(@data.current_byte)
attrValue.push(@data.currentByte.downcase) attr_value.push(@data.current_byte.downcase)
else else
attrValue.push(@data.currentByte) attr_value.push(@data.current_byte)
end end
while true while true
@data.position +=1 @data.position += 1
if (SPACE_CHARACTERS + ['>', '<']).include?(@data.currentByte) if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
return [attrName.join(''), attrValue.join('')] return [attr_name.join(''), attr_value.join('')]
elsif ASCII_UPPERCASE.include?(@data.currentByte) elsif ASCII_UPPERCASE.include?(@data.current_byte)
attrValue.push(@data.currentByte.downcase) attr_value.push(@data.current_byte.downcase)
else else
attrValue.push(@data.currentByte) attr_value.push(@data.current_byte)
end
end end
end end
end end
end
class ContentAttrParser class ContentAttrParser
def initialize(data) def initialize(data)
@data = data @data = data
end end
def parse def parse
begin begin
#Skip to the first ";" #Skip to the first ";"
@data.position = 0 @data.position = 0
@data.jumpTo(';') @data.jump_to(';')
@data.position += 1 @data.position += 1
@data.skip @data.skip
#Check if the attr name is charset #Check if the attr name is charset
#otherwise return #otherwise return
@data.jumpTo('charset') @data.jump_to('charset')
@data.position += 1 @data.position += 1
@data.skip @data.skip
unless @data.currentByte == '=' unless @data.current_byte == '='
#If there is no = sign keep looking for attrs #If there is no = sign keep looking for attrs
return nil return nil
end end
@data.position += 1 @data.position += 1
@data.skip @data.skip
#Look for an encoding between matching quote marks #Look for an encoding between matching quote marks
if ['"', "'"].include?(@data.currentByte) if ['"', "'"].include?(@data.current_byte)
quoteMark = @data.currentByte quote_mark = @data.current_byte
@data.position += 1 @data.position += 1
oldPosition = @data.position old_position = @data.position
@data.jumpTo(quoteMark) @data.jump_to(quote_mark)
return @data[oldPosition ... @data.position] return @data[old_position ... @data.position]
else else
#Unquoted value #Unquoted value
oldPosition = @data.position old_position = @data.position
begin begin
@data.findNext(SPACE_CHARACTERS) @data.find_next(SPACE_CHARACTERS)
return @data[oldPosition ... @data.position] return @data[old_position ... @data.position]
rescue EOF rescue EOF
#Return the whole remaining value #Return the whole remaining value
return @data[oldPosition .. -1] return @data[old_position .. -1]
end end
end end
rescue EOF rescue EOF
return nil return nil
end end
end end
end end
# Determine if a string is a supported encoding # Determine if a string is a supported encoding
def self.isValidEncoding(encoding) def self.is_valid_encoding(encoding)
(not encoding.nil? and encoding.kind_of?(String) and ENCODINGS.include?(encoding.downcase.strip)) (not encoding.nil? and encoding.kind_of?(String) and ENCODINGS.include?(encoding.downcase.strip))
end end
end end

View file

@ -16,10 +16,10 @@ require 'html5lib/constants'
module HTML5lib module HTML5lib
# liberal XML parser # liberal XML parser
class XMLParser < HTMLParser class XMLParser < HTMLParser
def initialize(options={}) def initialize(options = {})
super options super options
@phases[:initial] = XmlRootPhase.new(self, @tree) @phases[:initial] = XmlRootPhase.new(self, @tree)
end end
@ -53,12 +53,12 @@ class XMLParser < HTMLParser
return token return token
end end
end end
# liberal XMTHML parser # liberal XMTHML parser
class XHTMLParser < XMLParser class XHTMLParser < XMLParser
def initialize(options={}) def initialize(options = {})
super options super options
@phases[:initial] = InitialPhase.new(self, @tree) @phases[:initial] = InitialPhase.new(self, @tree)
@phases[:rootElement] = XhmlRootPhase.new(self, @tree) @phases[:rootElement] = XhmlRootPhase.new(self, @tree)
@ -82,18 +82,18 @@ class XHTMLParser < XMLParser
return token return token
end end
end end
class XhmlRootPhase < RootElementPhase class XhmlRootPhase < RootElementPhase
def insertHtmlElement def insertHtmlElement
element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'}) element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
@tree.openElements.push(element) @tree.openElements.push(element)
@tree.document.appendChild(element) @tree.document.appendChild(element)
@parser.phase = @parser.phases[:beforeHead] @parser.phase = @parser.phases[:beforeHead]
end end
end end
class XmlRootPhase < Phase class XmlRootPhase < Phase
# Prime the Xml parser # Prime the Xml parser
@start_tag_handlers = Hash.new(:startTagOther) @start_tag_handlers = Hash.new(:startTagOther)
@end_tag_handlers = Hash.new(:endTagOther) @end_tag_handlers = Hash.new(:endTagOther)
@ -108,9 +108,9 @@ class XmlRootPhase < Phase
super super
@tree.openElements.pop @tree.openElements.pop
end end
end end
class XmlElementPhase < Phase class XmlElementPhase < Phase
# Generic handling for all XML elements # Generic handling for all XML elements
@start_tag_handlers = Hash.new(:startTagOther) @start_tag_handlers = Hash.new(:startTagOther)
@ -136,6 +136,6 @@ class XmlElementPhase < Phase
def processCharacters(data) def processCharacters(data)
@tree.insertText(data) @tree.insertText(data)
end end
end end
end end

View file

@ -6,7 +6,7 @@ module HTML5lib
# This module provides sanitization of XHTML+MathML+SVG # This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes. # and of inline style attributes.
class HTMLSanitizer < HTMLTokenizer class HTMLSanitizer < HTMLTokenizer
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
button caption center cite code col colgroup dd del dfn dir div dl dt button caption center cite code col colgroup dd del dfn dir div dl dt
@ -144,7 +144,6 @@ class HTMLSanitizer < HTMLTokenizer
else else
yield token yield token
end end
end end
end end
@ -157,7 +156,7 @@ class HTMLSanitizer < HTMLTokenizer
return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/ return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
clean = [] clean = []
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val| style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
next if val.empty? next if val.empty?
prop.downcase! prop.downcase!
if ALLOWED_CSS_PROPERTIES.include?(prop) if ALLOWED_CSS_PROPERTIES.include?(prop)
@ -174,5 +173,5 @@ class HTMLSanitizer < HTMLTokenizer
style = clean.join(' ') style = clean.join(' ')
end end
end end
end end

View file

@ -3,27 +3,27 @@ require 'html5lib/inputstream'
module HTML5lib module HTML5lib
# This class takes care of tokenizing HTML. # This class takes care of tokenizing HTML.
# #
# * @currentToken # * @currentToken
# Holds the token that is currently being processed. # Holds the token that is currently being processed.
# #
# * @state # * @state
# Holds a reference to the method to be invoked... XXX # Holds a reference to the method to be invoked... XXX
# #
# * @states # * @states
# Holds a mapping between states and methods that implement the state. # Holds a mapping between states and methods that implement the state.
# #
# * @stream # * @stream
# Points to HTMLInputStream object. # Points to HTMLInputStream object.
class HTMLTokenizer class HTMLTokenizer
attr_accessor :contentModelFlag, :currentToken attr_accessor :contentModelFlag, :currentToken
attr_reader :stream attr_reader :stream
# XXX need to fix documentation # XXX need to fix documentation
def initialize(stream, options={}) def initialize(stream, options = {})
@stream = HTMLInputStream.new(stream, options) @stream = HTMLInputStream.new(stream, options)
@states = { @states = {
@ -147,7 +147,7 @@ class HTMLTokenizer
charAsInt = 65533 charAsInt = 65533
end end
if charAsInt <= 0x10FFF if charAsInt <= 0x10FFFF
char = [charAsInt].pack('U') char = [charAsInt].pack('U')
else else
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
@ -261,13 +261,11 @@ class HTMLTokenizer
@state = @states[:data] @state = @states[:data]
end end
# Below are the various tokenizer states worked out. # Below are the various tokenizer states worked out.
# XXX AT Perhaps we should have Hixie run some evaluation on billions of # XXX AT Perhaps we should have Hixie run some evaluation on billions of
# documents to figure out what the order of the various if and elsif # documents to figure out what the order of the various if and elsif
# statements should be. # statements should be.
def dataState def dataState
data = @stream.char data = @stream.char
if data == "&" and (@contentModelFlag == :PCDATA or if data == "&" and (@contentModelFlag == :PCDATA or
@ -285,10 +283,10 @@ class HTMLTokenizer
# XXX need to check if we don't need a special "spaces" flag on # XXX need to check if we don't need a special "spaces" flag on
# characters. # characters.
@tokenQueue.push({:type => :SpaceCharacters, :data => @tokenQueue.push({:type => :SpaceCharacters, :data =>
data + @stream.charsUntil(SPACE_CHARACTERS, true)}) data + @stream.chars_until(SPACE_CHARACTERS, true)})
else else
@tokenQueue.push({:type => :Characters, :data => @tokenQueue.push({:type => :Characters, :data =>
data + @stream.charsUntil(["&", "<"])}) data + @stream.chars_until(["&", "<"])})
end end
return true return true
end end
@ -430,7 +428,7 @@ class HTMLTokenizer
emitCurrentToken emitCurrentToken
elsif ASCII_LETTERS.include? data elsif ASCII_LETTERS.include? data
@currentToken[:name] += data +\ @currentToken[:name] += data +\
@stream.charsUntil(ASCII_LETTERS, true) @stream.chars_until(ASCII_LETTERS, true)
elsif data == ">" elsif data == ">"
emitCurrentToken emitCurrentToken
elsif data == "<" elsif data == "<"
@ -450,7 +448,7 @@ class HTMLTokenizer
def beforeAttributeNameState def beforeAttributeNameState
data = @stream.char data = @stream.char
if SPACE_CHARACTERS.include? data if SPACE_CHARACTERS.include? data
@stream.charsUntil(SPACE_CHARACTERS, true) @stream.chars_until(SPACE_CHARACTERS, true)
elsif data == :EOF elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file. Expected attribute name instead.")}) _("Unexpected end of file. Expected attribute name instead.")})
@ -486,7 +484,7 @@ class HTMLTokenizer
leavingThisState = false leavingThisState = false
elsif ASCII_LETTERS.include? data elsif ASCII_LETTERS.include? data
@currentToken[:data][-1][0] += data +\ @currentToken[:data][-1][0] += data +\
@stream.charsUntil(ASCII_LETTERS, true) @stream.chars_until(ASCII_LETTERS, true)
leavingThisState = false leavingThisState = false
elsif data == ">" elsif data == ">"
# XXX If we emit here the attributes are converted to a dict # XXX If we emit here the attributes are converted to a dict
@ -529,7 +527,7 @@ class HTMLTokenizer
def afterAttributeNameState def afterAttributeNameState
data = @stream.char data = @stream.char
if SPACE_CHARACTERS.include? data if SPACE_CHARACTERS.include? data
@stream.charsUntil(SPACE_CHARACTERS, true) @stream.chars_until(SPACE_CHARACTERS, true)
elsif data == "=" elsif data == "="
@state = @states[:beforeAttributeValue] @state = @states[:beforeAttributeValue]
elsif data == ">" elsif data == ">"
@ -559,7 +557,7 @@ class HTMLTokenizer
def beforeAttributeValueState def beforeAttributeValueState
data = @stream.char data = @stream.char
if SPACE_CHARACTERS.include? data if SPACE_CHARACTERS.include? data
@stream.charsUntil(SPACE_CHARACTERS, true) @stream.chars_until(SPACE_CHARACTERS, true)
elsif data == "\"" elsif data == "\""
@state = @states[:attributeValueDoubleQuoted] @state = @states[:attributeValueDoubleQuoted]
elsif data == "&" elsif data == "&"
@ -597,7 +595,7 @@ class HTMLTokenizer
emitCurrentToken emitCurrentToken
else else
@currentToken[:data][-1][1] += data +\ @currentToken[:data][-1][1] += data +\
@stream.charsUntil(["\"", "&"]) @stream.chars_until(["\"", "&"])
end end
return true return true
end end
@ -614,7 +612,7 @@ class HTMLTokenizer
emitCurrentToken emitCurrentToken
else else
@currentToken[:data][-1][1] += data +\ @currentToken[:data][-1][1] += data +\
@stream.charsUntil(["'", "&"]) @stream.chars_until(["'", "&"])
end end
return true return true
end end
@ -638,17 +636,17 @@ class HTMLTokenizer
emitCurrentToken emitCurrentToken
else else
@currentToken[:data][-1][1] += data + @currentToken[:data][-1][1] += data +
@stream.charsUntil(["&", ">","<"] + SPACE_CHARACTERS) @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
end end
return true return true
end end
def bogusCommentState def bogusCommentState
# Make a new comment token and give it as value all the characters # Make a new comment token and give it as value all the characters
# until the first > or :EOF (charsUntil checks for :EOF automatically) # until the first > or :EOF (chars_until checks for :EOF automatically)
# and emit it. # and emit it.
@tokenQueue.push( @tokenQueue.push(
{:type => :Comment, :data => @stream.charsUntil((">"))}) {:type => :Comment, :data => @stream.chars_until((">"))})
# Eat the character directly after the bogus comment which is either a # Eat the character directly after the bogus comment which is either a
# ">" or an :EOF. # ">" or an :EOF.
@ -690,7 +688,7 @@ class HTMLTokenizer
@tokenQueue.push(@currentToken) @tokenQueue.push(@currentToken)
@state = @states[:data] @state = @states[:data]
else else
@currentToken[:data] += data + @stream.charsUntil("-") @currentToken[:data] += data + @stream.chars_until("-")
end end
return true return true
end end
@ -706,7 +704,7 @@ class HTMLTokenizer
@state = @states[:data] @state = @states[:data]
else else
@currentToken[:data] += "-" + data +\ @currentToken[:data] += "-" + data +\
@stream.charsUntil("-") @stream.chars_until("-")
# Consume the next character which is either a "-" or an :EOF as # Consume the next character which is either a "-" or an :EOF as
# well so if there's a "-" directly after the "-" we go nicely to # well so if there's a "-" directly after the "-" we go nicely to
# the "comment end state" without emitting a ParseError there. # the "comment end state" without emitting a ParseError there.
@ -849,6 +847,6 @@ class HTMLTokenizer
end end
def _(string); string; end def _(string); string; end
end end
end end

View file

@ -1,5 +1,5 @@
module HTML5lib module HTML5lib
module TreeBuilders module TreeBuilders
def self.getTreeBuilder(name) def self.getTreeBuilder(name)
case name.to_s.downcase case name.to_s.downcase
@ -17,5 +17,5 @@ module TreeBuilders
end end
end end
end end
end end

View file

@ -4,15 +4,15 @@ require 'html5lib/constants'
module HTML5lib module HTML5lib
# The scope markers are inserted when entering buttons, object elements, # The scope markers are inserted when entering buttons, object elements,
# marquees, table cells, and table captions, and are used to prevent formatting # marquees, table cells, and table captions, and are used to prevent formatting
# from "leaking" into tables, buttons, object elements, and marquees. # from "leaking" into tables, buttons, object elements, and marquees.
Marker = nil Marker = nil
module TreeBuilders module TreeBuilders
module Base module Base
class Node class Node
# The parent of the current node (or nil for the document node) # The parent of the current node (or nil for the document node)
attr_accessor :parent attr_accessor :parent
@ -36,7 +36,7 @@ class Node
# Insert data as text in the current node, positioned before the # Insert data as text in the current node, positioned before the
# start of node insertBefore or to the end of the node's text. # start of node insertBefore or to the end of the node's text.
def insertText(data, insertBefore = nil) def insertText(data, insertBefore=nil)
raise NotImplementedError raise NotImplementedError
end end
@ -71,10 +71,10 @@ class Node
def hasContent def hasContent
raise NotImplementedError raise NotImplementedError
end end
end end
# Base treebuilder implementation # Base treebuilder implementation
class TreeBuilder class TreeBuilder
attr_accessor :openElements attr_accessor :openElements
@ -118,7 +118,7 @@ class TreeBuilder
@document = @documentClass.new @document = @documentClass.new
end end
def elementInScope(target, tableVariant = false) def elementInScope(target, tableVariant=false)
# Exit early when possible. # Exit early when possible.
return true if @openElements[-1].name == target return true if @openElements[-1].name == target
@ -202,7 +202,7 @@ class TreeBuilder
@document.appendChild(@doctypeClass.new(name)) @document.appendChild(@doctypeClass.new(name))
end end
def insertComment(data, parent = nil) def insertComment(data, parent=nil)
parent = @openElements[-1] if parent.nil? parent = @openElements[-1] if parent.nil?
parent.appendChild(@commentClass.new(data)) parent.appendChild(@commentClass.new(data))
end end
@ -253,7 +253,7 @@ class TreeBuilder
return element return element
end end
def insertText(data, parent = nil) def insertText(data, parent=nil)
parent = @openElements[-1] if parent.nil? parent = @openElements[-1] if parent.nil?
if (not(@insertFromTable) or (@insertFromTable and not TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name))) if (not(@insertFromTable) or (@insertFromTable and not TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)))
@ -296,7 +296,7 @@ class TreeBuilder
return fosterParent, insertBefore return fosterParent, insertBefore
end end
def generateImpliedEndTags(exclude = nil) def generateImpliedEndTags(exclude=nil)
name = @openElements[-1].name name = @openElements[-1].name
if (['dd', 'dt', 'li', 'p', 'td', 'th', 'tr'].include?(name) and name != exclude) if (['dd', 'dt', 'li', 'p', 'td', 'th', 'tr'].include?(name) and name != exclude)
@ -324,7 +324,7 @@ class TreeBuilder
raise NotImplementedError raise NotImplementedError
end end
end end
end end
end end
end end

View file

@ -3,10 +3,10 @@ require 'hpricot'
require 'forwardable' require 'forwardable'
module HTML5lib module HTML5lib
module TreeBuilders module TreeBuilders
module Hpricot module Hpricot
class Node < Base::Node class Node < Base::Node
extend Forwardable extend Forwardable
@ -35,7 +35,7 @@ class Node < Base::Node
node.parent = nil node.parent = nil
end end
def insertText(data, before = nil) def insertText(data, before=nil)
if before if before
insertBefore(TextNode.new(data), before) insertBefore(TextNode.new(data), before)
else else
@ -55,9 +55,9 @@ class Node < Base::Node
def hasContent def hasContent
childNodes.any? childNodes.any?
end end
end end
class Element < Node class Element < Node
def self.hpricot_class def self.hpricot_class
::Hpricot::Elem ::Hpricot::Elem
end end
@ -89,13 +89,16 @@ class Element < Node
def initialize(hpricot) def initialize(hpricot)
@hpricot = hpricot @hpricot = hpricot
end end
def []=(k, v) def []=(k, v)
@hpricot.stag.send(stag_attributes_method)[k] = v @hpricot.stag.send(stag_attributes_method)[k] = v
end end
def stag_attributes_method def stag_attributes_method
# STag#attributes changed to STag#raw_attributes after Hpricot 0.5 # STag#attributes changed to STag#raw_attributes after Hpricot 0.5
@hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes @hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
end end
def method_missing(*a, &b) def method_missing(*a, &b)
@hpricot.attributes.send(*a, &b) @hpricot.attributes.send(*a, &b)
end end
@ -109,7 +112,7 @@ class Element < Node
attrs.each { |name, value| @hpricot[name] = value } attrs.each { |name, value| @hpricot[name] = value }
end end
def printTree(indent = 0) def printTree(indent=0)
tree = "\n|#{' ' * indent}<#{name}>" tree = "\n|#{' ' * indent}<#{name}>"
indent += 2 indent += 2
attributes.each do |name, value| attributes.each do |name, value|
@ -118,9 +121,9 @@ class Element < Node
end end
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) } childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
end end
end end
class Document < Node class Document < Node
def self.hpricot_class def self.hpricot_class
::Hpricot::Doc ::Hpricot::Doc
end end
@ -129,12 +132,12 @@ class Document < Node
super(nil) super(nil)
end end
def printTree(indent = 0) def printTree(indent=0)
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) } childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
end end
end end
class DocumentType < Node class DocumentType < Node
def self.hpricot_class def self.hpricot_class
::Hpricot::DocType ::Hpricot::DocType
end end
@ -148,42 +151,42 @@ class DocumentType < Node
@hpricot = ::Hpricot::DocType.new(name, nil, nil) @hpricot = ::Hpricot::DocType.new(name, nil, nil)
end end
def printTree(indent = 0) def printTree(indent=0)
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>" "\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
end end
end end
class DocumentFragment < Element class DocumentFragment < Element
def initialize def initialize
super('') super('')
end end
def printTree(indent = 0) def printTree(indent=0)
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) } childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
end end
end end
class TextNode < Node class TextNode < Node
def initialize(data) def initialize(data)
@hpricot = ::Hpricot::Text.new(data) @hpricot = ::Hpricot::Text.new(data)
end end
def printTree(indent = 0) def printTree(indent=0)
"\n|#{' ' * indent}\"#{hpricot.content}\"" "\n|#{' ' * indent}\"#{hpricot.content}\""
end end
end end
class CommentNode < Node class CommentNode < Node
def self.hpricot_class def self.hpricot_class
::Hpricot::Comment ::Hpricot::Comment
end end
def printTree(indent = 0) def printTree(indent=0)
"\n|#{' ' * indent}<!-- #{hpricot.content} -->" "\n|#{' ' * indent}<!-- #{hpricot.content} -->"
end end
end end
class TreeBuilder < Base::TreeBuilder class TreeBuilder < Base::TreeBuilder
def initialize def initialize
@documentClass = Document @documentClass = Document
@doctypeClass = DocumentType @doctypeClass = DocumentType
@ -204,8 +207,8 @@ class TreeBuilder < Base::TreeBuilder
@document = super @document = super
return @document.hpricot.children return @document.hpricot.children
end end
end end
end end
end end
end end

View file

@ -3,10 +3,10 @@ require 'rexml/document'
require 'forwardable' require 'forwardable'
module HTML5lib module HTML5lib
module TreeBuilders module TreeBuilders
module REXMLTree module REXMLTree
class Node < Base::Node class Node < Base::Node
extend Forwardable extend Forwardable
def_delegators :@rxobj, :name, :attributes def_delegators :@rxobj, :name, :attributes
attr_accessor :rxobj attr_accessor :rxobj
@ -58,9 +58,9 @@ class Node < Base::Node
def hasContent def hasContent
return (childNodes.length > 0) return (childNodes.length > 0)
end end
end end
class Element < Node class Element < Node
def self.rxclass def self.rxclass
REXML::Element REXML::Element
end end
@ -76,7 +76,7 @@ class Element < Node
end end
def attributes= value def attributes= value
value.each {|name,value| rxobj.attributes[name]=value} value.each {|name, value| rxobj.attributes[name]=value}
end end
def printTree indent=0 def printTree indent=0
@ -91,9 +91,9 @@ class Element < Node
end end
return tree return tree
end end
end end
class Document < Node class Document < Node
def self.rxclass def self.rxclass
REXML::Document REXML::Document
end end
@ -116,9 +116,9 @@ class Document < Node
end end
return tree return tree
end end
end end
class DocumentType < Node class DocumentType < Node
def self.rxclass def self.rxclass
REXML::DocType REXML::DocType
end end
@ -126,9 +126,9 @@ class DocumentType < Node
def printTree indent=0 def printTree indent=0
"\n|#{' ' * indent}<!DOCTYPE #{name}>" "\n|#{' ' * indent}<!DOCTYPE #{name}>"
end end
end end
class DocumentFragment < Element class DocumentFragment < Element
def initialize def initialize
super nil super nil
end end
@ -140,9 +140,9 @@ class DocumentFragment < Element
end end
return tree return tree
end end
end end
class TextNode < Node class TextNode < Node
def initialize data def initialize data
raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;') raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;')
@rxobj = REXML::Text.new(raw, true, nil, true) @rxobj = REXML::Text.new(raw, true, nil, true)
@ -151,9 +151,9 @@ class TextNode < Node
def printTree indent=0 def printTree indent=0
"\n|#{' ' * indent}\"#{rxobj.value}\"" "\n|#{' ' * indent}\"#{rxobj.value}\""
end end
end end
class CommentNode < Node class CommentNode < Node
def self.rxclass def self.rxclass
REXML::Comment REXML::Comment
end end
@ -161,9 +161,9 @@ class CommentNode < Node
def printTree indent=0 def printTree indent=0
"\n|#{' ' * indent}<!-- #{rxobj.string} -->" "\n|#{' ' * indent}<!-- #{rxobj.string} -->"
end end
end end
class TreeBuilder < Base::TreeBuilder class TreeBuilder < Base::TreeBuilder
def initialize def initialize
@documentClass = Document @documentClass = Document
@doctypeClass = DocumentType @doctypeClass = DocumentType
@ -184,8 +184,8 @@ class TreeBuilder < Base::TreeBuilder
@document = super @document = super
return @document.rxobj.children return @document.rxobj.children
end end
end end
end end
end end
end end

View file

@ -1,10 +1,10 @@
require 'html5lib/treebuilders/base' require 'html5lib/treebuilders/base'
module HTML5lib module HTML5lib
module TreeBuilders module TreeBuilders
module SimpleTree module SimpleTree
class Node < Base::Node class Node < Base::Node
# Node representing an item in the tree. # Node representing an item in the tree.
# name - The tag name associated with the node # name - The tag name associated with the node
attr_accessor :name attr_accessor :name
@ -74,9 +74,9 @@ class Node < Base::Node
def hasContent def hasContent
return (childNodes.length > 0) return (childNodes.length > 0)
end end
end end
class Element < Node class Element < Node
def to_s def to_s
"<%s>" % name "<%s>" % name
end end
@ -92,9 +92,9 @@ class Element < Node
end end
return tree return tree
end end
end end
class Document < Node class Document < Node
def to_s def to_s
"#document" "#document"
end end
@ -110,15 +110,15 @@ class Document < Node
end end
return tree return tree
end end
end end
class DocumentType < Node class DocumentType < Node
def to_s def to_s
"<!DOCTYPE %s>" % name "<!DOCTYPE %s>" % name
end end
end end
class DocumentFragment < Element class DocumentFragment < Element
def initialize def initialize
super nil super nil
end end
@ -130,9 +130,9 @@ class DocumentFragment < Element
end end
return tree return tree
end end
end end
class TextNode < Node class TextNode < Node
def initialize value def initialize value
super nil super nil
@value = value @value = value
@ -141,9 +141,9 @@ class TextNode < Node
def to_s def to_s
'"%s"' % value '"%s"' % value
end end
end end
class CommentNode < Node class CommentNode < Node
def initialize value def initialize value
super nil super nil
@value = value @value = value
@ -152,9 +152,9 @@ class CommentNode < Node
def to_s def to_s
"<!-- %s -->" % value "<!-- %s -->" % value
end end
end end
class TreeBuilder < Base::TreeBuilder class TreeBuilder < Base::TreeBuilder
def initialize def initialize
@documentClass = Document @documentClass = Document
@doctypeClass = DocumentType @doctypeClass = DocumentType
@ -171,8 +171,8 @@ class TreeBuilder < Base::TreeBuilder
@document = super @document = super
return @document.childNodes return @document.childNodes
end end
end end
end end
end end
end end

View file

@ -9,3 +9,15 @@ $:.unshift File.dirname(__FILE__)
def html5lib_test_files(subdirectory) def html5lib_test_files(subdirectory)
Dir[File.join(HTML5LIB_BASE, 'tests', subdirectory, '*.*')] Dir[File.join(HTML5LIB_BASE, 'tests', subdirectory, '*.*')]
end end
begin
require 'jsonx'
rescue LoadError
class JSON
def self.parse json
json.gsub! /"\s*:/, '"=>'
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
eval json
end
end
end

View file

@ -11,7 +11,7 @@ begin
def test_chardet def test_chardet
File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file| File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true) stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
assert_equal 'big5', stream.charEncoding.downcase assert_equal 'big5', stream.char_encoding.downcase
end end
end end
rescue LoadError rescue LoadError
@ -28,7 +28,7 @@ end
define_method 'test_%s_%d' % [ test_name, index + 1 ] do define_method 'test_%s_%d' % [ test_name, index + 1 ] do
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false) stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
assert_equal encoding.downcase, stream.charEncoding.downcase, input assert_equal encoding.downcase, stream.char_encoding.downcase, input
end end
end end
end end

View file

@ -203,4 +203,8 @@ class SanitizeTest < Test::Unit::TestCase
sanitize_html(%(<img src='vbscript:msgbox("XSS")' />)) sanitize_html(%(<img src='vbscript:msgbox("XSS")' />))
end end
def test_should_handle_astral_plane_characters
assert_equal "<p>\360\235\222\265 \360\235\224\270</p>",
sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
end
end end

View file

@ -4,18 +4,6 @@ require 'html5lib/tokenizer'
require 'tokenizer_test_parser' require 'tokenizer_test_parser'
begin
require 'jsonx'
rescue LoadError
class JSON
def self.parse json
json.gsub! /"\s*:/, '"=>'
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
eval json
end
end
end
class Html5TokenizerTestCase < Test::Unit::TestCase class Html5TokenizerTestCase < Test::Unit::TestCase
def type_of?(token_name, token) def type_of?(token_name, token)