ed68d975df
Fix that Tokenizer bug for real this time.
158 lines
4.4 KiB
Ruby
Executable file
158 lines
4.4 KiB
Ruby
Executable file
# Warning: this module is experimental and subject to change and even removal
|
|
# at any time.
|
|
#
|
|
# For background/rationale, see:
|
|
# * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
|
|
# * http://tinyurl.com/ylfj8k (and follow-ups)
|
|
#
|
|
# References:
|
|
# * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
|
|
# * http://wiki.whatwg.org/wiki/HtmlVsXhtml
|
|
#
|
|
# @@TODO:
|
|
# * Selectively lowercase only XHTML, but not foreign markup
|
|
require 'html5/html5parser'
|
|
require 'html5/constants'
|
|
|
|
module HTML5
|
|
|
|
# liberal XML parser
|
|
class XMLParser < HTMLParser
|
|
|
|
def initialize(options = {})
|
|
super options
|
|
@phases[:initial] = XmlRootPhase.new(self, @tree)
|
|
end
|
|
|
|
def normalize_token(token)
|
|
case token[:type]
|
|
when :StartTag, :EmptyTag
|
|
# We need to remove the duplicate attributes and convert attributes
|
|
# to a Hash so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
|
|
|
token[:data] = Hash[*token[:data].reverse.flatten]
|
|
|
|
# For EmptyTags, process both a Start and an End tag
|
|
if token[:type] == :EmptyTag
|
|
save = @tokenizer.content_model_flag
|
|
@phase.processStartTag(token[:name], token[:data])
|
|
@tokenizer.content_model_flag = save
|
|
token[:data] = {}
|
|
token[:type] = :EndTag
|
|
end
|
|
|
|
when :Characters
|
|
# un-escape RCDATA_ELEMENTS (e.g. style, script)
|
|
if @tokenizer.content_model_flag == :CDATA
|
|
token[:data] = token[:data].
|
|
gsub('<','<').gsub('>','>').gsub('&','&')
|
|
end
|
|
|
|
when :EndTag
|
|
if token[:data]
|
|
parse_error("attributes-in-end-tag")
|
|
end
|
|
|
|
when :Comment
|
|
# Rescue CDATA from the comments
|
|
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
|
|
token[:type] = :Characters
|
|
token[:data] = token[:data][7 ... -2]
|
|
end
|
|
end
|
|
|
|
return token
|
|
end
|
|
end
|
|
|
|
# liberal XMTHML parser
|
|
class XHTMLParser < XMLParser
|
|
|
|
def initialize(options = {})
|
|
super options
|
|
@phases[:initial] = InitialPhase.new(self, @tree)
|
|
@phases[:rootElement] = XhmlRootPhase.new(self, @tree)
|
|
end
|
|
|
|
def normalize_token(token)
|
|
super(token)
|
|
|
|
# ensure that non-void XHTML elements have content so that separate
|
|
# open and close tags are emitted
|
|
if token[:type] == :EndTag
|
|
if VOID_ELEMENTS.include? token[:name]
|
|
if @tree.open_elements[-1].name != token["name"]
|
|
token[:type] = :EmptyTag
|
|
token["data"] ||= {}
|
|
end
|
|
else
|
|
if token[:name] == @tree.open_elements[-1].name and \
|
|
not @tree.open_elements[-1].hasContent
|
|
@tree.insertText('') unless
|
|
@tree.open_elements.any? {|e|
|
|
e.attributes.keys.include? 'xmlns' and
|
|
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
return token
|
|
end
|
|
end
|
|
|
|
class XhmlRootPhase < RootElementPhase
|
|
def insert_html_element
|
|
element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
|
|
@tree.open_elements.push(element)
|
|
@tree.document.appendChild(element)
|
|
@parser.phase = @parser.phases[:beforeHead]
|
|
end
|
|
end
|
|
|
|
class XmlRootPhase < Phase
|
|
# Prime the Xml parser
|
|
@start_tag_handlers = Hash.new(:startTagOther)
|
|
@end_tag_handlers = Hash.new(:endTagOther)
|
|
def startTagOther(name, attributes)
|
|
@tree.open_elements.push(@tree.document)
|
|
element = @tree.createElement(name, attributes)
|
|
@tree.open_elements[-1].appendChild(element)
|
|
@tree.open_elements.push(element)
|
|
@parser.phase = XmlElementPhase.new(@parser,@tree)
|
|
end
|
|
def endTagOther(name)
|
|
super
|
|
@tree.open_elements.pop
|
|
end
|
|
end
|
|
|
|
class XmlElementPhase < Phase
|
|
# Generic handling for all XML elements
|
|
|
|
@start_tag_handlers = Hash.new(:startTagOther)
|
|
@end_tag_handlers = Hash.new(:endTagOther)
|
|
|
|
def startTagOther(name, attributes)
|
|
element = @tree.createElement(name, attributes)
|
|
@tree.open_elements[-1].appendChild(element)
|
|
@tree.open_elements.push(element)
|
|
end
|
|
|
|
def endTagOther(name)
|
|
for node in @tree.open_elements.reverse
|
|
if node.name == name
|
|
{} while @tree.open_elements.pop != node
|
|
break
|
|
else
|
|
parse_error
|
|
end
|
|
end
|
|
end
|
|
|
|
def processCharacters(data)
|
|
@tree.insertText(data)
|
|
end
|
|
end
|
|
|
|
end
|