# Warning: this module is experimental and subject to change and even removal # at any time. # # For background/rationale, see: # * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib # * http://tinyurl.com/ylfj8k (and follow-ups) # # References: # * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html # * http://wiki.whatwg.org/wiki/HtmlVsXhtml # # @@TODO: # * Selectively lowercase only XHTML, but not foreign markup require 'html5lib/html5parser' require 'html5lib/constants' module HTML5lib # liberal XML parser class XMLParser < HTMLParser def initialize(options={}) super options @phases[:initial] = XmlRootPhase.new(self, @tree) end def normalizeToken(token) if token[:type] == :StartTag or token[:type] == :EmptyTag # We need to remove the duplicate attributes and convert attributes # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"} token[:data] = Hash[*token[:data].reverse.flatten] # For EmptyTags, process both a Start and an End tag if token[:type] == :EmptyTag @phase.processStartTag(token[:name], token[:data]) token[:data] = {} token[:type] = :EndTag end elsif token[:type] == :EndTag if token[:data] parseError(_("End tag contains unexpected attributes.")) end elsif token[:type] == :Comment # Rescue CDATA from the comments if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]" token[:type] = :Characters token[:data] = token[:data][7 ... -2] end end return token end end # liberal XMTHML parser class XHTMLParser < XMLParser def initialize(options={}) super options @phases[:initial] = InitialPhase.new(self, @tree) @phases[:rootElement] = XhmlRootPhase.new(self, @tree) end def normalizeToken(token) super(token) # ensure that non-void XHTML elements have content so that separate # open and close tags are emitted if token[:type] == :EndTag and \ not VOID_ELEMENTS.include? token[:name] and \ token[:name] == @tree.openElements[-1].name and \ not @tree.openElements[-1].hasContent @tree.insertText('') unless @tree.openElements.any? {|e| e.attributes.keys.include? 'xmlns' and e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml' } end return token end end class XhmlRootPhase < RootElementPhase def insertHtmlElement element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'}) @tree.openElements.push(element) @tree.document.appendChild(element) @parser.phase = @parser.phases[:beforeHead] end end class XmlRootPhase < Phase # Prime the Xml parser @start_tag_handlers = Hash.new(:startTagOther) @end_tag_handlers = Hash.new(:endTagOther) def startTagOther(name, attributes) @tree.openElements.push(@tree.document) element = @tree.createElement(name, attributes) @tree.openElements[-1].appendChild(element) @tree.openElements.push(element) @parser.phase = XmlElementPhase.new(@parser,@tree) end def endTagOther(name) super @tree.openElements.pop end end class XmlElementPhase < Phase # Generic handling for all XML elements @start_tag_handlers = Hash.new(:startTagOther) @end_tag_handlers = Hash.new(:endTagOther) def startTagOther(name, attributes) element = @tree.createElement(name, attributes) @tree.openElements[-1].appendChild(element) @tree.openElements.push(element) end def endTagOther(name) for node in @tree.openElements.reverse if node.name == name {} while @tree.openElements.pop != node break else @parser.parseError end end end def processCharacters(data) @tree.insertText(data) end end end