a6429f8c22
Completely removed the html5lib sanitizer. Fixed the string-handling to work in both Ruby 1.8.x and 1.9.2. There are still, inexplicably, two functional tests that fail. But the rest seems to work quite well.
158 lines
4.4 KiB
Ruby
Executable file
158 lines
4.4 KiB
Ruby
Executable file
# Warning: this module is experimental and subject to change and even removal
|
|
# at any time.
|
|
#
|
|
# For background/rationale, see:
|
|
# * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
|
|
# * http://tinyurl.com/ylfj8k (and follow-ups)
|
|
#
|
|
# References:
|
|
# * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
|
|
# * http://wiki.whatwg.org/wiki/HtmlVsXhtml
|
|
#
|
|
# @@TODO:
|
|
# * Selectively lowercase only XHTML, but not foreign markup
|
|
require 'html5/html5parser'
|
|
require 'html5/constants'
|
|
|
|
module HTML5
|
|
|
|
# liberal XML parser
|
|
class XMLParser < HTMLParser
|
|
|
|
def initialize(options = {})
|
|
super options
|
|
@phases[:initial] = XmlRootPhase.new(self, @tree)
|
|
end
|
|
|
|
def normalize_token(token)
|
|
case token[:type]
|
|
when :StartTag, :EmptyTag
|
|
# We need to remove the duplicate attributes and convert attributes
|
|
# to a Hash so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
|
|
|
token[:data] = Hash[*token[:data].reverse.flatten]
|
|
|
|
# For EmptyTags, process both a Start and an End tag
|
|
if token[:type] == :EmptyTag
|
|
save = @tokenizer.content_model_flag
|
|
@phase.processStartTag(token[:name], token[:data])
|
|
@tokenizer.content_model_flag = save
|
|
token[:data] = {}
|
|
token[:type] = :EndTag
|
|
end
|
|
|
|
when :Characters
|
|
# un-escape RCDATA_ELEMENTS (e.g. style, script)
|
|
if @tokenizer.content_model_flag == :CDATA
|
|
token[:data] = token[:data].
|
|
gsub('<','<').gsub('>','>').gsub('&','&')
|
|
end
|
|
|
|
when :EndTag
|
|
if token[:data]
|
|
parse_error("attributes-in-end-tag")
|
|
end
|
|
|
|
when :Comment
|
|
# Rescue CDATA from the comments
|
|
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
|
|
token[:type] = :Characters
|
|
token[:data] = token[:data][7 ... -2]
|
|
end
|
|
end
|
|
|
|
return token
|
|
end
|
|
end
|
|
|
|
# liberal XMTHML parser
|
|
class XHTMLParser < XMLParser
|
|
|
|
def initialize(options = {})
|
|
super options
|
|
@phases[:initial] = InitialPhase.new(self, @tree)
|
|
@phases[:rootElement] = XhmlRootPhase.new(self, @tree)
|
|
end
|
|
|
|
def normalize_token(token)
|
|
super(token)
|
|
|
|
# ensure that non-void XHTML elements have content so that separate
|
|
# open and close tags are emitted
|
|
if token[:type] == :EndTag
|
|
if VOID_ELEMENTS.include? token[:name]
|
|
if @tree.open_elements[-1].name != token["name"]
|
|
token[:type] = :EmptyTag
|
|
token["data"] ||= {}
|
|
end
|
|
else
|
|
if token[:name] == @tree.open_elements[-1].name and \
|
|
not @tree.open_elements[-1].hasContent
|
|
@tree.insertText('') unless
|
|
@tree.open_elements.any? {|e|
|
|
e.attributes.keys.include? 'xmlns' and
|
|
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
return token
|
|
end
|
|
end
|
|
|
|
class XhmlRootPhase < RootElementPhase
|
|
def insert_html_element
|
|
element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
|
|
@tree.open_elements.push(element)
|
|
@tree.document.appendChild(element)
|
|
@parser.phase = @parser.phases[:beforeHead]
|
|
end
|
|
end
|
|
|
|
class XmlRootPhase < Phase
|
|
# Prime the Xml parser
|
|
@start_tag_handlers = Hash.new(:startTagOther)
|
|
@end_tag_handlers = Hash.new(:endTagOther)
|
|
def startTagOther(name, attributes)
|
|
@tree.open_elements.push(@tree.document)
|
|
element = @tree.createElement(name, attributes)
|
|
@tree.open_elements[-1].appendChild(element)
|
|
@tree.open_elements.push(element)
|
|
@parser.phase = XmlElementPhase.new(@parser,@tree)
|
|
end
|
|
def endTagOther(name)
|
|
super
|
|
@tree.open_elements.pop
|
|
end
|
|
end
|
|
|
|
class XmlElementPhase < Phase
|
|
# Generic handling for all XML elements
|
|
|
|
@start_tag_handlers = Hash.new(:startTagOther)
|
|
@end_tag_handlers = Hash.new(:endTagOther)
|
|
|
|
def startTagOther(name, attributes)
|
|
element = @tree.createElement(name, attributes)
|
|
@tree.open_elements[-1].appendChild(element)
|
|
@tree.open_elements.push(element)
|
|
end
|
|
|
|
def endTagOther(name)
|
|
for node in @tree.open_elements.reverse
|
|
if node.name == name
|
|
{} while @tree.open_elements.pop != node
|
|
break
|
|
else
|
|
parse_error
|
|
end
|
|
end
|
|
end
|
|
|
|
def processCharacters(data)
|
|
@tree.insertText(data)
|
|
end
|
|
end
|
|
|
|
end
|