HTML5lib is Back.
Synced with latest version of HTML5lib, which fixes problem with Astral plane characters. I should really do some tests, but the HTML5lib Sanitizer seems to be 2-5 times slower than the old sanitizer.
This commit is contained in:
parent
e1a6827f1f
commit
4dd70af5ae
39 changed files with 4843 additions and 5576 deletions
|
@ -16,126 +16,126 @@ require 'html5lib/constants'
|
|||
|
||||
module HTML5lib
|
||||
|
||||
# liberal XML parser
|
||||
class XMLParser < HTMLParser
|
||||
# liberal XML parser
|
||||
class XMLParser < HTMLParser
|
||||
|
||||
def initialize(options={})
|
||||
super options
|
||||
@phases[:initial] = XmlRootPhase.new(self, @tree)
|
||||
def initialize(options = {})
|
||||
super options
|
||||
@phases[:initial] = XmlRootPhase.new(self, @tree)
|
||||
end
|
||||
|
||||
def normalizeToken(token)
|
||||
if token[:type] == :StartTag or token[:type] == :EmptyTag
|
||||
# We need to remove the duplicate attributes and convert attributes
|
||||
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||
if token[:type] == :StartTag or token[:type] == :EmptyTag
|
||||
# We need to remove the duplicate attributes and convert attributes
|
||||
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||
|
||||
token[:data] = Hash[*token[:data].reverse.flatten]
|
||||
token[:data] = Hash[*token[:data].reverse.flatten]
|
||||
|
||||
# For EmptyTags, process both a Start and an End tag
|
||||
if token[:type] == :EmptyTag
|
||||
@phase.processStartTag(token[:name], token[:data])
|
||||
token[:data] = {}
|
||||
token[:type] = :EndTag
|
||||
end
|
||||
|
||||
elsif token[:type] == :EndTag
|
||||
if token[:data]
|
||||
parseError(_("End tag contains unexpected attributes."))
|
||||
end
|
||||
|
||||
elsif token[:type] == :Comment
|
||||
# Rescue CDATA from the comments
|
||||
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
|
||||
token[:type] = :Characters
|
||||
token[:data] = token[:data][7 ... -2]
|
||||
end
|
||||
# For EmptyTags, process both a Start and an End tag
|
||||
if token[:type] == :EmptyTag
|
||||
@phase.processStartTag(token[:name], token[:data])
|
||||
token[:data] = {}
|
||||
token[:type] = :EndTag
|
||||
end
|
||||
|
||||
return token
|
||||
elsif token[:type] == :EndTag
|
||||
if token[:data]
|
||||
parseError(_("End tag contains unexpected attributes."))
|
||||
end
|
||||
|
||||
elsif token[:type] == :Comment
|
||||
# Rescue CDATA from the comments
|
||||
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
|
||||
token[:type] = :Characters
|
||||
token[:data] = token[:data][7 ... -2]
|
||||
end
|
||||
end
|
||||
|
||||
return token
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# liberal XMTHML parser
|
||||
class XHTMLParser < XMLParser
|
||||
# liberal XMTHML parser
|
||||
class XHTMLParser < XMLParser
|
||||
|
||||
def initialize(options={})
|
||||
super options
|
||||
@phases[:initial] = InitialPhase.new(self, @tree)
|
||||
@phases[:rootElement] = XhmlRootPhase.new(self, @tree)
|
||||
def initialize(options = {})
|
||||
super options
|
||||
@phases[:initial] = InitialPhase.new(self, @tree)
|
||||
@phases[:rootElement] = XhmlRootPhase.new(self, @tree)
|
||||
end
|
||||
|
||||
def normalizeToken(token)
|
||||
super(token)
|
||||
super(token)
|
||||
|
||||
# ensure that non-void XHTML elements have content so that separate
|
||||
# open and close tags are emitted
|
||||
if token[:type] == :EndTag and \
|
||||
not VOID_ELEMENTS.include? token[:name] and \
|
||||
token[:name] == @tree.openElements[-1].name and \
|
||||
not @tree.openElements[-1].hasContent
|
||||
@tree.insertText('') unless
|
||||
@tree.openElements.any? {|e|
|
||||
e.attributes.keys.include? 'xmlns' and
|
||||
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
|
||||
}
|
||||
end
|
||||
# ensure that non-void XHTML elements have content so that separate
|
||||
# open and close tags are emitted
|
||||
if token[:type] == :EndTag and \
|
||||
not VOID_ELEMENTS.include? token[:name] and \
|
||||
token[:name] == @tree.openElements[-1].name and \
|
||||
not @tree.openElements[-1].hasContent
|
||||
@tree.insertText('') unless
|
||||
@tree.openElements.any? {|e|
|
||||
e.attributes.keys.include? 'xmlns' and
|
||||
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
|
||||
}
|
||||
end
|
||||
|
||||
return token
|
||||
return token
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
class XhmlRootPhase < RootElementPhase
|
||||
class XhmlRootPhase < RootElementPhase
|
||||
def insertHtmlElement
|
||||
element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
|
||||
@tree.openElements.push(element)
|
||||
@tree.document.appendChild(element)
|
||||
@parser.phase = @parser.phases[:beforeHead]
|
||||
element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
|
||||
@tree.openElements.push(element)
|
||||
@tree.document.appendChild(element)
|
||||
@parser.phase = @parser.phases[:beforeHead]
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
class XmlRootPhase < Phase
|
||||
class XmlRootPhase < Phase
|
||||
# Prime the Xml parser
|
||||
@start_tag_handlers = Hash.new(:startTagOther)
|
||||
@end_tag_handlers = Hash.new(:endTagOther)
|
||||
def startTagOther(name, attributes)
|
||||
@tree.openElements.push(@tree.document)
|
||||
element = @tree.createElement(name, attributes)
|
||||
@tree.openElements[-1].appendChild(element)
|
||||
@tree.openElements.push(element)
|
||||
@parser.phase = XmlElementPhase.new(@parser,@tree)
|
||||
@tree.openElements.push(@tree.document)
|
||||
element = @tree.createElement(name, attributes)
|
||||
@tree.openElements[-1].appendChild(element)
|
||||
@tree.openElements.push(element)
|
||||
@parser.phase = XmlElementPhase.new(@parser,@tree)
|
||||
end
|
||||
def endTagOther(name)
|
||||
super
|
||||
@tree.openElements.pop
|
||||
super
|
||||
@tree.openElements.pop
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
class XmlElementPhase < Phase
|
||||
class XmlElementPhase < Phase
|
||||
# Generic handling for all XML elements
|
||||
|
||||
@start_tag_handlers = Hash.new(:startTagOther)
|
||||
@end_tag_handlers = Hash.new(:endTagOther)
|
||||
|
||||
def startTagOther(name, attributes)
|
||||
element = @tree.createElement(name, attributes)
|
||||
@tree.openElements[-1].appendChild(element)
|
||||
@tree.openElements.push(element)
|
||||
element = @tree.createElement(name, attributes)
|
||||
@tree.openElements[-1].appendChild(element)
|
||||
@tree.openElements.push(element)
|
||||
end
|
||||
|
||||
def endTagOther(name)
|
||||
for node in @tree.openElements.reverse
|
||||
if node.name == name
|
||||
{} while @tree.openElements.pop != node
|
||||
break
|
||||
else
|
||||
@parser.parseError
|
||||
end
|
||||
for node in @tree.openElements.reverse
|
||||
if node.name == name
|
||||
{} while @tree.openElements.pop != node
|
||||
break
|
||||
else
|
||||
@parser.parseError
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def processCharacters(data)
|
||||
@tree.insertText(data)
|
||||
@tree.insertText(data)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue