HTML5lib is Back.
Synced with latest version of HTML5lib, which fixes problem with Astral plane characters. I should really do some tests, but the HTML5lib Sanitizer seems to be 2-5 times slower than the old sanitizer.
This commit is contained in:
parent
e1a6827f1f
commit
4dd70af5ae
39 changed files with 4843 additions and 5576 deletions
|
@ -3,209 +3,212 @@ require 'hpricot'
|
|||
require 'forwardable'
|
||||
|
||||
module HTML5lib
|
||||
module TreeBuilders
|
||||
module Hpricot
|
||||
module TreeBuilders
|
||||
module Hpricot
|
||||
|
||||
class Node < Base::Node
|
||||
class Node < Base::Node
|
||||
|
||||
extend Forwardable
|
||||
extend Forwardable
|
||||
|
||||
def_delegators :@hpricot, :name
|
||||
def_delegators :@hpricot, :name
|
||||
|
||||
attr_accessor :hpricot
|
||||
attr_accessor :hpricot
|
||||
|
||||
def initialize(name)
|
||||
super(name)
|
||||
@hpricot = self.class.hpricot_class.new name
|
||||
end
|
||||
def initialize(name)
|
||||
super(name)
|
||||
@hpricot = self.class.hpricot_class.new name
|
||||
end
|
||||
|
||||
def appendChild(node)
|
||||
if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
|
||||
def appendChild(node)
|
||||
if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
|
||||
childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s
|
||||
else
|
||||
else
|
||||
childNodes << node
|
||||
hpricot.children << node.hpricot
|
||||
end
|
||||
node.parent = self
|
||||
end
|
||||
node.parent = self
|
||||
end
|
||||
|
||||
def removeChild(node)
|
||||
childNodes.delete(node)
|
||||
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
|
||||
node.parent = nil
|
||||
end
|
||||
def removeChild(node)
|
||||
childNodes.delete(node)
|
||||
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
|
||||
node.parent = nil
|
||||
end
|
||||
|
||||
def insertText(data, before = nil)
|
||||
if before
|
||||
def insertText(data, before=nil)
|
||||
if before
|
||||
insertBefore(TextNode.new(data), before)
|
||||
else
|
||||
else
|
||||
appendChild(TextNode.new(data))
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def insertBefore(node, refNode)
|
||||
index = childNodes.index(refNode)
|
||||
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
|
||||
def insertBefore(node, refNode)
|
||||
index = childNodes.index(refNode)
|
||||
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
|
||||
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
|
||||
else
|
||||
else
|
||||
childNodes.insert(index, node)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def hasContent
|
||||
childNodes.any?
|
||||
end
|
||||
end
|
||||
def hasContent
|
||||
childNodes.any?
|
||||
end
|
||||
end
|
||||
|
||||
class Element < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Elem
|
||||
end
|
||||
class Element < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Elem
|
||||
end
|
||||
|
||||
def initialize(name)
|
||||
super(name)
|
||||
def initialize(name)
|
||||
super(name)
|
||||
|
||||
@hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
|
||||
end
|
||||
@hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
|
||||
end
|
||||
|
||||
def name
|
||||
@hpricot.stag.name
|
||||
end
|
||||
def name
|
||||
@hpricot.stag.name
|
||||
end
|
||||
|
||||
def cloneNode
|
||||
attributes.inject(self.class.new(name)) do |node, (name, value)|
|
||||
def cloneNode
|
||||
attributes.inject(self.class.new(name)) do |node, (name, value)|
|
||||
node.hpricot[name] = value
|
||||
node
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# A call to Hpricot::Elem#raw_attributes is built dynamically,
|
||||
# so alterations to the returned value (a hash) will be lost.
|
||||
#
|
||||
# AttributeProxy works around this by forwarding :[]= calls
|
||||
# to the raw_attributes accessor on the element start tag.
|
||||
#
|
||||
class AttributeProxy
|
||||
def initialize(hpricot)
|
||||
# A call to Hpricot::Elem#raw_attributes is built dynamically,
|
||||
# so alterations to the returned value (a hash) will be lost.
|
||||
#
|
||||
# AttributeProxy works around this by forwarding :[]= calls
|
||||
# to the raw_attributes accessor on the element start tag.
|
||||
#
|
||||
class AttributeProxy
|
||||
def initialize(hpricot)
|
||||
@hpricot = hpricot
|
||||
end
|
||||
def []=(k, v)
|
||||
end
|
||||
|
||||
def []=(k, v)
|
||||
@hpricot.stag.send(stag_attributes_method)[k] = v
|
||||
end
|
||||
def stag_attributes_method
|
||||
end
|
||||
|
||||
def stag_attributes_method
|
||||
# STag#attributes changed to STag#raw_attributes after Hpricot 0.5
|
||||
@hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
|
||||
end
|
||||
def method_missing(*a, &b)
|
||||
end
|
||||
|
||||
def method_missing(*a, &b)
|
||||
@hpricot.attributes.send(*a, &b)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def attributes
|
||||
AttributeProxy.new(@hpricot)
|
||||
end
|
||||
def attributes
|
||||
AttributeProxy.new(@hpricot)
|
||||
end
|
||||
|
||||
def attributes=(attrs)
|
||||
attrs.each { |name, value| @hpricot[name] = value }
|
||||
end
|
||||
def attributes=(attrs)
|
||||
attrs.each { |name, value| @hpricot[name] = value }
|
||||
end
|
||||
|
||||
def printTree(indent = 0)
|
||||
tree = "\n|#{' ' * indent}<#{name}>"
|
||||
indent += 2
|
||||
attributes.each do |name, value|
|
||||
def printTree(indent=0)
|
||||
tree = "\n|#{' ' * indent}<#{name}>"
|
||||
indent += 2
|
||||
attributes.each do |name, value|
|
||||
next if name == 'xmlns'
|
||||
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
|
||||
end
|
||||
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
|
||||
end
|
||||
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
class Document < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Doc
|
||||
end
|
||||
class Document < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Doc
|
||||
end
|
||||
|
||||
def initialize
|
||||
super(nil)
|
||||
end
|
||||
def initialize
|
||||
super(nil)
|
||||
end
|
||||
|
||||
def printTree(indent = 0)
|
||||
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
|
||||
end
|
||||
end
|
||||
def printTree(indent=0)
|
||||
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentType < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::DocType
|
||||
end
|
||||
class DocumentType < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::DocType
|
||||
end
|
||||
|
||||
def initialize(name)
|
||||
begin
|
||||
def initialize(name)
|
||||
begin
|
||||
super(name)
|
||||
rescue ArgumentError # needs 3...
|
||||
rescue ArgumentError # needs 3...
|
||||
end
|
||||
|
||||
@hpricot = ::Hpricot::DocType.new(name, nil, nil)
|
||||
end
|
||||
|
||||
@hpricot = ::Hpricot::DocType.new(name, nil, nil)
|
||||
end
|
||||
def printTree(indent=0)
|
||||
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentFragment < Element
|
||||
def initialize
|
||||
super('')
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
|
||||
end
|
||||
end
|
||||
|
||||
class TextNode < Node
|
||||
def initialize(data)
|
||||
@hpricot = ::Hpricot::Text.new(data)
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
"\n|#{' ' * indent}\"#{hpricot.content}\""
|
||||
end
|
||||
end
|
||||
|
||||
class CommentNode < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Comment
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
|
||||
end
|
||||
end
|
||||
|
||||
class TreeBuilder < Base::TreeBuilder
|
||||
def initialize
|
||||
@documentClass = Document
|
||||
@doctypeClass = DocumentType
|
||||
@elementClass = Element
|
||||
@commentClass = CommentNode
|
||||
@fragmentClass = DocumentFragment
|
||||
end
|
||||
|
||||
def testSerializer(node)
|
||||
node.printTree
|
||||
end
|
||||
|
||||
def getDocument
|
||||
@document.hpricot
|
||||
end
|
||||
|
||||
def getFragment
|
||||
@document = super
|
||||
return @document.hpricot.children
|
||||
end
|
||||
end
|
||||
|
||||
def printTree(indent = 0)
|
||||
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentFragment < Element
|
||||
def initialize
|
||||
super('')
|
||||
end
|
||||
|
||||
def printTree(indent = 0)
|
||||
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
|
||||
end
|
||||
end
|
||||
|
||||
class TextNode < Node
|
||||
def initialize(data)
|
||||
@hpricot = ::Hpricot::Text.new(data)
|
||||
end
|
||||
|
||||
def printTree(indent = 0)
|
||||
"\n|#{' ' * indent}\"#{hpricot.content}\""
|
||||
end
|
||||
end
|
||||
|
||||
class CommentNode < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Comment
|
||||
end
|
||||
|
||||
def printTree(indent = 0)
|
||||
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
|
||||
end
|
||||
end
|
||||
|
||||
class TreeBuilder < Base::TreeBuilder
|
||||
def initialize
|
||||
@documentClass = Document
|
||||
@doctypeClass = DocumentType
|
||||
@elementClass = Element
|
||||
@commentClass = CommentNode
|
||||
@fragmentClass = DocumentFragment
|
||||
end
|
||||
|
||||
def testSerializer(node)
|
||||
node.printTree
|
||||
end
|
||||
|
||||
def getDocument
|
||||
@document.hpricot
|
||||
end
|
||||
|
||||
def getFragment
|
||||
@document = super
|
||||
return @document.hpricot.children
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue