REXML Trees

Synced with latest HTML5lib.
Added preliminary support (currently disabled) for sanitizing REXML trees.
This commit is contained in:
Jacques Distler 2007-06-05 16:34:49 -05:00
parent 4dd70af5ae
commit bd8ba1f4b1
28 changed files with 1317 additions and 112 deletions

View file

@ -148,6 +148,26 @@ module HTML5lib
input
]
BOOLEAN_ATTRIBUTES = {
:global => %w[irrelevant],
'style' => %w[scoped],
'img' => %w[ismap],
'audio' => %w[autoplay controls],
'video' => %w[autoplay controls],
'script' => %w[defer async],
'details' => %w[open],
'datagrid' => %w[multiple disabled],
'command' => %w[hidden disabled checked default],
'menu' => %w[autosubmit],
'fieldset' => %w[disabled readonly],
'option' => %w[disabled readonly selected],
'optgroup' => %w[disabled readonly],
'button' => %w[disabled autofocus],
'input' => %w[disabled readonly required autofocus checked ismap],
'select' => %w[disabled readonly autofocus multiple],
'output' => %w[disabled readonly]
}
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
ENTITIES_WINDOWS1252 = [
8364, # 0x80 0x20AC EURO SIGN

View file

@ -37,13 +37,13 @@ module HTML5lib
# :strict - raise an exception when a parse error is encountered
# :tree - a treebuilder class controlling the type of tree that will be
# returned. Built in treebuilders can be accessed through
# html5lib.treebuilders.getTreeBuilder(treeType)
# HTML5lib::TreeBuilders[treeType]
def initialize(options = {})
@strict = false
@errors = []
@tokenizer = HTMLTokenizer
@tree = TreeBuilders::REXMLTree::TreeBuilder
@tree = TreeBuilders::REXML::TreeBuilder
options.each { |name, value| instance_variable_set("@#{name}", value) }

View file

@ -107,4 +107,4 @@ module HTML5lib
end
end
end
end

View file

@ -153,4 +153,4 @@ module HTML5lib
end
end
end
end

View file

@ -58,7 +58,7 @@ module HTML5lib
unless @char_encoding == 'utf-8'
begin
require 'iconv'
uString = Iconv.iconv('utf-8', @encoding, uString)[0]
uString = Iconv.iconv('utf-8', @char_encoding, uString)[0]
rescue
end
end
@ -95,11 +95,13 @@ module HTML5lib
#First look for a BOM
#This will also read past the BOM if present
encoding = detect_bom
#If there is no BOM need to look for meta elements with encoding
#information
if encoding.nil? and @parse_meta
encoding = detect_encoding_meta
end
#Guess with chardet, if avaliable
if encoding.nil? and @chardet
begin
@ -111,13 +113,14 @@ module HTML5lib
rescue LoadError
end
end
# If all else fails use the default encoding
if encoding.nil?
encoding = @DEFAULT_ENCODING
end
#Substitute for equivalent encodings:
encoding_sub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
encoding_sub = {'iso-8859-1' => 'windows-1252'}
if encoding_sub.has_key?(encoding.downcase)
encoding = encoding_sub[encoding.downcase]
@ -132,10 +135,10 @@ module HTML5lib
def detect_bom
bom_dict = {
"\xef\xbb\xbf" => 'utf-8',
"\xff\xfe" => 'utf-16-le',
"\xfe\xff" => 'utf-16-be',
"\xff\xfe\x00\x00" => 'utf-32-le',
"\x00\x00\xfe\xff" => 'utf-32-be'
"\xff\xfe" => 'utf16le',
"\xfe\xff" => 'utf16be',
"\xff\xfe\x00\x00" => 'utf32le',
"\x00\x00\xfe\xff" => 'utf32be'
}
# Go to beginning of file and read in 4 bytes
@ -205,7 +208,17 @@ module HTML5lib
else
begin
@tell += 1
return @data_stream[@tell - 1].chr
c = @data_stream[@tell - 1]
case c
when 0xC2 .. 0xDF
@tell += 1
c.chr + @data_stream[@tell-1].chr
when 0xE0 .. 0xF0
@tell += 2
c.chr + @data_stream[@tell-2].chr + @data_stream[@tell-1].chr
else
c.chr
end
rescue
return :EOF
end
@ -227,8 +240,8 @@ module HTML5lib
else
# Then the rest
begin
char_stack.push(@data_stream[@tell].chr)
@tell += 1
char_stack.push(@data_stream[@tell-1].chr)
rescue
char_stack.push(:EOF)
break

View file

@ -1,4 +1,3 @@
require 'html5lib/tokenizer'
require 'cgi'
module HTML5lib
@ -6,7 +5,7 @@ module HTML5lib
# This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes.
class HTMLSanitizer < HTMLTokenizer
module HTMLSanitizeModule
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
button caption center cite code col colgroup dd del dfn dir div dl dt
@ -96,19 +95,7 @@ module HTML5lib
ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
# attributes are parsed, and a restricted set, # specified by
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
# in ALLOWED_PROTOCOLS are allowed.
#
# sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def each
super do |token|
def process_token(token)
case token[:type]
when :StartTag, :EndTag, :EmptyTag
if ALLOWED_ELEMENTS.include?(token[:name])
@ -126,7 +113,7 @@ module HTML5lib
end
token[:data] = attrs.map {|k,v| [k,v]}
end
yield token
return token
else
if token[:type] == :EndTag
token[:data] = "</#{token[:name]}>"
@ -139,12 +126,11 @@ module HTML5lib
token[:data].insert(-2,'/') if token[:type] == :EmptyTag
token[:type] = :Characters
token.delete(:name)
yield token
return token
end
else
yield token
return token
end
end
end
def sanitize_css(style)
@ -174,4 +160,23 @@ module HTML5lib
style = clean.join(' ')
end
end
class HTMLSanitizeFilter < Filter
include HTMLSanitizeModule
def each
@source.each do |token|
yield(process_token(token))
end
end
end
class HTMLSanitizer < HTMLTokenizer
include HTMLSanitizeModule
def each
super do |token|
yield(process_token(token))
end
end
end
end

View file

@ -0,0 +1,418 @@
require 'html5lib/constants'
require 'jcode'
module HTML5lib
class Filter
include Enumerable
def initialize(source)
@source = source
end
end
class OptionalTagFilter < Filter
def slider
previous1 = previous2 = nil
@source.each do |token|
yield previous2, previous1, token if previous1 != nil
previous2 = previous1
previous1 = token
end
yield previous2, previous1, nil
end
def each
slider do |previous, token, nexttok|
type = token[:type]
if type == :StartTag
yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok)
elsif type == :EndTag
yield token unless is_optional_end(token[:name], nexttok)
else
yield token
end
end
end
def is_optional_start(tagname, previous, nexttok)
type = nexttok ? nexttok[:type] : nil
if tagname == 'html'
# An html element's start tag may be omitted if the first thing
# inside the html element is not a space character or a comment.
return ![:Comment, :SpaceCharacters].include?(type)
elsif tagname == 'head'
# A head element's start tag may be omitted if the first thing
# inside the head element is an element.
return type == :StartTag
elsif tagname == 'body'
# A body element's start tag may be omitted if the first thing
# inside the body element is not a space character or a comment,
# except if the first thing inside the body element is a script
# or style element and the node immediately preceding the body
# element is a head element whose end tag has been omitted.
if [:Comment, :SpaceCharacters].include?(type)
return false
elsif type == :StartTag
# XXX: we do not look at the preceding event, so we never omit
# the body element's start tag if it's followed by a script or
# a style element.
return !%w[script style].include?(nexttok[:name])
else
return true
end
elsif tagname == 'colgroup'
# A colgroup element's start tag may be omitted if the first thing
# inside the colgroup element is a col element, and if the element
# is not immediately preceeded by another colgroup element whose
# end tag has been omitted.
if type == :StartTag
# XXX: we do not look at the preceding event, so instead we never
# omit the colgroup element's end tag when it is immediately
# followed by another colgroup element. See is_optional_end.
return nexttok[:name] == "col"
else
return false
end
elsif tagname == 'tbody'
# A tbody element's start tag may be omitted if the first thing
# inside the tbody element is a tr element, and if the element is
# not immediately preceeded by a tbody, thead, or tfoot element
# whose end tag has been omitted.
if type == :StartTag
# omit the thead and tfoot elements' end tag when they are
# immediately followed by a tbody element. See is_optional_end.
if previous and previous[:type] == :EndTag and \
%w(tbody thead tfoot).include?(previous[:name])
return false
end
return nexttok[:name] == 'tr'
else
return false
end
end
return false
end
def is_optional_end(tagname, nexttok)
type = nexttok ? nexttok[:type] : nil
if %w[html head body].include?(tagname)
# An html element's end tag may be omitted if the html element
# is not immediately followed by a space character or a comment.
return ![:Comment, :SpaceCharacters].include?(type)
elsif %w[li optgroup option tr].include?(tagname)
# A li element's end tag may be omitted if the li element is
# immediately followed by another li element or if there is
# no more content in the parent element.
# An optgroup element's end tag may be omitted if the optgroup
# element is immediately followed by another optgroup element,
# or if there is no more content in the parent element.
# An option element's end tag may be omitted if the option
# element is immediately followed by another option element,
# or if there is no more content in the parent element.
# A tr element's end tag may be omitted if the tr element is
# immediately followed by another tr element, or if there is
# no more content in the parent element.
if type == :StartTag
return nexttok[:name] == tagname
else
return type == :EndTag || type == nil
end
elsif %w(dt dd).include?(tagname)
# A dt element's end tag may be omitted if the dt element is
# immediately followed by another dt element or a dd element.
# A dd element's end tag may be omitted if the dd element is
# immediately followed by another dd element or a dt element,
# or if there is no more content in the parent element.
if type == :StartTag
return %w(dt dd).include?(nexttok[:name])
elsif tagname == 'dd'
return type == :EndTag || type == nil
else
return false
end
elsif tagname == 'p'
# A p element's end tag may be omitted if the p element is
# immediately followed by an address, blockquote, dl, fieldset,
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
# or ul element, or if there is no more content in the parent
# element.
if type == :StartTag
return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5
h6 hr menu ol p pre table ul).include?(nexttok[:name])
else
return type == :EndTag || type == nil
end
elsif tagname == 'colgroup'
# A colgroup element's end tag may be omitted if the colgroup
# element is not immediately followed by a space character or
# a comment.
if [:Comment, :SpaceCharacters].include?(type)
return false
elsif type == :StartTag
# XXX: we also look for an immediately following colgroup
# element. See is_optional_start.
return nexttok[:name] != 'colgroup'
else
return true
end
elsif %w(thead tbody).include? tagname
# A thead element's end tag may be omitted if the thead element
# is immediately followed by a tbody or tfoot element.
# A tbody element's end tag may be omitted if the tbody element
# is immediately followed by a tbody or tfoot element, or if
# there is no more content in the parent element.
# A tfoot element's end tag may be omitted if the tfoot element
# is immediately followed by a tbody element, or if there is no
# more content in the parent element.
# XXX: we never omit the end tag when the following element is
# a tbody. See is_optional_start.
if type == :StartTag
return %w(tbody tfoot).include?(nexttok[:name])
elsif tagname == 'tbody'
return (type == :EndTag or type == nil)
else
return false
end
elsif tagname == 'tfoot'
# A tfoot element's end tag may be omitted if the tfoot element
# is immediately followed by a tbody element, or if there is no
# more content in the parent element.
# XXX: we never omit the end tag when the following element is
# a tbody. See is_optional_start.
if type == :StartTag
return nexttok[:name] == 'tbody'
else
return type == :EndTag || type == nil
end
elsif %w(td th).include? tagname
# A td element's end tag may be omitted if the td element is
# immediately followed by a td or th element, or if there is
# no more content in the parent element.
# A th element's end tag may be omitted if the th element is
# immediately followed by a td or th element, or if there is
# no more content in the parent element.
if type == :StartTag
return %w(td th).include?(nexttok[:name])
else
return type == :EndTag || type == nil
end
end
return false
end
end
class HTMLSerializer
CDATA_ELEMENTS = %w[style script xmp iframe noembed noframes noscript]
def self.serialize(stream, options = {})
new(options).serialize(stream)
end
def initialize(options={})
@quote_attr_values = false
@quote_char = '"'
@use_best_quote_char = true
@minimize_boolean_attributes = true
@use_trailing_solidus = false
@space_before_trailing_solidus = true
@omit_optional_tags = true
@sanitize = false
@strip_whitespace = false
@inject_meta_charset = true
options.each do |name, value|
next unless %w(quote_attr_values quote_char use_best_quote_char
minimize_boolean_attributes use_trailing_solidus
space_before_trailing_solidus omit_optional_tags sanitize
strip_whitespace inject_meta_charset).include? name.to_s
@use_best_quote_char = false if name.to_s == 'quote_char'
instance_variable_set("@#{name}", value)
end
@errors = []
end
def serialize(treewalker, encoding=nil)
in_cdata = false
@errors = []
if encoding and @inject_meta_charset
treewalker = filter_inject_meta_charset(treewalker, encoding)
end
if @strip_whitespace
treewalker = filter_whitespace(treewalker)
end
if @sanitize
require 'html5lib/sanitizer'
treewalker = HTMLSanitizeFilter.new(treewalker)
end
# if @omit_optional_tags
# treewalker = OptionalTagFilter.new(treewalker)
# end
result = []
treewalker.each do |token|
type = token[:type]
if type == :Doctype
doctype = "<!DOCTYPE %s>" % token[:name]
if encoding
result << doctype.encode(encoding)
else
result << doctype
end
elsif [:Characters, :SpaceCharacters].include? type
if type == :SpaceCharacters or in_cdata
if in_cdata and token[:data].find("</") >= 0
serializeError(_("Unexpected </ in CDATA"))
end
if encoding
result << token[:data].encode(encoding, errors || "strict")
else
result << token[:data]
end
elsif encoding
result << token[:data].replace("&", "&amp;") \
.encode(encoding, unicode_encode_errors)
else
result << token[:data] \
.gsub("&", "&amp;") \
.gsub("<", "&lt;") \
.gsub(">", "&gt;")
end
elsif [:StartTag, :EmptyTag].include? type
name = token[:name]
if CDATA_ELEMENTS.include?(name)
in_cdata = true
elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element"))
end
attrs = token[:data].to_a
attrs.sort()
attributes = []
for k,v in attrs
if encoding
k = k.encode(encoding)
end
attributes << ' '
attributes << k
if not @minimize_boolean_attributes or \
(!(BOOLEAN_ATTRIBUTES[name]||[]).include?(k) \
and !BOOLEAN_ATTRIBUTES[:global].include?(k))
attributes << "="
if @quote_attr_values or v.empty?
quote_attr = true
else
quote_attr = (SPACE_CHARACTERS.join('') + "<>\"'").each_char.any? {|c| v.include?(c)}
end
v = v.gsub("&", "&amp;")
if encoding
v = v.encode(encoding, unicode_encode_errors)
end
if quote_attr
quote_char = @quote_char
if @use_best_quote_char
if v.index("'") and !v.index('"')
quote_char = '"'
elsif v.index('"') and !v.index("'")
quote_char = "'"
end
end
if quote_char == "'"
v = v.gsub("'", "&#39;")
else
v = v.gsub('"', "&quot;")
end
attributes << quote_char << v << quote_char
else
attributes << v
end
end
end
if VOID_ELEMENTS.include?(name) and @use_trailing_solidus
if @space_before_trailing_solidus
attributes << " /"
else
attributes << "/"
end
end
if encoding
result << "<%s%s>" % [name.encode(encoding), attributes.join('')]
else
result << "<%s%s>" % [name, attributes.join('')]
end
elsif type == :EndTag
name = token[:name]
if CDATA_ELEMENTS.include?(name)
in_cdata = false
elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element"))
end
end_tag = "</%s>" % name
if encoding
end_tag = end_tag.encode(encoding)
end
result << end_tag
elsif type == :Comment
data = token[:data]
if data.index("--")
serializeError(_("Comment contains --"))
end
comment = "<!--%s-->" % token[:data]
if encoding
comment = comment.encode(encoding, unicode_encode_errors)
end
result << comment
else
serializeError(token[:data])
end
end
result.join('')
end
def render(treewalker, encoding=nil)
if encoding
return "".join(list(serialize(treewalker, encoding)))
else
return "".join(list(serialize(treewalker)))
end
end
def serializeError(data="XXX ERROR MESSAGE NEEDED")
# XXX The idea is to make data mandatory.
@errors.push(data)
if @strict
raise SerializeError
end
end
def filter_inject_meta_charset(treewalker, encoding)
done = false
for token in treewalker
if not done and token[:type] == :StartTag \
and token[:name].lower() == "head"
yield({:type => :EmptyTag, :name => "meta", \
:data => {"charset" => encoding}})
end
yield token
end
end
def filter_whitespace(treewalker)
raise NotImplementedError
end
end
# Error in serialized tree
class SerializeError < Exception
end
end

View file

@ -1,21 +1,24 @@
module HTML5lib
module TreeBuilders
def self.getTreeBuilder(name)
case name.to_s.downcase
class << self
def [](name)
case name.to_s.downcase
when 'simpletree' then
require 'html5lib/treebuilders/simpletree'
SimpleTree::TreeBuilder
when 'rexml' then
require 'html5lib/treebuilders/rexml'
REXMLTree::TreeBuilder
REXML::TreeBuilder
when 'hpricot' then
require 'html5lib/treebuilders/hpricot'
Hpricot::TreeBuilder
else
raise "Unknown TreeBuilder #{name}"
end
end
end
alias :getTreeBuilder :[]
end
end
end

View file

@ -144,7 +144,7 @@ module HTML5lib
# code. It should still do the same though.
# Step 1: stop the algorithm when there's nothing to do.
return unless @activeFormattingElements
return if @activeFormattingElements.empty?
# Step 2 and step 3: we start with the last element. So i is -1.
i = -1

View file

@ -1,4 +1,5 @@
require 'html5lib/treebuilders/base'
require 'rubygems'
require 'hpricot'
require 'forwardable'
@ -26,12 +27,14 @@ module HTML5lib
childNodes << node
hpricot.children << node.hpricot
end
node.hpricot.parent = hpricot
node.parent = self
end
def removeChild(node)
childNodes.delete(node)
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
node.hpricot.parent = nil
node.parent = nil
end
@ -48,6 +51,7 @@ module HTML5lib
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
else
refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
childNodes.insert(index, node)
end
end

View file

@ -4,7 +4,7 @@ require 'forwardable'
module HTML5lib
module TreeBuilders
module REXMLTree
module REXML
class Node < Base::Node
extend Forwardable
@ -52,6 +52,7 @@ module HTML5lib
childNodes[index-1].rxobj.raw = true
else
childNodes.insert index, node
refNode.rxobj.parent.insert_before(refNode.rxobj,node.rxobj)
end
end
@ -62,7 +63,7 @@ module HTML5lib
class Element < Node
def self.rxclass
REXML::Element
::REXML::Element
end
def initialize name
@ -95,7 +96,7 @@ module HTML5lib
class Document < Node
def self.rxclass
REXML::Document
::REXML::Document
end
def initialize
@ -120,7 +121,7 @@ module HTML5lib
class DocumentType < Node
def self.rxclass
REXML::DocType
::REXML::DocType
end
def printTree indent=0
@ -145,7 +146,7 @@ module HTML5lib
class TextNode < Node
def initialize data
raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;')
@rxobj = REXML::Text.new(raw, true, nil, true)
@rxobj = ::REXML::Text.new(raw, true, nil, true)
end
def printTree indent=0
@ -155,7 +156,7 @@ module HTML5lib
class CommentNode < Node
def self.rxclass
REXML::Comment
::REXML::Comment
end
def printTree indent=0

View file

@ -0,0 +1,26 @@
require 'html5lib/treewalkers/base'
module HTML5lib
module TreeWalkers
class << self
def [](name)
case name.to_s.downcase
when 'simpletree' then
require 'html5lib/treewalkers/simpletree'
SimpleTree::TreeWalker
when 'rexml' then
require 'html5lib/treewalkers/rexml'
REXML::TreeWalker
when 'hpricot' then
require 'html5lib/treewalkers/hpricot'
Hpricot::TreeWalker
else
raise "Unknown TreeWalker #{name}"
end
end
alias :getTreeWalker :[]
end
end
end

View file

@ -0,0 +1,156 @@
require 'html5lib/constants'
module HTML5lib
module TreeWalkers
module TokenConstructor
def error(msg)
return {:type => "SerializeError", :data => msg}
end
def normalizeAttrs(attrs)
attrs.to_a
end
def emptyTag(name, attrs, hasChildren=false)
error(_("Void element has children")) if hasChildren
return({:type => :EmptyTag, :name => name, \
:data => normalizeAttrs(attrs)})
end
def startTag(name, attrs)
return {:type => :StartTag, :name => name, \
:data => normalizeAttrs(attrs)}
end
def endTag(name)
return {:type => :EndTag, :name => name, :data => []}
end
def text(data)
if data =~ /^([#{SPACE_CHARACTERS.join('')}]+)/
yield({:type => :SpaceCharacters, :data => $1})
data = data[$1.length .. -1]
return if data.empty?
end
if data =~ /([#{SPACE_CHARACTERS.join('')}]+)$/
yield({:type => :Characters, :data => data[0 ... -$1.length]})
yield({:type => :SpaceCharacters, :data => $1})
else
yield({:type => :Characters, :data => data})
end
end
def comment(data)
return {:type => :Comment, :data => data}
end
def doctype(name)
return {:type => :Doctype, :name => name, :data => name.upcase() == "HTML"}
end
def unknown(nodeType)
return error(_("Unknown node type: ") + nodeType.to_s)
end
def _(str)
str
end
end
class Base
include TokenConstructor
def initialize(tree)
@tree = tree
end
def each
raise NotImplementedError
end
alias walk each
end
class NonRecursiveTreeWalker < TreeWalkers::Base
def node_details(node)
raise NotImplementedError
end
def first_child(node)
raise NotImplementedError
end
def next_sibling(node)
raise NotImplementedError
end
def parent(node)
raise NotImplementedError
end
def each
currentNode = @tree
while currentNode != nil
details = node_details(currentNode)
hasChildren = false
case details.shift
when :DOCTYPE
yield doctype(*details)
when :TEXT
text(*details) {|token| yield token}
when :ELEMENT
name, attributes, hasChildren = details
if VOID_ELEMENTS.include?(name)
yield emptyTag(name, attributes.to_a, hasChildren)
hasChildren = false
else
yield startTag(name, attributes.to_a)
end
when :COMMENT
yield comment(details[0])
when :DOCUMENT, :DOCUMENT_FRAGMENT
hasChildren = true
when nil
# ignore (REXML::XMLDecl is an example)
else
yield unknown(details[0])
end
firstChild = hasChildren ? first_child(currentNode) : nil
if firstChild != nil
currentNode = firstChild
else
while currentNode != nil
details = node_details(currentNode)
if details.shift == :ELEMENT
name, attributes, hasChildren = details
yield endTag(name) if !VOID_ELEMENTS.include?(name)
end
if @tree == currentNode
currentNode = nil
else
nextSibling = next_sibling(currentNode)
if nextSibling != nil
currentNode = nextSibling
break
end
currentNode = parent(currentNode)
end
end
end
end
end
end
end
end

View file

@ -0,0 +1,48 @@
require 'html5lib/treewalkers/base'
require 'rexml/document'
module HTML5lib
module TreeWalkers
module Hpricot
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
def node_details(node)
case node
when ::Hpricot::Elem
if !node.name
[:DOCUMENT_FRAGMENT]
else
[:ELEMENT, node.name,
node.attributes.map {|name,value| [name,value]},
!node.empty?]
end
when ::Hpricot::Text
[:TEXT, node.to_plain_text]
when ::Hpricot::Comment
[:COMMENT, node.content]
when ::Hpricot::Doc
[:DOCUMENT]
when ::Hpricot::DocType
[:DOCTYPE, node.target]
when ::Hpricot::XMLDecl
[nil]
else
[:UNKNOWN, node.class.inspect]
end
end
def first_child(node)
node.children.first
end
def next_sibling(node)
node.next_node
end
def parent(node)
node.parent
end
end
end
end
end

View file

@ -0,0 +1,48 @@
require 'html5lib/treewalkers/base'
require 'rexml/document'
module HTML5lib
module TreeWalkers
module REXML
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
def node_details(node)
case node
when ::REXML::Document
[:DOCUMENT]
when ::REXML::Element
if !node.name
[:DOCUMENT_FRAGMENT]
else
[:ELEMENT, node.name,
node.attributes.map {|name,value| [name,value]},
node.has_elements? || node.has_text?]
end
when ::REXML::Text
[:TEXT, node.value]
when ::REXML::Comment
[:COMMENT, node.string]
when ::REXML::DocType
[:DOCTYPE, node.name]
when ::REXML::XMLDecl
[nil]
else
[:UNKNOWN, node.class.inspect]
end
end
def first_child(node)
node.children.first
end
def next_sibling(node)
node.next_sibling
end
def parent(node)
node.parent
end
end
end
end
end

View file

@ -0,0 +1,48 @@
require 'html5lib/treewalkers/base'
module HTML5lib
module TreeWalkers
module SimpleTree
class TreeWalker < HTML5lib::TreeWalkers::Base
include HTML5lib::TreeBuilders::SimpleTree
def walk(node)
case node
when Document, DocumentFragment
return
when DocumentType
yield doctype(node.name)
when TextNode
text(node.value) {|token| yield token}
when Element
if VOID_ELEMENTS.include?(node.name)
yield emptyTag(node.name, node.attributes, node.hasContent())
else
yield startTag(node.name, node.attributes)
for child in node.childNodes
walk(child) {|token| yield token}
end
yield endTag(node.name)
end
when CommentNode
yield comment(node.value)
else
puts '?'
yield unknown(node.class)
end
end
def each
for child in @tree.childNodes
walk(child) {|node| yield node}
end
end
end
end
end
end