REXML Trees

Synced with latest HTML5lib.
Added preliminary support (currently disabled) for sanitizing REXML trees.
This commit is contained in:
Jacques Distler 2007-06-05 16:34:49 -05:00
parent 4dd70af5ae
commit bd8ba1f4b1
28 changed files with 1317 additions and 112 deletions

View file

@ -53,9 +53,10 @@ module Engines
def mask def mask
require_dependency 'maruku' require_dependency 'maruku'
require_dependency 'maruku/ext/math' require_dependency 'maruku/ext/math'
html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"), # html = sanitize_rexml(Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
{:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html # {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html_tree)
sanitize_xhtml(html.to_ncr) html = sanitize_xhtml(Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
{:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html.to_ncr)
end end
end end

View file

@ -8,19 +8,36 @@ module Sanitize
# #
# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML # sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
# sanitize_html() is a case-insensitive sanitizer suitable for HTML # sanitize_html() is a case-insensitive sanitizer suitable for HTML
# sanitize_rexml() sanitized a REXML tree, returning a string
require 'html5lib/sanitizer'
require 'html5lib/html5parser' require 'html5lib/html5parser'
require 'html5lib/liberalxmlparser' require 'html5lib/liberalxmlparser'
require 'html5lib/treewalkers'
require 'html5lib/serializer'
require 'string_utils'
require 'html5lib/sanitizer'
include HTML5lib include HTML5lib
def sanitize_xhtml(html) def sanitize_xhtml(html)
XHTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s XHTMLParser.parseFragment(html.to_ncr, :tokenizer => HTMLSanitizer).to_s
end end
def sanitize_html(html) def sanitize_html(html)
HTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s HTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
end end
def sanitize_rexml(tree)
tokens = TreeWalkers.getTreeWalker('rexml').new(tree.to_ncr)
HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
:quote_attr_values => 'true',
:minimize_boolean_attributes => 'false',
:use_trailing_solidus => 'true',
:space_before_trailing_solidus => 'true',
:omit_optional_tags => 'false',
:inject_meta_charset => 'false',
:sanitize => 'true'})
end
end end

View file

@ -2155,3 +2155,20 @@ class String
end end
end end
require 'rexml/element'
module REXML
class Element
def to_ncr
XPath.each(self, '//*') { |el|
el.texts.each_index {|i|
el.texts[i].value = el.texts[i].to_s.to_ncr
}
el.attributes.each { |name,val|
el.attributes[name] = val.to_ncr
}
}
return self
end
end
end

View file

@ -148,6 +148,26 @@ module HTML5lib
input input
] ]
BOOLEAN_ATTRIBUTES = {
:global => %w[irrelevant],
'style' => %w[scoped],
'img' => %w[ismap],
'audio' => %w[autoplay controls],
'video' => %w[autoplay controls],
'script' => %w[defer async],
'details' => %w[open],
'datagrid' => %w[multiple disabled],
'command' => %w[hidden disabled checked default],
'menu' => %w[autosubmit],
'fieldset' => %w[disabled readonly],
'option' => %w[disabled readonly selected],
'optgroup' => %w[disabled readonly],
'button' => %w[disabled autofocus],
'input' => %w[disabled readonly required autofocus checked ismap],
'select' => %w[disabled readonly autofocus multiple],
'output' => %w[disabled readonly]
}
# entitiesWindows1252 has to be _ordered_ and needs to have an index. # entitiesWindows1252 has to be _ordered_ and needs to have an index.
ENTITIES_WINDOWS1252 = [ ENTITIES_WINDOWS1252 = [
8364, # 0x80 0x20AC EURO SIGN 8364, # 0x80 0x20AC EURO SIGN

View file

@ -37,13 +37,13 @@ module HTML5lib
# :strict - raise an exception when a parse error is encountered # :strict - raise an exception when a parse error is encountered
# :tree - a treebuilder class controlling the type of tree that will be # :tree - a treebuilder class controlling the type of tree that will be
# returned. Built in treebuilders can be accessed through # returned. Built in treebuilders can be accessed through
# html5lib.treebuilders.getTreeBuilder(treeType) # HTML5lib::TreeBuilders[treeType]
def initialize(options = {}) def initialize(options = {})
@strict = false @strict = false
@errors = [] @errors = []
@tokenizer = HTMLTokenizer @tokenizer = HTMLTokenizer
@tree = TreeBuilders::REXMLTree::TreeBuilder @tree = TreeBuilders::REXML::TreeBuilder
options.each { |name, value| instance_variable_set("@#{name}", value) } options.each { |name, value| instance_variable_set("@#{name}", value) }

View file

@ -107,4 +107,4 @@ module HTML5lib
end end
end end
end end

View file

@ -153,4 +153,4 @@ module HTML5lib
end end
end end
end end

View file

@ -58,7 +58,7 @@ module HTML5lib
unless @char_encoding == 'utf-8' unless @char_encoding == 'utf-8'
begin begin
require 'iconv' require 'iconv'
uString = Iconv.iconv('utf-8', @encoding, uString)[0] uString = Iconv.iconv('utf-8', @char_encoding, uString)[0]
rescue rescue
end end
end end
@ -95,11 +95,13 @@ module HTML5lib
#First look for a BOM #First look for a BOM
#This will also read past the BOM if present #This will also read past the BOM if present
encoding = detect_bom encoding = detect_bom
#If there is no BOM need to look for meta elements with encoding #If there is no BOM need to look for meta elements with encoding
#information #information
if encoding.nil? and @parse_meta if encoding.nil? and @parse_meta
encoding = detect_encoding_meta encoding = detect_encoding_meta
end end
#Guess with chardet, if avaliable #Guess with chardet, if avaliable
if encoding.nil? and @chardet if encoding.nil? and @chardet
begin begin
@ -111,13 +113,14 @@ module HTML5lib
rescue LoadError rescue LoadError
end end
end end
# If all else fails use the default encoding # If all else fails use the default encoding
if encoding.nil? if encoding.nil?
encoding = @DEFAULT_ENCODING encoding = @DEFAULT_ENCODING
end end
#Substitute for equivalent encodings: #Substitute for equivalent encodings:
encoding_sub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'} encoding_sub = {'iso-8859-1' => 'windows-1252'}
if encoding_sub.has_key?(encoding.downcase) if encoding_sub.has_key?(encoding.downcase)
encoding = encoding_sub[encoding.downcase] encoding = encoding_sub[encoding.downcase]
@ -132,10 +135,10 @@ module HTML5lib
def detect_bom def detect_bom
bom_dict = { bom_dict = {
"\xef\xbb\xbf" => 'utf-8', "\xef\xbb\xbf" => 'utf-8',
"\xff\xfe" => 'utf-16-le', "\xff\xfe" => 'utf16le',
"\xfe\xff" => 'utf-16-be', "\xfe\xff" => 'utf16be',
"\xff\xfe\x00\x00" => 'utf-32-le', "\xff\xfe\x00\x00" => 'utf32le',
"\x00\x00\xfe\xff" => 'utf-32-be' "\x00\x00\xfe\xff" => 'utf32be'
} }
# Go to beginning of file and read in 4 bytes # Go to beginning of file and read in 4 bytes
@ -205,7 +208,17 @@ module HTML5lib
else else
begin begin
@tell += 1 @tell += 1
return @data_stream[@tell - 1].chr c = @data_stream[@tell - 1]
case c
when 0xC2 .. 0xDF
@tell += 1
c.chr + @data_stream[@tell-1].chr
when 0xE0 .. 0xF0
@tell += 2
c.chr + @data_stream[@tell-2].chr + @data_stream[@tell-1].chr
else
c.chr
end
rescue rescue
return :EOF return :EOF
end end
@ -227,8 +240,8 @@ module HTML5lib
else else
# Then the rest # Then the rest
begin begin
char_stack.push(@data_stream[@tell].chr)
@tell += 1 @tell += 1
char_stack.push(@data_stream[@tell-1].chr)
rescue rescue
char_stack.push(:EOF) char_stack.push(:EOF)
break break

View file

@ -1,4 +1,3 @@
require 'html5lib/tokenizer'
require 'cgi' require 'cgi'
module HTML5lib module HTML5lib
@ -6,7 +5,7 @@ module HTML5lib
# This module provides sanitization of XHTML+MathML+SVG # This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes. # and of inline style attributes.
class HTMLSanitizer < HTMLTokenizer module HTMLSanitizeModule
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
button caption center cite code col colgroup dd del dfn dir div dl dt button caption center cite code col colgroup dd del dfn dir div dl dt
@ -96,19 +95,7 @@ module HTML5lib
ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and def process_token(token)
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
# attributes are parsed, and a restricted set, # specified by
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
# in ALLOWED_PROTOCOLS are allowed.
#
# sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def each
super do |token|
case token[:type] case token[:type]
when :StartTag, :EndTag, :EmptyTag when :StartTag, :EndTag, :EmptyTag
if ALLOWED_ELEMENTS.include?(token[:name]) if ALLOWED_ELEMENTS.include?(token[:name])
@ -126,7 +113,7 @@ module HTML5lib
end end
token[:data] = attrs.map {|k,v| [k,v]} token[:data] = attrs.map {|k,v| [k,v]}
end end
yield token return token
else else
if token[:type] == :EndTag if token[:type] == :EndTag
token[:data] = "</#{token[:name]}>" token[:data] = "</#{token[:name]}>"
@ -139,12 +126,11 @@ module HTML5lib
token[:data].insert(-2,'/') if token[:type] == :EmptyTag token[:data].insert(-2,'/') if token[:type] == :EmptyTag
token[:type] = :Characters token[:type] = :Characters
token.delete(:name) token.delete(:name)
yield token return token
end end
else else
yield token return token
end end
end
end end
def sanitize_css(style) def sanitize_css(style)
@ -174,4 +160,23 @@ module HTML5lib
style = clean.join(' ') style = clean.join(' ')
end end
end end
class HTMLSanitizeFilter < Filter
include HTMLSanitizeModule
def each
@source.each do |token|
yield(process_token(token))
end
end
end
class HTMLSanitizer < HTMLTokenizer
include HTMLSanitizeModule
def each
super do |token|
yield(process_token(token))
end
end
end
end end

View file

@ -0,0 +1,418 @@
require 'html5lib/constants'
require 'jcode'
module HTML5lib
class Filter
include Enumerable
def initialize(source)
@source = source
end
end
class OptionalTagFilter < Filter
def slider
previous1 = previous2 = nil
@source.each do |token|
yield previous2, previous1, token if previous1 != nil
previous2 = previous1
previous1 = token
end
yield previous2, previous1, nil
end
def each
slider do |previous, token, nexttok|
type = token[:type]
if type == :StartTag
yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok)
elsif type == :EndTag
yield token unless is_optional_end(token[:name], nexttok)
else
yield token
end
end
end
def is_optional_start(tagname, previous, nexttok)
type = nexttok ? nexttok[:type] : nil
if tagname == 'html'
# An html element's start tag may be omitted if the first thing
# inside the html element is not a space character or a comment.
return ![:Comment, :SpaceCharacters].include?(type)
elsif tagname == 'head'
# A head element's start tag may be omitted if the first thing
# inside the head element is an element.
return type == :StartTag
elsif tagname == 'body'
# A body element's start tag may be omitted if the first thing
# inside the body element is not a space character or a comment,
# except if the first thing inside the body element is a script
# or style element and the node immediately preceding the body
# element is a head element whose end tag has been omitted.
if [:Comment, :SpaceCharacters].include?(type)
return false
elsif type == :StartTag
# XXX: we do not look at the preceding event, so we never omit
# the body element's start tag if it's followed by a script or
# a style element.
return !%w[script style].include?(nexttok[:name])
else
return true
end
elsif tagname == 'colgroup'
# A colgroup element's start tag may be omitted if the first thing
# inside the colgroup element is a col element, and if the element
# is not immediately preceeded by another colgroup element whose
# end tag has been omitted.
if type == :StartTag
# XXX: we do not look at the preceding event, so instead we never
# omit the colgroup element's end tag when it is immediately
# followed by another colgroup element. See is_optional_end.
return nexttok[:name] == "col"
else
return false
end
elsif tagname == 'tbody'
# A tbody element's start tag may be omitted if the first thing
# inside the tbody element is a tr element, and if the element is
# not immediately preceeded by a tbody, thead, or tfoot element
# whose end tag has been omitted.
if type == :StartTag
# omit the thead and tfoot elements' end tag when they are
# immediately followed by a tbody element. See is_optional_end.
if previous and previous[:type] == :EndTag and \
%w(tbody thead tfoot).include?(previous[:name])
return false
end
return nexttok[:name] == 'tr'
else
return false
end
end
return false
end
def is_optional_end(tagname, nexttok)
type = nexttok ? nexttok[:type] : nil
if %w[html head body].include?(tagname)
# An html element's end tag may be omitted if the html element
# is not immediately followed by a space character or a comment.
return ![:Comment, :SpaceCharacters].include?(type)
elsif %w[li optgroup option tr].include?(tagname)
# A li element's end tag may be omitted if the li element is
# immediately followed by another li element or if there is
# no more content in the parent element.
# An optgroup element's end tag may be omitted if the optgroup
# element is immediately followed by another optgroup element,
# or if there is no more content in the parent element.
# An option element's end tag may be omitted if the option
# element is immediately followed by another option element,
# or if there is no more content in the parent element.
# A tr element's end tag may be omitted if the tr element is
# immediately followed by another tr element, or if there is
# no more content in the parent element.
if type == :StartTag
return nexttok[:name] == tagname
else
return type == :EndTag || type == nil
end
elsif %w(dt dd).include?(tagname)
# A dt element's end tag may be omitted if the dt element is
# immediately followed by another dt element or a dd element.
# A dd element's end tag may be omitted if the dd element is
# immediately followed by another dd element or a dt element,
# or if there is no more content in the parent element.
if type == :StartTag
return %w(dt dd).include?(nexttok[:name])
elsif tagname == 'dd'
return type == :EndTag || type == nil
else
return false
end
elsif tagname == 'p'
# A p element's end tag may be omitted if the p element is
# immediately followed by an address, blockquote, dl, fieldset,
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
# or ul element, or if there is no more content in the parent
# element.
if type == :StartTag
return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5
h6 hr menu ol p pre table ul).include?(nexttok[:name])
else
return type == :EndTag || type == nil
end
elsif tagname == 'colgroup'
# A colgroup element's end tag may be omitted if the colgroup
# element is not immediately followed by a space character or
# a comment.
if [:Comment, :SpaceCharacters].include?(type)
return false
elsif type == :StartTag
# XXX: we also look for an immediately following colgroup
# element. See is_optional_start.
return nexttok[:name] != 'colgroup'
else
return true
end
elsif %w(thead tbody).include? tagname
# A thead element's end tag may be omitted if the thead element
# is immediately followed by a tbody or tfoot element.
# A tbody element's end tag may be omitted if the tbody element
# is immediately followed by a tbody or tfoot element, or if
# there is no more content in the parent element.
# A tfoot element's end tag may be omitted if the tfoot element
# is immediately followed by a tbody element, or if there is no
# more content in the parent element.
# XXX: we never omit the end tag when the following element is
# a tbody. See is_optional_start.
if type == :StartTag
return %w(tbody tfoot).include?(nexttok[:name])
elsif tagname == 'tbody'
return (type == :EndTag or type == nil)
else
return false
end
elsif tagname == 'tfoot'
# A tfoot element's end tag may be omitted if the tfoot element
# is immediately followed by a tbody element, or if there is no
# more content in the parent element.
# XXX: we never omit the end tag when the following element is
# a tbody. See is_optional_start.
if type == :StartTag
return nexttok[:name] == 'tbody'
else
return type == :EndTag || type == nil
end
elsif %w(td th).include? tagname
# A td element's end tag may be omitted if the td element is
# immediately followed by a td or th element, or if there is
# no more content in the parent element.
# A th element's end tag may be omitted if the th element is
# immediately followed by a td or th element, or if there is
# no more content in the parent element.
if type == :StartTag
return %w(td th).include?(nexttok[:name])
else
return type == :EndTag || type == nil
end
end
return false
end
end
class HTMLSerializer
CDATA_ELEMENTS = %w[style script xmp iframe noembed noframes noscript]
def self.serialize(stream, options = {})
new(options).serialize(stream)
end
def initialize(options={})
@quote_attr_values = false
@quote_char = '"'
@use_best_quote_char = true
@minimize_boolean_attributes = true
@use_trailing_solidus = false
@space_before_trailing_solidus = true
@omit_optional_tags = true
@sanitize = false
@strip_whitespace = false
@inject_meta_charset = true
options.each do |name, value|
next unless %w(quote_attr_values quote_char use_best_quote_char
minimize_boolean_attributes use_trailing_solidus
space_before_trailing_solidus omit_optional_tags sanitize
strip_whitespace inject_meta_charset).include? name.to_s
@use_best_quote_char = false if name.to_s == 'quote_char'
instance_variable_set("@#{name}", value)
end
@errors = []
end
def serialize(treewalker, encoding=nil)
in_cdata = false
@errors = []
if encoding and @inject_meta_charset
treewalker = filter_inject_meta_charset(treewalker, encoding)
end
if @strip_whitespace
treewalker = filter_whitespace(treewalker)
end
if @sanitize
require 'html5lib/sanitizer'
treewalker = HTMLSanitizeFilter.new(treewalker)
end
# if @omit_optional_tags
# treewalker = OptionalTagFilter.new(treewalker)
# end
result = []
treewalker.each do |token|
type = token[:type]
if type == :Doctype
doctype = "<!DOCTYPE %s>" % token[:name]
if encoding
result << doctype.encode(encoding)
else
result << doctype
end
elsif [:Characters, :SpaceCharacters].include? type
if type == :SpaceCharacters or in_cdata
if in_cdata and token[:data].find("</") >= 0
serializeError(_("Unexpected </ in CDATA"))
end
if encoding
result << token[:data].encode(encoding, errors || "strict")
else
result << token[:data]
end
elsif encoding
result << token[:data].replace("&", "&amp;") \
.encode(encoding, unicode_encode_errors)
else
result << token[:data] \
.gsub("&", "&amp;") \
.gsub("<", "&lt;") \
.gsub(">", "&gt;")
end
elsif [:StartTag, :EmptyTag].include? type
name = token[:name]
if CDATA_ELEMENTS.include?(name)
in_cdata = true
elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element"))
end
attrs = token[:data].to_a
attrs.sort()
attributes = []
for k,v in attrs
if encoding
k = k.encode(encoding)
end
attributes << ' '
attributes << k
if not @minimize_boolean_attributes or \
(!(BOOLEAN_ATTRIBUTES[name]||[]).include?(k) \
and !BOOLEAN_ATTRIBUTES[:global].include?(k))
attributes << "="
if @quote_attr_values or v.empty?
quote_attr = true
else
quote_attr = (SPACE_CHARACTERS.join('') + "<>\"'").each_char.any? {|c| v.include?(c)}
end
v = v.gsub("&", "&amp;")
if encoding
v = v.encode(encoding, unicode_encode_errors)
end
if quote_attr
quote_char = @quote_char
if @use_best_quote_char
if v.index("'") and !v.index('"')
quote_char = '"'
elsif v.index('"') and !v.index("'")
quote_char = "'"
end
end
if quote_char == "'"
v = v.gsub("'", "&#39;")
else
v = v.gsub('"', "&quot;")
end
attributes << quote_char << v << quote_char
else
attributes << v
end
end
end
if VOID_ELEMENTS.include?(name) and @use_trailing_solidus
if @space_before_trailing_solidus
attributes << " /"
else
attributes << "/"
end
end
if encoding
result << "<%s%s>" % [name.encode(encoding), attributes.join('')]
else
result << "<%s%s>" % [name, attributes.join('')]
end
elsif type == :EndTag
name = token[:name]
if CDATA_ELEMENTS.include?(name)
in_cdata = false
elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element"))
end
end_tag = "</%s>" % name
if encoding
end_tag = end_tag.encode(encoding)
end
result << end_tag
elsif type == :Comment
data = token[:data]
if data.index("--")
serializeError(_("Comment contains --"))
end
comment = "<!--%s-->" % token[:data]
if encoding
comment = comment.encode(encoding, unicode_encode_errors)
end
result << comment
else
serializeError(token[:data])
end
end
result.join('')
end
def render(treewalker, encoding=nil)
if encoding
return "".join(list(serialize(treewalker, encoding)))
else
return "".join(list(serialize(treewalker)))
end
end
def serializeError(data="XXX ERROR MESSAGE NEEDED")
# XXX The idea is to make data mandatory.
@errors.push(data)
if @strict
raise SerializeError
end
end
def filter_inject_meta_charset(treewalker, encoding)
done = false
for token in treewalker
if not done and token[:type] == :StartTag \
and token[:name].lower() == "head"
yield({:type => :EmptyTag, :name => "meta", \
:data => {"charset" => encoding}})
end
yield token
end
end
def filter_whitespace(treewalker)
raise NotImplementedError
end
end
# Error in serialized tree
class SerializeError < Exception
end
end

View file

@ -1,21 +1,24 @@
module HTML5lib module HTML5lib
module TreeBuilders module TreeBuilders
def self.getTreeBuilder(name) class << self
case name.to_s.downcase def [](name)
case name.to_s.downcase
when 'simpletree' then when 'simpletree' then
require 'html5lib/treebuilders/simpletree' require 'html5lib/treebuilders/simpletree'
SimpleTree::TreeBuilder SimpleTree::TreeBuilder
when 'rexml' then when 'rexml' then
require 'html5lib/treebuilders/rexml' require 'html5lib/treebuilders/rexml'
REXMLTree::TreeBuilder REXML::TreeBuilder
when 'hpricot' then when 'hpricot' then
require 'html5lib/treebuilders/hpricot' require 'html5lib/treebuilders/hpricot'
Hpricot::TreeBuilder Hpricot::TreeBuilder
else else
raise "Unknown TreeBuilder #{name}" raise "Unknown TreeBuilder #{name}"
end
end end
end
alias :getTreeBuilder :[]
end
end end
end end

View file

@ -144,7 +144,7 @@ module HTML5lib
# code. It should still do the same though. # code. It should still do the same though.
# Step 1: stop the algorithm when there's nothing to do. # Step 1: stop the algorithm when there's nothing to do.
return unless @activeFormattingElements return if @activeFormattingElements.empty?
# Step 2 and step 3: we start with the last element. So i is -1. # Step 2 and step 3: we start with the last element. So i is -1.
i = -1 i = -1

View file

@ -1,4 +1,5 @@
require 'html5lib/treebuilders/base' require 'html5lib/treebuilders/base'
require 'rubygems'
require 'hpricot' require 'hpricot'
require 'forwardable' require 'forwardable'
@ -26,12 +27,14 @@ module HTML5lib
childNodes << node childNodes << node
hpricot.children << node.hpricot hpricot.children << node.hpricot
end end
node.hpricot.parent = hpricot
node.parent = self node.parent = self
end end
def removeChild(node) def removeChild(node)
childNodes.delete(node) childNodes.delete(node)
hpricot.children.delete_at(hpricot.children.index(node.hpricot)) hpricot.children.delete_at(hpricot.children.index(node.hpricot))
node.hpricot.parent = nil
node.parent = nil node.parent = nil
end end
@ -48,6 +51,7 @@ module HTML5lib
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode) if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
else else
refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
childNodes.insert(index, node) childNodes.insert(index, node)
end end
end end

View file

@ -4,7 +4,7 @@ require 'forwardable'
module HTML5lib module HTML5lib
module TreeBuilders module TreeBuilders
module REXMLTree module REXML
class Node < Base::Node class Node < Base::Node
extend Forwardable extend Forwardable
@ -52,6 +52,7 @@ module HTML5lib
childNodes[index-1].rxobj.raw = true childNodes[index-1].rxobj.raw = true
else else
childNodes.insert index, node childNodes.insert index, node
refNode.rxobj.parent.insert_before(refNode.rxobj,node.rxobj)
end end
end end
@ -62,7 +63,7 @@ module HTML5lib
class Element < Node class Element < Node
def self.rxclass def self.rxclass
REXML::Element ::REXML::Element
end end
def initialize name def initialize name
@ -95,7 +96,7 @@ module HTML5lib
class Document < Node class Document < Node
def self.rxclass def self.rxclass
REXML::Document ::REXML::Document
end end
def initialize def initialize
@ -120,7 +121,7 @@ module HTML5lib
class DocumentType < Node class DocumentType < Node
def self.rxclass def self.rxclass
REXML::DocType ::REXML::DocType
end end
def printTree indent=0 def printTree indent=0
@ -145,7 +146,7 @@ module HTML5lib
class TextNode < Node class TextNode < Node
def initialize data def initialize data
raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;') raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;')
@rxobj = REXML::Text.new(raw, true, nil, true) @rxobj = ::REXML::Text.new(raw, true, nil, true)
end end
def printTree indent=0 def printTree indent=0
@ -155,7 +156,7 @@ module HTML5lib
class CommentNode < Node class CommentNode < Node
def self.rxclass def self.rxclass
REXML::Comment ::REXML::Comment
end end
def printTree indent=0 def printTree indent=0

View file

@ -0,0 +1,26 @@
require 'html5lib/treewalkers/base'
module HTML5lib
module TreeWalkers
class << self
def [](name)
case name.to_s.downcase
when 'simpletree' then
require 'html5lib/treewalkers/simpletree'
SimpleTree::TreeWalker
when 'rexml' then
require 'html5lib/treewalkers/rexml'
REXML::TreeWalker
when 'hpricot' then
require 'html5lib/treewalkers/hpricot'
Hpricot::TreeWalker
else
raise "Unknown TreeWalker #{name}"
end
end
alias :getTreeWalker :[]
end
end
end

View file

@ -0,0 +1,156 @@
require 'html5lib/constants'
module HTML5lib
module TreeWalkers
module TokenConstructor
def error(msg)
return {:type => "SerializeError", :data => msg}
end
def normalizeAttrs(attrs)
attrs.to_a
end
def emptyTag(name, attrs, hasChildren=false)
error(_("Void element has children")) if hasChildren
return({:type => :EmptyTag, :name => name, \
:data => normalizeAttrs(attrs)})
end
def startTag(name, attrs)
return {:type => :StartTag, :name => name, \
:data => normalizeAttrs(attrs)}
end
def endTag(name)
return {:type => :EndTag, :name => name, :data => []}
end
def text(data)
if data =~ /^([#{SPACE_CHARACTERS.join('')}]+)/
yield({:type => :SpaceCharacters, :data => $1})
data = data[$1.length .. -1]
return if data.empty?
end
if data =~ /([#{SPACE_CHARACTERS.join('')}]+)$/
yield({:type => :Characters, :data => data[0 ... -$1.length]})
yield({:type => :SpaceCharacters, :data => $1})
else
yield({:type => :Characters, :data => data})
end
end
def comment(data)
return {:type => :Comment, :data => data}
end
def doctype(name)
return {:type => :Doctype, :name => name, :data => name.upcase() == "HTML"}
end
def unknown(nodeType)
return error(_("Unknown node type: ") + nodeType.to_s)
end
def _(str)
str
end
end
class Base
include TokenConstructor
def initialize(tree)
@tree = tree
end
def each
raise NotImplementedError
end
alias walk each
end
class NonRecursiveTreeWalker < TreeWalkers::Base
def node_details(node)
raise NotImplementedError
end
def first_child(node)
raise NotImplementedError
end
def next_sibling(node)
raise NotImplementedError
end
def parent(node)
raise NotImplementedError
end
def each
currentNode = @tree
while currentNode != nil
details = node_details(currentNode)
hasChildren = false
case details.shift
when :DOCTYPE
yield doctype(*details)
when :TEXT
text(*details) {|token| yield token}
when :ELEMENT
name, attributes, hasChildren = details
if VOID_ELEMENTS.include?(name)
yield emptyTag(name, attributes.to_a, hasChildren)
hasChildren = false
else
yield startTag(name, attributes.to_a)
end
when :COMMENT
yield comment(details[0])
when :DOCUMENT, :DOCUMENT_FRAGMENT
hasChildren = true
when nil
# ignore (REXML::XMLDecl is an example)
else
yield unknown(details[0])
end
firstChild = hasChildren ? first_child(currentNode) : nil
if firstChild != nil
currentNode = firstChild
else
while currentNode != nil
details = node_details(currentNode)
if details.shift == :ELEMENT
name, attributes, hasChildren = details
yield endTag(name) if !VOID_ELEMENTS.include?(name)
end
if @tree == currentNode
currentNode = nil
else
nextSibling = next_sibling(currentNode)
if nextSibling != nil
currentNode = nextSibling
break
end
currentNode = parent(currentNode)
end
end
end
end
end
end
end
end

View file

@ -0,0 +1,48 @@
require 'html5lib/treewalkers/base'
require 'rexml/document'
module HTML5lib
module TreeWalkers
module Hpricot
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
def node_details(node)
case node
when ::Hpricot::Elem
if !node.name
[:DOCUMENT_FRAGMENT]
else
[:ELEMENT, node.name,
node.attributes.map {|name,value| [name,value]},
!node.empty?]
end
when ::Hpricot::Text
[:TEXT, node.to_plain_text]
when ::Hpricot::Comment
[:COMMENT, node.content]
when ::Hpricot::Doc
[:DOCUMENT]
when ::Hpricot::DocType
[:DOCTYPE, node.target]
when ::Hpricot::XMLDecl
[nil]
else
[:UNKNOWN, node.class.inspect]
end
end
def first_child(node)
node.children.first
end
def next_sibling(node)
node.next_node
end
def parent(node)
node.parent
end
end
end
end
end

View file

@ -0,0 +1,48 @@
require 'html5lib/treewalkers/base'
require 'rexml/document'
module HTML5lib
module TreeWalkers
module REXML
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
def node_details(node)
case node
when ::REXML::Document
[:DOCUMENT]
when ::REXML::Element
if !node.name
[:DOCUMENT_FRAGMENT]
else
[:ELEMENT, node.name,
node.attributes.map {|name,value| [name,value]},
node.has_elements? || node.has_text?]
end
when ::REXML::Text
[:TEXT, node.value]
when ::REXML::Comment
[:COMMENT, node.string]
when ::REXML::DocType
[:DOCTYPE, node.name]
when ::REXML::XMLDecl
[nil]
else
[:UNKNOWN, node.class.inspect]
end
end
def first_child(node)
node.children.first
end
def next_sibling(node)
node.next_sibling
end
def parent(node)
node.parent
end
end
end
end
end

View file

@ -0,0 +1,48 @@
require 'html5lib/treewalkers/base'
module HTML5lib
module TreeWalkers
module SimpleTree
class TreeWalker < HTML5lib::TreeWalkers::Base
include HTML5lib::TreeBuilders::SimpleTree
def walk(node)
case node
when Document, DocumentFragment
return
when DocumentType
yield doctype(node.name)
when TextNode
text(node.value) {|token| yield token}
when Element
if VOID_ELEMENTS.include?(node.name)
yield emptyTag(node.name, node.attributes, node.hasContent())
else
yield startTag(node.name, node.attributes)
for child in node.childNodes
walk(child) {|token| yield token}
end
yield endTag(node.name)
end
when CommentNode
yield comment(node.value)
else
puts '?'
yield unknown(node.class)
end
end
def each
for child in @tree.childNodes
walk(child) {|node| yield node}
end
end
end
end
end
end

137
vendor/plugins/HTML5lib/parse.rb vendored Executable file
View file

@ -0,0 +1,137 @@
#!/usr/bin/env ruby
#
# Parse a document to a simpletree tree, with optional profiling
$:.unshift File.dirname(__FILE__),'lib'
def parse(opts, args)
f = args[-1]
if f
begin
require 'open-uri' if f[0..6] == 'http://'
f = open(f)
rescue
end
else
$stderr.write("No filename provided. Use -h for help\n")
exit(1)
end
require 'html5lib/treebuilders'
treebuilder = HTML5lib::TreeBuilders[opts.treebuilder]
if opts.output == :xml
require 'html5lib/liberalxmlparser'
p = HTML5lib::XHTMLParser.new(:tree=>treebuilder)
else
require 'html5lib/html5parser'
p = HTML5lib::HTMLParser.new(:tree=>treebuilder)
end
if opts.profile
require 'profiler'
Profiler__::start_profile
p.parse(f)
Profiler__::stop_profile
Profiler__::print_profile($stderr)
elsif opts.time
require 'time'
t0 = Time.new
document = p.parse(f)
t1 = Time.new
printOutput(p, document, opts)
t2 = Time.new
puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1]
else
document = p.parse(f)
printOutput(p, document, opts)
end
end
def printOutput(parser, document, opts)
puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding
case opts.output
when :xml
print document
when :html
require 'html5lib/treewalkers'
tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document)
require 'html5lib/serializer'
print HTML5lib::HTMLSerializer.serialize(tokens, :encoding=>'utf-8')
when :hilite
print document.hilite
when :tree
print parser.tree.testSerializer(document)
end
if opts.error
errList=[]
for pos, message in parser.errors
errList << ("Line %i Col %i"%pos + " " + message)
end
$stderr.write("\nParse errors:\n" + errList.join("\n")+"\n")
end
end
require 'ostruct'
options = OpenStruct.new
options.profile = false
options.time = false
options.output = :tree
options.treebuilder = 'simpletree'
options.error = false
options.encoding = false
require 'optparse'
opts = OptionParser.new do |opts|
opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
options.profile = profile
end
opts.on("-t", "--[no-]time", "Time the run") do |time|
options.time = time
end
opts.on("--[no-]tree", "Do not print output tree") do |tree|
if tree
options.output = :tree
else
options.output = nil
end
end
opts.on("-b", "--treebuilder NAME") do |treebuilder|
options.treebuilder = treebuilder
end
opts.on("-e", "--error", "Print a list of parse errors") do |error|
options.error = error
end
opts.on("-x", "--xml", "output as xml") do |xml|
options.output = :xml
options.treebuilder = "rexml"
end
opts.on("--html", "Output as html") do |html|
options.output = :html
end
opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
options.output = :hilite
end
opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
options.encoding = encoding
end
opts.on_tail("-h", "--help", "Show this message") do
puts opts
exit
end
end
opts.parse!(ARGV)
parse options, ARGV

View file

@ -21,3 +21,53 @@ rescue LoadError
end end
end end
end end
module HTML5lib
module TestSupport
def self.startswith?(a, b)
b[0... a.length] == a
end
def self.parseTestcase(data)
innerHTML = nil
input = []
output = []
errors = []
currentList = input
data.split(/\n/).each do |line|
if !line.empty? and !startswith?("#errors", line) and
!startswith?("#document", line) and
!startswith?("#data", line) and
!startswith?("#document-fragment", line)
if currentList == output and startswith?("|", line)
currentList.push(line[2..-1])
else
currentList.push(line)
end
elsif line == "#errors"
currentList = errors
elsif line == "#document" or startswith?("#document-fragment", line)
if startswith?("#document-fragment", line)
innerHTML = line[19..-1]
raise AssertionError unless innerHTML
end
currentList = output
end
end
return innerHTML, input.join("\n"), output.join("\n"), errors
end
# convert the output of str(document) to the format used in the testcases
def convertTreeDump(treedump)
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
end
def sortattrs(output)
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) do |match|
match.split("\n").sort.join("\n")
end
end
end
end

View file

@ -4,33 +4,33 @@ require 'html5lib/inputstream'
class Html5EncodingTestCase < Test::Unit::TestCase class Html5EncodingTestCase < Test::Unit::TestCase
begin begin
require 'rubygems' require 'rubygems'
require 'UniversalDetector' require 'UniversalDetector'
def test_chardet def test_chardet
File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file| File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true) stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
assert_equal 'big5', stream.char_encoding.downcase assert_equal 'big5', stream.char_encoding.downcase
end end
end end
rescue LoadError rescue LoadError
puts "chardet not found, skipping chardet tests" puts "chardet not found, skipping chardet tests"
end end
html5lib_test_files('encoding').each do |test_file| html5lib_test_files('encoding').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '').tr('-', '') test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
File.read(test_file).split("#data\n").each_with_index do |data, index| File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty? next if data.empty?
input, encoding = data.split(/\n#encoding\s+/, 2) input, encoding = data.split(/\n#encoding\s+/, 2)
encoding = encoding.split[0] encoding = encoding.split[0]
define_method 'test_%s_%d' % [ test_name, index + 1 ] do define_method 'test_%s_%d' % [ test_name, index + 1 ] do
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false) stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
assert_equal encoding.downcase, stream.char_encoding.downcase, input assert_equal encoding.downcase, stream.char_encoding.downcase, input
end end
end
end end
end
end end

View file

@ -14,53 +14,12 @@ end
$CHECK_PARSER_ERRORS = false $CHECK_PARSER_ERRORS = false
puts 'Testing: ' + $tree_types_to_test * ', ' puts 'Testing tree builders: ' + $tree_types_to_test * ', '
class Html5ParserTestCase < Test::Unit::TestCase class Html5ParserTestCase < Test::Unit::TestCase
include HTML5lib
def self.startswith?(a, b) include TestSupport
b[0... a.length] == a
end
def self.parseTestcase(data)
innerHTML = nil
input = []
output = []
errors = []
currentList = input
data.split(/\n/).each do |line|
if !line.empty? and !startswith?("#errors", line) and
!startswith?("#document", line) and
!startswith?("#data", line) and
!startswith?("#document-fragment", line)
if currentList == output and startswith?("|", line)
currentList.push(line[2..-1])
else
currentList.push(line)
end
elsif line == "#errors"
currentList = errors
elsif line == "#document" or startswith?("#document-fragment", line)
if startswith?("#document-fragment", line)
innerHTML = line[19..-1]
raise AssertionError unless innerHTML
end
currentList = output
end
end
return innerHTML, input.join("\n"), output.join("\n"), errors
end
# convert the output of str(document) to the format used in the testcases
def convertTreeDump(treedump)
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
end
def sortattrs(output)
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
end
html5lib_test_files('tree-construction').each do |test_file| html5lib_test_files('tree-construction').each do |test_file|
@ -69,12 +28,13 @@ class Html5ParserTestCase < Test::Unit::TestCase
File.read(test_file).split("#data\n").each_with_index do |data, index| File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty? next if data.empty?
innerHTML, input, expected_output, expected_errors = parseTestcase(data) innerHTML, input, expected_output, expected_errors =
TestSupport.parseTestcase(data)
$tree_types_to_test.each do |tree_name| $tree_types_to_test.each do |tree_name|
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name)) parser = HTMLParser.new(:tree => TreeBuilders[tree_name])
if innerHTML if innerHTML
parser.parseFragment(input, innerHTML) parser.parseFragment(input, innerHTML)

View file

@ -2,9 +2,11 @@
require File.join(File.dirname(__FILE__), 'preamble') require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/sanitizer'
require 'html5lib/html5parser' require 'html5lib/html5parser'
require 'html5lib/liberalxmlparser' require 'html5lib/liberalxmlparser'
require 'html5lib/treewalkers'
require 'html5lib/serializer'
require 'html5lib/sanitizer'
class SanitizeTest < Test::Unit::TestCase class SanitizeTest < Test::Unit::TestCase
include HTML5lib include HTML5lib

View file

@ -0,0 +1,52 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/html5parser'
require 'html5lib/serializer'
require 'html5lib/treewalkers'
#Run the serialize error checks
checkSerializeErrors = false
class JsonWalker < HTML5lib::TreeWalkers::Base
def each
@tree.each do |token|
case token[0]
when 'StartTag'
yield startTag(token[1], token[2])
when 'EndTag'
yield endTag(token[1])
when 'EmptyTag'
yield emptyTag(token[1], token[2])
when 'Comment'
yield comment(token[1])
when 'Characters', 'SpaceCharacters'
text(token[1]) {|textToken| yield textToken}
when 'Doctype'
yield doctype(token[1])
else
raise ValueError("Unknown token type: " + type)
end
end
end
end
class Html5SerializeTestcase < Test::Unit::TestCase
html5lib_test_files('serializer').each do |filename|
test_name = File.basename(filename).sub('.test', '')
tests = JSON::parse(open(filename).read)
tests['tests'].each_with_index do |test, index|
define_method "test_#{test_name}_#{index+1}" do
result = HTML5lib::HTMLSerializer.
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
expected = test["expected"]
if expected.length == 1
assert_equal(expected[0], result, test["description"])
elsif !expected.include?(result)
flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
end
end
end
end
end

54
vendor/plugins/HTML5lib/tests/test_stream.rb vendored Executable file
View file

@ -0,0 +1,54 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/inputstream'
class HTMLInputStreamTest < Test::Unit::TestCase
include HTML5lib
def test_char_ascii
stream = HTMLInputStream.new("'")
assert_equal('ascii', stream.char_encoding)
assert_equal("'", stream.char)
end
def test_char_null
stream = HTMLInputStream.new("\x00")
assert_equal("\xef\xbf\xbd", stream.char)
end
def test_char_utf8
stream = HTMLInputStream.new("\xe2\x80\x98")
assert_equal('utf-8', stream.char_encoding)
assert_equal("\xe2\x80\x98", stream.char)
end
def test_char_win1252
stream = HTMLInputStream.new("\x91")
assert_equal('windows-1252', stream.char_encoding)
assert_equal("\xe2\x80\x98", stream.char)
end
def test_bom
stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
assert_equal('utf-8', stream.char_encoding)
assert_equal("'", stream.char)
end
def test_utf_16
stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
assert(stream.char_encoding, 'utf-16-le')
assert_equal(1025, stream.chars_until(' ',true).length)
end
def test_newlines
stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
assert_equal(0, stream.instance_eval {@tell})
assert_equal("a\nbb\n", stream.chars_until('c'))
assert_equal(6, stream.instance_eval {@tell})
assert_equal([3,1], stream.position)
assert_equal("ccc\ndddd", stream.chars_until('x'))
assert_equal(14, stream.instance_eval {@tell})
assert_equal([4,5], stream.position)
assert_equal([0,1,4,8], stream.instance_eval {@new_lines})
end
end

View file

@ -0,0 +1,110 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/html5parser'
require 'html5lib/treewalkers'
require 'html5lib/treebuilders'
$tree_types_to_test = {
'simpletree' =>
{:builder => HTML5lib::TreeBuilders['simpletree'],
:walker => HTML5lib::TreeWalkers['simpletree']},
'rexml' =>
{:builder => HTML5lib::TreeBuilders['rexml'],
:walker => HTML5lib::TreeWalkers['rexml']},
# 'hpricot' =>
# {:builder => HTML5lib::TreeBuilders['hpricot'],
# :walker => HTML5lib::TreeWalkers['hpricot']},
}
puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
class TestTreeWalkers < Test::Unit::TestCase
include HTML5lib::TestSupport
def concatenateCharacterTokens(tokens)
charactersToken = nil
for token in tokens
type = token[:type]
if [:Characters, :SpaceCharacters].include?(type)
if charactersToken == nil
charactersToken = {:type => :Characters, :data => token[:data]}
else
charactersToken[:data] += token[:data]
end
else
if charactersToken != nil
yield charactersToken
charactersToken = nil
end
yield token
end
end
yield charactersToken if charactersToken != nil
end
def convertTokens(tokens)
output = []
indent = 0
concatenateCharacterTokens(tokens) do |token|
case token[:type]
when :StartTag, :EmptyTag
output << "#{' '*indent}<#{token[:name]}>"
indent += 2
for name, value in token[:data].to_a.sort
next if name=='xmlns'
output << "#{' '*indent}#{name}=\"#{value}\""
end
indent -= 2 if token[:type] == :EmptyTag
when :EndTag
indent -= 2
when :Comment
output << "#{' '*indent}<!-- #{token[:data]} -->"
when :Doctype
output << "#{' '*indent}<!DOCTYPE #{token[:name]}>"
when :Characters, :SpaceCharacters
output << "#{' '*indent}\"#{token[:data]}\""
else
# TODO: what to do with errors?
end
end
return output.join("\n")
end
html5lib_test_files('tree-construction').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '')
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
innerHTML, input, expected_output, expected_errors =
HTML5lib::TestSupport::parseTestcase(data)
rexml = $tree_types_to_test['rexml']
$tree_types_to_test.each do |tree_name, treeClass|
define_method "test_#{test_name}_#{index}_#{tree_name}" do
parser = HTML5lib::HTMLParser.new(:tree => treeClass[:builder])
if innerHTML
parser.parseFragment(input, innerHTML)
else
parser.parse(input)
end
document = parser.tree.getDocument
begin
output = sortattrs(convertTokens(treeClass[:walker].new(document)))
expected = sortattrs(expected_output)
errorMsg = "\n\nExpected:\n#{expected}\nRecieved:\n#{output}\n"
assert_equal(expected, output, errorMsg)
rescue NotImplementedError
# Amnesty for those that confess...
end
end
end
end
end
end

View file

@ -154,6 +154,21 @@ Example:
CSS: style.css math.css CSS: style.css math.css
=end =end
# Render to an HTML fragment (returns a REXML document tree)
def to_html_tree
div = Element.new 'div'
children_to_html.each do |e|
div << e
end
# render footnotes
if @doc.footnotes_order.size > 0
div << render_footnotes
end
doc = Document.new(nil,{:respect_whitespace =>:all})
doc << div
end
# Render to a complete HTML document (returns a REXML document tree) # Render to a complete HTML document (returns a REXML document tree)
def to_html_document_tree def to_html_document_tree