REXML Trees

Synced with latest HTML5lib.
Added preliminary support (currently disabled) for sanitizing REXML trees.
This commit is contained in:
Jacques Distler 2007-06-05 16:34:49 -05:00
parent 4dd70af5ae
commit bd8ba1f4b1
28 changed files with 1317 additions and 112 deletions

View file

@ -53,9 +53,10 @@ module Engines
def mask
require_dependency 'maruku'
require_dependency 'maruku/ext/math'
html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
{:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html
sanitize_xhtml(html.to_ncr)
# html = sanitize_rexml(Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
# {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html_tree)
html = sanitize_xhtml(Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
{:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html.to_ncr)
end
end

View file

@ -8,19 +8,36 @@ module Sanitize
#
# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
# sanitize_html() is a case-insensitive sanitizer suitable for HTML
# sanitize_rexml() sanitized a REXML tree, returning a string
require 'html5lib/sanitizer'
require 'html5lib/html5parser'
require 'html5lib/liberalxmlparser'
require 'html5lib/treewalkers'
require 'html5lib/serializer'
require 'string_utils'
require 'html5lib/sanitizer'
include HTML5lib
def sanitize_xhtml(html)
XHTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
XHTMLParser.parseFragment(html.to_ncr, :tokenizer => HTMLSanitizer).to_s
end
def sanitize_html(html)
HTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
end
def sanitize_rexml(tree)
tokens = TreeWalkers.getTreeWalker('rexml').new(tree.to_ncr)
HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
:quote_attr_values => 'true',
:minimize_boolean_attributes => 'false',
:use_trailing_solidus => 'true',
:space_before_trailing_solidus => 'true',
:omit_optional_tags => 'false',
:inject_meta_charset => 'false',
:sanitize => 'true'})
end
end

View file

@ -2155,3 +2155,20 @@ class String
end
end
require 'rexml/element'
module REXML
class Element
def to_ncr
XPath.each(self, '//*') { |el|
el.texts.each_index {|i|
el.texts[i].value = el.texts[i].to_s.to_ncr
}
el.attributes.each { |name,val|
el.attributes[name] = val.to_ncr
}
}
return self
end
end
end

View file

@ -148,6 +148,26 @@ module HTML5lib
input
]
BOOLEAN_ATTRIBUTES = {
:global => %w[irrelevant],
'style' => %w[scoped],
'img' => %w[ismap],
'audio' => %w[autoplay controls],
'video' => %w[autoplay controls],
'script' => %w[defer async],
'details' => %w[open],
'datagrid' => %w[multiple disabled],
'command' => %w[hidden disabled checked default],
'menu' => %w[autosubmit],
'fieldset' => %w[disabled readonly],
'option' => %w[disabled readonly selected],
'optgroup' => %w[disabled readonly],
'button' => %w[disabled autofocus],
'input' => %w[disabled readonly required autofocus checked ismap],
'select' => %w[disabled readonly autofocus multiple],
'output' => %w[disabled readonly]
}
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
ENTITIES_WINDOWS1252 = [
8364, # 0x80 0x20AC EURO SIGN

View file

@ -37,13 +37,13 @@ module HTML5lib
# :strict - raise an exception when a parse error is encountered
# :tree - a treebuilder class controlling the type of tree that will be
# returned. Built in treebuilders can be accessed through
# html5lib.treebuilders.getTreeBuilder(treeType)
# HTML5lib::TreeBuilders[treeType]
def initialize(options = {})
@strict = false
@errors = []
@tokenizer = HTMLTokenizer
@tree = TreeBuilders::REXMLTree::TreeBuilder
@tree = TreeBuilders::REXML::TreeBuilder
options.each { |name, value| instance_variable_set("@#{name}", value) }

View file

@ -58,7 +58,7 @@ module HTML5lib
unless @char_encoding == 'utf-8'
begin
require 'iconv'
uString = Iconv.iconv('utf-8', @encoding, uString)[0]
uString = Iconv.iconv('utf-8', @char_encoding, uString)[0]
rescue
end
end
@ -95,11 +95,13 @@ module HTML5lib
#First look for a BOM
#This will also read past the BOM if present
encoding = detect_bom
#If there is no BOM need to look for meta elements with encoding
#information
if encoding.nil? and @parse_meta
encoding = detect_encoding_meta
end
#Guess with chardet, if avaliable
if encoding.nil? and @chardet
begin
@ -111,13 +113,14 @@ module HTML5lib
rescue LoadError
end
end
# If all else fails use the default encoding
if encoding.nil?
encoding = @DEFAULT_ENCODING
end
#Substitute for equivalent encodings:
encoding_sub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
encoding_sub = {'iso-8859-1' => 'windows-1252'}
if encoding_sub.has_key?(encoding.downcase)
encoding = encoding_sub[encoding.downcase]
@ -132,10 +135,10 @@ module HTML5lib
def detect_bom
bom_dict = {
"\xef\xbb\xbf" => 'utf-8',
"\xff\xfe" => 'utf-16-le',
"\xfe\xff" => 'utf-16-be',
"\xff\xfe\x00\x00" => 'utf-32-le',
"\x00\x00\xfe\xff" => 'utf-32-be'
"\xff\xfe" => 'utf16le',
"\xfe\xff" => 'utf16be',
"\xff\xfe\x00\x00" => 'utf32le',
"\x00\x00\xfe\xff" => 'utf32be'
}
# Go to beginning of file and read in 4 bytes
@ -205,7 +208,17 @@ module HTML5lib
else
begin
@tell += 1
return @data_stream[@tell - 1].chr
c = @data_stream[@tell - 1]
case c
when 0xC2 .. 0xDF
@tell += 1
c.chr + @data_stream[@tell-1].chr
when 0xE0 .. 0xF0
@tell += 2
c.chr + @data_stream[@tell-2].chr + @data_stream[@tell-1].chr
else
c.chr
end
rescue
return :EOF
end
@ -227,8 +240,8 @@ module HTML5lib
else
# Then the rest
begin
char_stack.push(@data_stream[@tell].chr)
@tell += 1
char_stack.push(@data_stream[@tell-1].chr)
rescue
char_stack.push(:EOF)
break

View file

@ -1,4 +1,3 @@
require 'html5lib/tokenizer'
require 'cgi'
module HTML5lib
@ -6,7 +5,7 @@ module HTML5lib
# This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes.
class HTMLSanitizer < HTMLTokenizer
module HTMLSanitizeModule
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
button caption center cite code col colgroup dd del dfn dir div dl dt
@ -96,19 +95,7 @@ module HTML5lib
ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
# attributes are parsed, and a restricted set, # specified by
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
# in ALLOWED_PROTOCOLS are allowed.
#
# sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def each
super do |token|
def process_token(token)
case token[:type]
when :StartTag, :EndTag, :EmptyTag
if ALLOWED_ELEMENTS.include?(token[:name])
@ -126,7 +113,7 @@ module HTML5lib
end
token[:data] = attrs.map {|k,v| [k,v]}
end
yield token
return token
else
if token[:type] == :EndTag
token[:data] = "</#{token[:name]}>"
@ -139,12 +126,11 @@ module HTML5lib
token[:data].insert(-2,'/') if token[:type] == :EmptyTag
token[:type] = :Characters
token.delete(:name)
yield token
return token
end
else
yield token
return token
end
end
end
def sanitize_css(style)
@ -174,4 +160,23 @@ module HTML5lib
style = clean.join(' ')
end
end
class HTMLSanitizeFilter < Filter
include HTMLSanitizeModule
def each
@source.each do |token|
yield(process_token(token))
end
end
end
class HTMLSanitizer < HTMLTokenizer
include HTMLSanitizeModule
def each
super do |token|
yield(process_token(token))
end
end
end
end

View file

@ -0,0 +1,418 @@
require 'html5lib/constants'
require 'jcode'
module HTML5lib
class Filter
include Enumerable
def initialize(source)
@source = source
end
end
class OptionalTagFilter < Filter
def slider
previous1 = previous2 = nil
@source.each do |token|
yield previous2, previous1, token if previous1 != nil
previous2 = previous1
previous1 = token
end
yield previous2, previous1, nil
end
def each
slider do |previous, token, nexttok|
type = token[:type]
if type == :StartTag
yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok)
elsif type == :EndTag
yield token unless is_optional_end(token[:name], nexttok)
else
yield token
end
end
end
def is_optional_start(tagname, previous, nexttok)
type = nexttok ? nexttok[:type] : nil
if tagname == 'html'
# An html element's start tag may be omitted if the first thing
# inside the html element is not a space character or a comment.
return ![:Comment, :SpaceCharacters].include?(type)
elsif tagname == 'head'
# A head element's start tag may be omitted if the first thing
# inside the head element is an element.
return type == :StartTag
elsif tagname == 'body'
# A body element's start tag may be omitted if the first thing
# inside the body element is not a space character or a comment,
# except if the first thing inside the body element is a script
# or style element and the node immediately preceding the body
# element is a head element whose end tag has been omitted.
if [:Comment, :SpaceCharacters].include?(type)
return false
elsif type == :StartTag
# XXX: we do not look at the preceding event, so we never omit
# the body element's start tag if it's followed by a script or
# a style element.
return !%w[script style].include?(nexttok[:name])
else
return true
end
elsif tagname == 'colgroup'
# A colgroup element's start tag may be omitted if the first thing
# inside the colgroup element is a col element, and if the element
# is not immediately preceeded by another colgroup element whose
# end tag has been omitted.
if type == :StartTag
# XXX: we do not look at the preceding event, so instead we never
# omit the colgroup element's end tag when it is immediately
# followed by another colgroup element. See is_optional_end.
return nexttok[:name] == "col"
else
return false
end
elsif tagname == 'tbody'
# A tbody element's start tag may be omitted if the first thing
# inside the tbody element is a tr element, and if the element is
# not immediately preceeded by a tbody, thead, or tfoot element
# whose end tag has been omitted.
if type == :StartTag
# omit the thead and tfoot elements' end tag when they are
# immediately followed by a tbody element. See is_optional_end.
if previous and previous[:type] == :EndTag and \
%w(tbody thead tfoot).include?(previous[:name])
return false
end
return nexttok[:name] == 'tr'
else
return false
end
end
return false
end
def is_optional_end(tagname, nexttok)
type = nexttok ? nexttok[:type] : nil
if %w[html head body].include?(tagname)
# An html element's end tag may be omitted if the html element
# is not immediately followed by a space character or a comment.
return ![:Comment, :SpaceCharacters].include?(type)
elsif %w[li optgroup option tr].include?(tagname)
# A li element's end tag may be omitted if the li element is
# immediately followed by another li element or if there is
# no more content in the parent element.
# An optgroup element's end tag may be omitted if the optgroup
# element is immediately followed by another optgroup element,
# or if there is no more content in the parent element.
# An option element's end tag may be omitted if the option
# element is immediately followed by another option element,
# or if there is no more content in the parent element.
# A tr element's end tag may be omitted if the tr element is
# immediately followed by another tr element, or if there is
# no more content in the parent element.
if type == :StartTag
return nexttok[:name] == tagname
else
return type == :EndTag || type == nil
end
elsif %w(dt dd).include?(tagname)
# A dt element's end tag may be omitted if the dt element is
# immediately followed by another dt element or a dd element.
# A dd element's end tag may be omitted if the dd element is
# immediately followed by another dd element or a dt element,
# or if there is no more content in the parent element.
if type == :StartTag
return %w(dt dd).include?(nexttok[:name])
elsif tagname == 'dd'
return type == :EndTag || type == nil
else
return false
end
elsif tagname == 'p'
# A p element's end tag may be omitted if the p element is
# immediately followed by an address, blockquote, dl, fieldset,
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
# or ul element, or if there is no more content in the parent
# element.
if type == :StartTag
return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5
h6 hr menu ol p pre table ul).include?(nexttok[:name])
else
return type == :EndTag || type == nil
end
elsif tagname == 'colgroup'
# A colgroup element's end tag may be omitted if the colgroup
# element is not immediately followed by a space character or
# a comment.
if [:Comment, :SpaceCharacters].include?(type)
return false
elsif type == :StartTag
# XXX: we also look for an immediately following colgroup
# element. See is_optional_start.
return nexttok[:name] != 'colgroup'
else
return true
end
elsif %w(thead tbody).include? tagname
# A thead element's end tag may be omitted if the thead element
# is immediately followed by a tbody or tfoot element.
# A tbody element's end tag may be omitted if the tbody element
# is immediately followed by a tbody or tfoot element, or if
# there is no more content in the parent element.
# A tfoot element's end tag may be omitted if the tfoot element
# is immediately followed by a tbody element, or if there is no
# more content in the parent element.
# XXX: we never omit the end tag when the following element is
# a tbody. See is_optional_start.
if type == :StartTag
return %w(tbody tfoot).include?(nexttok[:name])
elsif tagname == 'tbody'
return (type == :EndTag or type == nil)
else
return false
end
elsif tagname == 'tfoot'
# A tfoot element's end tag may be omitted if the tfoot element
# is immediately followed by a tbody element, or if there is no
# more content in the parent element.
# XXX: we never omit the end tag when the following element is
# a tbody. See is_optional_start.
if type == :StartTag
return nexttok[:name] == 'tbody'
else
return type == :EndTag || type == nil
end
elsif %w(td th).include? tagname
# A td element's end tag may be omitted if the td element is
# immediately followed by a td or th element, or if there is
# no more content in the parent element.
# A th element's end tag may be omitted if the th element is
# immediately followed by a td or th element, or if there is
# no more content in the parent element.
if type == :StartTag
return %w(td th).include?(nexttok[:name])
else
return type == :EndTag || type == nil
end
end
return false
end
end
class HTMLSerializer
CDATA_ELEMENTS = %w[style script xmp iframe noembed noframes noscript]
def self.serialize(stream, options = {})
new(options).serialize(stream)
end
def initialize(options={})
@quote_attr_values = false
@quote_char = '"'
@use_best_quote_char = true
@minimize_boolean_attributes = true
@use_trailing_solidus = false
@space_before_trailing_solidus = true
@omit_optional_tags = true
@sanitize = false
@strip_whitespace = false
@inject_meta_charset = true
options.each do |name, value|
next unless %w(quote_attr_values quote_char use_best_quote_char
minimize_boolean_attributes use_trailing_solidus
space_before_trailing_solidus omit_optional_tags sanitize
strip_whitespace inject_meta_charset).include? name.to_s
@use_best_quote_char = false if name.to_s == 'quote_char'
instance_variable_set("@#{name}", value)
end
@errors = []
end
def serialize(treewalker, encoding=nil)
in_cdata = false
@errors = []
if encoding and @inject_meta_charset
treewalker = filter_inject_meta_charset(treewalker, encoding)
end
if @strip_whitespace
treewalker = filter_whitespace(treewalker)
end
if @sanitize
require 'html5lib/sanitizer'
treewalker = HTMLSanitizeFilter.new(treewalker)
end
# if @omit_optional_tags
# treewalker = OptionalTagFilter.new(treewalker)
# end
result = []
treewalker.each do |token|
type = token[:type]
if type == :Doctype
doctype = "<!DOCTYPE %s>" % token[:name]
if encoding
result << doctype.encode(encoding)
else
result << doctype
end
elsif [:Characters, :SpaceCharacters].include? type
if type == :SpaceCharacters or in_cdata
if in_cdata and token[:data].find("</") >= 0
serializeError(_("Unexpected </ in CDATA"))
end
if encoding
result << token[:data].encode(encoding, errors || "strict")
else
result << token[:data]
end
elsif encoding
result << token[:data].replace("&", "&amp;") \
.encode(encoding, unicode_encode_errors)
else
result << token[:data] \
.gsub("&", "&amp;") \
.gsub("<", "&lt;") \
.gsub(">", "&gt;")
end
elsif [:StartTag, :EmptyTag].include? type
name = token[:name]
if CDATA_ELEMENTS.include?(name)
in_cdata = true
elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element"))
end
attrs = token[:data].to_a
attrs.sort()
attributes = []
for k,v in attrs
if encoding
k = k.encode(encoding)
end
attributes << ' '
attributes << k
if not @minimize_boolean_attributes or \
(!(BOOLEAN_ATTRIBUTES[name]||[]).include?(k) \
and !BOOLEAN_ATTRIBUTES[:global].include?(k))
attributes << "="
if @quote_attr_values or v.empty?
quote_attr = true
else
quote_attr = (SPACE_CHARACTERS.join('') + "<>\"'").each_char.any? {|c| v.include?(c)}
end
v = v.gsub("&", "&amp;")
if encoding
v = v.encode(encoding, unicode_encode_errors)
end
if quote_attr
quote_char = @quote_char
if @use_best_quote_char
if v.index("'") and !v.index('"')
quote_char = '"'
elsif v.index('"') and !v.index("'")
quote_char = "'"
end
end
if quote_char == "'"
v = v.gsub("'", "&#39;")
else
v = v.gsub('"', "&quot;")
end
attributes << quote_char << v << quote_char
else
attributes << v
end
end
end
if VOID_ELEMENTS.include?(name) and @use_trailing_solidus
if @space_before_trailing_solidus
attributes << " /"
else
attributes << "/"
end
end
if encoding
result << "<%s%s>" % [name.encode(encoding), attributes.join('')]
else
result << "<%s%s>" % [name, attributes.join('')]
end
elsif type == :EndTag
name = token[:name]
if CDATA_ELEMENTS.include?(name)
in_cdata = false
elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element"))
end
end_tag = "</%s>" % name
if encoding
end_tag = end_tag.encode(encoding)
end
result << end_tag
elsif type == :Comment
data = token[:data]
if data.index("--")
serializeError(_("Comment contains --"))
end
comment = "<!--%s-->" % token[:data]
if encoding
comment = comment.encode(encoding, unicode_encode_errors)
end
result << comment
else
serializeError(token[:data])
end
end
result.join('')
end
def render(treewalker, encoding=nil)
if encoding
return "".join(list(serialize(treewalker, encoding)))
else
return "".join(list(serialize(treewalker)))
end
end
def serializeError(data="XXX ERROR MESSAGE NEEDED")
# XXX The idea is to make data mandatory.
@errors.push(data)
if @strict
raise SerializeError
end
end
def filter_inject_meta_charset(treewalker, encoding)
done = false
for token in treewalker
if not done and token[:type] == :StartTag \
and token[:name].lower() == "head"
yield({:type => :EmptyTag, :name => "meta", \
:data => {"charset" => encoding}})
end
yield token
end
end
def filter_whitespace(treewalker)
raise NotImplementedError
end
end
# Error in serialized tree
class SerializeError < Exception
end
end

View file

@ -1,21 +1,24 @@
module HTML5lib
module TreeBuilders
def self.getTreeBuilder(name)
case name.to_s.downcase
class << self
def [](name)
case name.to_s.downcase
when 'simpletree' then
require 'html5lib/treebuilders/simpletree'
SimpleTree::TreeBuilder
when 'rexml' then
require 'html5lib/treebuilders/rexml'
REXMLTree::TreeBuilder
REXML::TreeBuilder
when 'hpricot' then
require 'html5lib/treebuilders/hpricot'
Hpricot::TreeBuilder
else
raise "Unknown TreeBuilder #{name}"
end
end
end
alias :getTreeBuilder :[]
end
end
end

View file

@ -144,7 +144,7 @@ module HTML5lib
# code. It should still do the same though.
# Step 1: stop the algorithm when there's nothing to do.
return unless @activeFormattingElements
return if @activeFormattingElements.empty?
# Step 2 and step 3: we start with the last element. So i is -1.
i = -1

View file

@ -1,4 +1,5 @@
require 'html5lib/treebuilders/base'
require 'rubygems'
require 'hpricot'
require 'forwardable'
@ -26,12 +27,14 @@ module HTML5lib
childNodes << node
hpricot.children << node.hpricot
end
node.hpricot.parent = hpricot
node.parent = self
end
def removeChild(node)
childNodes.delete(node)
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
node.hpricot.parent = nil
node.parent = nil
end
@ -48,6 +51,7 @@ module HTML5lib
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
else
refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
childNodes.insert(index, node)
end
end

View file

@ -4,7 +4,7 @@ require 'forwardable'
module HTML5lib
module TreeBuilders
module REXMLTree
module REXML
class Node < Base::Node
extend Forwardable
@ -52,6 +52,7 @@ module HTML5lib
childNodes[index-1].rxobj.raw = true
else
childNodes.insert index, node
refNode.rxobj.parent.insert_before(refNode.rxobj,node.rxobj)
end
end
@ -62,7 +63,7 @@ module HTML5lib
class Element < Node
def self.rxclass
REXML::Element
::REXML::Element
end
def initialize name
@ -95,7 +96,7 @@ module HTML5lib
class Document < Node
def self.rxclass
REXML::Document
::REXML::Document
end
def initialize
@ -120,7 +121,7 @@ module HTML5lib
class DocumentType < Node
def self.rxclass
REXML::DocType
::REXML::DocType
end
def printTree indent=0
@ -145,7 +146,7 @@ module HTML5lib
class TextNode < Node
def initialize data
raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;')
@rxobj = REXML::Text.new(raw, true, nil, true)
@rxobj = ::REXML::Text.new(raw, true, nil, true)
end
def printTree indent=0
@ -155,7 +156,7 @@ module HTML5lib
class CommentNode < Node
def self.rxclass
REXML::Comment
::REXML::Comment
end
def printTree indent=0

View file

@ -0,0 +1,26 @@
require 'html5lib/treewalkers/base'
module HTML5lib
module TreeWalkers
class << self
def [](name)
case name.to_s.downcase
when 'simpletree' then
require 'html5lib/treewalkers/simpletree'
SimpleTree::TreeWalker
when 'rexml' then
require 'html5lib/treewalkers/rexml'
REXML::TreeWalker
when 'hpricot' then
require 'html5lib/treewalkers/hpricot'
Hpricot::TreeWalker
else
raise "Unknown TreeWalker #{name}"
end
end
alias :getTreeWalker :[]
end
end
end

View file

@ -0,0 +1,156 @@
require 'html5lib/constants'
module HTML5lib
module TreeWalkers
module TokenConstructor
def error(msg)
return {:type => "SerializeError", :data => msg}
end
def normalizeAttrs(attrs)
attrs.to_a
end
def emptyTag(name, attrs, hasChildren=false)
error(_("Void element has children")) if hasChildren
return({:type => :EmptyTag, :name => name, \
:data => normalizeAttrs(attrs)})
end
def startTag(name, attrs)
return {:type => :StartTag, :name => name, \
:data => normalizeAttrs(attrs)}
end
def endTag(name)
return {:type => :EndTag, :name => name, :data => []}
end
def text(data)
if data =~ /^([#{SPACE_CHARACTERS.join('')}]+)/
yield({:type => :SpaceCharacters, :data => $1})
data = data[$1.length .. -1]
return if data.empty?
end
if data =~ /([#{SPACE_CHARACTERS.join('')}]+)$/
yield({:type => :Characters, :data => data[0 ... -$1.length]})
yield({:type => :SpaceCharacters, :data => $1})
else
yield({:type => :Characters, :data => data})
end
end
def comment(data)
return {:type => :Comment, :data => data}
end
def doctype(name)
return {:type => :Doctype, :name => name, :data => name.upcase() == "HTML"}
end
def unknown(nodeType)
return error(_("Unknown node type: ") + nodeType.to_s)
end
def _(str)
str
end
end
class Base
include TokenConstructor
def initialize(tree)
@tree = tree
end
def each
raise NotImplementedError
end
alias walk each
end
class NonRecursiveTreeWalker < TreeWalkers::Base
def node_details(node)
raise NotImplementedError
end
def first_child(node)
raise NotImplementedError
end
def next_sibling(node)
raise NotImplementedError
end
def parent(node)
raise NotImplementedError
end
def each
currentNode = @tree
while currentNode != nil
details = node_details(currentNode)
hasChildren = false
case details.shift
when :DOCTYPE
yield doctype(*details)
when :TEXT
text(*details) {|token| yield token}
when :ELEMENT
name, attributes, hasChildren = details
if VOID_ELEMENTS.include?(name)
yield emptyTag(name, attributes.to_a, hasChildren)
hasChildren = false
else
yield startTag(name, attributes.to_a)
end
when :COMMENT
yield comment(details[0])
when :DOCUMENT, :DOCUMENT_FRAGMENT
hasChildren = true
when nil
# ignore (REXML::XMLDecl is an example)
else
yield unknown(details[0])
end
firstChild = hasChildren ? first_child(currentNode) : nil
if firstChild != nil
currentNode = firstChild
else
while currentNode != nil
details = node_details(currentNode)
if details.shift == :ELEMENT
name, attributes, hasChildren = details
yield endTag(name) if !VOID_ELEMENTS.include?(name)
end
if @tree == currentNode
currentNode = nil
else
nextSibling = next_sibling(currentNode)
if nextSibling != nil
currentNode = nextSibling
break
end
currentNode = parent(currentNode)
end
end
end
end
end
end
end
end

View file

@ -0,0 +1,48 @@
require 'html5lib/treewalkers/base'
require 'rexml/document'
module HTML5lib
module TreeWalkers
module Hpricot
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
def node_details(node)
case node
when ::Hpricot::Elem
if !node.name
[:DOCUMENT_FRAGMENT]
else
[:ELEMENT, node.name,
node.attributes.map {|name,value| [name,value]},
!node.empty?]
end
when ::Hpricot::Text
[:TEXT, node.to_plain_text]
when ::Hpricot::Comment
[:COMMENT, node.content]
when ::Hpricot::Doc
[:DOCUMENT]
when ::Hpricot::DocType
[:DOCTYPE, node.target]
when ::Hpricot::XMLDecl
[nil]
else
[:UNKNOWN, node.class.inspect]
end
end
def first_child(node)
node.children.first
end
def next_sibling(node)
node.next_node
end
def parent(node)
node.parent
end
end
end
end
end

View file

@ -0,0 +1,48 @@
require 'html5lib/treewalkers/base'
require 'rexml/document'
module HTML5lib
module TreeWalkers
module REXML
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
def node_details(node)
case node
when ::REXML::Document
[:DOCUMENT]
when ::REXML::Element
if !node.name
[:DOCUMENT_FRAGMENT]
else
[:ELEMENT, node.name,
node.attributes.map {|name,value| [name,value]},
node.has_elements? || node.has_text?]
end
when ::REXML::Text
[:TEXT, node.value]
when ::REXML::Comment
[:COMMENT, node.string]
when ::REXML::DocType
[:DOCTYPE, node.name]
when ::REXML::XMLDecl
[nil]
else
[:UNKNOWN, node.class.inspect]
end
end
def first_child(node)
node.children.first
end
def next_sibling(node)
node.next_sibling
end
def parent(node)
node.parent
end
end
end
end
end

View file

@ -0,0 +1,48 @@
require 'html5lib/treewalkers/base'
module HTML5lib
module TreeWalkers
module SimpleTree
class TreeWalker < HTML5lib::TreeWalkers::Base
include HTML5lib::TreeBuilders::SimpleTree
def walk(node)
case node
when Document, DocumentFragment
return
when DocumentType
yield doctype(node.name)
when TextNode
text(node.value) {|token| yield token}
when Element
if VOID_ELEMENTS.include?(node.name)
yield emptyTag(node.name, node.attributes, node.hasContent())
else
yield startTag(node.name, node.attributes)
for child in node.childNodes
walk(child) {|token| yield token}
end
yield endTag(node.name)
end
when CommentNode
yield comment(node.value)
else
puts '?'
yield unknown(node.class)
end
end
def each
for child in @tree.childNodes
walk(child) {|node| yield node}
end
end
end
end
end
end

137
vendor/plugins/HTML5lib/parse.rb vendored Executable file
View file

@ -0,0 +1,137 @@
#!/usr/bin/env ruby
#
# Parse a document to a simpletree tree, with optional profiling
$:.unshift File.dirname(__FILE__),'lib'
def parse(opts, args)
f = args[-1]
if f
begin
require 'open-uri' if f[0..6] == 'http://'
f = open(f)
rescue
end
else
$stderr.write("No filename provided. Use -h for help\n")
exit(1)
end
require 'html5lib/treebuilders'
treebuilder = HTML5lib::TreeBuilders[opts.treebuilder]
if opts.output == :xml
require 'html5lib/liberalxmlparser'
p = HTML5lib::XHTMLParser.new(:tree=>treebuilder)
else
require 'html5lib/html5parser'
p = HTML5lib::HTMLParser.new(:tree=>treebuilder)
end
if opts.profile
require 'profiler'
Profiler__::start_profile
p.parse(f)
Profiler__::stop_profile
Profiler__::print_profile($stderr)
elsif opts.time
require 'time'
t0 = Time.new
document = p.parse(f)
t1 = Time.new
printOutput(p, document, opts)
t2 = Time.new
puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1]
else
document = p.parse(f)
printOutput(p, document, opts)
end
end
def printOutput(parser, document, opts)
puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding
case opts.output
when :xml
print document
when :html
require 'html5lib/treewalkers'
tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document)
require 'html5lib/serializer'
print HTML5lib::HTMLSerializer.serialize(tokens, :encoding=>'utf-8')
when :hilite
print document.hilite
when :tree
print parser.tree.testSerializer(document)
end
if opts.error
errList=[]
for pos, message in parser.errors
errList << ("Line %i Col %i"%pos + " " + message)
end
$stderr.write("\nParse errors:\n" + errList.join("\n")+"\n")
end
end
require 'ostruct'
options = OpenStruct.new
options.profile = false
options.time = false
options.output = :tree
options.treebuilder = 'simpletree'
options.error = false
options.encoding = false
require 'optparse'
opts = OptionParser.new do |opts|
opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
options.profile = profile
end
opts.on("-t", "--[no-]time", "Time the run") do |time|
options.time = time
end
opts.on("--[no-]tree", "Do not print output tree") do |tree|
if tree
options.output = :tree
else
options.output = nil
end
end
opts.on("-b", "--treebuilder NAME") do |treebuilder|
options.treebuilder = treebuilder
end
opts.on("-e", "--error", "Print a list of parse errors") do |error|
options.error = error
end
opts.on("-x", "--xml", "output as xml") do |xml|
options.output = :xml
options.treebuilder = "rexml"
end
opts.on("--html", "Output as html") do |html|
options.output = :html
end
opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
options.output = :hilite
end
opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
options.encoding = encoding
end
opts.on_tail("-h", "--help", "Show this message") do
puts opts
exit
end
end
opts.parse!(ARGV)
parse options, ARGV

View file

@ -21,3 +21,53 @@ rescue LoadError
end
end
end
module HTML5lib
module TestSupport
def self.startswith?(a, b)
b[0... a.length] == a
end
def self.parseTestcase(data)
innerHTML = nil
input = []
output = []
errors = []
currentList = input
data.split(/\n/).each do |line|
if !line.empty? and !startswith?("#errors", line) and
!startswith?("#document", line) and
!startswith?("#data", line) and
!startswith?("#document-fragment", line)
if currentList == output and startswith?("|", line)
currentList.push(line[2..-1])
else
currentList.push(line)
end
elsif line == "#errors"
currentList = errors
elsif line == "#document" or startswith?("#document-fragment", line)
if startswith?("#document-fragment", line)
innerHTML = line[19..-1]
raise AssertionError unless innerHTML
end
currentList = output
end
end
return innerHTML, input.join("\n"), output.join("\n"), errors
end
# convert the output of str(document) to the format used in the testcases
def convertTreeDump(treedump)
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
end
def sortattrs(output)
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) do |match|
match.split("\n").sort.join("\n")
end
end
end
end

View file

@ -4,33 +4,33 @@ require 'html5lib/inputstream'
class Html5EncodingTestCase < Test::Unit::TestCase
begin
begin
require 'rubygems'
require 'UniversalDetector'
def test_chardet
File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
assert_equal 'big5', stream.char_encoding.downcase
end
File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
assert_equal 'big5', stream.char_encoding.downcase
end
end
rescue LoadError
rescue LoadError
puts "chardet not found, skipping chardet tests"
end
end
html5lib_test_files('encoding').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
html5lib_test_files('encoding').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
input, encoding = data.split(/\n#encoding\s+/, 2)
encoding = encoding.split[0]
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
input, encoding = data.split(/\n#encoding\s+/, 2)
encoding = encoding.split[0]
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
assert_equal encoding.downcase, stream.char_encoding.downcase, input
end
end
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
assert_equal encoding.downcase, stream.char_encoding.downcase, input
end
end
end
end

View file

@ -14,53 +14,12 @@ end
$CHECK_PARSER_ERRORS = false
puts 'Testing: ' + $tree_types_to_test * ', '
puts 'Testing tree builders: ' + $tree_types_to_test * ', '
class Html5ParserTestCase < Test::Unit::TestCase
def self.startswith?(a, b)
b[0... a.length] == a
end
def self.parseTestcase(data)
innerHTML = nil
input = []
output = []
errors = []
currentList = input
data.split(/\n/).each do |line|
if !line.empty? and !startswith?("#errors", line) and
!startswith?("#document", line) and
!startswith?("#data", line) and
!startswith?("#document-fragment", line)
if currentList == output and startswith?("|", line)
currentList.push(line[2..-1])
else
currentList.push(line)
end
elsif line == "#errors"
currentList = errors
elsif line == "#document" or startswith?("#document-fragment", line)
if startswith?("#document-fragment", line)
innerHTML = line[19..-1]
raise AssertionError unless innerHTML
end
currentList = output
end
end
return innerHTML, input.join("\n"), output.join("\n"), errors
end
# convert the output of str(document) to the format used in the testcases
def convertTreeDump(treedump)
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
end
def sortattrs(output)
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
end
include HTML5lib
include TestSupport
html5lib_test_files('tree-construction').each do |test_file|
@ -69,12 +28,13 @@ class Html5ParserTestCase < Test::Unit::TestCase
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
innerHTML, input, expected_output, expected_errors = parseTestcase(data)
innerHTML, input, expected_output, expected_errors =
TestSupport.parseTestcase(data)
$tree_types_to_test.each do |tree_name|
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name))
parser = HTMLParser.new(:tree => TreeBuilders[tree_name])
if innerHTML
parser.parseFragment(input, innerHTML)

View file

@ -2,9 +2,11 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/sanitizer'
require 'html5lib/html5parser'
require 'html5lib/liberalxmlparser'
require 'html5lib/treewalkers'
require 'html5lib/serializer'
require 'html5lib/sanitizer'
class SanitizeTest < Test::Unit::TestCase
include HTML5lib

View file

@ -0,0 +1,52 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/html5parser'
require 'html5lib/serializer'
require 'html5lib/treewalkers'
#Run the serialize error checks
checkSerializeErrors = false
class JsonWalker < HTML5lib::TreeWalkers::Base
def each
@tree.each do |token|
case token[0]
when 'StartTag'
yield startTag(token[1], token[2])
when 'EndTag'
yield endTag(token[1])
when 'EmptyTag'
yield emptyTag(token[1], token[2])
when 'Comment'
yield comment(token[1])
when 'Characters', 'SpaceCharacters'
text(token[1]) {|textToken| yield textToken}
when 'Doctype'
yield doctype(token[1])
else
raise ValueError("Unknown token type: " + type)
end
end
end
end
class Html5SerializeTestcase < Test::Unit::TestCase
html5lib_test_files('serializer').each do |filename|
test_name = File.basename(filename).sub('.test', '')
tests = JSON::parse(open(filename).read)
tests['tests'].each_with_index do |test, index|
define_method "test_#{test_name}_#{index+1}" do
result = HTML5lib::HTMLSerializer.
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
expected = test["expected"]
if expected.length == 1
assert_equal(expected[0], result, test["description"])
elsif !expected.include?(result)
flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
end
end
end
end
end

54
vendor/plugins/HTML5lib/tests/test_stream.rb vendored Executable file
View file

@ -0,0 +1,54 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/inputstream'
class HTMLInputStreamTest < Test::Unit::TestCase
include HTML5lib
def test_char_ascii
stream = HTMLInputStream.new("'")
assert_equal('ascii', stream.char_encoding)
assert_equal("'", stream.char)
end
def test_char_null
stream = HTMLInputStream.new("\x00")
assert_equal("\xef\xbf\xbd", stream.char)
end
def test_char_utf8
stream = HTMLInputStream.new("\xe2\x80\x98")
assert_equal('utf-8', stream.char_encoding)
assert_equal("\xe2\x80\x98", stream.char)
end
def test_char_win1252
stream = HTMLInputStream.new("\x91")
assert_equal('windows-1252', stream.char_encoding)
assert_equal("\xe2\x80\x98", stream.char)
end
def test_bom
stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
assert_equal('utf-8', stream.char_encoding)
assert_equal("'", stream.char)
end
def test_utf_16
stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
assert(stream.char_encoding, 'utf-16-le')
assert_equal(1025, stream.chars_until(' ',true).length)
end
def test_newlines
stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
assert_equal(0, stream.instance_eval {@tell})
assert_equal("a\nbb\n", stream.chars_until('c'))
assert_equal(6, stream.instance_eval {@tell})
assert_equal([3,1], stream.position)
assert_equal("ccc\ndddd", stream.chars_until('x'))
assert_equal(14, stream.instance_eval {@tell})
assert_equal([4,5], stream.position)
assert_equal([0,1,4,8], stream.instance_eval {@new_lines})
end
end

View file

@ -0,0 +1,110 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/html5parser'
require 'html5lib/treewalkers'
require 'html5lib/treebuilders'
$tree_types_to_test = {
'simpletree' =>
{:builder => HTML5lib::TreeBuilders['simpletree'],
:walker => HTML5lib::TreeWalkers['simpletree']},
'rexml' =>
{:builder => HTML5lib::TreeBuilders['rexml'],
:walker => HTML5lib::TreeWalkers['rexml']},
# 'hpricot' =>
# {:builder => HTML5lib::TreeBuilders['hpricot'],
# :walker => HTML5lib::TreeWalkers['hpricot']},
}
puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
class TestTreeWalkers < Test::Unit::TestCase
include HTML5lib::TestSupport
def concatenateCharacterTokens(tokens)
charactersToken = nil
for token in tokens
type = token[:type]
if [:Characters, :SpaceCharacters].include?(type)
if charactersToken == nil
charactersToken = {:type => :Characters, :data => token[:data]}
else
charactersToken[:data] += token[:data]
end
else
if charactersToken != nil
yield charactersToken
charactersToken = nil
end
yield token
end
end
yield charactersToken if charactersToken != nil
end
def convertTokens(tokens)
output = []
indent = 0
concatenateCharacterTokens(tokens) do |token|
case token[:type]
when :StartTag, :EmptyTag
output << "#{' '*indent}<#{token[:name]}>"
indent += 2
for name, value in token[:data].to_a.sort
next if name=='xmlns'
output << "#{' '*indent}#{name}=\"#{value}\""
end
indent -= 2 if token[:type] == :EmptyTag
when :EndTag
indent -= 2
when :Comment
output << "#{' '*indent}<!-- #{token[:data]} -->"
when :Doctype
output << "#{' '*indent}<!DOCTYPE #{token[:name]}>"
when :Characters, :SpaceCharacters
output << "#{' '*indent}\"#{token[:data]}\""
else
# TODO: what to do with errors?
end
end
return output.join("\n")
end
html5lib_test_files('tree-construction').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '')
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
innerHTML, input, expected_output, expected_errors =
HTML5lib::TestSupport::parseTestcase(data)
rexml = $tree_types_to_test['rexml']
$tree_types_to_test.each do |tree_name, treeClass|
define_method "test_#{test_name}_#{index}_#{tree_name}" do
parser = HTML5lib::HTMLParser.new(:tree => treeClass[:builder])
if innerHTML
parser.parseFragment(input, innerHTML)
else
parser.parse(input)
end
document = parser.tree.getDocument
begin
output = sortattrs(convertTokens(treeClass[:walker].new(document)))
expected = sortattrs(expected_output)
errorMsg = "\n\nExpected:\n#{expected}\nRecieved:\n#{output}\n"
assert_equal(expected, output, errorMsg)
rescue NotImplementedError
# Amnesty for those that confess...
end
end
end
end
end
end

View file

@ -154,6 +154,21 @@ Example:
CSS: style.css math.css
=end
# Render to an HTML fragment (returns a REXML document tree)
def to_html_tree
div = Element.new 'div'
children_to_html.each do |e|
div << e
end
# render footnotes
if @doc.footnotes_order.size > 0
div << render_footnotes
end
doc = Document.new(nil,{:respect_whitespace =>:all})
doc << div
end
# Render to a complete HTML document (returns a REXML document tree)
def to_html_document_tree