REXML Trees
Synced with latest HTML5lib. Added preliminary support (currently disabled) for sanitizing REXML trees.
This commit is contained in:
parent
4dd70af5ae
commit
bd8ba1f4b1
28 changed files with 1317 additions and 112 deletions
|
@ -53,9 +53,10 @@ module Engines
|
||||||
def mask
|
def mask
|
||||||
require_dependency 'maruku'
|
require_dependency 'maruku'
|
||||||
require_dependency 'maruku/ext/math'
|
require_dependency 'maruku/ext/math'
|
||||||
html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
|
# html = sanitize_rexml(Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
|
||||||
{:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html
|
# {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html_tree)
|
||||||
sanitize_xhtml(html.to_ncr)
|
html = sanitize_xhtml(Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
|
||||||
|
{:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html.to_ncr)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -8,19 +8,36 @@ module Sanitize
|
||||||
#
|
#
|
||||||
# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
|
# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
|
||||||
# sanitize_html() is a case-insensitive sanitizer suitable for HTML
|
# sanitize_html() is a case-insensitive sanitizer suitable for HTML
|
||||||
|
# sanitize_rexml() sanitized a REXML tree, returning a string
|
||||||
|
|
||||||
|
|
||||||
require 'html5lib/sanitizer'
|
|
||||||
require 'html5lib/html5parser'
|
require 'html5lib/html5parser'
|
||||||
require 'html5lib/liberalxmlparser'
|
require 'html5lib/liberalxmlparser'
|
||||||
|
|
||||||
|
require 'html5lib/treewalkers'
|
||||||
|
require 'html5lib/serializer'
|
||||||
|
require 'string_utils'
|
||||||
|
require 'html5lib/sanitizer'
|
||||||
|
|
||||||
include HTML5lib
|
include HTML5lib
|
||||||
|
|
||||||
def sanitize_xhtml(html)
|
def sanitize_xhtml(html)
|
||||||
XHTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
|
XHTMLParser.parseFragment(html.to_ncr, :tokenizer => HTMLSanitizer).to_s
|
||||||
end
|
end
|
||||||
|
|
||||||
def sanitize_html(html)
|
def sanitize_html(html)
|
||||||
HTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
|
HTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def sanitize_rexml(tree)
|
||||||
|
tokens = TreeWalkers.getTreeWalker('rexml').new(tree.to_ncr)
|
||||||
|
HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
|
||||||
|
:quote_attr_values => 'true',
|
||||||
|
:minimize_boolean_attributes => 'false',
|
||||||
|
:use_trailing_solidus => 'true',
|
||||||
|
:space_before_trailing_solidus => 'true',
|
||||||
|
:omit_optional_tags => 'false',
|
||||||
|
:inject_meta_charset => 'false',
|
||||||
|
:sanitize => 'true'})
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -2155,3 +2155,20 @@ class String
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
require 'rexml/element'
|
||||||
|
module REXML
|
||||||
|
class Element
|
||||||
|
def to_ncr
|
||||||
|
XPath.each(self, '//*') { |el|
|
||||||
|
el.texts.each_index {|i|
|
||||||
|
el.texts[i].value = el.texts[i].to_s.to_ncr
|
||||||
|
}
|
||||||
|
el.attributes.each { |name,val|
|
||||||
|
el.attributes[name] = val.to_ncr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return self
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
|
@ -148,6 +148,26 @@ module HTML5lib
|
||||||
input
|
input
|
||||||
]
|
]
|
||||||
|
|
||||||
|
BOOLEAN_ATTRIBUTES = {
|
||||||
|
:global => %w[irrelevant],
|
||||||
|
'style' => %w[scoped],
|
||||||
|
'img' => %w[ismap],
|
||||||
|
'audio' => %w[autoplay controls],
|
||||||
|
'video' => %w[autoplay controls],
|
||||||
|
'script' => %w[defer async],
|
||||||
|
'details' => %w[open],
|
||||||
|
'datagrid' => %w[multiple disabled],
|
||||||
|
'command' => %w[hidden disabled checked default],
|
||||||
|
'menu' => %w[autosubmit],
|
||||||
|
'fieldset' => %w[disabled readonly],
|
||||||
|
'option' => %w[disabled readonly selected],
|
||||||
|
'optgroup' => %w[disabled readonly],
|
||||||
|
'button' => %w[disabled autofocus],
|
||||||
|
'input' => %w[disabled readonly required autofocus checked ismap],
|
||||||
|
'select' => %w[disabled readonly autofocus multiple],
|
||||||
|
'output' => %w[disabled readonly]
|
||||||
|
}
|
||||||
|
|
||||||
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
|
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
|
||||||
ENTITIES_WINDOWS1252 = [
|
ENTITIES_WINDOWS1252 = [
|
||||||
8364, # 0x80 0x20AC EURO SIGN
|
8364, # 0x80 0x20AC EURO SIGN
|
||||||
|
|
|
@ -37,13 +37,13 @@ module HTML5lib
|
||||||
# :strict - raise an exception when a parse error is encountered
|
# :strict - raise an exception when a parse error is encountered
|
||||||
# :tree - a treebuilder class controlling the type of tree that will be
|
# :tree - a treebuilder class controlling the type of tree that will be
|
||||||
# returned. Built in treebuilders can be accessed through
|
# returned. Built in treebuilders can be accessed through
|
||||||
# html5lib.treebuilders.getTreeBuilder(treeType)
|
# HTML5lib::TreeBuilders[treeType]
|
||||||
def initialize(options = {})
|
def initialize(options = {})
|
||||||
@strict = false
|
@strict = false
|
||||||
@errors = []
|
@errors = []
|
||||||
|
|
||||||
@tokenizer = HTMLTokenizer
|
@tokenizer = HTMLTokenizer
|
||||||
@tree = TreeBuilders::REXMLTree::TreeBuilder
|
@tree = TreeBuilders::REXML::TreeBuilder
|
||||||
|
|
||||||
options.each { |name, value| instance_variable_set("@#{name}", value) }
|
options.each { |name, value| instance_variable_set("@#{name}", value) }
|
||||||
|
|
||||||
|
|
|
@ -107,4 +107,4 @@ module HTML5lib
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -153,4 +153,4 @@ module HTML5lib
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -58,7 +58,7 @@ module HTML5lib
|
||||||
unless @char_encoding == 'utf-8'
|
unless @char_encoding == 'utf-8'
|
||||||
begin
|
begin
|
||||||
require 'iconv'
|
require 'iconv'
|
||||||
uString = Iconv.iconv('utf-8', @encoding, uString)[0]
|
uString = Iconv.iconv('utf-8', @char_encoding, uString)[0]
|
||||||
rescue
|
rescue
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@ -95,11 +95,13 @@ module HTML5lib
|
||||||
#First look for a BOM
|
#First look for a BOM
|
||||||
#This will also read past the BOM if present
|
#This will also read past the BOM if present
|
||||||
encoding = detect_bom
|
encoding = detect_bom
|
||||||
|
|
||||||
#If there is no BOM need to look for meta elements with encoding
|
#If there is no BOM need to look for meta elements with encoding
|
||||||
#information
|
#information
|
||||||
if encoding.nil? and @parse_meta
|
if encoding.nil? and @parse_meta
|
||||||
encoding = detect_encoding_meta
|
encoding = detect_encoding_meta
|
||||||
end
|
end
|
||||||
|
|
||||||
#Guess with chardet, if avaliable
|
#Guess with chardet, if avaliable
|
||||||
if encoding.nil? and @chardet
|
if encoding.nil? and @chardet
|
||||||
begin
|
begin
|
||||||
|
@ -111,13 +113,14 @@ module HTML5lib
|
||||||
rescue LoadError
|
rescue LoadError
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# If all else fails use the default encoding
|
# If all else fails use the default encoding
|
||||||
if encoding.nil?
|
if encoding.nil?
|
||||||
encoding = @DEFAULT_ENCODING
|
encoding = @DEFAULT_ENCODING
|
||||||
end
|
end
|
||||||
|
|
||||||
#Substitute for equivalent encodings:
|
#Substitute for equivalent encodings:
|
||||||
encoding_sub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
|
encoding_sub = {'iso-8859-1' => 'windows-1252'}
|
||||||
|
|
||||||
if encoding_sub.has_key?(encoding.downcase)
|
if encoding_sub.has_key?(encoding.downcase)
|
||||||
encoding = encoding_sub[encoding.downcase]
|
encoding = encoding_sub[encoding.downcase]
|
||||||
|
@ -132,10 +135,10 @@ module HTML5lib
|
||||||
def detect_bom
|
def detect_bom
|
||||||
bom_dict = {
|
bom_dict = {
|
||||||
"\xef\xbb\xbf" => 'utf-8',
|
"\xef\xbb\xbf" => 'utf-8',
|
||||||
"\xff\xfe" => 'utf-16-le',
|
"\xff\xfe" => 'utf16le',
|
||||||
"\xfe\xff" => 'utf-16-be',
|
"\xfe\xff" => 'utf16be',
|
||||||
"\xff\xfe\x00\x00" => 'utf-32-le',
|
"\xff\xfe\x00\x00" => 'utf32le',
|
||||||
"\x00\x00\xfe\xff" => 'utf-32-be'
|
"\x00\x00\xfe\xff" => 'utf32be'
|
||||||
}
|
}
|
||||||
|
|
||||||
# Go to beginning of file and read in 4 bytes
|
# Go to beginning of file and read in 4 bytes
|
||||||
|
@ -205,7 +208,17 @@ module HTML5lib
|
||||||
else
|
else
|
||||||
begin
|
begin
|
||||||
@tell += 1
|
@tell += 1
|
||||||
return @data_stream[@tell - 1].chr
|
c = @data_stream[@tell - 1]
|
||||||
|
case c
|
||||||
|
when 0xC2 .. 0xDF
|
||||||
|
@tell += 1
|
||||||
|
c.chr + @data_stream[@tell-1].chr
|
||||||
|
when 0xE0 .. 0xF0
|
||||||
|
@tell += 2
|
||||||
|
c.chr + @data_stream[@tell-2].chr + @data_stream[@tell-1].chr
|
||||||
|
else
|
||||||
|
c.chr
|
||||||
|
end
|
||||||
rescue
|
rescue
|
||||||
return :EOF
|
return :EOF
|
||||||
end
|
end
|
||||||
|
@ -227,8 +240,8 @@ module HTML5lib
|
||||||
else
|
else
|
||||||
# Then the rest
|
# Then the rest
|
||||||
begin
|
begin
|
||||||
char_stack.push(@data_stream[@tell].chr)
|
|
||||||
@tell += 1
|
@tell += 1
|
||||||
|
char_stack.push(@data_stream[@tell-1].chr)
|
||||||
rescue
|
rescue
|
||||||
char_stack.push(:EOF)
|
char_stack.push(:EOF)
|
||||||
break
|
break
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
require 'html5lib/tokenizer'
|
|
||||||
require 'cgi'
|
require 'cgi'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5lib
|
||||||
|
@ -6,7 +5,7 @@ module HTML5lib
|
||||||
# This module provides sanitization of XHTML+MathML+SVG
|
# This module provides sanitization of XHTML+MathML+SVG
|
||||||
# and of inline style attributes.
|
# and of inline style attributes.
|
||||||
|
|
||||||
class HTMLSanitizer < HTMLTokenizer
|
module HTMLSanitizeModule
|
||||||
|
|
||||||
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
|
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
|
||||||
button caption center cite code col colgroup dd del dfn dir div dl dt
|
button caption center cite code col colgroup dd del dfn dir div dl dt
|
||||||
|
@ -96,19 +95,7 @@ module HTML5lib
|
||||||
ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
|
ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
|
||||||
ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
|
ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
|
||||||
|
|
||||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
def process_token(token)
|
||||||
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
|
|
||||||
# attributes are parsed, and a restricted set, # specified by
|
|
||||||
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
|
||||||
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
|
|
||||||
# in ALLOWED_PROTOCOLS are allowed.
|
|
||||||
#
|
|
||||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
|
||||||
# => <script> do_nasty_stuff() </script>
|
|
||||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
|
||||||
# => <a>Click here for $100</a>
|
|
||||||
def each
|
|
||||||
super do |token|
|
|
||||||
case token[:type]
|
case token[:type]
|
||||||
when :StartTag, :EndTag, :EmptyTag
|
when :StartTag, :EndTag, :EmptyTag
|
||||||
if ALLOWED_ELEMENTS.include?(token[:name])
|
if ALLOWED_ELEMENTS.include?(token[:name])
|
||||||
|
@ -126,7 +113,7 @@ module HTML5lib
|
||||||
end
|
end
|
||||||
token[:data] = attrs.map {|k,v| [k,v]}
|
token[:data] = attrs.map {|k,v| [k,v]}
|
||||||
end
|
end
|
||||||
yield token
|
return token
|
||||||
else
|
else
|
||||||
if token[:type] == :EndTag
|
if token[:type] == :EndTag
|
||||||
token[:data] = "</#{token[:name]}>"
|
token[:data] = "</#{token[:name]}>"
|
||||||
|
@ -139,12 +126,11 @@ module HTML5lib
|
||||||
token[:data].insert(-2,'/') if token[:type] == :EmptyTag
|
token[:data].insert(-2,'/') if token[:type] == :EmptyTag
|
||||||
token[:type] = :Characters
|
token[:type] = :Characters
|
||||||
token.delete(:name)
|
token.delete(:name)
|
||||||
yield token
|
return token
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
yield token
|
return token
|
||||||
end
|
end
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def sanitize_css(style)
|
def sanitize_css(style)
|
||||||
|
@ -174,4 +160,23 @@ module HTML5lib
|
||||||
style = clean.join(' ')
|
style = clean.join(' ')
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
class HTMLSanitizeFilter < Filter
|
||||||
|
include HTMLSanitizeModule
|
||||||
|
def each
|
||||||
|
@source.each do |token|
|
||||||
|
yield(process_token(token))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class HTMLSanitizer < HTMLTokenizer
|
||||||
|
include HTMLSanitizeModule
|
||||||
|
def each
|
||||||
|
super do |token|
|
||||||
|
yield(process_token(token))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
418
vendor/plugins/HTML5lib/lib/html5lib/serializer.rb
vendored
Normal file
418
vendor/plugins/HTML5lib/lib/html5lib/serializer.rb
vendored
Normal file
|
@ -0,0 +1,418 @@
|
||||||
|
require 'html5lib/constants'
|
||||||
|
require 'jcode'
|
||||||
|
|
||||||
|
module HTML5lib
|
||||||
|
|
||||||
|
class Filter
|
||||||
|
include Enumerable
|
||||||
|
def initialize(source)
|
||||||
|
@source = source
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class OptionalTagFilter < Filter
|
||||||
|
def slider
|
||||||
|
previous1 = previous2 = nil
|
||||||
|
@source.each do |token|
|
||||||
|
yield previous2, previous1, token if previous1 != nil
|
||||||
|
previous2 = previous1
|
||||||
|
previous1 = token
|
||||||
|
end
|
||||||
|
yield previous2, previous1, nil
|
||||||
|
end
|
||||||
|
|
||||||
|
def each
|
||||||
|
slider do |previous, token, nexttok|
|
||||||
|
type = token[:type]
|
||||||
|
if type == :StartTag
|
||||||
|
yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok)
|
||||||
|
elsif type == :EndTag
|
||||||
|
yield token unless is_optional_end(token[:name], nexttok)
|
||||||
|
else
|
||||||
|
yield token
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def is_optional_start(tagname, previous, nexttok)
|
||||||
|
type = nexttok ? nexttok[:type] : nil
|
||||||
|
if tagname == 'html'
|
||||||
|
# An html element's start tag may be omitted if the first thing
|
||||||
|
# inside the html element is not a space character or a comment.
|
||||||
|
return ![:Comment, :SpaceCharacters].include?(type)
|
||||||
|
elsif tagname == 'head'
|
||||||
|
# A head element's start tag may be omitted if the first thing
|
||||||
|
# inside the head element is an element.
|
||||||
|
return type == :StartTag
|
||||||
|
elsif tagname == 'body'
|
||||||
|
# A body element's start tag may be omitted if the first thing
|
||||||
|
# inside the body element is not a space character or a comment,
|
||||||
|
# except if the first thing inside the body element is a script
|
||||||
|
# or style element and the node immediately preceding the body
|
||||||
|
# element is a head element whose end tag has been omitted.
|
||||||
|
if [:Comment, :SpaceCharacters].include?(type)
|
||||||
|
return false
|
||||||
|
elsif type == :StartTag
|
||||||
|
# XXX: we do not look at the preceding event, so we never omit
|
||||||
|
# the body element's start tag if it's followed by a script or
|
||||||
|
# a style element.
|
||||||
|
return !%w[script style].include?(nexttok[:name])
|
||||||
|
else
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
elsif tagname == 'colgroup'
|
||||||
|
# A colgroup element's start tag may be omitted if the first thing
|
||||||
|
# inside the colgroup element is a col element, and if the element
|
||||||
|
# is not immediately preceeded by another colgroup element whose
|
||||||
|
# end tag has been omitted.
|
||||||
|
if type == :StartTag
|
||||||
|
# XXX: we do not look at the preceding event, so instead we never
|
||||||
|
# omit the colgroup element's end tag when it is immediately
|
||||||
|
# followed by another colgroup element. See is_optional_end.
|
||||||
|
return nexttok[:name] == "col"
|
||||||
|
else
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
elsif tagname == 'tbody'
|
||||||
|
# A tbody element's start tag may be omitted if the first thing
|
||||||
|
# inside the tbody element is a tr element, and if the element is
|
||||||
|
# not immediately preceeded by a tbody, thead, or tfoot element
|
||||||
|
# whose end tag has been omitted.
|
||||||
|
if type == :StartTag
|
||||||
|
# omit the thead and tfoot elements' end tag when they are
|
||||||
|
# immediately followed by a tbody element. See is_optional_end.
|
||||||
|
if previous and previous[:type] == :EndTag and \
|
||||||
|
%w(tbody thead tfoot).include?(previous[:name])
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
|
||||||
|
return nexttok[:name] == 'tr'
|
||||||
|
else
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
|
||||||
|
def is_optional_end(tagname, nexttok)
|
||||||
|
type = nexttok ? nexttok[:type] : nil
|
||||||
|
if %w[html head body].include?(tagname)
|
||||||
|
# An html element's end tag may be omitted if the html element
|
||||||
|
# is not immediately followed by a space character or a comment.
|
||||||
|
return ![:Comment, :SpaceCharacters].include?(type)
|
||||||
|
elsif %w[li optgroup option tr].include?(tagname)
|
||||||
|
# A li element's end tag may be omitted if the li element is
|
||||||
|
# immediately followed by another li element or if there is
|
||||||
|
# no more content in the parent element.
|
||||||
|
# An optgroup element's end tag may be omitted if the optgroup
|
||||||
|
# element is immediately followed by another optgroup element,
|
||||||
|
# or if there is no more content in the parent element.
|
||||||
|
# An option element's end tag may be omitted if the option
|
||||||
|
# element is immediately followed by another option element,
|
||||||
|
# or if there is no more content in the parent element.
|
||||||
|
# A tr element's end tag may be omitted if the tr element is
|
||||||
|
# immediately followed by another tr element, or if there is
|
||||||
|
# no more content in the parent element.
|
||||||
|
if type == :StartTag
|
||||||
|
return nexttok[:name] == tagname
|
||||||
|
else
|
||||||
|
return type == :EndTag || type == nil
|
||||||
|
end
|
||||||
|
elsif %w(dt dd).include?(tagname)
|
||||||
|
# A dt element's end tag may be omitted if the dt element is
|
||||||
|
# immediately followed by another dt element or a dd element.
|
||||||
|
# A dd element's end tag may be omitted if the dd element is
|
||||||
|
# immediately followed by another dd element or a dt element,
|
||||||
|
# or if there is no more content in the parent element.
|
||||||
|
if type == :StartTag
|
||||||
|
return %w(dt dd).include?(nexttok[:name])
|
||||||
|
elsif tagname == 'dd'
|
||||||
|
return type == :EndTag || type == nil
|
||||||
|
else
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
elsif tagname == 'p'
|
||||||
|
# A p element's end tag may be omitted if the p element is
|
||||||
|
# immediately followed by an address, blockquote, dl, fieldset,
|
||||||
|
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
|
||||||
|
# or ul element, or if there is no more content in the parent
|
||||||
|
# element.
|
||||||
|
if type == :StartTag
|
||||||
|
return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5
|
||||||
|
h6 hr menu ol p pre table ul).include?(nexttok[:name])
|
||||||
|
else
|
||||||
|
return type == :EndTag || type == nil
|
||||||
|
end
|
||||||
|
elsif tagname == 'colgroup'
|
||||||
|
# A colgroup element's end tag may be omitted if the colgroup
|
||||||
|
# element is not immediately followed by a space character or
|
||||||
|
# a comment.
|
||||||
|
if [:Comment, :SpaceCharacters].include?(type)
|
||||||
|
return false
|
||||||
|
elsif type == :StartTag
|
||||||
|
# XXX: we also look for an immediately following colgroup
|
||||||
|
# element. See is_optional_start.
|
||||||
|
return nexttok[:name] != 'colgroup'
|
||||||
|
else
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
elsif %w(thead tbody).include? tagname
|
||||||
|
# A thead element's end tag may be omitted if the thead element
|
||||||
|
# is immediately followed by a tbody or tfoot element.
|
||||||
|
# A tbody element's end tag may be omitted if the tbody element
|
||||||
|
# is immediately followed by a tbody or tfoot element, or if
|
||||||
|
# there is no more content in the parent element.
|
||||||
|
# A tfoot element's end tag may be omitted if the tfoot element
|
||||||
|
# is immediately followed by a tbody element, or if there is no
|
||||||
|
# more content in the parent element.
|
||||||
|
# XXX: we never omit the end tag when the following element is
|
||||||
|
# a tbody. See is_optional_start.
|
||||||
|
if type == :StartTag
|
||||||
|
return %w(tbody tfoot).include?(nexttok[:name])
|
||||||
|
elsif tagname == 'tbody'
|
||||||
|
return (type == :EndTag or type == nil)
|
||||||
|
else
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
elsif tagname == 'tfoot'
|
||||||
|
# A tfoot element's end tag may be omitted if the tfoot element
|
||||||
|
# is immediately followed by a tbody element, or if there is no
|
||||||
|
# more content in the parent element.
|
||||||
|
# XXX: we never omit the end tag when the following element is
|
||||||
|
# a tbody. See is_optional_start.
|
||||||
|
if type == :StartTag
|
||||||
|
return nexttok[:name] == 'tbody'
|
||||||
|
else
|
||||||
|
return type == :EndTag || type == nil
|
||||||
|
end
|
||||||
|
elsif %w(td th).include? tagname
|
||||||
|
# A td element's end tag may be omitted if the td element is
|
||||||
|
# immediately followed by a td or th element, or if there is
|
||||||
|
# no more content in the parent element.
|
||||||
|
# A th element's end tag may be omitted if the th element is
|
||||||
|
# immediately followed by a td or th element, or if there is
|
||||||
|
# no more content in the parent element.
|
||||||
|
if type == :StartTag
|
||||||
|
return %w(td th).include?(nexttok[:name])
|
||||||
|
else
|
||||||
|
return type == :EndTag || type == nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class HTMLSerializer
|
||||||
|
CDATA_ELEMENTS = %w[style script xmp iframe noembed noframes noscript]
|
||||||
|
|
||||||
|
def self.serialize(stream, options = {})
|
||||||
|
new(options).serialize(stream)
|
||||||
|
end
|
||||||
|
|
||||||
|
def initialize(options={})
|
||||||
|
@quote_attr_values = false
|
||||||
|
@quote_char = '"'
|
||||||
|
@use_best_quote_char = true
|
||||||
|
@minimize_boolean_attributes = true
|
||||||
|
|
||||||
|
@use_trailing_solidus = false
|
||||||
|
@space_before_trailing_solidus = true
|
||||||
|
|
||||||
|
@omit_optional_tags = true
|
||||||
|
@sanitize = false
|
||||||
|
|
||||||
|
@strip_whitespace = false
|
||||||
|
|
||||||
|
@inject_meta_charset = true
|
||||||
|
|
||||||
|
options.each do |name, value|
|
||||||
|
next unless %w(quote_attr_values quote_char use_best_quote_char
|
||||||
|
minimize_boolean_attributes use_trailing_solidus
|
||||||
|
space_before_trailing_solidus omit_optional_tags sanitize
|
||||||
|
strip_whitespace inject_meta_charset).include? name.to_s
|
||||||
|
@use_best_quote_char = false if name.to_s == 'quote_char'
|
||||||
|
instance_variable_set("@#{name}", value)
|
||||||
|
end
|
||||||
|
|
||||||
|
@errors = []
|
||||||
|
end
|
||||||
|
|
||||||
|
def serialize(treewalker, encoding=nil)
|
||||||
|
in_cdata = false
|
||||||
|
@errors = []
|
||||||
|
if encoding and @inject_meta_charset
|
||||||
|
treewalker = filter_inject_meta_charset(treewalker, encoding)
|
||||||
|
end
|
||||||
|
if @strip_whitespace
|
||||||
|
treewalker = filter_whitespace(treewalker)
|
||||||
|
end
|
||||||
|
if @sanitize
|
||||||
|
require 'html5lib/sanitizer'
|
||||||
|
treewalker = HTMLSanitizeFilter.new(treewalker)
|
||||||
|
end
|
||||||
|
# if @omit_optional_tags
|
||||||
|
# treewalker = OptionalTagFilter.new(treewalker)
|
||||||
|
# end
|
||||||
|
|
||||||
|
result = []
|
||||||
|
treewalker.each do |token|
|
||||||
|
type = token[:type]
|
||||||
|
if type == :Doctype
|
||||||
|
doctype = "<!DOCTYPE %s>" % token[:name]
|
||||||
|
if encoding
|
||||||
|
result << doctype.encode(encoding)
|
||||||
|
else
|
||||||
|
result << doctype
|
||||||
|
end
|
||||||
|
|
||||||
|
elsif [:Characters, :SpaceCharacters].include? type
|
||||||
|
if type == :SpaceCharacters or in_cdata
|
||||||
|
if in_cdata and token[:data].find("</") >= 0
|
||||||
|
serializeError(_("Unexpected </ in CDATA"))
|
||||||
|
end
|
||||||
|
if encoding
|
||||||
|
result << token[:data].encode(encoding, errors || "strict")
|
||||||
|
else
|
||||||
|
result << token[:data]
|
||||||
|
end
|
||||||
|
elsif encoding
|
||||||
|
result << token[:data].replace("&", "&") \
|
||||||
|
.encode(encoding, unicode_encode_errors)
|
||||||
|
else
|
||||||
|
result << token[:data] \
|
||||||
|
.gsub("&", "&") \
|
||||||
|
.gsub("<", "<") \
|
||||||
|
.gsub(">", ">")
|
||||||
|
end
|
||||||
|
|
||||||
|
elsif [:StartTag, :EmptyTag].include? type
|
||||||
|
name = token[:name]
|
||||||
|
if CDATA_ELEMENTS.include?(name)
|
||||||
|
in_cdata = true
|
||||||
|
elsif in_cdata
|
||||||
|
serializeError(_("Unexpected child element of a CDATA element"))
|
||||||
|
end
|
||||||
|
attrs = token[:data].to_a
|
||||||
|
attrs.sort()
|
||||||
|
attributes = []
|
||||||
|
for k,v in attrs
|
||||||
|
if encoding
|
||||||
|
k = k.encode(encoding)
|
||||||
|
end
|
||||||
|
attributes << ' '
|
||||||
|
|
||||||
|
attributes << k
|
||||||
|
if not @minimize_boolean_attributes or \
|
||||||
|
(!(BOOLEAN_ATTRIBUTES[name]||[]).include?(k) \
|
||||||
|
and !BOOLEAN_ATTRIBUTES[:global].include?(k))
|
||||||
|
attributes << "="
|
||||||
|
if @quote_attr_values or v.empty?
|
||||||
|
quote_attr = true
|
||||||
|
else
|
||||||
|
quote_attr = (SPACE_CHARACTERS.join('') + "<>\"'").each_char.any? {|c| v.include?(c)}
|
||||||
|
end
|
||||||
|
v = v.gsub("&", "&")
|
||||||
|
if encoding
|
||||||
|
v = v.encode(encoding, unicode_encode_errors)
|
||||||
|
end
|
||||||
|
if quote_attr
|
||||||
|
quote_char = @quote_char
|
||||||
|
if @use_best_quote_char
|
||||||
|
if v.index("'") and !v.index('"')
|
||||||
|
quote_char = '"'
|
||||||
|
elsif v.index('"') and !v.index("'")
|
||||||
|
quote_char = "'"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
if quote_char == "'"
|
||||||
|
v = v.gsub("'", "'")
|
||||||
|
else
|
||||||
|
v = v.gsub('"', """)
|
||||||
|
end
|
||||||
|
attributes << quote_char << v << quote_char
|
||||||
|
else
|
||||||
|
attributes << v
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
if VOID_ELEMENTS.include?(name) and @use_trailing_solidus
|
||||||
|
if @space_before_trailing_solidus
|
||||||
|
attributes << " /"
|
||||||
|
else
|
||||||
|
attributes << "/"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
if encoding
|
||||||
|
result << "<%s%s>" % [name.encode(encoding), attributes.join('')]
|
||||||
|
else
|
||||||
|
result << "<%s%s>" % [name, attributes.join('')]
|
||||||
|
end
|
||||||
|
|
||||||
|
elsif type == :EndTag
|
||||||
|
name = token[:name]
|
||||||
|
if CDATA_ELEMENTS.include?(name)
|
||||||
|
in_cdata = false
|
||||||
|
elsif in_cdata
|
||||||
|
serializeError(_("Unexpected child element of a CDATA element"))
|
||||||
|
end
|
||||||
|
end_tag = "</%s>" % name
|
||||||
|
if encoding
|
||||||
|
end_tag = end_tag.encode(encoding)
|
||||||
|
end
|
||||||
|
result << end_tag
|
||||||
|
|
||||||
|
elsif type == :Comment
|
||||||
|
data = token[:data]
|
||||||
|
if data.index("--")
|
||||||
|
serializeError(_("Comment contains --"))
|
||||||
|
end
|
||||||
|
comment = "<!--%s-->" % token[:data]
|
||||||
|
if encoding
|
||||||
|
comment = comment.encode(encoding, unicode_encode_errors)
|
||||||
|
end
|
||||||
|
result << comment
|
||||||
|
|
||||||
|
else
|
||||||
|
serializeError(token[:data])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
result.join('')
|
||||||
|
end
|
||||||
|
|
||||||
|
def render(treewalker, encoding=nil)
|
||||||
|
if encoding
|
||||||
|
return "".join(list(serialize(treewalker, encoding)))
|
||||||
|
else
|
||||||
|
return "".join(list(serialize(treewalker)))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def serializeError(data="XXX ERROR MESSAGE NEEDED")
|
||||||
|
# XXX The idea is to make data mandatory.
|
||||||
|
@errors.push(data)
|
||||||
|
if @strict
|
||||||
|
raise SerializeError
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def filter_inject_meta_charset(treewalker, encoding)
|
||||||
|
done = false
|
||||||
|
for token in treewalker
|
||||||
|
if not done and token[:type] == :StartTag \
|
||||||
|
and token[:name].lower() == "head"
|
||||||
|
yield({:type => :EmptyTag, :name => "meta", \
|
||||||
|
:data => {"charset" => encoding}})
|
||||||
|
end
|
||||||
|
yield token
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def filter_whitespace(treewalker)
|
||||||
|
raise NotImplementedError
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Error in serialized tree
|
||||||
|
class SerializeError < Exception
|
||||||
|
end
|
||||||
|
end
|
|
@ -1,21 +1,24 @@
|
||||||
module HTML5lib
|
module HTML5lib
|
||||||
module TreeBuilders
|
module TreeBuilders
|
||||||
|
|
||||||
def self.getTreeBuilder(name)
|
class << self
|
||||||
case name.to_s.downcase
|
def [](name)
|
||||||
|
case name.to_s.downcase
|
||||||
when 'simpletree' then
|
when 'simpletree' then
|
||||||
require 'html5lib/treebuilders/simpletree'
|
require 'html5lib/treebuilders/simpletree'
|
||||||
SimpleTree::TreeBuilder
|
SimpleTree::TreeBuilder
|
||||||
when 'rexml' then
|
when 'rexml' then
|
||||||
require 'html5lib/treebuilders/rexml'
|
require 'html5lib/treebuilders/rexml'
|
||||||
REXMLTree::TreeBuilder
|
REXML::TreeBuilder
|
||||||
when 'hpricot' then
|
when 'hpricot' then
|
||||||
require 'html5lib/treebuilders/hpricot'
|
require 'html5lib/treebuilders/hpricot'
|
||||||
Hpricot::TreeBuilder
|
Hpricot::TreeBuilder
|
||||||
else
|
else
|
||||||
raise "Unknown TreeBuilder #{name}"
|
raise "Unknown TreeBuilder #{name}"
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
|
||||||
|
|
||||||
|
alias :getTreeBuilder :[]
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -144,7 +144,7 @@ module HTML5lib
|
||||||
# code. It should still do the same though.
|
# code. It should still do the same though.
|
||||||
|
|
||||||
# Step 1: stop the algorithm when there's nothing to do.
|
# Step 1: stop the algorithm when there's nothing to do.
|
||||||
return unless @activeFormattingElements
|
return if @activeFormattingElements.empty?
|
||||||
|
|
||||||
# Step 2 and step 3: we start with the last element. So i is -1.
|
# Step 2 and step 3: we start with the last element. So i is -1.
|
||||||
i = -1
|
i = -1
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
require 'html5lib/treebuilders/base'
|
require 'html5lib/treebuilders/base'
|
||||||
|
require 'rubygems'
|
||||||
require 'hpricot'
|
require 'hpricot'
|
||||||
require 'forwardable'
|
require 'forwardable'
|
||||||
|
|
||||||
|
@ -26,12 +27,14 @@ module HTML5lib
|
||||||
childNodes << node
|
childNodes << node
|
||||||
hpricot.children << node.hpricot
|
hpricot.children << node.hpricot
|
||||||
end
|
end
|
||||||
|
node.hpricot.parent = hpricot
|
||||||
node.parent = self
|
node.parent = self
|
||||||
end
|
end
|
||||||
|
|
||||||
def removeChild(node)
|
def removeChild(node)
|
||||||
childNodes.delete(node)
|
childNodes.delete(node)
|
||||||
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
|
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
|
||||||
|
node.hpricot.parent = nil
|
||||||
node.parent = nil
|
node.parent = nil
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -48,6 +51,7 @@ module HTML5lib
|
||||||
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
|
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
|
||||||
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
|
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
|
||||||
else
|
else
|
||||||
|
refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
|
||||||
childNodes.insert(index, node)
|
childNodes.insert(index, node)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -4,7 +4,7 @@ require 'forwardable'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5lib
|
||||||
module TreeBuilders
|
module TreeBuilders
|
||||||
module REXMLTree
|
module REXML
|
||||||
|
|
||||||
class Node < Base::Node
|
class Node < Base::Node
|
||||||
extend Forwardable
|
extend Forwardable
|
||||||
|
@ -52,6 +52,7 @@ module HTML5lib
|
||||||
childNodes[index-1].rxobj.raw = true
|
childNodes[index-1].rxobj.raw = true
|
||||||
else
|
else
|
||||||
childNodes.insert index, node
|
childNodes.insert index, node
|
||||||
|
refNode.rxobj.parent.insert_before(refNode.rxobj,node.rxobj)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -62,7 +63,7 @@ module HTML5lib
|
||||||
|
|
||||||
class Element < Node
|
class Element < Node
|
||||||
def self.rxclass
|
def self.rxclass
|
||||||
REXML::Element
|
::REXML::Element
|
||||||
end
|
end
|
||||||
|
|
||||||
def initialize name
|
def initialize name
|
||||||
|
@ -95,7 +96,7 @@ module HTML5lib
|
||||||
|
|
||||||
class Document < Node
|
class Document < Node
|
||||||
def self.rxclass
|
def self.rxclass
|
||||||
REXML::Document
|
::REXML::Document
|
||||||
end
|
end
|
||||||
|
|
||||||
def initialize
|
def initialize
|
||||||
|
@ -120,7 +121,7 @@ module HTML5lib
|
||||||
|
|
||||||
class DocumentType < Node
|
class DocumentType < Node
|
||||||
def self.rxclass
|
def self.rxclass
|
||||||
REXML::DocType
|
::REXML::DocType
|
||||||
end
|
end
|
||||||
|
|
||||||
def printTree indent=0
|
def printTree indent=0
|
||||||
|
@ -145,7 +146,7 @@ module HTML5lib
|
||||||
class TextNode < Node
|
class TextNode < Node
|
||||||
def initialize data
|
def initialize data
|
||||||
raw=data.gsub('&','&').gsub('<','<').gsub('>','>')
|
raw=data.gsub('&','&').gsub('<','<').gsub('>','>')
|
||||||
@rxobj = REXML::Text.new(raw, true, nil, true)
|
@rxobj = ::REXML::Text.new(raw, true, nil, true)
|
||||||
end
|
end
|
||||||
|
|
||||||
def printTree indent=0
|
def printTree indent=0
|
||||||
|
@ -155,7 +156,7 @@ module HTML5lib
|
||||||
|
|
||||||
class CommentNode < Node
|
class CommentNode < Node
|
||||||
def self.rxclass
|
def self.rxclass
|
||||||
REXML::Comment
|
::REXML::Comment
|
||||||
end
|
end
|
||||||
|
|
||||||
def printTree indent=0
|
def printTree indent=0
|
||||||
|
|
26
vendor/plugins/HTML5lib/lib/html5lib/treewalkers.rb
vendored
Normal file
26
vendor/plugins/HTML5lib/lib/html5lib/treewalkers.rb
vendored
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
require 'html5lib/treewalkers/base'
|
||||||
|
|
||||||
|
module HTML5lib
|
||||||
|
module TreeWalkers
|
||||||
|
|
||||||
|
class << self
|
||||||
|
def [](name)
|
||||||
|
case name.to_s.downcase
|
||||||
|
when 'simpletree' then
|
||||||
|
require 'html5lib/treewalkers/simpletree'
|
||||||
|
SimpleTree::TreeWalker
|
||||||
|
when 'rexml' then
|
||||||
|
require 'html5lib/treewalkers/rexml'
|
||||||
|
REXML::TreeWalker
|
||||||
|
when 'hpricot' then
|
||||||
|
require 'html5lib/treewalkers/hpricot'
|
||||||
|
Hpricot::TreeWalker
|
||||||
|
else
|
||||||
|
raise "Unknown TreeWalker #{name}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
alias :getTreeWalker :[]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
156
vendor/plugins/HTML5lib/lib/html5lib/treewalkers/base.rb
vendored
Normal file
156
vendor/plugins/HTML5lib/lib/html5lib/treewalkers/base.rb
vendored
Normal file
|
@ -0,0 +1,156 @@
|
||||||
|
require 'html5lib/constants'
|
||||||
|
module HTML5lib
|
||||||
|
module TreeWalkers
|
||||||
|
|
||||||
|
module TokenConstructor
|
||||||
|
def error(msg)
|
||||||
|
return {:type => "SerializeError", :data => msg}
|
||||||
|
end
|
||||||
|
|
||||||
|
def normalizeAttrs(attrs)
|
||||||
|
attrs.to_a
|
||||||
|
end
|
||||||
|
|
||||||
|
def emptyTag(name, attrs, hasChildren=false)
|
||||||
|
error(_("Void element has children")) if hasChildren
|
||||||
|
return({:type => :EmptyTag, :name => name, \
|
||||||
|
:data => normalizeAttrs(attrs)})
|
||||||
|
end
|
||||||
|
|
||||||
|
def startTag(name, attrs)
|
||||||
|
return {:type => :StartTag, :name => name, \
|
||||||
|
:data => normalizeAttrs(attrs)}
|
||||||
|
end
|
||||||
|
|
||||||
|
def endTag(name)
|
||||||
|
return {:type => :EndTag, :name => name, :data => []}
|
||||||
|
end
|
||||||
|
|
||||||
|
def text(data)
|
||||||
|
if data =~ /^([#{SPACE_CHARACTERS.join('')}]+)/
|
||||||
|
yield({:type => :SpaceCharacters, :data => $1})
|
||||||
|
data = data[$1.length .. -1]
|
||||||
|
return if data.empty?
|
||||||
|
end
|
||||||
|
|
||||||
|
if data =~ /([#{SPACE_CHARACTERS.join('')}]+)$/
|
||||||
|
yield({:type => :Characters, :data => data[0 ... -$1.length]})
|
||||||
|
yield({:type => :SpaceCharacters, :data => $1})
|
||||||
|
else
|
||||||
|
yield({:type => :Characters, :data => data})
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def comment(data)
|
||||||
|
return {:type => :Comment, :data => data}
|
||||||
|
end
|
||||||
|
|
||||||
|
def doctype(name)
|
||||||
|
return {:type => :Doctype, :name => name, :data => name.upcase() == "HTML"}
|
||||||
|
end
|
||||||
|
|
||||||
|
def unknown(nodeType)
|
||||||
|
return error(_("Unknown node type: ") + nodeType.to_s)
|
||||||
|
end
|
||||||
|
|
||||||
|
def _(str)
|
||||||
|
str
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class Base
|
||||||
|
include TokenConstructor
|
||||||
|
|
||||||
|
def initialize(tree)
|
||||||
|
@tree = tree
|
||||||
|
end
|
||||||
|
|
||||||
|
def each
|
||||||
|
raise NotImplementedError
|
||||||
|
end
|
||||||
|
|
||||||
|
alias walk each
|
||||||
|
end
|
||||||
|
|
||||||
|
class NonRecursiveTreeWalker < TreeWalkers::Base
|
||||||
|
def node_details(node)
|
||||||
|
raise NotImplementedError
|
||||||
|
end
|
||||||
|
|
||||||
|
def first_child(node)
|
||||||
|
raise NotImplementedError
|
||||||
|
end
|
||||||
|
|
||||||
|
def next_sibling(node)
|
||||||
|
raise NotImplementedError
|
||||||
|
end
|
||||||
|
|
||||||
|
def parent(node)
|
||||||
|
raise NotImplementedError
|
||||||
|
end
|
||||||
|
|
||||||
|
def each
|
||||||
|
currentNode = @tree
|
||||||
|
while currentNode != nil
|
||||||
|
details = node_details(currentNode)
|
||||||
|
hasChildren = false
|
||||||
|
|
||||||
|
case details.shift
|
||||||
|
when :DOCTYPE
|
||||||
|
yield doctype(*details)
|
||||||
|
|
||||||
|
when :TEXT
|
||||||
|
text(*details) {|token| yield token}
|
||||||
|
|
||||||
|
when :ELEMENT
|
||||||
|
name, attributes, hasChildren = details
|
||||||
|
if VOID_ELEMENTS.include?(name)
|
||||||
|
yield emptyTag(name, attributes.to_a, hasChildren)
|
||||||
|
hasChildren = false
|
||||||
|
else
|
||||||
|
yield startTag(name, attributes.to_a)
|
||||||
|
end
|
||||||
|
|
||||||
|
when :COMMENT
|
||||||
|
yield comment(details[0])
|
||||||
|
|
||||||
|
when :DOCUMENT, :DOCUMENT_FRAGMENT
|
||||||
|
hasChildren = true
|
||||||
|
|
||||||
|
when nil
|
||||||
|
# ignore (REXML::XMLDecl is an example)
|
||||||
|
|
||||||
|
else
|
||||||
|
yield unknown(details[0])
|
||||||
|
end
|
||||||
|
|
||||||
|
firstChild = hasChildren ? first_child(currentNode) : nil
|
||||||
|
if firstChild != nil
|
||||||
|
currentNode = firstChild
|
||||||
|
else
|
||||||
|
while currentNode != nil
|
||||||
|
details = node_details(currentNode)
|
||||||
|
if details.shift == :ELEMENT
|
||||||
|
name, attributes, hasChildren = details
|
||||||
|
yield endTag(name) if !VOID_ELEMENTS.include?(name)
|
||||||
|
end
|
||||||
|
|
||||||
|
if @tree == currentNode
|
||||||
|
currentNode = nil
|
||||||
|
else
|
||||||
|
nextSibling = next_sibling(currentNode)
|
||||||
|
if nextSibling != nil
|
||||||
|
currentNode = nextSibling
|
||||||
|
break
|
||||||
|
end
|
||||||
|
|
||||||
|
currentNode = parent(currentNode)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
end
|
48
vendor/plugins/HTML5lib/lib/html5lib/treewalkers/hpricot.rb
vendored
Normal file
48
vendor/plugins/HTML5lib/lib/html5lib/treewalkers/hpricot.rb
vendored
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
require 'html5lib/treewalkers/base'
|
||||||
|
require 'rexml/document'
|
||||||
|
|
||||||
|
module HTML5lib
|
||||||
|
module TreeWalkers
|
||||||
|
module Hpricot
|
||||||
|
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
|
||||||
|
|
||||||
|
def node_details(node)
|
||||||
|
case node
|
||||||
|
when ::Hpricot::Elem
|
||||||
|
if !node.name
|
||||||
|
[:DOCUMENT_FRAGMENT]
|
||||||
|
else
|
||||||
|
[:ELEMENT, node.name,
|
||||||
|
node.attributes.map {|name,value| [name,value]},
|
||||||
|
!node.empty?]
|
||||||
|
end
|
||||||
|
when ::Hpricot::Text
|
||||||
|
[:TEXT, node.to_plain_text]
|
||||||
|
when ::Hpricot::Comment
|
||||||
|
[:COMMENT, node.content]
|
||||||
|
when ::Hpricot::Doc
|
||||||
|
[:DOCUMENT]
|
||||||
|
when ::Hpricot::DocType
|
||||||
|
[:DOCTYPE, node.target]
|
||||||
|
when ::Hpricot::XMLDecl
|
||||||
|
[nil]
|
||||||
|
else
|
||||||
|
[:UNKNOWN, node.class.inspect]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def first_child(node)
|
||||||
|
node.children.first
|
||||||
|
end
|
||||||
|
|
||||||
|
def next_sibling(node)
|
||||||
|
node.next_node
|
||||||
|
end
|
||||||
|
|
||||||
|
def parent(node)
|
||||||
|
node.parent
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
48
vendor/plugins/HTML5lib/lib/html5lib/treewalkers/rexml.rb
vendored
Normal file
48
vendor/plugins/HTML5lib/lib/html5lib/treewalkers/rexml.rb
vendored
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
require 'html5lib/treewalkers/base'
|
||||||
|
require 'rexml/document'
|
||||||
|
|
||||||
|
module HTML5lib
|
||||||
|
module TreeWalkers
|
||||||
|
module REXML
|
||||||
|
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
|
||||||
|
|
||||||
|
def node_details(node)
|
||||||
|
case node
|
||||||
|
when ::REXML::Document
|
||||||
|
[:DOCUMENT]
|
||||||
|
when ::REXML::Element
|
||||||
|
if !node.name
|
||||||
|
[:DOCUMENT_FRAGMENT]
|
||||||
|
else
|
||||||
|
[:ELEMENT, node.name,
|
||||||
|
node.attributes.map {|name,value| [name,value]},
|
||||||
|
node.has_elements? || node.has_text?]
|
||||||
|
end
|
||||||
|
when ::REXML::Text
|
||||||
|
[:TEXT, node.value]
|
||||||
|
when ::REXML::Comment
|
||||||
|
[:COMMENT, node.string]
|
||||||
|
when ::REXML::DocType
|
||||||
|
[:DOCTYPE, node.name]
|
||||||
|
when ::REXML::XMLDecl
|
||||||
|
[nil]
|
||||||
|
else
|
||||||
|
[:UNKNOWN, node.class.inspect]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def first_child(node)
|
||||||
|
node.children.first
|
||||||
|
end
|
||||||
|
|
||||||
|
def next_sibling(node)
|
||||||
|
node.next_sibling
|
||||||
|
end
|
||||||
|
|
||||||
|
def parent(node)
|
||||||
|
node.parent
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
48
vendor/plugins/HTML5lib/lib/html5lib/treewalkers/simpletree.rb
vendored
Normal file
48
vendor/plugins/HTML5lib/lib/html5lib/treewalkers/simpletree.rb
vendored
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
require 'html5lib/treewalkers/base'
|
||||||
|
|
||||||
|
module HTML5lib
|
||||||
|
module TreeWalkers
|
||||||
|
module SimpleTree
|
||||||
|
class TreeWalker < HTML5lib::TreeWalkers::Base
|
||||||
|
include HTML5lib::TreeBuilders::SimpleTree
|
||||||
|
|
||||||
|
def walk(node)
|
||||||
|
case node
|
||||||
|
when Document, DocumentFragment
|
||||||
|
return
|
||||||
|
|
||||||
|
when DocumentType
|
||||||
|
yield doctype(node.name)
|
||||||
|
|
||||||
|
when TextNode
|
||||||
|
text(node.value) {|token| yield token}
|
||||||
|
|
||||||
|
when Element
|
||||||
|
if VOID_ELEMENTS.include?(node.name)
|
||||||
|
yield emptyTag(node.name, node.attributes, node.hasContent())
|
||||||
|
else
|
||||||
|
yield startTag(node.name, node.attributes)
|
||||||
|
for child in node.childNodes
|
||||||
|
walk(child) {|token| yield token}
|
||||||
|
end
|
||||||
|
yield endTag(node.name)
|
||||||
|
end
|
||||||
|
|
||||||
|
when CommentNode
|
||||||
|
yield comment(node.value)
|
||||||
|
|
||||||
|
else
|
||||||
|
puts '?'
|
||||||
|
yield unknown(node.class)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def each
|
||||||
|
for child in @tree.childNodes
|
||||||
|
walk(child) {|node| yield node}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
137
vendor/plugins/HTML5lib/parse.rb
vendored
Executable file
137
vendor/plugins/HTML5lib/parse.rb
vendored
Executable file
|
@ -0,0 +1,137 @@
|
||||||
|
#!/usr/bin/env ruby
|
||||||
|
#
|
||||||
|
# Parse a document to a simpletree tree, with optional profiling
|
||||||
|
|
||||||
|
$:.unshift File.dirname(__FILE__),'lib'
|
||||||
|
|
||||||
|
def parse(opts, args)
|
||||||
|
|
||||||
|
f = args[-1]
|
||||||
|
if f
|
||||||
|
begin
|
||||||
|
require 'open-uri' if f[0..6] == 'http://'
|
||||||
|
f = open(f)
|
||||||
|
rescue
|
||||||
|
end
|
||||||
|
else
|
||||||
|
$stderr.write("No filename provided. Use -h for help\n")
|
||||||
|
exit(1)
|
||||||
|
end
|
||||||
|
|
||||||
|
require 'html5lib/treebuilders'
|
||||||
|
treebuilder = HTML5lib::TreeBuilders[opts.treebuilder]
|
||||||
|
|
||||||
|
if opts.output == :xml
|
||||||
|
require 'html5lib/liberalxmlparser'
|
||||||
|
p = HTML5lib::XHTMLParser.new(:tree=>treebuilder)
|
||||||
|
else
|
||||||
|
require 'html5lib/html5parser'
|
||||||
|
p = HTML5lib::HTMLParser.new(:tree=>treebuilder)
|
||||||
|
end
|
||||||
|
|
||||||
|
if opts.profile
|
||||||
|
require 'profiler'
|
||||||
|
Profiler__::start_profile
|
||||||
|
p.parse(f)
|
||||||
|
Profiler__::stop_profile
|
||||||
|
Profiler__::print_profile($stderr)
|
||||||
|
elsif opts.time
|
||||||
|
require 'time'
|
||||||
|
t0 = Time.new
|
||||||
|
document = p.parse(f)
|
||||||
|
t1 = Time.new
|
||||||
|
printOutput(p, document, opts)
|
||||||
|
t2 = Time.new
|
||||||
|
puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1]
|
||||||
|
else
|
||||||
|
document = p.parse(f)
|
||||||
|
printOutput(p, document, opts)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def printOutput(parser, document, opts)
|
||||||
|
puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding
|
||||||
|
|
||||||
|
case opts.output
|
||||||
|
when :xml
|
||||||
|
print document
|
||||||
|
when :html
|
||||||
|
require 'html5lib/treewalkers'
|
||||||
|
tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document)
|
||||||
|
require 'html5lib/serializer'
|
||||||
|
print HTML5lib::HTMLSerializer.serialize(tokens, :encoding=>'utf-8')
|
||||||
|
when :hilite
|
||||||
|
print document.hilite
|
||||||
|
when :tree
|
||||||
|
print parser.tree.testSerializer(document)
|
||||||
|
end
|
||||||
|
|
||||||
|
if opts.error
|
||||||
|
errList=[]
|
||||||
|
for pos, message in parser.errors
|
||||||
|
errList << ("Line %i Col %i"%pos + " " + message)
|
||||||
|
end
|
||||||
|
$stderr.write("\nParse errors:\n" + errList.join("\n")+"\n")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
require 'ostruct'
|
||||||
|
options = OpenStruct.new
|
||||||
|
options.profile = false
|
||||||
|
options.time = false
|
||||||
|
options.output = :tree
|
||||||
|
options.treebuilder = 'simpletree'
|
||||||
|
options.error = false
|
||||||
|
options.encoding = false
|
||||||
|
|
||||||
|
require 'optparse'
|
||||||
|
opts = OptionParser.new do |opts|
|
||||||
|
opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
|
||||||
|
options.profile = profile
|
||||||
|
end
|
||||||
|
|
||||||
|
opts.on("-t", "--[no-]time", "Time the run") do |time|
|
||||||
|
options.time = time
|
||||||
|
end
|
||||||
|
|
||||||
|
opts.on("--[no-]tree", "Do not print output tree") do |tree|
|
||||||
|
if tree
|
||||||
|
options.output = :tree
|
||||||
|
else
|
||||||
|
options.output = nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
opts.on("-b", "--treebuilder NAME") do |treebuilder|
|
||||||
|
options.treebuilder = treebuilder
|
||||||
|
end
|
||||||
|
|
||||||
|
opts.on("-e", "--error", "Print a list of parse errors") do |error|
|
||||||
|
options.error = error
|
||||||
|
end
|
||||||
|
|
||||||
|
opts.on("-x", "--xml", "output as xml") do |xml|
|
||||||
|
options.output = :xml
|
||||||
|
options.treebuilder = "rexml"
|
||||||
|
end
|
||||||
|
|
||||||
|
opts.on("--html", "Output as html") do |html|
|
||||||
|
options.output = :html
|
||||||
|
end
|
||||||
|
|
||||||
|
opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
|
||||||
|
options.output = :hilite
|
||||||
|
end
|
||||||
|
|
||||||
|
opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
|
||||||
|
options.encoding = encoding
|
||||||
|
end
|
||||||
|
|
||||||
|
opts.on_tail("-h", "--help", "Show this message") do
|
||||||
|
puts opts
|
||||||
|
exit
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
opts.parse!(ARGV)
|
||||||
|
parse options, ARGV
|
50
vendor/plugins/HTML5lib/tests/preamble.rb
vendored
50
vendor/plugins/HTML5lib/tests/preamble.rb
vendored
|
@ -21,3 +21,53 @@ rescue LoadError
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
module HTML5lib
|
||||||
|
module TestSupport
|
||||||
|
def self.startswith?(a, b)
|
||||||
|
b[0... a.length] == a
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.parseTestcase(data)
|
||||||
|
innerHTML = nil
|
||||||
|
input = []
|
||||||
|
output = []
|
||||||
|
errors = []
|
||||||
|
currentList = input
|
||||||
|
data.split(/\n/).each do |line|
|
||||||
|
if !line.empty? and !startswith?("#errors", line) and
|
||||||
|
!startswith?("#document", line) and
|
||||||
|
!startswith?("#data", line) and
|
||||||
|
!startswith?("#document-fragment", line)
|
||||||
|
|
||||||
|
if currentList == output and startswith?("|", line)
|
||||||
|
currentList.push(line[2..-1])
|
||||||
|
else
|
||||||
|
currentList.push(line)
|
||||||
|
end
|
||||||
|
elsif line == "#errors"
|
||||||
|
currentList = errors
|
||||||
|
elsif line == "#document" or startswith?("#document-fragment", line)
|
||||||
|
if startswith?("#document-fragment", line)
|
||||||
|
innerHTML = line[19..-1]
|
||||||
|
raise AssertionError unless innerHTML
|
||||||
|
end
|
||||||
|
currentList = output
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return innerHTML, input.join("\n"), output.join("\n"), errors
|
||||||
|
end
|
||||||
|
|
||||||
|
# convert the output of str(document) to the format used in the testcases
|
||||||
|
def convertTreeDump(treedump)
|
||||||
|
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
|
||||||
|
end
|
||||||
|
|
||||||
|
def sortattrs(output)
|
||||||
|
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) do |match|
|
||||||
|
match.split("\n").sort.join("\n")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
36
vendor/plugins/HTML5lib/tests/test_encoding.rb
vendored
36
vendor/plugins/HTML5lib/tests/test_encoding.rb
vendored
|
@ -4,33 +4,33 @@ require 'html5lib/inputstream'
|
||||||
|
|
||||||
class Html5EncodingTestCase < Test::Unit::TestCase
|
class Html5EncodingTestCase < Test::Unit::TestCase
|
||||||
|
|
||||||
begin
|
begin
|
||||||
require 'rubygems'
|
require 'rubygems'
|
||||||
require 'UniversalDetector'
|
require 'UniversalDetector'
|
||||||
|
|
||||||
def test_chardet
|
def test_chardet
|
||||||
File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
|
File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
|
||||||
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
|
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
|
||||||
assert_equal 'big5', stream.char_encoding.downcase
|
assert_equal 'big5', stream.char_encoding.downcase
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
rescue LoadError
|
rescue LoadError
|
||||||
puts "chardet not found, skipping chardet tests"
|
puts "chardet not found, skipping chardet tests"
|
||||||
end
|
end
|
||||||
|
|
||||||
html5lib_test_files('encoding').each do |test_file|
|
html5lib_test_files('encoding').each do |test_file|
|
||||||
test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
|
test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
|
||||||
|
|
||||||
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
||||||
next if data.empty?
|
next if data.empty?
|
||||||
input, encoding = data.split(/\n#encoding\s+/, 2)
|
input, encoding = data.split(/\n#encoding\s+/, 2)
|
||||||
encoding = encoding.split[0]
|
encoding = encoding.split[0]
|
||||||
|
|
||||||
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
|
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
|
||||||
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
|
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
|
||||||
assert_equal encoding.downcase, stream.char_encoding.downcase, input
|
assert_equal encoding.downcase, stream.char_encoding.downcase, input
|
||||||
end
|
end
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
52
vendor/plugins/HTML5lib/tests/test_parser.rb
vendored
52
vendor/plugins/HTML5lib/tests/test_parser.rb
vendored
|
@ -14,53 +14,12 @@ end
|
||||||
|
|
||||||
$CHECK_PARSER_ERRORS = false
|
$CHECK_PARSER_ERRORS = false
|
||||||
|
|
||||||
puts 'Testing: ' + $tree_types_to_test * ', '
|
puts 'Testing tree builders: ' + $tree_types_to_test * ', '
|
||||||
|
|
||||||
|
|
||||||
class Html5ParserTestCase < Test::Unit::TestCase
|
class Html5ParserTestCase < Test::Unit::TestCase
|
||||||
|
include HTML5lib
|
||||||
def self.startswith?(a, b)
|
include TestSupport
|
||||||
b[0... a.length] == a
|
|
||||||
end
|
|
||||||
|
|
||||||
def self.parseTestcase(data)
|
|
||||||
innerHTML = nil
|
|
||||||
input = []
|
|
||||||
output = []
|
|
||||||
errors = []
|
|
||||||
currentList = input
|
|
||||||
data.split(/\n/).each do |line|
|
|
||||||
if !line.empty? and !startswith?("#errors", line) and
|
|
||||||
!startswith?("#document", line) and
|
|
||||||
!startswith?("#data", line) and
|
|
||||||
!startswith?("#document-fragment", line)
|
|
||||||
|
|
||||||
if currentList == output and startswith?("|", line)
|
|
||||||
currentList.push(line[2..-1])
|
|
||||||
else
|
|
||||||
currentList.push(line)
|
|
||||||
end
|
|
||||||
elsif line == "#errors"
|
|
||||||
currentList = errors
|
|
||||||
elsif line == "#document" or startswith?("#document-fragment", line)
|
|
||||||
if startswith?("#document-fragment", line)
|
|
||||||
innerHTML = line[19..-1]
|
|
||||||
raise AssertionError unless innerHTML
|
|
||||||
end
|
|
||||||
currentList = output
|
|
||||||
end
|
|
||||||
end
|
|
||||||
return innerHTML, input.join("\n"), output.join("\n"), errors
|
|
||||||
end
|
|
||||||
|
|
||||||
# convert the output of str(document) to the format used in the testcases
|
|
||||||
def convertTreeDump(treedump)
|
|
||||||
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
|
|
||||||
end
|
|
||||||
|
|
||||||
def sortattrs(output)
|
|
||||||
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
|
|
||||||
end
|
|
||||||
|
|
||||||
html5lib_test_files('tree-construction').each do |test_file|
|
html5lib_test_files('tree-construction').each do |test_file|
|
||||||
|
|
||||||
|
@ -69,12 +28,13 @@ class Html5ParserTestCase < Test::Unit::TestCase
|
||||||
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
||||||
next if data.empty?
|
next if data.empty?
|
||||||
|
|
||||||
innerHTML, input, expected_output, expected_errors = parseTestcase(data)
|
innerHTML, input, expected_output, expected_errors =
|
||||||
|
TestSupport.parseTestcase(data)
|
||||||
|
|
||||||
$tree_types_to_test.each do |tree_name|
|
$tree_types_to_test.each do |tree_name|
|
||||||
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
|
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
|
||||||
|
|
||||||
parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name))
|
parser = HTMLParser.new(:tree => TreeBuilders[tree_name])
|
||||||
|
|
||||||
if innerHTML
|
if innerHTML
|
||||||
parser.parseFragment(input, innerHTML)
|
parser.parseFragment(input, innerHTML)
|
||||||
|
|
|
@ -2,9 +2,11 @@
|
||||||
|
|
||||||
require File.join(File.dirname(__FILE__), 'preamble')
|
require File.join(File.dirname(__FILE__), 'preamble')
|
||||||
|
|
||||||
require 'html5lib/sanitizer'
|
|
||||||
require 'html5lib/html5parser'
|
require 'html5lib/html5parser'
|
||||||
require 'html5lib/liberalxmlparser'
|
require 'html5lib/liberalxmlparser'
|
||||||
|
require 'html5lib/treewalkers'
|
||||||
|
require 'html5lib/serializer'
|
||||||
|
require 'html5lib/sanitizer'
|
||||||
|
|
||||||
class SanitizeTest < Test::Unit::TestCase
|
class SanitizeTest < Test::Unit::TestCase
|
||||||
include HTML5lib
|
include HTML5lib
|
||||||
|
|
52
vendor/plugins/HTML5lib/tests/test_serializer.rb
vendored
Normal file
52
vendor/plugins/HTML5lib/tests/test_serializer.rb
vendored
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
require File.join(File.dirname(__FILE__), 'preamble')
|
||||||
|
|
||||||
|
require 'html5lib/html5parser'
|
||||||
|
require 'html5lib/serializer'
|
||||||
|
require 'html5lib/treewalkers'
|
||||||
|
|
||||||
|
#Run the serialize error checks
|
||||||
|
checkSerializeErrors = false
|
||||||
|
|
||||||
|
class JsonWalker < HTML5lib::TreeWalkers::Base
|
||||||
|
def each
|
||||||
|
@tree.each do |token|
|
||||||
|
case token[0]
|
||||||
|
when 'StartTag'
|
||||||
|
yield startTag(token[1], token[2])
|
||||||
|
when 'EndTag'
|
||||||
|
yield endTag(token[1])
|
||||||
|
when 'EmptyTag'
|
||||||
|
yield emptyTag(token[1], token[2])
|
||||||
|
when 'Comment'
|
||||||
|
yield comment(token[1])
|
||||||
|
when 'Characters', 'SpaceCharacters'
|
||||||
|
text(token[1]) {|textToken| yield textToken}
|
||||||
|
when 'Doctype'
|
||||||
|
yield doctype(token[1])
|
||||||
|
else
|
||||||
|
raise ValueError("Unknown token type: " + type)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class Html5SerializeTestcase < Test::Unit::TestCase
|
||||||
|
html5lib_test_files('serializer').each do |filename|
|
||||||
|
test_name = File.basename(filename).sub('.test', '')
|
||||||
|
tests = JSON::parse(open(filename).read)
|
||||||
|
tests['tests'].each_with_index do |test, index|
|
||||||
|
|
||||||
|
define_method "test_#{test_name}_#{index+1}" do
|
||||||
|
result = HTML5lib::HTMLSerializer.
|
||||||
|
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
|
||||||
|
expected = test["expected"]
|
||||||
|
if expected.length == 1
|
||||||
|
assert_equal(expected[0], result, test["description"])
|
||||||
|
elsif !expected.include?(result)
|
||||||
|
flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
54
vendor/plugins/HTML5lib/tests/test_stream.rb
vendored
Executable file
54
vendor/plugins/HTML5lib/tests/test_stream.rb
vendored
Executable file
|
@ -0,0 +1,54 @@
|
||||||
|
require File.join(File.dirname(__FILE__), 'preamble')
|
||||||
|
|
||||||
|
require 'html5lib/inputstream'
|
||||||
|
|
||||||
|
class HTMLInputStreamTest < Test::Unit::TestCase
|
||||||
|
include HTML5lib
|
||||||
|
|
||||||
|
def test_char_ascii
|
||||||
|
stream = HTMLInputStream.new("'")
|
||||||
|
assert_equal('ascii', stream.char_encoding)
|
||||||
|
assert_equal("'", stream.char)
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_char_null
|
||||||
|
stream = HTMLInputStream.new("\x00")
|
||||||
|
assert_equal("\xef\xbf\xbd", stream.char)
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_char_utf8
|
||||||
|
stream = HTMLInputStream.new("\xe2\x80\x98")
|
||||||
|
assert_equal('utf-8', stream.char_encoding)
|
||||||
|
assert_equal("\xe2\x80\x98", stream.char)
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_char_win1252
|
||||||
|
stream = HTMLInputStream.new("\x91")
|
||||||
|
assert_equal('windows-1252', stream.char_encoding)
|
||||||
|
assert_equal("\xe2\x80\x98", stream.char)
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_bom
|
||||||
|
stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
|
||||||
|
assert_equal('utf-8', stream.char_encoding)
|
||||||
|
assert_equal("'", stream.char)
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_utf_16
|
||||||
|
stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
|
||||||
|
assert(stream.char_encoding, 'utf-16-le')
|
||||||
|
assert_equal(1025, stream.chars_until(' ',true).length)
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_newlines
|
||||||
|
stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
|
||||||
|
assert_equal(0, stream.instance_eval {@tell})
|
||||||
|
assert_equal("a\nbb\n", stream.chars_until('c'))
|
||||||
|
assert_equal(6, stream.instance_eval {@tell})
|
||||||
|
assert_equal([3,1], stream.position)
|
||||||
|
assert_equal("ccc\ndddd", stream.chars_until('x'))
|
||||||
|
assert_equal(14, stream.instance_eval {@tell})
|
||||||
|
assert_equal([4,5], stream.position)
|
||||||
|
assert_equal([0,1,4,8], stream.instance_eval {@new_lines})
|
||||||
|
end
|
||||||
|
end
|
110
vendor/plugins/HTML5lib/tests/test_treewalkers.rb
vendored
Normal file
110
vendor/plugins/HTML5lib/tests/test_treewalkers.rb
vendored
Normal file
|
@ -0,0 +1,110 @@
|
||||||
|
require File.join(File.dirname(__FILE__), 'preamble')
|
||||||
|
|
||||||
|
require 'html5lib/html5parser'
|
||||||
|
require 'html5lib/treewalkers'
|
||||||
|
require 'html5lib/treebuilders'
|
||||||
|
|
||||||
|
$tree_types_to_test = {
|
||||||
|
'simpletree' =>
|
||||||
|
{:builder => HTML5lib::TreeBuilders['simpletree'],
|
||||||
|
:walker => HTML5lib::TreeWalkers['simpletree']},
|
||||||
|
'rexml' =>
|
||||||
|
{:builder => HTML5lib::TreeBuilders['rexml'],
|
||||||
|
:walker => HTML5lib::TreeWalkers['rexml']},
|
||||||
|
# 'hpricot' =>
|
||||||
|
# {:builder => HTML5lib::TreeBuilders['hpricot'],
|
||||||
|
# :walker => HTML5lib::TreeWalkers['hpricot']},
|
||||||
|
}
|
||||||
|
|
||||||
|
puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
|
||||||
|
|
||||||
|
class TestTreeWalkers < Test::Unit::TestCase
|
||||||
|
include HTML5lib::TestSupport
|
||||||
|
|
||||||
|
def concatenateCharacterTokens(tokens)
|
||||||
|
charactersToken = nil
|
||||||
|
for token in tokens
|
||||||
|
type = token[:type]
|
||||||
|
if [:Characters, :SpaceCharacters].include?(type)
|
||||||
|
if charactersToken == nil
|
||||||
|
charactersToken = {:type => :Characters, :data => token[:data]}
|
||||||
|
else
|
||||||
|
charactersToken[:data] += token[:data]
|
||||||
|
end
|
||||||
|
else
|
||||||
|
if charactersToken != nil
|
||||||
|
yield charactersToken
|
||||||
|
charactersToken = nil
|
||||||
|
end
|
||||||
|
yield token
|
||||||
|
end
|
||||||
|
end
|
||||||
|
yield charactersToken if charactersToken != nil
|
||||||
|
end
|
||||||
|
|
||||||
|
def convertTokens(tokens)
|
||||||
|
output = []
|
||||||
|
indent = 0
|
||||||
|
concatenateCharacterTokens(tokens) do |token|
|
||||||
|
case token[:type]
|
||||||
|
when :StartTag, :EmptyTag
|
||||||
|
output << "#{' '*indent}<#{token[:name]}>"
|
||||||
|
indent += 2
|
||||||
|
for name, value in token[:data].to_a.sort
|
||||||
|
next if name=='xmlns'
|
||||||
|
output << "#{' '*indent}#{name}=\"#{value}\""
|
||||||
|
end
|
||||||
|
indent -= 2 if token[:type] == :EmptyTag
|
||||||
|
when :EndTag
|
||||||
|
indent -= 2
|
||||||
|
when :Comment
|
||||||
|
output << "#{' '*indent}<!-- #{token[:data]} -->"
|
||||||
|
when :Doctype
|
||||||
|
output << "#{' '*indent}<!DOCTYPE #{token[:name]}>"
|
||||||
|
when :Characters, :SpaceCharacters
|
||||||
|
output << "#{' '*indent}\"#{token[:data]}\""
|
||||||
|
else
|
||||||
|
# TODO: what to do with errors?
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return output.join("\n")
|
||||||
|
end
|
||||||
|
|
||||||
|
html5lib_test_files('tree-construction').each do |test_file|
|
||||||
|
|
||||||
|
test_name = File.basename(test_file).sub('.dat', '')
|
||||||
|
|
||||||
|
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
||||||
|
next if data.empty?
|
||||||
|
|
||||||
|
innerHTML, input, expected_output, expected_errors =
|
||||||
|
HTML5lib::TestSupport::parseTestcase(data)
|
||||||
|
|
||||||
|
rexml = $tree_types_to_test['rexml']
|
||||||
|
$tree_types_to_test.each do |tree_name, treeClass|
|
||||||
|
|
||||||
|
define_method "test_#{test_name}_#{index}_#{tree_name}" do
|
||||||
|
|
||||||
|
parser = HTML5lib::HTMLParser.new(:tree => treeClass[:builder])
|
||||||
|
|
||||||
|
if innerHTML
|
||||||
|
parser.parseFragment(input, innerHTML)
|
||||||
|
else
|
||||||
|
parser.parse(input)
|
||||||
|
end
|
||||||
|
|
||||||
|
document = parser.tree.getDocument
|
||||||
|
|
||||||
|
begin
|
||||||
|
output = sortattrs(convertTokens(treeClass[:walker].new(document)))
|
||||||
|
expected = sortattrs(expected_output)
|
||||||
|
errorMsg = "\n\nExpected:\n#{expected}\nRecieved:\n#{output}\n"
|
||||||
|
assert_equal(expected, output, errorMsg)
|
||||||
|
rescue NotImplementedError
|
||||||
|
# Amnesty for those that confess...
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -154,6 +154,21 @@ Example:
|
||||||
CSS: style.css math.css
|
CSS: style.css math.css
|
||||||
|
|
||||||
=end
|
=end
|
||||||
|
# Render to an HTML fragment (returns a REXML document tree)
|
||||||
|
def to_html_tree
|
||||||
|
div = Element.new 'div'
|
||||||
|
children_to_html.each do |e|
|
||||||
|
div << e
|
||||||
|
end
|
||||||
|
|
||||||
|
# render footnotes
|
||||||
|
if @doc.footnotes_order.size > 0
|
||||||
|
div << render_footnotes
|
||||||
|
end
|
||||||
|
|
||||||
|
doc = Document.new(nil,{:respect_whitespace =>:all})
|
||||||
|
doc << div
|
||||||
|
end
|
||||||
|
|
||||||
# Render to a complete HTML document (returns a REXML document tree)
|
# Render to a complete HTML document (returns a REXML document tree)
|
||||||
def to_html_document_tree
|
def to_html_document_tree
|
||||||
|
|
Loading…
Add table
Reference in a new issue