Sync with latest HTML5lib
This commit is contained in:
parent
3a3cfeaa9b
commit
55fdc9fff4
18 changed files with 266 additions and 124 deletions
16
vendor/plugins/HTML5lib/lib/html5/html5parser.rb
vendored
16
vendor/plugins/HTML5lib/lib/html5/html5parser.rb
vendored
|
@ -69,15 +69,15 @@ module HTML5
|
|||
|
||||
if inner_html
|
||||
case @inner_html = container.downcase
|
||||
when 'title', 'textarea'
|
||||
@tokenizer.content_model_flag = :RCDATA
|
||||
when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
|
||||
@tokenizer.content_model_flag = :CDATA
|
||||
when 'plaintext'
|
||||
@tokenizer.content_model_flag = :PLAINTEXT
|
||||
else
|
||||
when 'title', 'textarea'
|
||||
@tokenizer.content_model_flag = :RCDATA
|
||||
when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
|
||||
@tokenizer.content_model_flag = :CDATA
|
||||
when 'plaintext'
|
||||
@tokenizer.content_model_flag = :PLAINTEXT
|
||||
else
|
||||
# content_model_flag already is PCDATA
|
||||
#@tokenizer.content_model_flag = :PCDATA
|
||||
@tokenizer.content_model_flag = :PCDATA
|
||||
end
|
||||
|
||||
@phase = @phases[:rootElement]
|
||||
|
|
|
@ -6,45 +6,45 @@ module HTML5
|
|||
# http://www.whatwg.org/specs/web-apps/current-work/#in-body
|
||||
|
||||
handle_start 'html'
|
||||
handle_start %w( base link meta script style ) => 'ProcessInHead'
|
||||
handle_start %w(base link meta script style) => 'ProcessInHead'
|
||||
handle_start 'title'
|
||||
|
||||
handle_start 'body', 'form', 'plaintext', 'a', 'button', 'xmp', 'table', 'hr', 'image'
|
||||
|
||||
handle_start 'input', 'textarea', 'select', 'isindex', %w( marquee object )
|
||||
handle_start 'input', 'textarea', 'select', 'isindex', %w(marquee object)
|
||||
|
||||
handle_start %w( li dd dt ) => 'ListItem'
|
||||
|
||||
handle_start %w( address blockquote center dir div dl fieldset listing menu ol p pre ul ) => 'CloseP'
|
||||
handle_start %w(li dd dt) => 'ListItem'
|
||||
|
||||
handle_start %w( b big em font i s small strike strong tt u ) => 'Formatting'
|
||||
handle_start %w(address blockquote center dir div dl fieldset listing menu ol p pre ul) => 'CloseP'
|
||||
|
||||
handle_start %w(b big em font i s small strike strong tt u) => 'Formatting'
|
||||
handle_start 'nobr'
|
||||
|
||||
handle_start %w( area basefont bgsound br embed img param spacer wbr ) => 'VoidFormatting'
|
||||
handle_start %w(area basefont bgsound br embed img param spacer wbr) => 'VoidFormatting'
|
||||
|
||||
handle_start %w( iframe noembed noframes noscript ) => 'Cdata', HEADING_ELEMENTS => 'Heading'
|
||||
handle_start %w(iframe noembed noframes noscript) => 'Cdata', HEADING_ELEMENTS => 'Heading'
|
||||
|
||||
handle_start %w( caption col colgroup frame frameset head option optgroup tbody td tfoot th thead tr ) => 'Misplaced'
|
||||
handle_start %w(caption col colgroup frame frameset head option optgroup tbody td tfoot th thead tr) => 'Misplaced'
|
||||
|
||||
handle_start %w( event-source section nav article aside header footer datagrid command ) => 'New'
|
||||
handle_start %w(event-source section nav article aside header footer datagrid command) => 'New'
|
||||
|
||||
handle_end 'p', 'body', 'html', 'form', %w( button marquee object ), %w( dd dt li ) => 'ListItem'
|
||||
handle_end 'p', 'body', 'html', 'form', %w(button marquee object), %w(dd dt li) => 'ListItem'
|
||||
|
||||
handle_end %w( address blockquote center div dl fieldset listing menu ol pre ul ) => 'Block'
|
||||
handle_end %w(address blockquote center div dl fieldset listing menu ol pre ul) => 'Block'
|
||||
|
||||
handle_end HEADING_ELEMENTS => 'Heading'
|
||||
|
||||
handle_end %w( a b big em font i nobr s small strike strong tt u ) => 'Formatting'
|
||||
handle_end %w(a b big em font i nobr s small strike strong tt u) => 'Formatting'
|
||||
|
||||
handle_end %w( head frameset select optgroup option table caption colgroup col thead tfoot tbody tr td th ) => 'Misplaced'
|
||||
handle_end %w(head frameset select optgroup option table caption colgroup col thead tfoot tbody tr td th) => 'Misplaced'
|
||||
|
||||
handle_end 'br'
|
||||
|
||||
handle_end %w( area basefont bgsound embed hr image img input isindex param spacer wbr frame ) => 'None'
|
||||
handle_end %w(area basefont bgsound embed hr image img input isindex param spacer wbr frame) => 'None'
|
||||
|
||||
handle_end %w( noframes noscript noembed textarea xmp iframe ) => 'CdataTextAreaXmp'
|
||||
handle_end %w(noframes noscript noembed textarea xmp iframe ) => 'CdataTextAreaXmp'
|
||||
|
||||
handle_end %w( event-source section nav article aside header footer datagrid command ) => 'New'
|
||||
handle_end %w(event-source section nav article aside header footer datagrid command) => 'New'
|
||||
|
||||
def initialize(parser, tree)
|
||||
super(parser, tree)
|
||||
|
@ -107,7 +107,7 @@ module HTML5
|
|||
def startTagBody(name, attributes)
|
||||
parse_error("unexpected-start-tag", {"name" => "body"})
|
||||
|
||||
if (@tree.open_elements.length == 1 || @tree.open_elements[1].name != 'body')
|
||||
if @tree.open_elements.length == 1 || @tree.open_elements[1].name != 'body'
|
||||
assert @parser.inner_html
|
||||
else
|
||||
attributes.each do |attr, value|
|
||||
|
@ -126,11 +126,11 @@ module HTML5
|
|||
|
||||
def startTagForm(name, attributes)
|
||||
if @tree.formPointer
|
||||
parse_error("Unexpected start tag (form). Ignored.")
|
||||
parse_error("unexpected-start-tag", {"name" => name})
|
||||
else
|
||||
endTagP('p') if in_scope?('p')
|
||||
@tree.insert_element(name, attributes)
|
||||
@tree.formPointer = @tree.open_elements[-1]
|
||||
@tree.formPointer = @tree.open_elements.last
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -69,8 +69,7 @@ module HTML5
|
|||
end
|
||||
|
||||
def endTagTableElements(name)
|
||||
parse_error("unexpected-end-tag-in-select",
|
||||
{"name" => name})
|
||||
parse_error("unexpected-end-tag-in-select", {"name" => name})
|
||||
|
||||
if in_scope?(name, true)
|
||||
endTagSelect('select')
|
||||
|
@ -79,7 +78,7 @@ module HTML5
|
|||
end
|
||||
|
||||
def endTagOther(name)
|
||||
parse_error(_("Unexpected end tag token (#{name}) in the select phase. Ignored."))
|
||||
parse_error("unexpected-end-tag-in-select", {"name" => name})
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -7,7 +7,7 @@ module HTML5
|
|||
|
||||
handle_start 'html', 'tr', %w( td th ) => 'TableCell', %w( caption col colgroup tbody tfoot thead ) => 'TableOther'
|
||||
|
||||
handle_end 'table', %w( tbody tfoot thead ) => 'TableRowGroup', %w( body caption col colgroup html td th tr ) => 'Ingore'
|
||||
handle_end 'table', %w( tbody tfoot thead ) => 'TableRowGroup', %w( body caption col colgroup html td th tr ) => 'Ignore'
|
||||
|
||||
def processCharacters(data)
|
||||
@parser.phases[:inTable].processCharacters(data)
|
||||
|
|
|
@ -33,10 +33,9 @@ module HTML5
|
|||
|
||||
def insert_html_element
|
||||
element = @tree.createElement('html', {})
|
||||
@tree.open_elements.push(element)
|
||||
@tree.open_elements << element
|
||||
@tree.document.appendChild(element)
|
||||
@parser.phase = @parser.phases[:beforeHead]
|
||||
end
|
||||
|
||||
end
|
||||
end
|
52
vendor/plugins/HTML5lib/lib/html5/inputstream.rb
vendored
52
vendor/plugins/HTML5lib/lib/html5/inputstream.rb
vendored
|
@ -60,15 +60,11 @@ module HTML5
|
|||
if @char_encoding == 'windows-1252'
|
||||
@win1252 = true
|
||||
elsif @char_encoding != 'utf-8'
|
||||
require 'iconv'
|
||||
begin
|
||||
require 'iconv'
|
||||
begin
|
||||
@buffer << @raw_stream.read unless @raw_stream.eof?
|
||||
@buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
|
||||
rescue
|
||||
@win1252 = true
|
||||
end
|
||||
rescue LoadError
|
||||
@buffer << @raw_stream.read unless @raw_stream.eof?
|
||||
@buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
|
||||
rescue
|
||||
@win1252 = true
|
||||
end
|
||||
end
|
||||
|
@ -88,12 +84,11 @@ module HTML5
|
|||
def open_stream(source)
|
||||
# Already an IO like object
|
||||
if source.respond_to?(:read)
|
||||
@stream = source
|
||||
source
|
||||
else
|
||||
# Treat source as a string and wrap in StringIO
|
||||
@stream = StringIO.new(source)
|
||||
StringIO.new(source)
|
||||
end
|
||||
return @stream
|
||||
end
|
||||
|
||||
def detect_encoding
|
||||
|
@ -138,14 +133,12 @@ module HTML5
|
|||
encoding = @DEFAULT_ENCODING
|
||||
end
|
||||
|
||||
#Substitute for equivalent encodings
|
||||
encoding_sub = {'iso-8859-1' => 'windows-1252'}
|
||||
|
||||
if encoding_sub.has_key?(encoding.downcase)
|
||||
encoding = encoding_sub[encoding.downcase]
|
||||
#Substitute for equivalent encoding
|
||||
if 'iso-8859-1' == encoding.downcase
|
||||
encoding = 'windows-1252'
|
||||
end
|
||||
|
||||
return encoding
|
||||
encoding
|
||||
end
|
||||
|
||||
# Attempts to detect at BOM at the start of the stream. If
|
||||
|
@ -153,9 +146,9 @@ module HTML5
|
|||
# encoding otherwise return nil
|
||||
def detect_bom
|
||||
bom_dict = {
|
||||
"\xef\xbb\xbf" => 'utf-8',
|
||||
"\xff\xfe" => 'utf-16le',
|
||||
"\xfe\xff" => 'utf-16be',
|
||||
"\xef\xbb\xbf" => 'utf-8',
|
||||
"\xff\xfe" => 'utf-16le',
|
||||
"\xfe\xff" => 'utf-16be',
|
||||
"\xff\xfe\x00\x00" => 'utf-32le',
|
||||
"\x00\x00\xfe\xff" => 'utf-32be'
|
||||
}
|
||||
|
@ -200,7 +193,7 @@ module HTML5
|
|||
|
||||
#TODO: huh?
|
||||
require 'delegate'
|
||||
# @raw_stream = SimpleDelegator.new(@raw_stream)
|
||||
@raw_stream = SimpleDelegator.new(@raw_stream)
|
||||
|
||||
class << @raw_stream
|
||||
def read(chars=-1)
|
||||
|
@ -251,7 +244,7 @@ module HTML5
|
|||
col -= 1
|
||||
end
|
||||
end
|
||||
return [line+1, col]
|
||||
return [line + 1, col]
|
||||
end
|
||||
|
||||
# Read one character from the stream or queue if available. Return
|
||||
|
@ -260,9 +253,9 @@ module HTML5
|
|||
unless @queue.empty?
|
||||
return @queue.shift
|
||||
else
|
||||
if @tell + 3 > @buffer.length and !@raw_stream.eof?
|
||||
if @tell + 3 > @buffer.length && !@raw_stream.eof?
|
||||
# read next block
|
||||
@buffer = @buffer[@tell .. -1] + @raw_stream.read(@NUM_BYTES_BUFFER)
|
||||
@buffer = @buffer[@tell..-1] + @raw_stream.read(@NUM_BYTES_BUFFER)
|
||||
@tell = 0
|
||||
end
|
||||
|
||||
|
@ -270,7 +263,7 @@ module HTML5
|
|||
@tell += 1
|
||||
|
||||
case c
|
||||
when 0x01 .. 0x7F
|
||||
when 0x01..0x7F
|
||||
if c == 0x0D
|
||||
# normalize newlines
|
||||
@tell += 1 if @buffer[@tell] == 0x0A
|
||||
|
@ -288,7 +281,7 @@ module HTML5
|
|||
|
||||
c.chr
|
||||
|
||||
when 0x80 .. 0xBF
|
||||
when 0x80..0xBF
|
||||
if !@win1252
|
||||
[0xFFFD].pack('U') # invalid utf-8
|
||||
elsif c <= 0x9f
|
||||
|
@ -297,10 +290,11 @@ module HTML5
|
|||
"\xC2" + c.chr # convert to utf-8
|
||||
end
|
||||
|
||||
when 0xC0 .. 0xFF
|
||||
when 0xC0..0xFF
|
||||
if instance_variables.include?("@win1252") && @win1252
|
||||
"\xC3" + (c-64).chr # convert to utf-8
|
||||
elsif @buffer[@tell-1 .. @tell+3] =~ /^
|
||||
"\xC3" + (c - 64).chr # convert to utf-8
|
||||
# from http://www.w3.org/International/questions/qa-forms-utf-8.en.php
|
||||
elsif @buffer[@tell - 1..@tell + 3] =~ /^
|
||||
( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
||||
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
||||
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
|
||||
|
|
12
vendor/plugins/HTML5lib/lib/html5/sanitizer.rb
vendored
12
vendor/plugins/HTML5lib/lib/html5/sanitizer.rb
vendored
|
@ -110,13 +110,13 @@ module HTML5
|
|||
def sanitize_token(token)
|
||||
case token[:type]
|
||||
when :StartTag, :EndTag, :EmptyTag
|
||||
if ALLOWED_ELEMENTS.include?(token[:name])
|
||||
if self.class.const_get("ALLOWED_ELEMENTS").include?(token[:name])
|
||||
if token.has_key? :data
|
||||
attrs = Hash[*token[:data].flatten]
|
||||
attrs.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
|
||||
attrs.delete_if { |attr,v| !self.class.const_get("ALLOWED_ATTRIBUTES").include?(attr) }
|
||||
ATTR_VAL_IS_URI.each do |attr|
|
||||
val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
|
||||
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
|
||||
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !self.class.const_get("ALLOWED_PROTOCOLS").include?(val_unescaped.split(':')[0])
|
||||
attrs.delete attr
|
||||
end
|
||||
end
|
||||
|
@ -160,14 +160,14 @@ module HTML5
|
|||
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
|
||||
next if val.empty?
|
||||
prop.downcase!
|
||||
if ALLOWED_CSS_PROPERTIES.include?(prop)
|
||||
if self.class.const_get("ALLOWED_CSS_PROPERTIES").include?(prop)
|
||||
clean << "#{prop}: #{val};"
|
||||
elsif %w[background border margin padding].include?(prop.split('-')[0])
|
||||
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
|
||||
!ALLOWED_CSS_KEYWORDS.include?(keyword) and
|
||||
!self.class.const_get("ALLOWED_CSS_KEYWORDS").include?(keyword) and
|
||||
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
|
||||
end
|
||||
elsif ALLOWED_SVG_PROPERTIES.include?(prop)
|
||||
elsif self.class.const_get("ALLOWED_SVG_PROPERTIES").include?(prop)
|
||||
clean << "#{prop}: #{val};"
|
||||
end
|
||||
end
|
||||
|
|
|
@ -73,7 +73,7 @@ module HTML5
|
|||
elsif [:Characters, :SpaceCharacters].include? type
|
||||
if type == :SpaceCharacters or in_cdata
|
||||
if in_cdata and token[:data].include?("</")
|
||||
serialize_error(_("Unexpected </ in CDATA"))
|
||||
serialize_error("Unexpected </ in CDATA")
|
||||
end
|
||||
result << token[:data]
|
||||
else
|
||||
|
|
|
@ -99,12 +99,13 @@ module HTML5
|
|||
super nil
|
||||
end
|
||||
|
||||
def appendChild node
|
||||
if node.kind_of? Element and node.name == 'html'
|
||||
node.rxobj.add_namespace('http://www.w3.org/1999/xhtml')
|
||||
end
|
||||
super node
|
||||
end
|
||||
# ryansking: not sure why this was here. removing it doesn't cause any tests to fail
|
||||
# def appendChild node
|
||||
# if node.kind_of? Element and node.name == 'html'
|
||||
# node.rxobj.add_namespace('http://www.w3.org/1999/xhtml')
|
||||
# end
|
||||
# super node
|
||||
# end
|
||||
|
||||
def printTree indent=0
|
||||
tree = "#document"
|
||||
|
|
|
@ -176,7 +176,7 @@ module HTML5
|
|||
|
||||
def get_fragment
|
||||
@document = super
|
||||
@document.childNodes
|
||||
@document
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -68,6 +68,14 @@ class Base
|
|||
end
|
||||
|
||||
alias walk each
|
||||
|
||||
def to_ary
|
||||
a = []
|
||||
each do |i|
|
||||
a << i
|
||||
end
|
||||
a
|
||||
end
|
||||
end
|
||||
|
||||
class NonRecursiveTreeWalker < TreeWalkers::Base
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue