More fixes, sync with HTML5lib
Do a better job with the wrapper <div>s added by xhtmldiff and Maruku's to_html_tree method. More tests fixed.
This commit is contained in:
parent
3ca33e52b5
commit
3de374d6c1
20 changed files with 541 additions and 118 deletions
|
@ -148,6 +148,18 @@ module HTML5lib
|
|||
input
|
||||
]
|
||||
|
||||
CDATA_ELEMENTS = %w[title textarea]
|
||||
|
||||
RCDATA_ELEMENTS = %w[
|
||||
style
|
||||
script
|
||||
xmp
|
||||
iframe
|
||||
noembed
|
||||
noframes
|
||||
noscript
|
||||
]
|
||||
|
||||
BOOLEAN_ATTRIBUTES = {
|
||||
:global => %w[irrelevant],
|
||||
'style' => %w[scoped],
|
||||
|
|
1
vendor/plugins/HTML5lib/lib/html5lib/filters.rb
vendored
Normal file
1
vendor/plugins/HTML5lib/lib/html5lib/filters.rb
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
require 'html5lib/filters/optionaltags'
|
10
vendor/plugins/HTML5lib/lib/html5lib/filters/base.rb
vendored
Normal file
10
vendor/plugins/HTML5lib/lib/html5lib/filters/base.rb
vendored
Normal file
|
@ -0,0 +1,10 @@
|
|||
require 'delegate'
|
||||
require 'enumerator'
|
||||
|
||||
module HTML5lib
|
||||
module Filters
|
||||
class Base < SimpleDelegator
|
||||
include Enumerable
|
||||
end
|
||||
end
|
||||
end
|
62
vendor/plugins/HTML5lib/lib/html5lib/filters/inject_meta_charset.rb
vendored
Normal file
62
vendor/plugins/HTML5lib/lib/html5lib/filters/inject_meta_charset.rb
vendored
Normal file
|
@ -0,0 +1,62 @@
|
|||
require 'html5lib/filters/base'
|
||||
|
||||
module HTML5lib
|
||||
module Filters
|
||||
class InjectMetaCharset < Base
|
||||
def initialize(source, encoding)
|
||||
super(source)
|
||||
@encoding = encoding
|
||||
end
|
||||
|
||||
def each
|
||||
state = :pre_head
|
||||
meta_found = @encoding.nil?
|
||||
pending = []
|
||||
|
||||
__getobj__.each do |token|
|
||||
case token[:type]
|
||||
when :StartTag
|
||||
state = :in_head if token[:name].downcase == "head"
|
||||
|
||||
when :EmptyTag
|
||||
if token[:name].downcase == "meta"
|
||||
if token[:data].any? {|name,value| name=='charset'}
|
||||
# replace charset with actual encoding
|
||||
attrs=Hash[*token[:data].flatten]
|
||||
attrs['charset'] = @encoding
|
||||
token[:data] = attrs.to_a.sort
|
||||
meta_found = true
|
||||
end
|
||||
|
||||
elsif token[:name].downcase == "head" and not meta_found
|
||||
# insert meta into empty head
|
||||
yield({:type => :StartTag, :name => "head", :data => {}})
|
||||
yield({:type => :EmptyTag, :name => "meta",
|
||||
:data => {"charset" => @encoding}})
|
||||
yield({:type => :EndTag, :name => "head"})
|
||||
meta_found = true
|
||||
next
|
||||
end
|
||||
|
||||
when :EndTag
|
||||
if token[:name].downcase == "head" and pending.any?
|
||||
# insert meta into head (if necessary) and flush pending queue
|
||||
yield pending.shift
|
||||
yield({:type => :EmptyTag, :name => "meta",
|
||||
:data => {"charset" => @encoding}}) if not meta_found
|
||||
yield pending.shift while pending.any?
|
||||
meta_found = true
|
||||
state = :post_head
|
||||
end
|
||||
end
|
||||
|
||||
if state == :in_head
|
||||
pending << token
|
||||
else
|
||||
yield token
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
199
vendor/plugins/HTML5lib/lib/html5lib/filters/optionaltags.rb
vendored
Normal file
199
vendor/plugins/HTML5lib/lib/html5lib/filters/optionaltags.rb
vendored
Normal file
|
@ -0,0 +1,199 @@
|
|||
require 'html5lib/constants'
|
||||
require 'html5lib/filters/base'
|
||||
|
||||
module HTML5lib
|
||||
module Filters
|
||||
|
||||
class OptionalTagFilter < Base
|
||||
def slider
|
||||
previous1 = previous2 = nil
|
||||
__getobj__.each do |token|
|
||||
yield previous2, previous1, token if previous1 != nil
|
||||
previous2 = previous1
|
||||
previous1 = token
|
||||
end
|
||||
yield previous2, previous1, nil
|
||||
end
|
||||
|
||||
def each
|
||||
slider do |previous, token, nexttok|
|
||||
type = token[:type]
|
||||
if type == :StartTag
|
||||
yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok)
|
||||
elsif type == :EndTag
|
||||
yield token unless is_optional_end(token[:name], nexttok)
|
||||
else
|
||||
yield token
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def is_optional_start(tagname, previous, nexttok)
|
||||
type = nexttok ? nexttok[:type] : nil
|
||||
if tagname == 'html'
|
||||
# An html element's start tag may be omitted if the first thing
|
||||
# inside the html element is not a space character or a comment.
|
||||
return ![:Comment, :SpaceCharacters].include?(type)
|
||||
elsif tagname == 'head'
|
||||
# A head element's start tag may be omitted if the first thing
|
||||
# inside the head element is an element.
|
||||
return type == :StartTag
|
||||
elsif tagname == 'body'
|
||||
# A body element's start tag may be omitted if the first thing
|
||||
# inside the body element is not a space character or a comment,
|
||||
# except if the first thing inside the body element is a script
|
||||
# or style element and the node immediately preceding the body
|
||||
# element is a head element whose end tag has been omitted.
|
||||
if [:Comment, :SpaceCharacters].include?(type)
|
||||
return false
|
||||
elsif type == :StartTag
|
||||
# XXX: we do not look at the preceding event, so we never omit
|
||||
# the body element's start tag if it's followed by a script or
|
||||
# a style element.
|
||||
return !%w[script style].include?(nexttok[:name])
|
||||
else
|
||||
return true
|
||||
end
|
||||
elsif tagname == 'colgroup'
|
||||
# A colgroup element's start tag may be omitted if the first thing
|
||||
# inside the colgroup element is a col element, and if the element
|
||||
# is not immediately preceeded by another colgroup element whose
|
||||
# end tag has been omitted.
|
||||
if type == :StartTag
|
||||
# XXX: we do not look at the preceding event, so instead we never
|
||||
# omit the colgroup element's end tag when it is immediately
|
||||
# followed by another colgroup element. See is_optional_end.
|
||||
return nexttok[:name] == "col"
|
||||
else
|
||||
return false
|
||||
end
|
||||
elsif tagname == 'tbody'
|
||||
# A tbody element's start tag may be omitted if the first thing
|
||||
# inside the tbody element is a tr element, and if the element is
|
||||
# not immediately preceeded by a tbody, thead, or tfoot element
|
||||
# whose end tag has been omitted.
|
||||
if type == :StartTag
|
||||
# omit the thead and tfoot elements' end tag when they are
|
||||
# immediately followed by a tbody element. See is_optional_end.
|
||||
if previous and previous[:type] == :EndTag and \
|
||||
%w(tbody thead tfoot).include?(previous[:name])
|
||||
return false
|
||||
end
|
||||
|
||||
return nexttok[:name] == 'tr'
|
||||
else
|
||||
return false
|
||||
end
|
||||
end
|
||||
return false
|
||||
end
|
||||
|
||||
def is_optional_end(tagname, nexttok)
|
||||
type = nexttok ? nexttok[:type] : nil
|
||||
if %w[html head body].include?(tagname)
|
||||
# An html element's end tag may be omitted if the html element
|
||||
# is not immediately followed by a space character or a comment.
|
||||
return ![:Comment, :SpaceCharacters].include?(type)
|
||||
elsif %w[li optgroup option tr].include?(tagname)
|
||||
# A li element's end tag may be omitted if the li element is
|
||||
# immediately followed by another li element or if there is
|
||||
# no more content in the parent element.
|
||||
# An optgroup element's end tag may be omitted if the optgroup
|
||||
# element is immediately followed by another optgroup element,
|
||||
# or if there is no more content in the parent element.
|
||||
# An option element's end tag may be omitted if the option
|
||||
# element is immediately followed by another option element,
|
||||
# or if there is no more content in the parent element.
|
||||
# A tr element's end tag may be omitted if the tr element is
|
||||
# immediately followed by another tr element, or if there is
|
||||
# no more content in the parent element.
|
||||
if type == :StartTag
|
||||
return nexttok[:name] == tagname
|
||||
else
|
||||
return type == :EndTag || type == nil
|
||||
end
|
||||
elsif %w(dt dd).include?(tagname)
|
||||
# A dt element's end tag may be omitted if the dt element is
|
||||
# immediately followed by another dt element or a dd element.
|
||||
# A dd element's end tag may be omitted if the dd element is
|
||||
# immediately followed by another dd element or a dt element,
|
||||
# or if there is no more content in the parent element.
|
||||
if type == :StartTag
|
||||
return %w(dt dd).include?(nexttok[:name])
|
||||
elsif tagname == 'dd'
|
||||
return type == :EndTag || type == nil
|
||||
else
|
||||
return false
|
||||
end
|
||||
elsif tagname == 'p'
|
||||
# A p element's end tag may be omitted if the p element is
|
||||
# immediately followed by an address, blockquote, dl, fieldset,
|
||||
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
|
||||
# or ul element, or if there is no more content in the parent
|
||||
# element.
|
||||
if type == :StartTag
|
||||
return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5
|
||||
h6 hr menu ol p pre table ul).include?(nexttok[:name])
|
||||
else
|
||||
return type == :EndTag || type == nil
|
||||
end
|
||||
elsif tagname == 'colgroup'
|
||||
# A colgroup element's end tag may be omitted if the colgroup
|
||||
# element is not immediately followed by a space character or
|
||||
# a comment.
|
||||
if [:Comment, :SpaceCharacters].include?(type)
|
||||
return false
|
||||
elsif type == :StartTag
|
||||
# XXX: we also look for an immediately following colgroup
|
||||
# element. See is_optional_start.
|
||||
return nexttok[:name] != 'colgroup'
|
||||
else
|
||||
return true
|
||||
end
|
||||
elsif %w(thead tbody).include? tagname
|
||||
# A thead element's end tag may be omitted if the thead element
|
||||
# is immediately followed by a tbody or tfoot element.
|
||||
# A tbody element's end tag may be omitted if the tbody element
|
||||
# is immediately followed by a tbody or tfoot element, or if
|
||||
# there is no more content in the parent element.
|
||||
# A tfoot element's end tag may be omitted if the tfoot element
|
||||
# is immediately followed by a tbody element, or if there is no
|
||||
# more content in the parent element.
|
||||
# XXX: we never omit the end tag when the following element is
|
||||
# a tbody. See is_optional_start.
|
||||
if type == :StartTag
|
||||
return %w(tbody tfoot).include?(nexttok[:name])
|
||||
elsif tagname == 'tbody'
|
||||
return (type == :EndTag or type == nil)
|
||||
else
|
||||
return false
|
||||
end
|
||||
elsif tagname == 'tfoot'
|
||||
# A tfoot element's end tag may be omitted if the tfoot element
|
||||
# is immediately followed by a tbody element, or if there is no
|
||||
# more content in the parent element.
|
||||
# XXX: we never omit the end tag when the following element is
|
||||
# a tbody. See is_optional_start.
|
||||
if type == :StartTag
|
||||
return nexttok[:name] == 'tbody'
|
||||
else
|
||||
return type == :EndTag || type == nil
|
||||
end
|
||||
elsif %w(td th).include? tagname
|
||||
# A td element's end tag may be omitted if the td element is
|
||||
# immediately followed by a td or th element, or if there is
|
||||
# no more content in the parent element.
|
||||
# A th element's end tag may be omitted if the th element is
|
||||
# immediately followed by a td or th element, or if there is
|
||||
# no more content in the parent element.
|
||||
if type == :StartTag
|
||||
return %w(td th).include?(nexttok[:name])
|
||||
else
|
||||
return type == :EndTag || type == nil
|
||||
end
|
||||
end
|
||||
return false
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
15
vendor/plugins/HTML5lib/lib/html5lib/filters/sanitizer.rb
vendored
Normal file
15
vendor/plugins/HTML5lib/lib/html5lib/filters/sanitizer.rb
vendored
Normal file
|
@ -0,0 +1,15 @@
|
|||
require 'html5lib/filters/base'
|
||||
require 'html5lib/sanitizer'
|
||||
|
||||
module HTML5lib
|
||||
module Filters
|
||||
class HTMLSanitizeFilter < Base
|
||||
include HTMLSanitizeModule
|
||||
def each
|
||||
__getobj__.each do |token|
|
||||
yield(sanitize_token(token))
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
36
vendor/plugins/HTML5lib/lib/html5lib/filters/whitespace.rb
vendored
Normal file
36
vendor/plugins/HTML5lib/lib/html5lib/filters/whitespace.rb
vendored
Normal file
|
@ -0,0 +1,36 @@
|
|||
require 'html5lib/constants'
|
||||
require 'html5lib/filters/base'
|
||||
|
||||
module HTML5lib
|
||||
module Filters
|
||||
class WhitespaceFilter < Base
|
||||
|
||||
SPACE_PRESERVE_ELEMENTS = %w[pre textarea] + RCDATA_ELEMENTS
|
||||
SPACES = /[#{SPACE_CHARACTERS.join('')}]+/m
|
||||
|
||||
def each
|
||||
preserve = 0
|
||||
__getobj__.each do |token|
|
||||
case token[:type]
|
||||
when :StartTag
|
||||
if preserve > 0 or SPACE_PRESERVE_ELEMENTS.include?(token[:name])
|
||||
preserve += 1
|
||||
end
|
||||
|
||||
when :EndTag
|
||||
preserve -= 1 if preserve > 0
|
||||
|
||||
when :SpaceCharacters
|
||||
next if preserve == 0
|
||||
|
||||
when :Characters
|
||||
token[:data] = token[:data].sub(SPACES,' ') if preserve == 0
|
||||
end
|
||||
|
||||
yield token
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
@ -1,5 +1,4 @@
|
|||
require 'cgi'
|
||||
require 'html5lib/filters'
|
||||
|
||||
module HTML5lib
|
||||
|
||||
|
@ -176,15 +175,6 @@ module HTML5lib
|
|||
end
|
||||
end
|
||||
|
||||
class HTMLSanitizeFilter < Filters::Base
|
||||
include HTMLSanitizeModule
|
||||
def each
|
||||
__getobj__.each do |token|
|
||||
yield(sanitize_token(token))
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
class HTMLSanitizer < HTMLTokenizer
|
||||
include HTMLSanitizeModule
|
||||
def each
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
require 'html5lib/constants'
|
||||
require 'html5lib/filters'
|
||||
|
||||
module HTML5lib
|
||||
|
||||
|
@ -7,7 +6,7 @@ module HTML5lib
|
|||
CDATA_ELEMENTS = %w[style script xmp iframe noembed noframes noscript]
|
||||
|
||||
def self.serialize(stream, options = {})
|
||||
new(options).serialize(stream)
|
||||
new(options).serialize(stream, options[:encoding])
|
||||
end
|
||||
|
||||
def initialize(options={})
|
||||
|
@ -40,20 +39,25 @@ module HTML5lib
|
|||
|
||||
def serialize(treewalker, encoding=nil)
|
||||
in_cdata = false
|
||||
@errors = []
|
||||
|
||||
|
||||
@errors = []
|
||||
if encoding and @inject_meta_charset
|
||||
treewalker = filter_inject_meta_charset(treewalker, encoding)
|
||||
require 'html5lib/filters/inject_meta_charset'
|
||||
treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
|
||||
end
|
||||
|
||||
if @strip_whitespace
|
||||
treewalker = filter_whitespace(treewalker)
|
||||
require 'html5lib/filters/whitespace'
|
||||
treewalker = Filters::WhitespaceFilter.new(treewalker)
|
||||
end
|
||||
|
||||
if @sanitize
|
||||
require 'html5lib/sanitizer'
|
||||
treewalker = HTMLSanitizeFilter.new(treewalker)
|
||||
require 'html5lib/filters/sanitizer'
|
||||
treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
|
||||
end
|
||||
|
||||
if @omit_optional_tags
|
||||
require 'html5lib/filters/optionaltags'
|
||||
treewalker = Filters::OptionalTagFilter.new(treewalker)
|
||||
end
|
||||
|
||||
|
@ -62,25 +66,14 @@ module HTML5lib
|
|||
type = token[:type]
|
||||
if type == :Doctype
|
||||
doctype = "<!DOCTYPE %s>" % token[:name]
|
||||
if encoding
|
||||
result << doctype.encode(encoding)
|
||||
else
|
||||
result << doctype
|
||||
end
|
||||
result << doctype
|
||||
|
||||
elsif [:Characters, :SpaceCharacters].include? type
|
||||
if type == :SpaceCharacters or in_cdata
|
||||
if in_cdata and token[:data].include?("</")
|
||||
serializeError(_("Unexpected </ in CDATA"))
|
||||
end
|
||||
if encoding
|
||||
result << token[:data].encode(encoding, errors || "strict")
|
||||
else
|
||||
result << token[:data]
|
||||
end
|
||||
elsif encoding
|
||||
result << token[:data].replace("&", "&").
|
||||
encode(encoding, unicode_encode_errors)
|
||||
result << token[:data]
|
||||
else
|
||||
result << token[:data].
|
||||
gsub("&", "&").
|
||||
|
@ -97,7 +90,6 @@ module HTML5lib
|
|||
end
|
||||
attributes = []
|
||||
for k,v in attrs = token[:data].to_a.sort
|
||||
k = k.encode(encoding) if encoding
|
||||
attributes << ' '
|
||||
|
||||
attributes << k
|
||||
|
@ -111,9 +103,6 @@ module HTML5lib
|
|||
quote_attr = (SPACE_CHARACTERS + %w(< > " ')).any? {|c| v.include?(c)}
|
||||
end
|
||||
v = v.gsub("&", "&")
|
||||
if encoding
|
||||
v = v.encode(encoding, unicode_encode_errors)
|
||||
end
|
||||
if quote_attr
|
||||
quote_char = @quote_char
|
||||
if @use_best_quote_char
|
||||
|
@ -141,11 +130,7 @@ module HTML5lib
|
|||
attributes << "/"
|
||||
end
|
||||
end
|
||||
if encoding
|
||||
result << "<%s%s>" % [name.encode(encoding), attributes.join('')]
|
||||
else
|
||||
result << "<%s%s>" % [name, attributes.join('')]
|
||||
end
|
||||
result << "<%s%s>" % [name, attributes.join('')]
|
||||
|
||||
elsif type == :EndTag
|
||||
name = token[:name]
|
||||
|
@ -155,33 +140,29 @@ module HTML5lib
|
|||
serializeError(_("Unexpected child element of a CDATA element"))
|
||||
end
|
||||
end_tag = "</#{name}>"
|
||||
end_tag = end_tag.encode(encoding) if encoding
|
||||
result << end_tag
|
||||
|
||||
elsif type == :Comment
|
||||
data = token[:data]
|
||||
serializeError(_("Comment contains --")) if data.index("--")
|
||||
comment = "<!--%s-->" % token[:data]
|
||||
if encoding
|
||||
comment = comment.encode(encoding, unicode_encode_errors)
|
||||
end
|
||||
result << comment
|
||||
|
||||
else
|
||||
serializeError(token[:data])
|
||||
end
|
||||
end
|
||||
result.join('')
|
||||
end
|
||||
|
||||
def render(treewalker, encoding=nil)
|
||||
if encoding
|
||||
return "".join(list(serialize(treewalker, encoding)))
|
||||
if encoding and encoding != 'utf-8'
|
||||
require 'iconv'
|
||||
Iconv.iconv(encoding, 'utf-8', result.join('')).first
|
||||
else
|
||||
return "".join(list(serialize(treewalker)))
|
||||
result.join('')
|
||||
end
|
||||
end
|
||||
|
||||
alias :render :serialize
|
||||
|
||||
def serializeError(data="XXX ERROR MESSAGE NEEDED")
|
||||
# XXX The idea is to make data mandatory.
|
||||
@errors.push(data)
|
||||
|
@ -189,22 +170,6 @@ module HTML5lib
|
|||
raise SerializeError
|
||||
end
|
||||
end
|
||||
|
||||
def filter_inject_meta_charset(treewalker, encoding)
|
||||
done = false
|
||||
for token in treewalker
|
||||
if not done and token[:type] == :StartTag \
|
||||
and token[:name].lower() == "head"
|
||||
yield({:type => :EmptyTag, :name => "meta", \
|
||||
:data => {"charset" => encoding}})
|
||||
end
|
||||
yield token
|
||||
end
|
||||
end
|
||||
|
||||
def filter_whitespace(treewalker)
|
||||
raise NotImplementedError
|
||||
end
|
||||
end
|
||||
|
||||
# Error in serialized tree
|
||||
|
|
|
@ -27,13 +27,13 @@ module TokenConstructor
|
|||
end
|
||||
|
||||
def text(data)
|
||||
if data =~ /^([#{SPACE_CHARACTERS.join('')}]+)/
|
||||
if data =~ /\A([#{SPACE_CHARACTERS.join('')}]+)/m
|
||||
yield({:type => :SpaceCharacters, :data => $1})
|
||||
data = data[$1.length .. -1]
|
||||
return if data.empty?
|
||||
end
|
||||
|
||||
if data =~ /([#{SPACE_CHARACTERS.join('')}]+)$/
|
||||
if data =~ /([#{SPACE_CHARACTERS.join('')}]+)\Z/m
|
||||
yield({:type => :Characters, :data => data[0 ... -$1.length]})
|
||||
yield({:type => :SpaceCharacters, :data => $1})
|
||||
else
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue