More fixes, sync with HTML5lib
Do a better job with the wrapper <div>s added by xhtmldiff and Maruku's to_html_tree method. More tests fixed.
This commit is contained in:
parent
3ca33e52b5
commit
3de374d6c1
20 changed files with 541 additions and 118 deletions
|
@ -1,5 +1,4 @@
|
|||
require 'html5lib/constants'
|
||||
require 'html5lib/filters'
|
||||
|
||||
module HTML5lib
|
||||
|
||||
|
@ -7,7 +6,7 @@ module HTML5lib
|
|||
CDATA_ELEMENTS = %w[style script xmp iframe noembed noframes noscript]
|
||||
|
||||
def self.serialize(stream, options = {})
|
||||
new(options).serialize(stream)
|
||||
new(options).serialize(stream, options[:encoding])
|
||||
end
|
||||
|
||||
def initialize(options={})
|
||||
|
@ -40,20 +39,25 @@ module HTML5lib
|
|||
|
||||
def serialize(treewalker, encoding=nil)
|
||||
in_cdata = false
|
||||
@errors = []
|
||||
|
||||
|
||||
@errors = []
|
||||
if encoding and @inject_meta_charset
|
||||
treewalker = filter_inject_meta_charset(treewalker, encoding)
|
||||
require 'html5lib/filters/inject_meta_charset'
|
||||
treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
|
||||
end
|
||||
|
||||
if @strip_whitespace
|
||||
treewalker = filter_whitespace(treewalker)
|
||||
require 'html5lib/filters/whitespace'
|
||||
treewalker = Filters::WhitespaceFilter.new(treewalker)
|
||||
end
|
||||
|
||||
if @sanitize
|
||||
require 'html5lib/sanitizer'
|
||||
treewalker = HTMLSanitizeFilter.new(treewalker)
|
||||
require 'html5lib/filters/sanitizer'
|
||||
treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
|
||||
end
|
||||
|
||||
if @omit_optional_tags
|
||||
require 'html5lib/filters/optionaltags'
|
||||
treewalker = Filters::OptionalTagFilter.new(treewalker)
|
||||
end
|
||||
|
||||
|
@ -62,25 +66,14 @@ module HTML5lib
|
|||
type = token[:type]
|
||||
if type == :Doctype
|
||||
doctype = "<!DOCTYPE %s>" % token[:name]
|
||||
if encoding
|
||||
result << doctype.encode(encoding)
|
||||
else
|
||||
result << doctype
|
||||
end
|
||||
result << doctype
|
||||
|
||||
elsif [:Characters, :SpaceCharacters].include? type
|
||||
if type == :SpaceCharacters or in_cdata
|
||||
if in_cdata and token[:data].include?("</")
|
||||
serializeError(_("Unexpected </ in CDATA"))
|
||||
end
|
||||
if encoding
|
||||
result << token[:data].encode(encoding, errors || "strict")
|
||||
else
|
||||
result << token[:data]
|
||||
end
|
||||
elsif encoding
|
||||
result << token[:data].replace("&", "&").
|
||||
encode(encoding, unicode_encode_errors)
|
||||
result << token[:data]
|
||||
else
|
||||
result << token[:data].
|
||||
gsub("&", "&").
|
||||
|
@ -97,7 +90,6 @@ module HTML5lib
|
|||
end
|
||||
attributes = []
|
||||
for k,v in attrs = token[:data].to_a.sort
|
||||
k = k.encode(encoding) if encoding
|
||||
attributes << ' '
|
||||
|
||||
attributes << k
|
||||
|
@ -111,9 +103,6 @@ module HTML5lib
|
|||
quote_attr = (SPACE_CHARACTERS + %w(< > " ')).any? {|c| v.include?(c)}
|
||||
end
|
||||
v = v.gsub("&", "&")
|
||||
if encoding
|
||||
v = v.encode(encoding, unicode_encode_errors)
|
||||
end
|
||||
if quote_attr
|
||||
quote_char = @quote_char
|
||||
if @use_best_quote_char
|
||||
|
@ -141,11 +130,7 @@ module HTML5lib
|
|||
attributes << "/"
|
||||
end
|
||||
end
|
||||
if encoding
|
||||
result << "<%s%s>" % [name.encode(encoding), attributes.join('')]
|
||||
else
|
||||
result << "<%s%s>" % [name, attributes.join('')]
|
||||
end
|
||||
result << "<%s%s>" % [name, attributes.join('')]
|
||||
|
||||
elsif type == :EndTag
|
||||
name = token[:name]
|
||||
|
@ -155,33 +140,29 @@ module HTML5lib
|
|||
serializeError(_("Unexpected child element of a CDATA element"))
|
||||
end
|
||||
end_tag = "</#{name}>"
|
||||
end_tag = end_tag.encode(encoding) if encoding
|
||||
result << end_tag
|
||||
|
||||
elsif type == :Comment
|
||||
data = token[:data]
|
||||
serializeError(_("Comment contains --")) if data.index("--")
|
||||
comment = "<!--%s-->" % token[:data]
|
||||
if encoding
|
||||
comment = comment.encode(encoding, unicode_encode_errors)
|
||||
end
|
||||
result << comment
|
||||
|
||||
else
|
||||
serializeError(token[:data])
|
||||
end
|
||||
end
|
||||
result.join('')
|
||||
end
|
||||
|
||||
def render(treewalker, encoding=nil)
|
||||
if encoding
|
||||
return "".join(list(serialize(treewalker, encoding)))
|
||||
if encoding and encoding != 'utf-8'
|
||||
require 'iconv'
|
||||
Iconv.iconv(encoding, 'utf-8', result.join('')).first
|
||||
else
|
||||
return "".join(list(serialize(treewalker)))
|
||||
result.join('')
|
||||
end
|
||||
end
|
||||
|
||||
alias :render :serialize
|
||||
|
||||
def serializeError(data="XXX ERROR MESSAGE NEEDED")
|
||||
# XXX The idea is to make data mandatory.
|
||||
@errors.push(data)
|
||||
|
@ -189,22 +170,6 @@ module HTML5lib
|
|||
raise SerializeError
|
||||
end
|
||||
end
|
||||
|
||||
def filter_inject_meta_charset(treewalker, encoding)
|
||||
done = false
|
||||
for token in treewalker
|
||||
if not done and token[:type] == :StartTag \
|
||||
and token[:name].lower() == "head"
|
||||
yield({:type => :EmptyTag, :name => "meta", \
|
||||
:data => {"charset" => encoding}})
|
||||
end
|
||||
yield token
|
||||
end
|
||||
end
|
||||
|
||||
def filter_whitespace(treewalker)
|
||||
raise NotImplementedError
|
||||
end
|
||||
end
|
||||
|
||||
# Error in serialized tree
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue