More fixes, sync with HTML5lib

Do a better job with the wrapper <div>s added by xhtmldiff and Maruku's to_html_tree method.
More tests fixed.
This commit is contained in:
Jacques Distler 2007-06-13 23:05:15 -05:00
parent 3ca33e52b5
commit 3de374d6c1
20 changed files with 541 additions and 118 deletions

View file

@ -1,5 +1,4 @@
require 'html5lib/constants'
require 'html5lib/filters'
module HTML5lib
@ -7,7 +6,7 @@ module HTML5lib
CDATA_ELEMENTS = %w[style script xmp iframe noembed noframes noscript]
def self.serialize(stream, options = {})
new(options).serialize(stream)
new(options).serialize(stream, options[:encoding])
end
def initialize(options={})
@ -40,20 +39,25 @@ module HTML5lib
def serialize(treewalker, encoding=nil)
in_cdata = false
@errors = []
@errors = []
if encoding and @inject_meta_charset
treewalker = filter_inject_meta_charset(treewalker, encoding)
require 'html5lib/filters/inject_meta_charset'
treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
end
if @strip_whitespace
treewalker = filter_whitespace(treewalker)
require 'html5lib/filters/whitespace'
treewalker = Filters::WhitespaceFilter.new(treewalker)
end
if @sanitize
require 'html5lib/sanitizer'
treewalker = HTMLSanitizeFilter.new(treewalker)
require 'html5lib/filters/sanitizer'
treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
end
if @omit_optional_tags
require 'html5lib/filters/optionaltags'
treewalker = Filters::OptionalTagFilter.new(treewalker)
end
@ -62,25 +66,14 @@ module HTML5lib
type = token[:type]
if type == :Doctype
doctype = "<!DOCTYPE %s>" % token[:name]
if encoding
result << doctype.encode(encoding)
else
result << doctype
end
result << doctype
elsif [:Characters, :SpaceCharacters].include? type
if type == :SpaceCharacters or in_cdata
if in_cdata and token[:data].include?("</")
serializeError(_("Unexpected </ in CDATA"))
end
if encoding
result << token[:data].encode(encoding, errors || "strict")
else
result << token[:data]
end
elsif encoding
result << token[:data].replace("&", "&amp;").
encode(encoding, unicode_encode_errors)
result << token[:data]
else
result << token[:data].
gsub("&", "&amp;").
@ -97,7 +90,6 @@ module HTML5lib
end
attributes = []
for k,v in attrs = token[:data].to_a.sort
k = k.encode(encoding) if encoding
attributes << ' '
attributes << k
@ -111,9 +103,6 @@ module HTML5lib
quote_attr = (SPACE_CHARACTERS + %w(< > " ')).any? {|c| v.include?(c)}
end
v = v.gsub("&", "&amp;")
if encoding
v = v.encode(encoding, unicode_encode_errors)
end
if quote_attr
quote_char = @quote_char
if @use_best_quote_char
@ -141,11 +130,7 @@ module HTML5lib
attributes << "/"
end
end
if encoding
result << "<%s%s>" % [name.encode(encoding), attributes.join('')]
else
result << "<%s%s>" % [name, attributes.join('')]
end
result << "<%s%s>" % [name, attributes.join('')]
elsif type == :EndTag
name = token[:name]
@ -155,33 +140,29 @@ module HTML5lib
serializeError(_("Unexpected child element of a CDATA element"))
end
end_tag = "</#{name}>"
end_tag = end_tag.encode(encoding) if encoding
result << end_tag
elsif type == :Comment
data = token[:data]
serializeError(_("Comment contains --")) if data.index("--")
comment = "<!--%s-->" % token[:data]
if encoding
comment = comment.encode(encoding, unicode_encode_errors)
end
result << comment
else
serializeError(token[:data])
end
end
result.join('')
end
def render(treewalker, encoding=nil)
if encoding
return "".join(list(serialize(treewalker, encoding)))
if encoding and encoding != 'utf-8'
require 'iconv'
Iconv.iconv(encoding, 'utf-8', result.join('')).first
else
return "".join(list(serialize(treewalker)))
result.join('')
end
end
alias :render :serialize
def serializeError(data="XXX ERROR MESSAGE NEEDED")
# XXX The idea is to make data mandatory.
@errors.push(data)
@ -189,22 +170,6 @@ module HTML5lib
raise SerializeError
end
end
def filter_inject_meta_charset(treewalker, encoding)
done = false
for token in treewalker
if not done and token[:type] == :StartTag \
and token[:name].lower() == "head"
yield({:type => :EmptyTag, :name => "meta", \
:data => {"charset" => encoding}})
end
yield token
end
end
def filter_whitespace(treewalker)
raise NotImplementedError
end
end
# Error in serialized tree