More fixes, sync with HTML5lib

Do a better job with the wrapper <div>s added by xhtmldiff and Maruku's to_html_tree method.
More tests fixed.
This commit is contained in:
Jacques Distler 2007-06-13 23:05:15 -05:00
parent 3ca33e52b5
commit 3de374d6c1
20 changed files with 541 additions and 118 deletions

View file

@ -148,6 +148,18 @@ module HTML5lib
input
]
CDATA_ELEMENTS = %w[title textarea]
RCDATA_ELEMENTS = %w[
style
script
xmp
iframe
noembed
noframes
noscript
]
BOOLEAN_ATTRIBUTES = {
:global => %w[irrelevant],
'style' => %w[scoped],

View file

@ -0,0 +1 @@
require 'html5lib/filters/optionaltags'

View file

@ -0,0 +1,10 @@
require 'delegate'
require 'enumerator'
module HTML5lib
module Filters
class Base < SimpleDelegator
include Enumerable
end
end
end

View file

@ -0,0 +1,62 @@
require 'html5lib/filters/base'
module HTML5lib
module Filters
class InjectMetaCharset < Base
def initialize(source, encoding)
super(source)
@encoding = encoding
end
def each
state = :pre_head
meta_found = @encoding.nil?
pending = []
__getobj__.each do |token|
case token[:type]
when :StartTag
state = :in_head if token[:name].downcase == "head"
when :EmptyTag
if token[:name].downcase == "meta"
if token[:data].any? {|name,value| name=='charset'}
# replace charset with actual encoding
attrs=Hash[*token[:data].flatten]
attrs['charset'] = @encoding
token[:data] = attrs.to_a.sort
meta_found = true
end
elsif token[:name].downcase == "head" and not meta_found
# insert meta into empty head
yield({:type => :StartTag, :name => "head", :data => {}})
yield({:type => :EmptyTag, :name => "meta",
:data => {"charset" => @encoding}})
yield({:type => :EndTag, :name => "head"})
meta_found = true
next
end
when :EndTag
if token[:name].downcase == "head" and pending.any?
# insert meta into head (if necessary) and flush pending queue
yield pending.shift
yield({:type => :EmptyTag, :name => "meta",
:data => {"charset" => @encoding}}) if not meta_found
yield pending.shift while pending.any?
meta_found = true
state = :post_head
end
end
if state == :in_head
pending << token
else
yield token
end
end
end
end
end
end

View file

@ -0,0 +1,199 @@
require 'html5lib/constants'
require 'html5lib/filters/base'
module HTML5lib
module Filters
class OptionalTagFilter < Base
def slider
previous1 = previous2 = nil
__getobj__.each do |token|
yield previous2, previous1, token if previous1 != nil
previous2 = previous1
previous1 = token
end
yield previous2, previous1, nil
end
def each
slider do |previous, token, nexttok|
type = token[:type]
if type == :StartTag
yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok)
elsif type == :EndTag
yield token unless is_optional_end(token[:name], nexttok)
else
yield token
end
end
end
def is_optional_start(tagname, previous, nexttok)
type = nexttok ? nexttok[:type] : nil
if tagname == 'html'
# An html element's start tag may be omitted if the first thing
# inside the html element is not a space character or a comment.
return ![:Comment, :SpaceCharacters].include?(type)
elsif tagname == 'head'
# A head element's start tag may be omitted if the first thing
# inside the head element is an element.
return type == :StartTag
elsif tagname == 'body'
# A body element's start tag may be omitted if the first thing
# inside the body element is not a space character or a comment,
# except if the first thing inside the body element is a script
# or style element and the node immediately preceding the body
# element is a head element whose end tag has been omitted.
if [:Comment, :SpaceCharacters].include?(type)
return false
elsif type == :StartTag
# XXX: we do not look at the preceding event, so we never omit
# the body element's start tag if it's followed by a script or
# a style element.
return !%w[script style].include?(nexttok[:name])
else
return true
end
elsif tagname == 'colgroup'
# A colgroup element's start tag may be omitted if the first thing
# inside the colgroup element is a col element, and if the element
# is not immediately preceeded by another colgroup element whose
# end tag has been omitted.
if type == :StartTag
# XXX: we do not look at the preceding event, so instead we never
# omit the colgroup element's end tag when it is immediately
# followed by another colgroup element. See is_optional_end.
return nexttok[:name] == "col"
else
return false
end
elsif tagname == 'tbody'
# A tbody element's start tag may be omitted if the first thing
# inside the tbody element is a tr element, and if the element is
# not immediately preceeded by a tbody, thead, or tfoot element
# whose end tag has been omitted.
if type == :StartTag
# omit the thead and tfoot elements' end tag when they are
# immediately followed by a tbody element. See is_optional_end.
if previous and previous[:type] == :EndTag and \
%w(tbody thead tfoot).include?(previous[:name])
return false
end
return nexttok[:name] == 'tr'
else
return false
end
end
return false
end
def is_optional_end(tagname, nexttok)
type = nexttok ? nexttok[:type] : nil
if %w[html head body].include?(tagname)
# An html element's end tag may be omitted if the html element
# is not immediately followed by a space character or a comment.
return ![:Comment, :SpaceCharacters].include?(type)
elsif %w[li optgroup option tr].include?(tagname)
# A li element's end tag may be omitted if the li element is
# immediately followed by another li element or if there is
# no more content in the parent element.
# An optgroup element's end tag may be omitted if the optgroup
# element is immediately followed by another optgroup element,
# or if there is no more content in the parent element.
# An option element's end tag may be omitted if the option
# element is immediately followed by another option element,
# or if there is no more content in the parent element.
# A tr element's end tag may be omitted if the tr element is
# immediately followed by another tr element, or if there is
# no more content in the parent element.
if type == :StartTag
return nexttok[:name] == tagname
else
return type == :EndTag || type == nil
end
elsif %w(dt dd).include?(tagname)
# A dt element's end tag may be omitted if the dt element is
# immediately followed by another dt element or a dd element.
# A dd element's end tag may be omitted if the dd element is
# immediately followed by another dd element or a dt element,
# or if there is no more content in the parent element.
if type == :StartTag
return %w(dt dd).include?(nexttok[:name])
elsif tagname == 'dd'
return type == :EndTag || type == nil
else
return false
end
elsif tagname == 'p'
# A p element's end tag may be omitted if the p element is
# immediately followed by an address, blockquote, dl, fieldset,
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
# or ul element, or if there is no more content in the parent
# element.
if type == :StartTag
return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5
h6 hr menu ol p pre table ul).include?(nexttok[:name])
else
return type == :EndTag || type == nil
end
elsif tagname == 'colgroup'
# A colgroup element's end tag may be omitted if the colgroup
# element is not immediately followed by a space character or
# a comment.
if [:Comment, :SpaceCharacters].include?(type)
return false
elsif type == :StartTag
# XXX: we also look for an immediately following colgroup
# element. See is_optional_start.
return nexttok[:name] != 'colgroup'
else
return true
end
elsif %w(thead tbody).include? tagname
# A thead element's end tag may be omitted if the thead element
# is immediately followed by a tbody or tfoot element.
# A tbody element's end tag may be omitted if the tbody element
# is immediately followed by a tbody or tfoot element, or if
# there is no more content in the parent element.
# A tfoot element's end tag may be omitted if the tfoot element
# is immediately followed by a tbody element, or if there is no
# more content in the parent element.
# XXX: we never omit the end tag when the following element is
# a tbody. See is_optional_start.
if type == :StartTag
return %w(tbody tfoot).include?(nexttok[:name])
elsif tagname == 'tbody'
return (type == :EndTag or type == nil)
else
return false
end
elsif tagname == 'tfoot'
# A tfoot element's end tag may be omitted if the tfoot element
# is immediately followed by a tbody element, or if there is no
# more content in the parent element.
# XXX: we never omit the end tag when the following element is
# a tbody. See is_optional_start.
if type == :StartTag
return nexttok[:name] == 'tbody'
else
return type == :EndTag || type == nil
end
elsif %w(td th).include? tagname
# A td element's end tag may be omitted if the td element is
# immediately followed by a td or th element, or if there is
# no more content in the parent element.
# A th element's end tag may be omitted if the th element is
# immediately followed by a td or th element, or if there is
# no more content in the parent element.
if type == :StartTag
return %w(td th).include?(nexttok[:name])
else
return type == :EndTag || type == nil
end
end
return false
end
end
end
end

View file

@ -0,0 +1,15 @@
require 'html5lib/filters/base'
require 'html5lib/sanitizer'
module HTML5lib
module Filters
class HTMLSanitizeFilter < Base
include HTMLSanitizeModule
def each
__getobj__.each do |token|
yield(sanitize_token(token))
end
end
end
end
end

View file

@ -0,0 +1,36 @@
require 'html5lib/constants'
require 'html5lib/filters/base'
module HTML5lib
module Filters
class WhitespaceFilter < Base
SPACE_PRESERVE_ELEMENTS = %w[pre textarea] + RCDATA_ELEMENTS
SPACES = /[#{SPACE_CHARACTERS.join('')}]+/m
def each
preserve = 0
__getobj__.each do |token|
case token[:type]
when :StartTag
if preserve > 0 or SPACE_PRESERVE_ELEMENTS.include?(token[:name])
preserve += 1
end
when :EndTag
preserve -= 1 if preserve > 0
when :SpaceCharacters
next if preserve == 0
when :Characters
token[:data] = token[:data].sub(SPACES,' ') if preserve == 0
end
yield token
end
end
end
end
end

View file

@ -1,5 +1,4 @@
require 'cgi'
require 'html5lib/filters'
module HTML5lib
@ -176,15 +175,6 @@ module HTML5lib
end
end
class HTMLSanitizeFilter < Filters::Base
include HTMLSanitizeModule
def each
__getobj__.each do |token|
yield(sanitize_token(token))
end
end
end
class HTMLSanitizer < HTMLTokenizer
include HTMLSanitizeModule
def each

View file

@ -1,5 +1,4 @@
require 'html5lib/constants'
require 'html5lib/filters'
module HTML5lib
@ -7,7 +6,7 @@ module HTML5lib
CDATA_ELEMENTS = %w[style script xmp iframe noembed noframes noscript]
def self.serialize(stream, options = {})
new(options).serialize(stream)
new(options).serialize(stream, options[:encoding])
end
def initialize(options={})
@ -40,20 +39,25 @@ module HTML5lib
def serialize(treewalker, encoding=nil)
in_cdata = false
@errors = []
@errors = []
if encoding and @inject_meta_charset
treewalker = filter_inject_meta_charset(treewalker, encoding)
require 'html5lib/filters/inject_meta_charset'
treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
end
if @strip_whitespace
treewalker = filter_whitespace(treewalker)
require 'html5lib/filters/whitespace'
treewalker = Filters::WhitespaceFilter.new(treewalker)
end
if @sanitize
require 'html5lib/sanitizer'
treewalker = HTMLSanitizeFilter.new(treewalker)
require 'html5lib/filters/sanitizer'
treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
end
if @omit_optional_tags
require 'html5lib/filters/optionaltags'
treewalker = Filters::OptionalTagFilter.new(treewalker)
end
@ -62,25 +66,14 @@ module HTML5lib
type = token[:type]
if type == :Doctype
doctype = "<!DOCTYPE %s>" % token[:name]
if encoding
result << doctype.encode(encoding)
else
result << doctype
end
result << doctype
elsif [:Characters, :SpaceCharacters].include? type
if type == :SpaceCharacters or in_cdata
if in_cdata and token[:data].include?("</")
serializeError(_("Unexpected </ in CDATA"))
end
if encoding
result << token[:data].encode(encoding, errors || "strict")
else
result << token[:data]
end
elsif encoding
result << token[:data].replace("&", "&amp;").
encode(encoding, unicode_encode_errors)
result << token[:data]
else
result << token[:data].
gsub("&", "&amp;").
@ -97,7 +90,6 @@ module HTML5lib
end
attributes = []
for k,v in attrs = token[:data].to_a.sort
k = k.encode(encoding) if encoding
attributes << ' '
attributes << k
@ -111,9 +103,6 @@ module HTML5lib
quote_attr = (SPACE_CHARACTERS + %w(< > " ')).any? {|c| v.include?(c)}
end
v = v.gsub("&", "&amp;")
if encoding
v = v.encode(encoding, unicode_encode_errors)
end
if quote_attr
quote_char = @quote_char
if @use_best_quote_char
@ -141,11 +130,7 @@ module HTML5lib
attributes << "/"
end
end
if encoding
result << "<%s%s>" % [name.encode(encoding), attributes.join('')]
else
result << "<%s%s>" % [name, attributes.join('')]
end
result << "<%s%s>" % [name, attributes.join('')]
elsif type == :EndTag
name = token[:name]
@ -155,33 +140,29 @@ module HTML5lib
serializeError(_("Unexpected child element of a CDATA element"))
end
end_tag = "</#{name}>"
end_tag = end_tag.encode(encoding) if encoding
result << end_tag
elsif type == :Comment
data = token[:data]
serializeError(_("Comment contains --")) if data.index("--")
comment = "<!--%s-->" % token[:data]
if encoding
comment = comment.encode(encoding, unicode_encode_errors)
end
result << comment
else
serializeError(token[:data])
end
end
result.join('')
end
def render(treewalker, encoding=nil)
if encoding
return "".join(list(serialize(treewalker, encoding)))
if encoding and encoding != 'utf-8'
require 'iconv'
Iconv.iconv(encoding, 'utf-8', result.join('')).first
else
return "".join(list(serialize(treewalker)))
result.join('')
end
end
alias :render :serialize
def serializeError(data="XXX ERROR MESSAGE NEEDED")
# XXX The idea is to make data mandatory.
@errors.push(data)
@ -189,22 +170,6 @@ module HTML5lib
raise SerializeError
end
end
def filter_inject_meta_charset(treewalker, encoding)
done = false
for token in treewalker
if not done and token[:type] == :StartTag \
and token[:name].lower() == "head"
yield({:type => :EmptyTag, :name => "meta", \
:data => {"charset" => encoding}})
end
yield token
end
end
def filter_whitespace(treewalker)
raise NotImplementedError
end
end
# Error in serialized tree

View file

@ -27,13 +27,13 @@ module TokenConstructor
end
def text(data)
if data =~ /^([#{SPACE_CHARACTERS.join('')}]+)/
if data =~ /\A([#{SPACE_CHARACTERS.join('')}]+)/m
yield({:type => :SpaceCharacters, :data => $1})
data = data[$1.length .. -1]
return if data.empty?
end
if data =~ /([#{SPACE_CHARACTERS.join('')}]+)$/
if data =~ /([#{SPACE_CHARACTERS.join('')}]+)\Z/m
yield({:type => :Characters, :data => data[0 ... -$1.length]})
yield({:type => :SpaceCharacters, :data => $1})
else

View file

@ -59,7 +59,7 @@ def printOutput(parser, document, opts)
require 'html5lib/treewalkers'
tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document)
require 'html5lib/serializer'
print HTML5lib::HTMLSerializer.serialize(tokens, :encoding=>'utf-8')
print HTML5lib::HTMLSerializer.serialize(tokens, opts.serializer)
when :hilite
print document.hilite
when :tree
@ -80,11 +80,16 @@ require 'ostruct'
options = OpenStruct.new
options.profile = false
options.time = false
options.output = :tree
options.output = :html
options.treebuilder = 'simpletree'
options.error = false
options.encoding = false
options.parsemethod = :parse
options.serializer = {
:encoding => 'utf-8',
:omit_optional_tags => false,
:inject_meta_charset => false
}
require 'optparse'
opts = OptionParser.new do |opts|
@ -96,14 +101,6 @@ opts = OptionParser.new do |opts|
options.time = time
end
opts.on("--[no-]tree", "Do not print output tree") do |tree|
if tree
options.output = :tree
else
options.output = nil
end
end
opts.on("-b", "--treebuilder NAME") do |treebuilder|
options.treebuilder = treebuilder
end
@ -116,13 +113,17 @@ opts = OptionParser.new do |opts|
options.parsemethod = :parseFragment
end
opts.on("--tree", "output as debug tree") do |tree|
options.output = :tree
end
opts.on("-x", "--xml", "output as xml") do |xml|
options.output = :xml
options.treebuilder = "rexml"
end
opts.on("--html", "Output as html") do |html|
options.output = :html
opts.on("--[no-]html", "Output as html") do |html|
options.output = (html ? :html : nil)
end
opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
@ -133,6 +134,22 @@ opts = OptionParser.new do |opts|
options.encoding = encoding
end
opts.on("--[no-]inject-meta-charset", "inject <meta charset>") do |inject|
options.serializer[:inject_meta_charset] = inject
end
opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
options.serializer[:strip_whitespace] = strip
end
opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
options.serializer[:sanitize] = sanitize
end
opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit|
options.serializer[:omit_optional_tags] = omit
end
opts.on_tail("-h", "--help", "Show this message") do
puts opts
exit

View file

@ -0,0 +1,39 @@
{"tests": [
{"description": "no encoding",
"options": {"inject_meta_charset": true},
"input": [["EmptyTag", "head", {}]],
"expected": ["<head>"]
},
{"description": "empytag head",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["EmptyTag", "head", {}]],
"expected": ["<head><meta charset=utf-8>"]
},
{"description": "head w/title",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "head", {}], ["StartTag","title",{}], ["Characters", "foo"],["EndTag", "title"], ["EndTag", "head"]],
"expected": ["<head><meta charset=utf-8><title>foo</title>"]
},
{"description": "head w/meta-charset",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "head", {}], ["EmptyTag","meta",{"charset":"ascii"}], ["EndTag", "head"]],
"expected": ["<head><meta charset=utf-8>"]
},
{"description": "head w/robots",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "head", {}], ["EmptyTag","meta",{"name":"robots","content":"noindex"}], ["EndTag", "head"]],
"expected": ["<head><meta charset=utf-8><meta content=noindex name=robots>"]
},
{"description": "head w/robots & charset",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "head", {}], ["EmptyTag","meta",{"name":"robots","content":"noindex"}], ["EmptyTag","meta",{"charset":"ascii"}], ["EndTag", "head"]],
"expected": ["<head><meta content=noindex name=robots><meta charset=utf-8>"]
}
]}

View file

@ -24,7 +24,7 @@ class JsonWalker < HTML5lib::TreeWalkers::Base
when 'Doctype'
yield doctype(token[1])
else
raise ValueError("Unknown token type: " + type)
raise "Unknown token type: " + token[0]
end
end
end
@ -37,7 +37,10 @@ class Html5SerializeTestcase < Test::Unit::TestCase
tests['tests'].each_with_index do |test, index|
define_method "test_#{test_name}_#{index+1}" do
next if test_name == 'whitespace' #TODO
if test["options"] and test["options"]["encoding"]
test["options"][:encoding] = test["options"]["encoding"]
end
result = HTML5lib::HTMLSerializer.
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
expected = test["expected"]