Fix Unicode bug

Fix Diego Restrepo's bug (see Rev 184).
Update to latest HTML5lib.
This commit is contained in:
Jacques Distler 2007-12-17 03:17:43 -06:00
parent 18da1a1d71
commit 0f6889e09f
29 changed files with 380 additions and 498 deletions

231
vendor/plugins/HTML5lib/lib/html5/cli.rb vendored Normal file
View file

@ -0,0 +1,231 @@
$:.unshift File.dirname(__FILE__), 'lib'
require 'html5'
require 'core_ext/string'
require 'ostruct'
require 'optparse'
module HTML5::CLI
def self.parse_opts argv
options = OpenStruct.new
options.profile = false
options.time = false
options.output = :html
options.treebuilder = 'simpletree'
options.error = false
options.encoding = false
options.parsemethod = :parse
options.serializer = {
:encoding => 'utf-8',
:omit_optional_tags => false,
:inject_meta_charset => false
}
opts = OptionParser.new do |opts|
opts.separator ""
opts.separator "Parse Options:"
opts.on("-b", "--treebuilder NAME") do |treebuilder|
options.treebuilder = treebuilder
end
opts.on("-f", "--fragment CONTAINER", "Parse as a fragment") do |container|
options.parsemethod = :parse_fragment
options.container = container if container
end
opts.separator ""
opts.separator "Filter Options:"
opts.on("--[no-]inject-meta-charset", "inject <meta charset>") do |inject|
options.serializer[:inject_meta_charset] = inject
end
opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
options.serializer[:strip_whitespace] = strip
end
opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
options.serializer[:sanitize] = sanitize
end
opts.separator ""
opts.separator "Output Options:"
opts.on("--tree", "output as debug tree") do |tree|
options.output = :tree
end
opts.on("-x", "--xml", "output as xml") do |xml|
options.output = :xml
options.treebuilder = "rexml"
end
opts.on("--[no-]html", "Output as html") do |html|
options.output = (html ? :html : nil)
end
opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
options.output = :hilite
end
opts.on("-e", "--error", "Print a list of parse errors") do |error|
options.error = error
end
opts.separator ""
opts.separator "Serialization Options:"
opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit|
options.serializer[:omit_optional_tags] = omit
end
opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote|
options.serializer[:quote_attr_values] = quote
end
opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best|
options.serializer[:use_best_quote_char] = best
end
opts.on("--quote-char C", "Use specified quote character") do |c|
options.serializer[:quote_char] = c
end
opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min|
options.serializer[:minimize_boolean_attributes] = min
end
opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash|
options.serializer[:use_trailing_solidus] = slash
end
opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt|
options.serializer[:escape_lt_in_attrs] = lt
end
opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata|
options.serializer[:escape_rcdata] = rcdata
end
opts.separator ""
opts.separator "Other Options:"
opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
options.profile = profile
end
opts.on("-t", "--[no-]time", "Time the run") do |time|
options.time = time
end
opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
options.encoding = encoding
end
opts.on_tail("-h", "--help", "Show this message") do
puts opts
exit
end
end
opts.parse!(argv)
options
end
def self.open_input f
if f
begin
if f[0..6] == 'http://'
require 'open-uri'
f = URI.parse(f).open
encoding = f.charset
elsif f == '-'
f = $stdin
else
f = open(f)
end
rescue
end
else
$stderr.write("No filename provided. Use -h for help\n")
exit(1)
end
f
end
def self.parse(opts, args)
encoding = nil
f = open_input args.last
require 'html5/treebuilders'
treebuilder = HTML5::TreeBuilders[opts.treebuilder]
if opts.output == :xml
require 'html5/liberalxmlparser'
p = HTML5::XMLParser.new(:tree=>treebuilder)
else
require 'html5/html5parser'
p = HTML5::HTMLParser.new(:tree=>treebuilder)
end
if opts.parsemethod == :parse
args = [f, encoding]
else
args = [f, (opts.container || 'div'), encoding]
end
if opts.profile
require 'profiler'
Profiler__::start_profile
p.send(opts.parsemethod, *args)
Profiler__::stop_profile
Profiler__::print_profile($stderr)
elsif opts.time
require 'time' # TODO: switch to benchmark
t0 = Time.new
document = p.send(opts.parsemethod, *args)
t1 = Time.new
print_output(p, document, opts)
t2 = Time.new
puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1]
else
document = p.send(opts.parsemethod, *args)
print_output(p, document, opts)
end
end
def self.print_output(parser, document, opts)
puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding
case opts.output
when :xml
print document
when :html
require 'html5/treewalkers'
tokens = HTML5::TreeWalkers[opts.treebuilder].new(document)
require 'html5/serializer'
puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer)
when :hilite
print document.hilite
when :tree
document = [document] unless document.respond_to?(:each)
document.each {|fragment| puts parser.tree.testSerializer(fragment)}
end
if opts.error
errList=[]
for pos, errorcode, datavars in parser.errors
errList << "Line #{pos[0]} Col #{pos[1]} " + (HTML5::E[errorcode] || "Unknown error \"#{errorcode}\"") % datavars
end
$stdout.write("\nParse errors:\n" + errList.join("\n")+"\n")
end
end
def self.run
options = parse_opts ARGV
parse options, ARGV
end
end

View file

@ -908,7 +908,7 @@ module HTML5
"eof-in-bogus-doctype" =>
_("Unexpected end of file in bogus doctype."),
"eof-in-innerhtml" =>
_("XXX innerHTML EOF"),
_("Unexpected EOF in inner html mode."),
"unexpected-doctype" =>
_("Unexpected DOCTYPE. Ignored."),
"non-html-root" =>
@ -1040,7 +1040,8 @@ module HTML5
_("Unexpected end tag (%(name))" +
". Expected end of file."),
"unexpected-end-table-in-caption" =>
_("Unexpected end table tag in caption. Generates implied end caption.")
_("Unexpected end table tag in caption. Generates implied end caption."),
"end-html-in-innerhtml" => _("Unexpected html end tag in inner html mode.")
}
end

View file

@ -1 +0,0 @@
require 'html5/filters/optionaltags'

View file

@ -25,7 +25,7 @@ module HTML5
def endTagHtml(name)
if @parser.inner_html
parse_error
parse_error "end-html-in-innerhtml"
else
# XXX: This may need to be done, not sure
# Don't set last_phase to the current phase but to the inBody phase

View file

@ -51,34 +51,22 @@ module HTML5
super(parser, tree)
# for special handling of whitespace in <pre>
if $-w
$-w = false
class << self; alias processSpaceCharactersNonPre processSpaceCharacters; end
$-w = true
else
class << self; alias processSpaceCharactersNonPre processSpaceCharacters; end
silence do
class << self
alias processSpaceCharactersNonPre processSpaceCharacters
end
end
end
def processSpaceCharactersDropNewline(data)
# #Sometimes (start of <pre> blocks) we want to drop leading newlines
if $-w
$-w = false
class << self
silence do
alias processSpaceCharacters processSpaceCharactersNonPre
end
end
$-w = true
else
class << self
silence do
alias processSpaceCharacters processSpaceCharactersNonPre
end
class << self
silence do
alias processSpaceCharacters processSpaceCharactersNonPre
end
end
if (data.length > 0 and data[0] == ?\n &&
%w[pre textarea].include?(@tree.open_elements.last.name) && !@tree.open_elements.last.hasContent)
data = data[1..-1]
@ -376,16 +364,6 @@ module HTML5
end
def endTagBlock(name)
#Put us back in the right whitespace handling mode
if name == 'pre'
class << self;
silence do
alias processSpaceCharacters processSpaceCharactersNonPre;
end
end
end
@tree.generateImpliedEndTags if in_scope?(name)
unless @tree.open_elements.last.name == name

View file

@ -144,7 +144,7 @@ module HTML5
def remove_open_elements_until(name=nil)
finished = false
until finished
until finished || @tree.open_elements.length == 0
element = @tree.open_elements.pop
finished = name.nil? ? yield(element) : element.name == name
end