Sync with latest HTML5lib

This commit is contained in:
Jacques Distler 2007-06-22 03:12:08 -05:00
parent bf572e295f
commit 8e92e4a3ab
41 changed files with 1334 additions and 564 deletions

View file

@ -5,12 +5,20 @@
$:.unshift File.dirname(__FILE__),'lib'
def parse(opts, args)
encoding = nil
f = args[-1]
if f
begin
require 'open-uri' if f[0..6] == 'http://'
f = open(f)
if f[0..6] == 'http://'
require 'open-uri'
f = URI.parse(f).open
encoding = f.charset
elsif f == '-'
f = $stdin
else
f = open(f)
end
rescue
end
else
@ -29,22 +37,28 @@ def parse(opts, args)
p = HTML5lib::HTMLParser.new(:tree=>treebuilder)
end
if opts.parsemethod == :parse
args = [f, encoding]
else
args = [f, 'div', encoding]
end
if opts.profile
require 'profiler'
Profiler__::start_profile
p.send(opts.parsemethod,f)
p.send(opts.parsemethod, *args)
Profiler__::stop_profile
Profiler__::print_profile($stderr)
elsif opts.time
require 'time'
t0 = Time.new
document = p.send(opts.parsemethod,f)
document = p.send(opts.parsemethod, *args)
t1 = Time.new
printOutput(p, document, opts)
t2 = Time.new
puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1]
else
document = p.send(opts.parsemethod,f)
document = p.send(opts.parsemethod, *args)
printOutput(p, document, opts)
end
end
@ -59,7 +73,7 @@ def printOutput(parser, document, opts)
require 'html5lib/treewalkers'
tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document)
require 'html5lib/serializer'
print HTML5lib::HTMLSerializer.serialize(tokens, opts.serializer)
puts HTML5lib::HTMLSerializer.serialize(tokens, opts.serializer)
when :hilite
print document.hilite
when :tree
@ -93,26 +107,35 @@ options.serializer = {
require 'optparse'
opts = OptionParser.new do |opts|
opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
options.profile = profile
end
opts.on("-t", "--[no-]time", "Time the run") do |time|
options.time = time
end
opts.separator ""
opts.separator "Parse Options:"
opts.on("-b", "--treebuilder NAME") do |treebuilder|
options.treebuilder = treebuilder
end
opts.on("-e", "--error", "Print a list of parse errors") do |error|
options.error = error
end
opts.on("-f", "--fragment", "Parse as a fragment") do |parse|
options.parsemethod = :parseFragment
end
opts.separator ""
opts.separator "Filter Options:"
opts.on("--[no-]inject-meta-charset", "inject <meta charset>") do |inject|
options.serializer[:inject_meta_charset] = inject
end
opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
options.serializer[:strip_whitespace] = strip
end
opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
options.serializer[:sanitize] = sanitize
end
opts.separator ""
opts.separator "Output Options:"
opts.on("--tree", "output as debug tree") do |tree|
options.output = :tree
end
@ -130,26 +153,56 @@ opts = OptionParser.new do |opts|
options.output = :hilite
end
opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
options.encoding = encoding
opts.on("-e", "--error", "Print a list of parse errors") do |error|
options.error = error
end
opts.on("--[no-]inject-meta-charset", "inject <meta charset>") do |inject|
options.serializer[:inject_meta_charset] = inject
end
opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
options.serializer[:strip_whitespace] = strip
end
opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
options.serializer[:sanitize] = sanitize
end
opts.separator ""
opts.separator "Serialization Options:"
opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit|
options.serializer[:omit_optional_tags] = omit
end
opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote|
options.serializer[:quote_attr_values] = quote
end
opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best|
options.serializer[:use_best_quote_char] = best
end
opts.on("--quote-char C", "Use specified quote character") do |c|
options.serializer[:quote_char] = c
end
opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min|
options.serializer[:minimize_boolean_attributes] = min
end
opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash|
options.serializer[:use_trailing_solidus] = slash
end
opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt|
options.serializer[:escape_lt_in_attrs] = lt
end
opts.separator ""
opts.separator "Other Options:"
opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
options.profile = profile
end
opts.on("-t", "--[no-]time", "Time the run") do |time|
options.time = time
end
opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
options.encoding = encoding
end
opts.on_tail("-h", "--help", "Show this message") do
puts opts
exit