1085168bbf
Sync with the latest html5lib. Having the Maruku unit tests on-hand may be useful for debugging; so let's include them.
248 lines
6.8 KiB
Ruby
248 lines
6.8 KiB
Ruby
$:.unshift File.dirname(__FILE__), 'lib'
|
|
require 'html5'
|
|
require 'ostruct'
|
|
require 'optparse'
|
|
|
|
module HTML5::CLI
|
|
|
|
def self.parse_opts argv
|
|
options = OpenStruct.new
|
|
options.profile = false
|
|
options.time = false
|
|
options.output = :html
|
|
options.treebuilder = 'simpletree'
|
|
options.error = false
|
|
options.encoding = false
|
|
options.parsemethod = :parse
|
|
options.serializer = {
|
|
:encoding => 'utf-8',
|
|
:omit_optional_tags => false,
|
|
:inject_meta_charset => false
|
|
}
|
|
|
|
opts = OptionParser.new do |opts|
|
|
opts.separator ""
|
|
opts.separator "Parse Options:"
|
|
|
|
opts.on("-b", "--treebuilder NAME") do |treebuilder|
|
|
options.treebuilder = treebuilder
|
|
end
|
|
|
|
opts.on("-f", "--fragment CONTAINER", "Parse as a fragment") do |container|
|
|
options.parsemethod = :parse_fragment
|
|
options.container = container if container
|
|
end
|
|
|
|
opts.separator ""
|
|
opts.separator "Filter Options:"
|
|
|
|
opts.on("--[no-]inject-meta-charset", "inject <meta charset>") do |inject|
|
|
options.serializer[:inject_meta_charset] = inject
|
|
end
|
|
|
|
opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
|
|
options.serializer[:strip_whitespace] = strip
|
|
end
|
|
|
|
opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
|
|
options.serializer[:sanitize] = sanitize
|
|
end
|
|
|
|
opts.separator ""
|
|
opts.separator "Output Options:"
|
|
|
|
opts.on("--tree", "output as debug tree") do |tree|
|
|
options.output = :tree
|
|
end
|
|
|
|
opts.on("-x", "--xml", "output as xml") do |xml|
|
|
options.output = :xml
|
|
options.treebuilder = "rexml"
|
|
end
|
|
|
|
opts.on("--[no-]html", "Output as html") do |html|
|
|
options.output = (html ? :html : nil)
|
|
end
|
|
|
|
opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
|
|
options.output = :hilite
|
|
end
|
|
|
|
opts.on("-e", "--error", "Print a list of parse errors") do |error|
|
|
options.error = error
|
|
end
|
|
|
|
opts.separator ""
|
|
opts.separator "Serialization Options:"
|
|
|
|
opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit|
|
|
options.serializer[:omit_optional_tags] = omit
|
|
end
|
|
|
|
opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote|
|
|
options.serializer[:quote_attr_values] = quote
|
|
end
|
|
|
|
opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best|
|
|
options.serializer[:use_best_quote_char] = best
|
|
end
|
|
|
|
opts.on("--quote-char C", "Use specified quote character") do |c|
|
|
options.serializer[:quote_char] = c
|
|
end
|
|
|
|
opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min|
|
|
options.serializer[:minimize_boolean_attributes] = min
|
|
end
|
|
|
|
opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash|
|
|
options.serializer[:use_trailing_solidus] = slash
|
|
end
|
|
|
|
opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt|
|
|
options.serializer[:escape_lt_in_attrs] = lt
|
|
end
|
|
|
|
opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata|
|
|
options.serializer[:escape_rcdata] = rcdata
|
|
end
|
|
|
|
opts.separator ""
|
|
opts.separator "Other Options:"
|
|
|
|
opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
|
|
options.profile = profile
|
|
end
|
|
|
|
opts.on("-t", "--[no-]time", "Time the run") do |time|
|
|
options.time = time
|
|
end
|
|
|
|
opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
|
|
options.encoding = encoding
|
|
end
|
|
|
|
opts.on_tail("-h", "--help", "Show this message") do
|
|
puts opts
|
|
exit
|
|
end
|
|
|
|
|
|
end
|
|
opts.parse!(argv)
|
|
options
|
|
end
|
|
|
|
def self.open_input f
|
|
if f
|
|
begin
|
|
if f[0..6] == 'http://'
|
|
require 'open-uri'
|
|
f = URI.parse(f).open
|
|
encoding = f.charset
|
|
elsif f == '-'
|
|
f = $stdin
|
|
else
|
|
f = open(f)
|
|
end
|
|
rescue
|
|
end
|
|
else
|
|
$stderr.write("No filename provided. Use -h for help\n")
|
|
exit(1)
|
|
end
|
|
f
|
|
end
|
|
|
|
def self.parse(opts, args)
|
|
encoding = nil
|
|
|
|
f = open_input args.last
|
|
|
|
require 'html5/treebuilders'
|
|
treebuilder = HTML5::TreeBuilders[opts.treebuilder]
|
|
|
|
if opts.output == :xml
|
|
require 'html5/liberalxmlparser'
|
|
p = HTML5::XMLParser.new(:tree=>treebuilder)
|
|
else
|
|
require 'html5/html5parser'
|
|
p = HTML5::HTMLParser.new(:tree=>treebuilder)
|
|
end
|
|
|
|
if opts.parsemethod == :parse
|
|
args = [f, encoding]
|
|
else
|
|
args = [f, (opts.container || 'div'), encoding]
|
|
end
|
|
|
|
if opts.profile
|
|
require 'profiler'
|
|
Profiler__::start_profile
|
|
p.send(opts.parsemethod, *args)
|
|
Profiler__::stop_profile
|
|
Profiler__::print_profile($stderr)
|
|
elsif opts.time
|
|
require 'time' # TODO: switch to benchmark
|
|
t0 = Time.new
|
|
document = p.send(opts.parsemethod, *args)
|
|
t1 = Time.new
|
|
print_output(p, document, opts)
|
|
t2 = Time.new
|
|
puts "\n\nRun took: #{t1-t0}s (plus #{t2-t1}s to print the output)"
|
|
else
|
|
document = p.send(opts.parsemethod, *args)
|
|
print_output(p, document, opts)
|
|
end
|
|
end
|
|
|
|
def self.print_output(parser, document, opts)
|
|
puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding
|
|
|
|
case opts.output
|
|
when :xml
|
|
print document
|
|
when :html
|
|
require 'html5/treewalkers'
|
|
tokens = HTML5::TreeWalkers[opts.treebuilder].new(document)
|
|
require 'html5/serializer'
|
|
puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer)
|
|
when :hilite
|
|
print document.hilite
|
|
when :tree
|
|
document = [document] unless document.respond_to?(:each)
|
|
document.each {|fragment| puts parser.tree.testSerializer(fragment)}
|
|
end
|
|
|
|
if opts.error
|
|
errList=[]
|
|
for pos, errorcode, datavars in parser.errors
|
|
formatstring = HTML5::E[errorcode] || 'Unknown error "%(errorcode)"'
|
|
message = PythonicTemplate.new(formatstring).to_s(datavars)
|
|
errList << "Line #{pos[0]} Col #{pos[1]} " + message
|
|
end
|
|
$stdout.write("\nParse errors:\n" + errList.join("\n")+"\n")
|
|
end
|
|
end
|
|
|
|
class PythonicTemplate
|
|
# convert Python format string into a Ruby string, ready to eval
|
|
def initialize format
|
|
@format = format
|
|
@format.gsub!('"', '\\"')
|
|
@format.gsub!(/%\((\w+)\)/, '#{@_\1}')
|
|
@format = '"' + @format + '"'
|
|
end
|
|
|
|
# evaluate string
|
|
def to_s(vars=nil)
|
|
vars.each {|var,value| eval "@_#{var}=#{value.dump}"} if vars
|
|
eval @format
|
|
end
|
|
end
|
|
|
|
def self.run
|
|
options = parse_opts ARGV
|
|
parse options, ARGV
|
|
end
|
|
end
|