a6429f8c22
Completely removed the html5lib sanitizer. Fixed the string-handling to work in both Ruby 1.8.x and 1.9.2. There are still, inexplicably, two functional tests that fail. But the rest seems to work quite well.
248 lines
6.8 KiB
Ruby
248 lines
6.8 KiB
Ruby
$:.unshift File.dirname(__FILE__), 'lib'
|
|
require 'html5'
|
|
require 'ostruct'
|
|
require 'optparse'
|
|
|
|
module HTML5::CLI
|
|
|
|
def self.parse_opts argv
|
|
options = OpenStruct.new
|
|
options.profile = false
|
|
options.time = false
|
|
options.output = :html
|
|
options.treebuilder = 'simpletree'
|
|
options.error = false
|
|
options.encoding = false
|
|
options.parsemethod = :parse
|
|
options.serializer = {
|
|
:encoding => 'utf-8',
|
|
:omit_optional_tags => false,
|
|
:inject_meta_charset => false
|
|
}
|
|
|
|
opts = OptionParser.new do |opts|
|
|
opts.separator ""
|
|
opts.separator "Parse Options:"
|
|
|
|
opts.on("-b", "--treebuilder NAME") do |treebuilder|
|
|
options.treebuilder = treebuilder
|
|
end
|
|
|
|
opts.on("-f", "--fragment CONTAINER", "Parse as a fragment") do |container|
|
|
options.parsemethod = :parse_fragment
|
|
options.container = container if container
|
|
end
|
|
|
|
opts.separator ""
|
|
opts.separator "Filter Options:"
|
|
|
|
opts.on("--[no-]inject-meta-charset", "inject <meta charset>") do |inject|
|
|
options.serializer[:inject_meta_charset] = inject
|
|
end
|
|
|
|
opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
|
|
options.serializer[:strip_whitespace] = strip
|
|
end
|
|
|
|
opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
|
|
options.serializer[:sanitize] = sanitize
|
|
end
|
|
|
|
opts.separator ""
|
|
opts.separator "Output Options:"
|
|
|
|
opts.on("--tree", "output as debug tree") do |tree|
|
|
options.output = :tree
|
|
end
|
|
|
|
opts.on("-x", "--xml", "output as xml") do |xml|
|
|
options.output = :xml
|
|
options.treebuilder = "rexml"
|
|
end
|
|
|
|
opts.on("--[no-]html", "Output as html") do |html|
|
|
options.output = (html ? :html : nil)
|
|
end
|
|
|
|
opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
|
|
options.output = :hilite
|
|
end
|
|
|
|
opts.on("-e", "--error", "Print a list of parse errors") do |error|
|
|
options.error = error
|
|
end
|
|
|
|
opts.separator ""
|
|
opts.separator "Serialization Options:"
|
|
|
|
opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit|
|
|
options.serializer[:omit_optional_tags] = omit
|
|
end
|
|
|
|
opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote|
|
|
options.serializer[:quote_attr_values] = quote
|
|
end
|
|
|
|
opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best|
|
|
options.serializer[:use_best_quote_char] = best
|
|
end
|
|
|
|
opts.on("--quote-char C", "Use specified quote character") do |c|
|
|
options.serializer[:quote_char] = c
|
|
end
|
|
|
|
opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min|
|
|
options.serializer[:minimize_boolean_attributes] = min
|
|
end
|
|
|
|
opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash|
|
|
options.serializer[:use_trailing_solidus] = slash
|
|
end
|
|
|
|
opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt|
|
|
options.serializer[:escape_lt_in_attrs] = lt
|
|
end
|
|
|
|
opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata|
|
|
options.serializer[:escape_rcdata] = rcdata
|
|
end
|
|
|
|
opts.separator ""
|
|
opts.separator "Other Options:"
|
|
|
|
opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
|
|
options.profile = profile
|
|
end
|
|
|
|
opts.on("-t", "--[no-]time", "Time the run") do |time|
|
|
options.time = time
|
|
end
|
|
|
|
opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
|
|
options.encoding = encoding
|
|
end
|
|
|
|
opts.on_tail("-h", "--help", "Show this message") do
|
|
puts opts
|
|
exit
|
|
end
|
|
|
|
|
|
end
|
|
opts.parse!(argv)
|
|
options
|
|
end
|
|
|
|
def self.open_input f
|
|
if f
|
|
begin
|
|
if f[0..6] == 'http://'
|
|
require 'open-uri'
|
|
f = URI.parse(f).open
|
|
encoding = f.charset
|
|
elsif f == '-'
|
|
f = $stdin
|
|
else
|
|
f = open(f)
|
|
end
|
|
rescue
|
|
end
|
|
else
|
|
$stderr.write("No filename provided. Use -h for help\n")
|
|
exit(1)
|
|
end
|
|
f
|
|
end
|
|
|
|
def self.parse(opts, args)
|
|
encoding = nil
|
|
|
|
f = open_input args.last
|
|
|
|
require 'html5/treebuilders'
|
|
treebuilder = HTML5::TreeBuilders[opts.treebuilder]
|
|
|
|
if opts.output == :xml
|
|
require 'html5/liberalxmlparser'
|
|
p = HTML5::XMLParser.new(:tree=>treebuilder)
|
|
else
|
|
require 'html5/html5parser'
|
|
p = HTML5::HTMLParser.new(:tree=>treebuilder)
|
|
end
|
|
|
|
if opts.parsemethod == :parse
|
|
args = [f, encoding]
|
|
else
|
|
args = [f, (opts.container || 'div'), encoding]
|
|
end
|
|
|
|
if opts.profile
|
|
require 'profiler'
|
|
Profiler__::start_profile
|
|
p.send(opts.parsemethod, *args)
|
|
Profiler__::stop_profile
|
|
Profiler__::print_profile($stderr)
|
|
elsif opts.time
|
|
require 'time' # TODO: switch to benchmark
|
|
t0 = Time.new
|
|
document = p.send(opts.parsemethod, *args)
|
|
t1 = Time.new
|
|
print_output(p, document, opts)
|
|
t2 = Time.new
|
|
puts "\n\nRun took: #{t1-t0}s (plus #{t2-t1}s to print the output)"
|
|
else
|
|
document = p.send(opts.parsemethod, *args)
|
|
print_output(p, document, opts)
|
|
end
|
|
end
|
|
|
|
def self.print_output(parser, document, opts)
|
|
puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding
|
|
|
|
case opts.output
|
|
when :xml
|
|
print document
|
|
when :html
|
|
require 'html5/treewalkers'
|
|
tokens = HTML5::TreeWalkers[opts.treebuilder].new(document)
|
|
require 'html5/serializer'
|
|
puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer)
|
|
when :hilite
|
|
print document.hilite
|
|
when :tree
|
|
document = [document] unless document.respond_to?(:each)
|
|
document.each {|fragment| puts parser.tree.testSerializer(fragment)}
|
|
end
|
|
|
|
if opts.error
|
|
errList=[]
|
|
for pos, errorcode, datavars in parser.errors
|
|
formatstring = HTML5::E[errorcode] || 'Unknown error "%(errorcode)"'
|
|
message = PythonicTemplate.new(formatstring).to_s(datavars)
|
|
errList << "Line #{pos[0]} Col #{pos[1]} " + message
|
|
end
|
|
$stdout.write("\nParse errors:\n" + errList.join("\n")+"\n")
|
|
end
|
|
end
|
|
|
|
class PythonicTemplate
|
|
# convert Python format string into a Ruby string, ready to eval
|
|
def initialize format
|
|
@format = format
|
|
@format.gsub!('"', '\\"')
|
|
@format.gsub!(/%\((\w+)\)/, '#{@_\1}')
|
|
@format = '"' + @format + '"'
|
|
end
|
|
|
|
# evaluate string
|
|
def to_s(vars=nil)
|
|
vars.each {|var,value| eval "@_#{var}=#{value.dump}"} if vars
|
|
eval @format
|
|
end
|
|
end
|
|
|
|
def self.run
|
|
options = parse_opts ARGV
|
|
parse options, ARGV
|
|
end
|
|
end
|