$:.unshift File.dirname(__FILE__), 'lib' require 'html5' require 'ostruct' require 'optparse' module HTML5::CLI def self.parse_opts argv options = OpenStruct.new options.profile = false options.time = false options.output = :html options.treebuilder = 'simpletree' options.error = false options.encoding = false options.parsemethod = :parse options.serializer = { :encoding => 'utf-8', :omit_optional_tags => false, :inject_meta_charset => false } opts = OptionParser.new do |opts| opts.separator "" opts.separator "Parse Options:" opts.on("-b", "--treebuilder NAME") do |treebuilder| options.treebuilder = treebuilder end opts.on("-f", "--fragment CONTAINER", "Parse as a fragment") do |container| options.parsemethod = :parse_fragment options.container = container if container end opts.separator "" opts.separator "Filter Options:" opts.on("--[no-]inject-meta-charset", "inject ") do |inject| options.serializer[:inject_meta_charset] = inject end opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip| options.serializer[:strip_whitespace] = strip end opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize| options.serializer[:sanitize] = sanitize end opts.separator "" opts.separator "Output Options:" opts.on("--tree", "output as debug tree") do |tree| options.output = :tree end opts.on("-x", "--xml", "output as xml") do |xml| options.output = :xml options.treebuilder = "rexml" end opts.on("--[no-]html", "Output as html") do |html| options.output = (html ? :html : nil) end opts.on("--hilite", "Output as formatted highlighted code.") do |hilite| options.output = :hilite end opts.on("-e", "--error", "Print a list of parse errors") do |error| options.error = error end opts.separator "" opts.separator "Serialization Options:" opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit| options.serializer[:omit_optional_tags] = omit end opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote| options.serializer[:quote_attr_values] = quote end opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best| options.serializer[:use_best_quote_char] = best end opts.on("--quote-char C", "Use specified quote character") do |c| options.serializer[:quote_char] = c end opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min| options.serializer[:minimize_boolean_attributes] = min end opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash| options.serializer[:use_trailing_solidus] = slash end opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt| options.serializer[:escape_lt_in_attrs] = lt end opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata| options.serializer[:escape_rcdata] = rcdata end opts.separator "" opts.separator "Other Options:" opts.on("-p", "--[no-]profile", "Profile the run") do |profile| options.profile = profile end opts.on("-t", "--[no-]time", "Time the run") do |time| options.time = time end opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding| options.encoding = encoding end opts.on_tail("-h", "--help", "Show this message") do puts opts exit end end opts.parse!(argv) options end def self.open_input f if f begin if f[0..6] == 'http://' require 'open-uri' f = URI.parse(f).open encoding = f.charset elsif f == '-' f = $stdin else f = open(f) end rescue end else $stderr.write("No filename provided. Use -h for help\n") exit(1) end f end def self.parse(opts, args) encoding = nil f = open_input args.last require 'html5/treebuilders' treebuilder = HTML5::TreeBuilders[opts.treebuilder] if opts.output == :xml require 'html5/liberalxmlparser' p = HTML5::XMLParser.new(:tree=>treebuilder) else require 'html5/html5parser' p = HTML5::HTMLParser.new(:tree=>treebuilder) end if opts.parsemethod == :parse args = [f, encoding] else args = [f, (opts.container || 'div'), encoding] end if opts.profile require 'profiler' Profiler__::start_profile p.send(opts.parsemethod, *args) Profiler__::stop_profile Profiler__::print_profile($stderr) elsif opts.time require 'time' # TODO: switch to benchmark t0 = Time.new document = p.send(opts.parsemethod, *args) t1 = Time.new print_output(p, document, opts) t2 = Time.new puts "\n\nRun took: #{t1-t0}s (plus #{t2-t1}s to print the output)" else document = p.send(opts.parsemethod, *args) print_output(p, document, opts) end end def self.print_output(parser, document, opts) puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding case opts.output when :xml print document when :html require 'html5/treewalkers' tokens = HTML5::TreeWalkers[opts.treebuilder].new(document) require 'html5/serializer' puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer) when :hilite print document.hilite when :tree document = [document] unless document.respond_to?(:each) document.each {|fragment| puts parser.tree.testSerializer(fragment)} end if opts.error errList=[] for pos, errorcode, datavars in parser.errors formatstring = HTML5::E[errorcode] || 'Unknown error "%(errorcode)"' message = PythonicTemplate.new(formatstring).to_s(datavars) errList << "Line #{pos[0]} Col #{pos[1]} " + message end $stdout.write("\nParse errors:\n" + errList.join("\n")+"\n") end end class PythonicTemplate # convert Python format string into a Ruby string, ready to eval def initialize format @format = format @format.gsub!('"', '\\"') @format.gsub!(/%\((\w+)\)/, '#{@_\1}') @format = '"' + @format + '"' end # evaluate string def to_s(vars=nil) vars.each {|var,value| eval "@_#{var}=#{value.dump}"} if vars eval @format end end def self.run options = parse_opts ARGV parse options, ARGV end end