#!/usr/bin/env ruby

require 'core_ext/string'
$:.unshift File.dirname(__FILE__), 'lib'

def parse(opts, args)
  encoding = nil

  f = args[-1]
  if f
    begin
      if f[0..6] == 'http://'
        require 'open-uri'
        f = URI.parse(f).open
        encoding = f.charset
      elsif f == '-'
        f = $stdin
      else
        f = open(f)
      end
    rescue
    end
  else
    $stderr.write("No filename provided. Use -h for help\n")
    exit(1)
  end

  require 'html5/treebuilders'
  treebuilder = HTML5::TreeBuilders[opts.treebuilder]

  if opts.output == :xml
    require 'html5/liberalxmlparser'
    p = HTML5::XMLParser.new(:tree=>treebuilder)
  else
    require 'html5/html5parser'
    p = HTML5::HTMLParser.new(:tree=>treebuilder)
  end

  if opts.parsemethod == :parse
    args = [f, encoding]
  else
    args = [f, 'div', encoding]
  end

  if opts.profile
    require 'profiler'
    Profiler__::start_profile
    p.send(opts.parsemethod, *args)
    Profiler__::stop_profile
    Profiler__::print_profile($stderr)
  elsif opts.time
    require 'time' # TODO: switch to benchmark
    t0 = Time.new
    document = p.send(opts.parsemethod, *args)
    t1 = Time.new
    print_output(p, document, opts)
    t2 = Time.new
    puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1]
  else
    document = p.send(opts.parsemethod, *args)
    print_output(p, document, opts)
  end
end

def print_output(parser, document, opts)
  puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding

  case opts.output
  when :xml
    print document
  when :html
    require 'html5/treewalkers'
    tokens = HTML5::TreeWalkers[opts.treebuilder].new(document)
    require 'html5/serializer'
    puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer)
  when :hilite
    print document.hilite
  when :tree
    document = [document] unless document.respond_to?(:each)
    document.each {|fragment| puts parser.tree.testSerializer(fragment)}
  end

  if opts.error
    errList=[]
    for pos, errorcode, datavars in parser.errors
      errList << "Line #{pos[0]} Col #{pos[1]} " + (HTML5::E[errorcode] || "Unknown error \"#{errorcode}\"") % datavars
    end
    $stdout.write("\nParse errors:\n" + errList.join("\n")+"\n")
  end
end

require 'ostruct'
options = OpenStruct.new
options.profile = false
options.time = false
options.output = :html
options.treebuilder = 'simpletree'
options.error = false
options.encoding = false
options.parsemethod = :parse
options.serializer = {
  :encoding => 'utf-8',
  :omit_optional_tags => false,
  :inject_meta_charset => false
}

require 'optparse'
opts = OptionParser.new do |opts|
  opts.separator ""
  opts.separator "Parse Options:"

  opts.on("-b", "--treebuilder NAME") do |treebuilder|
    options.treebuilder = treebuilder
  end

  opts.on("-f", "--fragment", "Parse as a fragment") do |parse|
    options.parsemethod = :parse_fragment
  end

  opts.separator ""
  opts.separator "Filter Options:"

  opts.on("--[no-]inject-meta-charset", "inject <meta charset>") do |inject|
    options.serializer[:inject_meta_charset] = inject
  end

  opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
    options.serializer[:strip_whitespace] = strip
  end

  opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
    options.serializer[:sanitize] = sanitize
  end

  opts.separator ""
  opts.separator "Output Options:"

  opts.on("--tree", "output as debug tree") do |tree|
    options.output = :tree
  end
  
  opts.on("-x", "--xml", "output as xml") do |xml|
    options.output = :xml
    options.treebuilder = "rexml"
  end
  
  opts.on("--[no-]html", "Output as html") do |html|
    options.output = (html ? :html : nil)
  end
  
  opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
    options.output = :hilite
  end
  
  opts.on("-e", "--error", "Print a list of parse errors") do |error|
    options.error = error
  end

  opts.separator ""
  opts.separator "Serialization Options:"

  opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit|
    options.serializer[:omit_optional_tags] = omit
  end

  opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote|
    options.serializer[:quote_attr_values] = quote
  end

  opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best|
    options.serializer[:use_best_quote_char] = best
  end

  opts.on("--quote-char C", "Use specified quote character") do |c|
    options.serializer[:quote_char] = c
  end

  opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min|
    options.serializer[:minimize_boolean_attributes] = min
  end

  opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash|
    options.serializer[:use_trailing_solidus] = slash
  end

  opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt|
    options.serializer[:escape_lt_in_attrs] = lt
  end

  opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata|
    options.serializer[:escape_rcdata] = rcdata
  end

  opts.separator ""
  opts.separator "Other Options:"

  opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
    options.profile = profile
  end
    
  opts.on("-t", "--[no-]time", "Time the run") do |time|
    options.time = time
  end
    
  opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
    options.encoding = encoding
  end

  opts.on_tail("-h", "--help", "Show this message") do
    puts opts
    exit
  end
end

opts.parse!(ARGV)
parse options, ARGV