Ruby 1.9 Compatibility

Completely removed the html5lib sanitizer.
Fixed the string-handling to work in both
Ruby 1.8.x and 1.9.2. There are still,
inexplicably, two functional tests that
fail. But the rest seems to work quite well.
This commit is contained in:
Jacques Distler 2009-11-30 16:28:18 -06:00
parent 79c8572053
commit a6429f8c22
142 changed files with 519 additions and 843 deletions

View file

@ -0,0 +1,248 @@
$:.unshift File.dirname(__FILE__), 'lib'
require 'html5'
require 'ostruct'
require 'optparse'
module HTML5::CLI
def self.parse_opts argv
options = OpenStruct.new
options.profile = false
options.time = false
options.output = :html
options.treebuilder = 'simpletree'
options.error = false
options.encoding = false
options.parsemethod = :parse
options.serializer = {
:encoding => 'utf-8',
:omit_optional_tags => false,
:inject_meta_charset => false
}
opts = OptionParser.new do |opts|
opts.separator ""
opts.separator "Parse Options:"
opts.on("-b", "--treebuilder NAME") do |treebuilder|
options.treebuilder = treebuilder
end
opts.on("-f", "--fragment CONTAINER", "Parse as a fragment") do |container|
options.parsemethod = :parse_fragment
options.container = container if container
end
opts.separator ""
opts.separator "Filter Options:"
opts.on("--[no-]inject-meta-charset", "inject <meta charset>") do |inject|
options.serializer[:inject_meta_charset] = inject
end
opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
options.serializer[:strip_whitespace] = strip
end
opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
options.serializer[:sanitize] = sanitize
end
opts.separator ""
opts.separator "Output Options:"
opts.on("--tree", "output as debug tree") do |tree|
options.output = :tree
end
opts.on("-x", "--xml", "output as xml") do |xml|
options.output = :xml
options.treebuilder = "rexml"
end
opts.on("--[no-]html", "Output as html") do |html|
options.output = (html ? :html : nil)
end
opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
options.output = :hilite
end
opts.on("-e", "--error", "Print a list of parse errors") do |error|
options.error = error
end
opts.separator ""
opts.separator "Serialization Options:"
opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit|
options.serializer[:omit_optional_tags] = omit
end
opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote|
options.serializer[:quote_attr_values] = quote
end
opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best|
options.serializer[:use_best_quote_char] = best
end
opts.on("--quote-char C", "Use specified quote character") do |c|
options.serializer[:quote_char] = c
end
opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min|
options.serializer[:minimize_boolean_attributes] = min
end
opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash|
options.serializer[:use_trailing_solidus] = slash
end
opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt|
options.serializer[:escape_lt_in_attrs] = lt
end
opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata|
options.serializer[:escape_rcdata] = rcdata
end
opts.separator ""
opts.separator "Other Options:"
opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
options.profile = profile
end
opts.on("-t", "--[no-]time", "Time the run") do |time|
options.time = time
end
opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
options.encoding = encoding
end
opts.on_tail("-h", "--help", "Show this message") do
puts opts
exit
end
end
opts.parse!(argv)
options
end
def self.open_input f
if f
begin
if f[0..6] == 'http://'
require 'open-uri'
f = URI.parse(f).open
encoding = f.charset
elsif f == '-'
f = $stdin
else
f = open(f)
end
rescue
end
else
$stderr.write("No filename provided. Use -h for help\n")
exit(1)
end
f
end
def self.parse(opts, args)
encoding = nil
f = open_input args.last
require 'html5/treebuilders'
treebuilder = HTML5::TreeBuilders[opts.treebuilder]
if opts.output == :xml
require 'html5/liberalxmlparser'
p = HTML5::XMLParser.new(:tree=>treebuilder)
else
require 'html5/html5parser'
p = HTML5::HTMLParser.new(:tree=>treebuilder)
end
if opts.parsemethod == :parse
args = [f, encoding]
else
args = [f, (opts.container || 'div'), encoding]
end
if opts.profile
require 'profiler'
Profiler__::start_profile
p.send(opts.parsemethod, *args)
Profiler__::stop_profile
Profiler__::print_profile($stderr)
elsif opts.time
require 'time' # TODO: switch to benchmark
t0 = Time.new
document = p.send(opts.parsemethod, *args)
t1 = Time.new
print_output(p, document, opts)
t2 = Time.new
puts "\n\nRun took: #{t1-t0}s (plus #{t2-t1}s to print the output)"
else
document = p.send(opts.parsemethod, *args)
print_output(p, document, opts)
end
end
def self.print_output(parser, document, opts)
puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding
case opts.output
when :xml
print document
when :html
require 'html5/treewalkers'
tokens = HTML5::TreeWalkers[opts.treebuilder].new(document)
require 'html5/serializer'
puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer)
when :hilite
print document.hilite
when :tree
document = [document] unless document.respond_to?(:each)
document.each {|fragment| puts parser.tree.testSerializer(fragment)}
end
if opts.error
errList=[]
for pos, errorcode, datavars in parser.errors
formatstring = HTML5::E[errorcode] || 'Unknown error "%(errorcode)"'
message = PythonicTemplate.new(formatstring).to_s(datavars)
errList << "Line #{pos[0]} Col #{pos[1]} " + message
end
$stdout.write("\nParse errors:\n" + errList.join("\n")+"\n")
end
end
class PythonicTemplate
# convert Python format string into a Ruby string, ready to eval
def initialize format
@format = format
@format.gsub!('"', '\\"')
@format.gsub!(/%\((\w+)\)/, '#{@_\1}')
@format = '"' + @format + '"'
end
# evaluate string
def to_s(vars=nil)
vars.each {|var,value| eval "@_#{var}=#{value.dump}"} if vars
eval @format
end
end
def self.run
options = parse_opts ARGV
parse options, ARGV
end
end