require 'syntax' module Syntax # A tokenizer for the Ruby language. It recognizes all common syntax # (and some less common syntax) but because it is not a true lexer, it # will make mistakes on some ambiguous cases. class Ruby < Tokenizer # The list of all identifiers recognized as keywords. KEYWORDS = %w{if then elsif else end begin do rescue ensure while for class module def yield raise until unless and or not when case super undef break next redo retry in return alias defined?} # Perform ruby-specific setup def setup @selector = false @allow_operator = false @heredocs = [] end # Step through a single iteration of the tokenization process. def step case when bol? && check( /=begin/ ) start_group( :comment, scan_until( /^=end#{EOL}/ ) ) when bol? && check( /__END__#{EOL}/ ) start_group( :comment, scan_until( /\Z/ ) ) else case when check( /def\s+/ ) start_group :keyword, scan( /def\s+/ ) start_group :method, scan_until( /(?=[;(\s]|#{EOL})/ ) when check( /class\s+/ ) start_group :keyword, scan( /class\s+/ ) start_group :class, scan_until( /(?=[;\s<]|#{EOL})/ ) when check( /module\s+/ ) start_group :keyword, scan( /module\s+/ ) start_group :module, scan_until( /(?=[;\s]|#{EOL})/ ) when check( /::/ ) start_group :punct, scan(/::/) when check( /:"/ ) start_group :symbol, scan(/:/) scan_delimited_region :symbol, :symbol, "", true @allow_operator = true when check( /:'/ ) start_group :symbol, scan(/:/) scan_delimited_region :symbol, :symbol, "", false @allow_operator = true when scan( /:[_a-zA-Z@$][$@\w]*[=!?]?/ ) start_group :symbol, matched @allow_operator = true when scan( /\?(\\[^\n\r]|[^\\\n\r\s])/ ) start_group :char, matched @allow_operator = true when check( /(__FILE__|__LINE__|true|false|nil|self)[?!]?/ ) if @selector || matched[-1] == ?? || matched[-1] == ?! start_group :ident, scan(/(__FILE__|__LINE__|true|false|nil|self)[?!]?/) else start_group :constant, scan(/(__FILE__|__LINE__|true|false|nil|self)/) end @selector = false @allow_operator = true when scan(/0([bB][01]+|[oO][0-7]+|[dD][0-9]+|[xX][0-9a-fA-F]+)/) start_group :number, matched @allow_operator = true else case peek(2) when "%r" scan_delimited_region :punct, :regex, scan( /../ ), true @allow_operator = true when "%w", "%q" scan_delimited_region :punct, :string, scan( /../ ), false @allow_operator = true when "%s" scan_delimited_region :punct, :symbol, scan( /../ ), false @allow_operator = true when "%W", "%Q", "%x" scan_delimited_region :punct, :string, scan( /../ ), true @allow_operator = true when /%[^\sa-zA-Z0-9]/ scan_delimited_region :punct, :string, scan( /./ ), true @allow_operator = true when "<<" saw_word = ( chunk[-1,1] =~ /[\w!?]/ ) start_group :punct, scan( /<(\[\{}:;,&|%]/ start_group :punct, scan(/./) @allow_operator = false when /[)\]]/ start_group :punct, scan(/./) @allow_operator = true else # all else just falls through this, to prevent # infinite loops... append getch end end end end end private # Scan a delimited region of text. This handles the simple cases (strings # delimited with quotes) as well as the more complex cases of %-strings # and here-documents. # # * +delim_group+ is the group to use to classify the delimiters of the # region # * +inner_group+ is the group to use to classify the contents of the # region # * +starter+ is the text to use as the starting delimiter # * +exprs+ is a boolean flag indicating whether the region is an # interpolated string or not # * +delim+ is the text to use as the delimiter of the region. If +nil+, # the next character will be treated as the delimiter. # * +heredoc+ is either +false+, meaning the region is not a heredoc, or # :flush (meaning the delimiter must be flushed left), or # :float (meaning the delimiter doens't have to be flush left). def scan_delimited_region( delim_group, inner_group, starter, exprs, delim=nil, heredoc=false ) # begin if !delim start_group delim_group, starter delim = scan( /./ ) append delim delim = case delim when '{' then '}' when '(' then ')' when '[' then ']' when '<' then '>' else delim end end start_region inner_group items = "\\\\|" if heredoc items << "(^" items << '\s*' if heredoc == :float items << "#{Regexp.escape(delim)}\s*?)#{EOL}" else items << "#{Regexp.escape(delim)}" end items << "|#(\\$|@@?|\\{)" if exprs items = Regexp.new( items ) loop do p = pos match = scan_until( items ) if match.nil? start_group inner_group, scan_until( /\Z/ ) break else text = pre_match[p..-1] start_group inner_group, text if text.length > 0 case matched.strip when "\\" unless exprs case peek(1) when "'" scan(/./) start_group :escape, "\\'" when "\\" scan(/./) start_group :escape, "\\\\" else start_group inner_group, "\\" end else start_group :escape, "\\" c = getch append c case c when 'x' append scan( /[a-fA-F0-9]{1,2}/ ) when /[0-7]/ append scan( /[0-7]{0,2}/ ) end end when delim end_region inner_group start_group delim_group, matched break when /^#/ do_highlight = (option(:expressions) == :highlight) start_region :expr if do_highlight start_group :expr, matched case matched[1] when ?{ depth = 1 content = "" while depth > 0 p = pos c = scan_until( /[\{}]/ ) if c.nil? content << scan_until( /\Z/ ) break else depth += ( matched == "{" ? 1 : -1 ) content << pre_match[p..-1] content << matched if depth > 0 end end if do_highlight subtokenize "ruby", content start_group :expr, "}" else append content + "}" end when ?$, ?@ append scan( /\w+/ ) end end_region :expr if do_highlight else raise "unexpected match on #{matched}" end end end end # Scan a heredoc beginning at the current position. # # * +float+ indicates whether the delimiter may be floated to the right # * +type+ is +nil+, a single quote, or a double quote # * +delim+ is the delimiter to look for def scan_heredoc(float, type, delim) scan_delimited_region( :constant, :string, "", type != "'", delim, float ? :float : :flush ) end end SYNTAX["ruby"] = Ruby end