Bring up to current.

2007-01-22 08:36:51 -06:00 · 2007-01-22 08:36:51 -06:00 · b19e1e4f47
commit b19e1e4f47
parent 69b62b6f33
71 changed files with 8305 additions and 39 deletions
--- a/lib/syntax/lang/ruby.rb
+++ b/lib/syntax/lang/ruby.rb
@ -0,0 +1,317 @@
+require 'syntax'
+
+module Syntax
+
+  # A tokenizer for the Ruby language. It recognizes all common syntax
+  # (and some less common syntax) but because it is not a true lexer, it
+  # will make mistakes on some ambiguous cases.
+  class Ruby < Tokenizer
+
+    # The list of all identifiers recognized as keywords.
+    KEYWORDS =
+      %w{if then elsif else end begin do rescue ensure while for
+         class module def yield raise until unless and or not when
+         case super undef break next redo retry in return alias
+         defined?}
+
+    # Perform ruby-specific setup
+    def setup
+      @selector = false
+      @allow_operator = false
+      @heredocs = []
+    end
+
+    # Step through a single iteration of the tokenization process.
+    def step
+      case
+        when bol? && check( /=begin/ )
+          start_group( :comment, scan_until( /^=end#{EOL}/ ) )
+        when bol? && check( /__END__#{EOL}/ )
+          start_group( :comment, scan_until( /\Z/ ) )
+      else
+        case
+          when check( /def\s+/ )
+            start_group :keyword, scan( /def\s+/ )
+            start_group :method,  scan_until( /(?=[;(\s]|#{EOL})/ )
+          when check( /class\s+/ )
+            start_group :keyword, scan( /class\s+/ )
+            start_group :class,  scan_until( /(?=[;\s<]|#{EOL})/ )
+          when check( /module\s+/ )
+            start_group :keyword, scan( /module\s+/ )
+            start_group :module,  scan_until( /(?=[;\s]|#{EOL})/ )
+          when check( /::/ )
+            start_group :punct, scan(/::/)
+          when check( /:"/ )
+            start_group :symbol, scan(/:/)
+            scan_delimited_region :symbol, :symbol, "", true
+            @allow_operator = true
+          when check( /:'/ )
+            start_group :symbol, scan(/:/)
+            scan_delimited_region :symbol, :symbol, "", false
+            @allow_operator = true
+          when scan( /:[_a-zA-Z@$][$@\w]*[=!?]?/ )
+            start_group :symbol, matched
+            @allow_operator = true
+          when scan( /\?(\\[^\n\r]|[^\\\n\r\s])/ )
+            start_group :char, matched
+            @allow_operator = true
+          when check( /(__FILE__|__LINE__|true|false|nil|self)[?!]?/ )
+            if @selector || matched[-1] == ?? || matched[-1] == ?!
+              start_group :ident,
+                scan(/(__FILE__|__LINE__|true|false|nil|self)[?!]?/)
+            else
+              start_group :constant,
+                scan(/(__FILE__|__LINE__|true|false|nil|self)/)
+            end
+            @selector = false
+            @allow_operator = true
+          when scan(/0([bB][01]+|[oO][0-7]+|[dD][0-9]+|[xX][0-9a-fA-F]+)/)
+            start_group :number, matched
+            @allow_operator = true
+          else
+            case peek(2)
+              when "%r"
+                scan_delimited_region :punct, :regex, scan( /../ ), true
+                @allow_operator = true
+              when "%w", "%q"
+                scan_delimited_region :punct, :string, scan( /../ ), false
+                @allow_operator = true
+              when "%s"
+                scan_delimited_region :punct, :symbol, scan( /../ ), false
+                @allow_operator = true
+              when "%W", "%Q", "%x"
+                scan_delimited_region :punct, :string, scan( /../ ), true
+                @allow_operator = true
+              when /%[^\sa-zA-Z0-9]/
+                scan_delimited_region :punct, :string, scan( /./ ), true
+                @allow_operator = true
+              when "<<"
+                saw_word = ( chunk[-1,1] =~ /[\w!?]/ )
+                start_group :punct, scan( /<</ )
+                if saw_word
+                  @allow_operator = false
+                  return
+                end
+
+                float_right = scan( /-/ )
+                append "-" if float_right
+                if ( type = scan( /['"]/ ) )
+                  append type
+                  delim = scan_until( /(?=#{type})/ )
+                  if delim.nil?
+                    append scan_until( /\Z/ )
+                    return
+                  end
+                else
+                  delim = scan( /\w+/ ) or return
+                end
+                start_group :constant, delim
+                start_group :punct, scan( /#{type}/ ) if type
+                @heredocs << [ float_right, type, delim ]
+                @allow_operator = true
+              else
+                case peek(1)
+                  when /[\n\r]/
+                    unless @heredocs.empty?
+                      scan_heredoc(*@heredocs.shift)
+                    else
+                      start_group :normal, scan( /\s+/ )
+                    end
+                    @allow_operator = false
+                  when /\s/
+                    start_group :normal, scan( /\s+/ )
+                  when "#"
+                    start_group :comment, scan( /#[^\n\r]*/ )
+                  when /[A-Z]/
+                    start_group @selector ? :ident : :constant, scan( /\w+/ )
+                    @allow_operator = true
+                  when /[a-z_]/
+                    word = scan( /\w+[?!]?/ )
+                    if !@selector && KEYWORDS.include?( word )
+                      start_group :keyword, word
+                      @allow_operator = false
+                    elsif
+                      start_group :ident, word
+                      @allow_operator = true
+                    end
+                    @selector = false
+                  when /\d/
+                    start_group :number,
+                      scan( /[\d_]+(\.[\d_]+)?([eE][\d_]+)?/ )
+                    @allow_operator = true
+                  when '"'
+                    scan_delimited_region :punct, :string, "", true
+                    @allow_operator = true
+                  when '/'
+                    if @allow_operator
+                      start_group :punct, scan(%r{/})
+                      @allow_operator = false
+                    else
+                      scan_delimited_region :punct, :regex, "", true
+                      @allow_operator = true
+                    end
+                  when "'"
+                    scan_delimited_region :punct, :string, "", false
+                    @allow_operator = true
+                  when "."
+                    dots = scan( /\.{1,3}/ )
+                    start_group :punct, dots
+                    @selector = ( dots.length == 1 )
+                  when /[@]/
+                    start_group :attribute, scan( /@{1,2}\w*/ )
+                    @allow_operator = true
+                  when /[$]/
+                    start_group :global, scan(/\$/)
+                    start_group :global, scan( /\w+|./ ) if check(/./)
+                    @allow_operator = true
+                  when /[-!?*\/+=<>(\[\{}:;,&|%]/
+                    start_group :punct, scan(/./)
+                    @allow_operator = false
+                  when /[)\]]/
+                    start_group :punct, scan(/./)
+                    @allow_operator = true
+                  else
+                    # all else just falls through this, to prevent
+                    # infinite loops...
+                    append getch
+                end
+            end
+        end
+      end
+    end
+
+    private
+
+      # Scan a delimited region of text. This handles the simple cases (strings
+      # delimited with quotes) as well as the more complex cases of %-strings
+      # and here-documents.
+      #
+      # * +delim_group+ is the group to use to classify the delimiters of the
+      #   region
+      # * +inner_group+ is the group to use to classify the contents of the
+      #   region
+      # * +starter+ is the text to use as the starting delimiter
+      # * +exprs+ is a boolean flag indicating whether the region is an
+      #   interpolated string or not
+      # * +delim+ is the text to use as the delimiter of the region. If +nil+,
+      #   the next character will be treated as the delimiter.
+      # * +heredoc+ is either +false+, meaning the region is not a heredoc, or
+      #   <tt>:flush</tt> (meaning the delimiter must be flushed left), or
+      #   <tt>:float</tt> (meaning the delimiter doens't have to be flush left).
+      def scan_delimited_region( delim_group, inner_group, starter, exprs,
+        delim=nil, heredoc=false )
+      # begin
+        if !delim
+          start_group delim_group, starter
+          delim = scan( /./ )
+          append delim
+
+          delim = case delim
+            when '{' then '}'
+            when '(' then ')'
+            when '[' then ']'
+            when '<' then '>'
+            else delim
+          end
+        end
+
+        start_region inner_group
+
+        items = "\\\\|"
+        if heredoc
+          items << "(^"
+          items << '\s*' if heredoc == :float
+          items << "#{Regexp.escape(delim)}\s*?)#{EOL}"
+        else
+          items << "#{Regexp.escape(delim)}"
+        end
+        items << "|#(\\$|@@?|\\{)" if exprs
+        items = Regexp.new( items )
+
+        loop do
+          p = pos
+          match = scan_until( items )
+          if match.nil?
+            start_group inner_group, scan_until( /\Z/ )
+            break
+          else
+            text = pre_match[p..-1]
+            start_group inner_group, text if text.length > 0
+            case matched.strip
+              when "\\"
+                unless exprs
+                  case peek(1)
+                    when "'"
+                      scan(/./)
+                      start_group :escape, "\\'"
+                    when "\\"
+                      scan(/./)
+                      start_group :escape, "\\\\"
+                    else
+                      start_group inner_group, "\\"
+                  end
+                else
+                  start_group :escape, "\\"
+                  c = getch
+                  append c
+                  case c
+                    when 'x'
+                      append scan( /[a-fA-F0-9]{1,2}/ )
+                    when /[0-7]/
+                      append scan( /[0-7]{0,2}/ )
+                  end
+                end
+              when delim
+                end_region inner_group
+                start_group delim_group, matched
+                break
+              when /^#/
+                do_highlight = (option(:expressions) == :highlight)
+                start_region :expr if do_highlight
+                start_group :expr, matched
+                case matched[1]
+                  when ?{
+                    depth = 1
+                    content = ""
+                    while depth > 0
+                      p = pos
+                      c = scan_until( /[\{}]/ )
+                      if c.nil?
+                        content << scan_until( /\Z/ )
+                        break
+                      else
+                        depth += ( matched == "{" ? 1 : -1 )
+                        content << pre_match[p..-1]
+                        content << matched if depth > 0
+                      end
+                    end
+                    if do_highlight
+                      subtokenize "ruby", content
+                      start_group :expr, "}"
+                    else
+                      append content + "}"
+                    end
+                  when ?$, ?@
+                    append scan( /\w+/ )
+                end
+                end_region :expr if do_highlight
+              else raise "unexpected match on #{matched}"
+            end
+          end
+        end
+      end
+
+      # Scan a heredoc beginning at the current position.
+      #
+      # * +float+ indicates whether the delimiter may be floated to the right
+      # * +type+ is +nil+, a single quote, or a double quote
+      # * +delim+ is the delimiter to look for
+      def scan_heredoc(float, type, delim)
+        scan_delimited_region( :constant, :string, "", type != "'",
+          delim, float ? :float : :flush )
+      end
+  end
+
+  SYNTAX["ruby"] = Ruby
+
+end
--- a/lib/syntax/lang/xml.rb
+++ b/lib/syntax/lang/xml.rb
@ -0,0 +1,108 @@
+require 'syntax'
+
+module Syntax
+
+  # A simple implementation of an XML lexer. It handles most cases. It is
+  # not a validating lexer, meaning it will happily process invalid XML without
+  # complaining.
+  class XML < Tokenizer
+
+    # Initialize the lexer.
+    def setup
+      @in_tag = false
+    end
+
+    # Step through a single iteration of the tokenization process. This will
+    # yield (potentially) many tokens, and possibly zero tokens.
+    def step
+      start_group :normal, matched if scan( /\s+/ )
+      if @in_tag
+        case
+          when scan( /([-\w]+):([-\w]+)/ )
+            start_group :namespace, subgroup(1)
+            start_group :punct, ":"
+            start_group :attribute, subgroup(2)
+          when scan( /\d+/ )
+            start_group :number, matched
+          when scan( /[-\w]+/ )
+            start_group :attribute, matched
+          when scan( %r{[/?]?>} )
+            @in_tag = false
+            start_group :punct, matched
+          when scan( /=/ )
+            start_group :punct, matched
+          when scan( /["']/ )
+            scan_string matched
+          else
+            append getch
+        end
+      elsif ( text = scan_until( /(?=[<&])/ ) )
+        start_group :normal, text unless text.empty?
+        if scan(/<!--.*?(-->|\Z)/m)
+          start_group :comment, matched
+        else
+          case peek(1)
+            when "<"
+              start_group :punct, getch
+              case peek(1)
+                when "?"
+                  append getch
+                when "/"
+                  append getch
+                when "!"
+                  append getch
+              end
+              start_group :normal, matched if scan( /\s+/ )
+              if scan( /([-\w]+):([-\w]+)/ )
+                start_group :namespace, subgroup(1)
+                start_group :punct, ":"
+                start_group :tag, subgroup(2)
+              elsif scan( /[-\w]+/ )
+                start_group :tag, matched
+              end
+              @in_tag = true
+            when "&"
+              if scan( /&\S{1,10};/ )
+                start_group :entity, matched
+              else
+                start_group :normal, scan( /&/ )
+              end
+          end
+        end
+      else
+        append scan_until( /\Z/ )
+      end
+    end
+
+    private
+
+      # Scan the string starting at the current position, with the given
+      # delimiter character.
+      def scan_string( delim )
+        start_group :punct, delim
+        match = /(?=[&\\]|#{delim})/
+        loop do
+          break unless ( text = scan_until( match ) )
+          start_group :string, text unless text.empty?
+          case peek(1)
+            when "&"
+              if scan( /&\S{1,10};/ )
+                start_group :entity, matched
+              else
+                start_group :string, getch
+              end
+            when "\\"
+              start_group :string, getch
+              append getch || ""
+            when delim
+              start_group :punct, getch
+              break
+          end
+        end
+      end
+
+  end
+
+  SYNTAX["xml"] = XML
+
+end
--- a/lib/syntax/lang/yaml.rb
+++ b/lib/syntax/lang/yaml.rb
@ -0,0 +1,105 @@
+require 'syntax'
+
+module Syntax
+
+  # A simple implementation of an YAML lexer. It handles most cases. It is
+  # not a validating lexer.
+  class YAML < Tokenizer
+
+    # Step through a single iteration of the tokenization process. This will
+    # yield (potentially) many tokens, and possibly zero tokens.
+    def step
+      if bol?
+        case
+          when scan(/---(\s*.+)?$/)
+            start_group :document, matched
+          when scan(/(\s*)([a-zA-Z][-\w]*)(\s*):/)
+            start_group :normal, subgroup(1)
+            start_group :key, subgroup(2)
+            start_group :normal, subgroup(3)
+            start_group :punct, ":"
+          when scan(/(\s*)-/)
+            start_group :normal, subgroup(1)
+            start_group :punct, "-"
+          when scan(/\s*$/)
+            start_group :normal, matched
+          when scan(/#.*$/)
+            start_group :comment, matched
+          else
+            append getch
+        end
+      else
+        case
+          when scan(/[\n\r]+/)
+            start_group :normal, matched
+          when scan(/[ \t]+/)
+            start_group :normal, matched
+          when scan(/!+(.*?^)?\S+/)
+            start_group :type, matched
+          when scan(/&\S+/)
+            start_group :anchor, matched
+          when scan(/\*\S+/)
+            start_group :ref, matched
+          when scan(/\d\d:\d\d:\d\d/)
+            start_group :time, matched
+          when scan(/\d\d\d\d-\d\d-\d\d\s\d\d:\d\d:\d\d(\.\d+)? [-+]\d\d:\d\d/)
+            start_group :date, matched
+          when scan(/['"]/)
+            start_group :punct, matched
+            scan_string matched
+          when scan(/:\w+/)
+            start_group :symbol, matched
+          when scan(/[:]/)
+            start_group :punct, matched
+          when scan(/#.*$/)
+            start_group :comment, matched
+          when scan(/>-?/)
+            start_group :punct, matched
+            start_group :normal, scan(/.*$/)
+            append getch until eos? || bol?
+            return if eos?
+            indent = check(/ */)
+            start_group :string
+            loop do
+              line = check_until(/[\n\r]|\Z/)
+              break if line.nil?
+              if line.chomp.length > 0
+                this_indent = line.chomp.match( /^\s*/ )[0]
+                break if this_indent.length < indent.length
+              end
+              append scan_until(/[\n\r]|\Z/)
+            end
+          else
+            start_group :normal, scan_until(/(?=$|#)/)
+        end
+      end
+    end
+
+    private
+
+      def scan_string( delim )
+        regex = /(?=[#{delim=="'" ? "" : "\\\\"}#{delim}])/
+        loop do
+          text = scan_until( regex )
+          if text.nil?
+            start_group :string, scan_until( /\Z/ )
+            break
+          else
+            start_group :string, text unless text.empty?
+          end
+
+          case peek(1)
+            when "\\"
+              start_group :expr, scan(/../)
+            else
+              start_group :punct, getch
+              break
+          end
+        end
+      end
+
+  end
+
+  SYNTAX["yaml"] = YAML
+
+end