164 lines
4.5 KiB
Ruby
164 lines
4.5 KiB
Ruby
|
require 'strscan'
|
||
|
|
||
|
module Syntax
|
||
|
|
||
|
# A single token extracted by a tokenizer. It is simply the lexeme
|
||
|
# itself, decorated with a 'group' attribute to identify the type of the
|
||
|
# lexeme.
|
||
|
class Token < String
|
||
|
|
||
|
# the type of the lexeme that was extracted.
|
||
|
attr_reader :group
|
||
|
|
||
|
# the instruction associated with this token (:none, :region_open, or
|
||
|
# :region_close)
|
||
|
attr_reader :instruction
|
||
|
|
||
|
# Create a new Token representing the given text, and belonging to the
|
||
|
# given group.
|
||
|
def initialize( text, group, instruction = :none )
|
||
|
super text
|
||
|
@group = group
|
||
|
@instruction = instruction
|
||
|
end
|
||
|
|
||
|
end
|
||
|
|
||
|
# The base class of all tokenizers. It sets up the scanner and manages the
|
||
|
# looping until all tokens have been extracted. It also provides convenience
|
||
|
# methods to make sure adjacent tokens of identical groups are returned as
|
||
|
# a single token.
|
||
|
class Tokenizer
|
||
|
|
||
|
# The current group being processed by the tokenizer
|
||
|
attr_reader :group
|
||
|
|
||
|
# The current chunk of text being accumulated
|
||
|
attr_reader :chunk
|
||
|
|
||
|
# Start tokenizing. This sets up the state in preparation for tokenization,
|
||
|
# such as creating a new scanner for the text and saving the callback block.
|
||
|
# The block will be invoked for each token extracted.
|
||
|
def start( text, &block )
|
||
|
@chunk = ""
|
||
|
@group = :normal
|
||
|
@callback = block
|
||
|
@text = StringScanner.new( text )
|
||
|
setup
|
||
|
end
|
||
|
|
||
|
# Subclasses may override this method to provide implementation-specific
|
||
|
# setup logic.
|
||
|
def setup
|
||
|
end
|
||
|
|
||
|
# Finish tokenizing. This flushes the buffer, yielding any remaining text
|
||
|
# to the client.
|
||
|
def finish
|
||
|
start_group nil
|
||
|
teardown
|
||
|
end
|
||
|
|
||
|
# Subclasses may override this method to provide implementation-specific
|
||
|
# teardown logic.
|
||
|
def teardown
|
||
|
end
|
||
|
|
||
|
# Subclasses must implement this method, which is called for each iteration
|
||
|
# of the tokenization process. This method may extract multiple tokens.
|
||
|
def step
|
||
|
raise NotImplementedError, "subclasses must implement #step"
|
||
|
end
|
||
|
|
||
|
# Begins tokenizing the given text, calling #step until the text has been
|
||
|
# exhausted.
|
||
|
def tokenize( text, &block )
|
||
|
start text, &block
|
||
|
step until @text.eos?
|
||
|
finish
|
||
|
end
|
||
|
|
||
|
# Specify a set of tokenizer-specific options. Each tokenizer may (or may
|
||
|
# not) publish any options, but if a tokenizer does those options may be
|
||
|
# used to specify optional behavior.
|
||
|
def set( opts={} )
|
||
|
( @options ||= Hash.new ).update opts
|
||
|
end
|
||
|
|
||
|
# Get the value of the specified option.
|
||
|
def option(opt)
|
||
|
@options ? @options[opt] : nil
|
||
|
end
|
||
|
|
||
|
private
|
||
|
|
||
|
EOL = /(?=\r\n?|\n|$)/
|
||
|
|
||
|
# A convenience for delegating method calls to the scanner.
|
||
|
def self.delegate( sym )
|
||
|
define_method( sym ) { |*a| @text.__send__( sym, *a ) }
|
||
|
end
|
||
|
|
||
|
delegate :bol?
|
||
|
delegate :eos?
|
||
|
delegate :scan
|
||
|
delegate :scan_until
|
||
|
delegate :check
|
||
|
delegate :check_until
|
||
|
delegate :getch
|
||
|
delegate :matched
|
||
|
delegate :pre_match
|
||
|
delegate :peek
|
||
|
delegate :pos
|
||
|
|
||
|
# Access the n-th subgroup from the most recent match.
|
||
|
def subgroup(n)
|
||
|
@text[n]
|
||
|
end
|
||
|
|
||
|
# Append the given data to the currently active chunk.
|
||
|
def append( data )
|
||
|
@chunk << data
|
||
|
end
|
||
|
|
||
|
# Request that a new group be started. If the current group is the same
|
||
|
# as the group being requested, a new group will not be created. If a new
|
||
|
# group is created and the current chunk is not empty, the chunk's
|
||
|
# contents will be yielded to the client as a token, and then cleared.
|
||
|
#
|
||
|
# After the new group is started, if +data+ is non-nil it will be appended
|
||
|
# to the chunk.
|
||
|
def start_group( gr, data=nil )
|
||
|
flush_chunk if gr != @group
|
||
|
@group = gr
|
||
|
@chunk << data if data
|
||
|
end
|
||
|
|
||
|
def start_region( gr, data=nil )
|
||
|
flush_chunk
|
||
|
@group = gr
|
||
|
@callback.call( Token.new( data||"", @group, :region_open ) )
|
||
|
end
|
||
|
|
||
|
def end_region( gr, data=nil )
|
||
|
flush_chunk
|
||
|
@group = gr
|
||
|
@callback.call( Token.new( data||"", @group, :region_close ) )
|
||
|
end
|
||
|
|
||
|
def flush_chunk
|
||
|
@callback.call( Token.new( @chunk, @group ) ) unless @chunk.empty?
|
||
|
@chunk = ""
|
||
|
end
|
||
|
|
||
|
def subtokenize( syntax, text )
|
||
|
tokenizer = Syntax.load( syntax )
|
||
|
tokenizer.set @options if @options
|
||
|
flush_chunk
|
||
|
tokenizer.tokenize( text, &@callback )
|
||
|
end
|
||
|
|
||
|
end
|
||
|
|
||
|
end
|