RegExpr/lib/regexpr.rb

459 lines
9.9 KiB
Ruby
Raw Normal View History

2010-03-18 13:43:32 +01:00
class RegExpr< Hash
end
class RegExpr::Segment
attr_accessor :value
def initialize( val) self. value= val end
def to_r() self. value. to_s end
def empty?() self. value. nil? end
def names() @value. names. flatten. compact end
def optimize
self. value= self. class. optimize self. value
self
end
class <<self
def optimize v
v= v. optimize
v= nil if v && v. empty?
v= v. value[ 0] if v. instance_of?( RegExpr::Block) && v. hidden && v. size == 1
v
end
def deepest
self. class_eval do
def names() [] end
end
end
def novalue
self. class_eval do
def initialize() end
def empty?() false end
def to_r() '' end
def optimize() self end
end
end
def nooptimize
self. class_eval do
def optimize() self end
end
end
end
end
class RegExpr::Block< RegExpr::Segment
attr_accessor :name, :hidden
def hidden?() @hidden end
def optimize() self. dup. optimize! end
def push( *v) @value. push *v end
def pop() @value. pop end
def empty?() @value. empty? end
def size() @value. size end
def names
names= @value. collect {|v| v. names }
names. push( name) unless self. hidden?
names. flatten. compact
end
def initialize *val
val= val[ 0] if val. size == 1 && val[ 0]. instance_of?( Array)
super val
@hidden= true
end
def optimize!
list, chars= [[]], RegExpr::Chars. new( '')
@value. each do |v|
v= self. class. optimize v
if v.instance_of? RegExpr::Or
list. push []
else list[ -1]. push v
end
end
list. delete_if do |v|
if v. size == 1 && ( v[ 0]. instance_of?( RegExpr::Chars) || v[ 0]. instance_of?( RegExpr::Char) )
chars+= v[ 0]
else false
end
end
chars= chars. optimize
values= []
list. each do |v|
values. push RegExpr::Or. new
values+= if v. size == 1 &&
v[ 0]. instance_of?( RegExpr::Block) &&
v[ 0]. hidden
v[ 0]. value
else
v. collect do |w|
if w. instance_of?( RegExpr::Block) && w. hidden
u= false
w. value. each do |i|
break unless u||= i. instance_of?( RegExpr::Or)
end
u ? w : w. value
else w
end
end. flatten
end
end
values.push RegExpr::Or. new, chars if chars. size > 0
values. shift
@value= values
self
end
def to_r()
(@hidden ? '(?:%s)' : '(%s)')% @value. collect {|i| i.to_r }. join( '')
end
end
class RegExpr::Not< RegExpr::Segment
deepest
novalue
def to_r
if @value. instance_of? RegExpr::Chars
@value. not!
@value. to_s
else '(?!%s)'% @value
end
end
end
class RegExpr::Range< RegExpr::Segment
novalue
attr_accessor :v1, :v2
def names() [] end
def optimize() self. value. optimize end
def to_r() self. optimize. to_r end
def initialize( v1, v2) @v1, @v2= v1, v2 end
# algo stolen from thomas leitner
def value
a, b= @v1< @v2 ? [ @v1, @v2] : [ @v2, @v1]
arr= Array[ a]
af= a == 0 ? 1.0 : a. to_f
bf= b == 0 ? 1.0 : b. to_f
1. upto( b. to_s. length- 1) do |i|
pot= 10** i
num= (af/ pot). ceil* pot # next higher number with i zeros
arr. insert i, num if num < @v2
num= (bf/ pot). floor* pot # next lower number with i zeros
arr. insert -i, num
end
arr. uniq!
arr. push b+ 1 # +1 -> to handle it in the same way as the other elements
result= RegExpr::Block. new
0. upto( arr. length- 2) do |i|
first= arr[ i]. to_s
second= (arr[ i+ 1]- 1).to_s
result. push RegExpr::Or. new
0. upto( first. length- 1) do |j|
result. push( if first[ j] == second[ j]
RegExpr::Char. new first[ j]. chr
else
RegExpr::Chars. new '%c-%c'% [ first[ j], second[ j] ]
end)
end
end
result. value. shift
result
end
end
class RegExpr::Chars< RegExpr::Segment
deepest
attr_reader :chars, :not
def to_r() '[%s]'% self. value end
def not?() @not end
def empty?() @chars. empty? end
def size() @chars. size end
def value=( val) @chars= (@not= val[ 0] == ?^) ? val[ 1.. -1] : val end
def value() (self. not? ? '^' : '')+ (@chars) end
def not!() @not= !@not end
alias -@ not!
def split
chars= []
@chars. gsub( /\\-/) do |r|
chars. push ?-
nil
end. gsub( /.-./) do |r|
chars+= ((r[ 0] .. r[ 2]). to_a)
nil
end. each_byte do |c|
chars. push c
end
chars
end
def optimize!
b2chr= lambda do |b|
"-[]".include?( b.chr) ? '\%c'% b : b. chr
end
chars= self. chars. bytes. sort. uniq
$stderr.puts chars.inspect
@chars= ''
return self if chars. empty?
b= chars. shift
chars. each do |i|
if b+1 == i
unless @chars[ -1] == ?- && @chars[-2] != ?\\
@chars+= b2chr. call( b)+ '-'
end
else @chars+= b2chr. call b
end
b= i
end
@chars+= b2chr. call b
self
end
def optimize
n= self. dup. optimize!
if (n. size == 1 || (n. size == 2 && n. value[ 0] == ?\\ )) && ! n. not?
RegExpr::Char. new n. chars[ -1]. chr
else n
end
end
def + b
chars= self. not? ? '^' : ''
chars+= if b. instance_of? RegExpr::Char
self. split.push b. value[ 0]
elsif self. not? == b. not?
self. split+ b. split
elsif self. not?
(0.. 255). to_a- self. split+ b. split
else
(0.. 255). to_a- b. split+ self. split
end. compact. uniq. collect {|i| i. chr }. join( '')
self. class. new chars
end
end
class RegExpr::Repeat< RegExpr::Segment
attr_reader :min, :max
def minandmax x
case x
when nil, '' then nil
else x. to_i
end
end
def optimize
super
min == 1 && max == 1 ? @value : self
end
def initialize value, min= 1, max= min
super value
@min, @max= self. minandmax( min), self. minandmax( max)
end
def to_r
t= '{%s,%s}'% [ @min||'', @max||'' ]
t= Hash[ *%w<{,1} ? {0,1} ? {0,} * {,} * {1,} +>+ ['{1,1}', ''] ][ t]|| t
@value. to_r+ t
end
end
class RegExpr::Char< RegExpr::Segment
deepest
nooptimize
def to_r() ::Regexp. quote @value end
def size() 1 end
def self. new x
x= x. split( ''). collect {|i| super i }
x. size == 1 ? x[ 0] : RegExpr::Block. new( x)
end
end
class RegExpr::Regexp< RegExpr::Segment
deepest
nooptimize
def to_r() @value. to_s end
end
class RegExpr::Or< RegExpr::Segment
deepest
novalue
def to_r() '|' end
def to_s() '|' end
end
class RegExpr::End< RegExpr::Segment
deepest
novalue
def to_r() '$' end
def to_s() '$' end
end
class RegExpr::Begin< RegExpr::Segment
deepest
novalue
def to_r() '^' end
def to_s() '^' end
end
class RegExpr::WildCard< RegExpr::Segment
deepest
nooptimize
def to_r() @value end
def to_s() @value end
end
class RegExpr
class <<self
STDEXP= Hash[
'loalpha' => '[a-z]',
'hialpha' => '[A-Z]',
'alpha' => 'loalpha | hialpha',
'digit' => '[0-9]',
'alphadigit' => 'alpha | digit',
'hexdigit' => 'digit | [a-fA-F]',
'octdigit' => '[0-7]',
'bindigit' => '[01]',
'space' => '[ \t\n\r\v]'
]
def [] *vals
ret= super *vals
STDEXP. each {|k, v| ret[ k]||= v }
ret
end
def new *vals
ret= super *vals
STDEXP. each {|k, v| ret[ k]||= v }
ret
end
end
def to_r exp= :main
r = self. to_re( exp). optimize
h, r = r. hidden?, r. to_r
r = r[ 1...-1] unless h
::Regexp. new r
end
def to_re exp= :main
u= RegExpr::Block. new
t, u. hidden= if exp. instance_of? Symbol
u. name= exp. to_sym
if self[ exp]
[ self[ exp], false]
else [ self[ exp. to_s], true]
end
else [ exp. to_s, true]
end
until !t || t. empty?
v, t= self. to_r_next t
case v
when ')' then return u, t
when RegExpr::Repeat then v. value= u. pop
end
u. push v
end
u
end
def to_r_next exp
exp. strip!
/^/. match exp[ 1.. -1]
t= case exp[ 0]
when ?^ then return RegExpr::Begin. new, exp[ 1.. -1]
when ?$ then return RegExpr::End. new, exp[ 1.. -1]
when ?\\
h= case exp[ 1]
when ?D, ?S, ?W, ?a, ?d.. ?f, ?n, ?r.. ?t, ?v, ?w
return RegExpr::WildCard. new( '\%c'% exp[ 1]), exp[ 2.. -1]
when ?x then 16
when ?o then 8
when ?b then 2
when ?0.. ?9
exp= 'XX'+ exp[ 1.. -1]
10
else Kernel. raise ArgumentError, 'Unknown form "%s"'% exp
end
i= exp[ 2.. -1]. to_i h
return RegExpr::Char. new( i.chr), exp[ (i. to_s( h). size+ 2).. -1]
when ?. then return RegExpr::WildCard. new( '.'), exp[ 1.. -1]
when ?0
case exp[ 1]
when ?x then %r<^0x([0-9a-f]+)>i. match exp
return '', $1. to_i( 16). to_s+ $'
when ?o then %r<^0o([0-8]+)>. match exp
return '', $1. to_i( 8). to_s+ $'
when ?b then %r<^0b([01]+)>. match exp
return '', $1. to_i( 2). to_s+ $'
else
case exp
when %r<(\d+)..(\d+)> then RegExpr::Range. new $1. to_i, $2. to_i
when %r<^(\d+,\d+|,\d+|\d+,?)> then RegExpr::Repeat. new '', *$1. split( ',')
else Kernel. raise ArgumentError, 'Unknown form "%s"'% exp
end
end
when ?( then return self. to_re( exp[ 1.. -1])
when ?) then ')'
when ?| then RegExpr::Or. new
when ?+ then RegExpr::Repeat. new '', 1, nil
when ?* then RegExpr::Repeat. new '', nil
when ?? then RegExpr::Repeat. new '', 0, 1
when ?" then RegExpr::Char. new %r<^"((?:[^"]|\\")*)">. match( exp)[ 1]
when ?[ then RegExpr::Chars. new %r<^\[((?:[^\]]|\\\])*[^\\]|)\]>. match( exp)[ 1]
when ?/ then exp =~ %r<^/((?:[^/]|\\/)*)/(im?|mi)?>
RegExpr::Regexp. new ::Regexp. new( $1,
($2 =~ /i/ ? ::Regexp::IGNORECASE : 0)+
($2 =~ /m/ ? ::Regexp::MULTILINE : 0))
else
case exp
when %r<^([a-z_][a-z_0-9]*\b)>i then self. to_re $1. to_sym
when %r<(\d+)..(\d+)> then RegExpr::Range. new $1. to_i, $2. to_i
when %r<^(\d+,\d+|,\d+|\d+,?)> then RegExpr::Repeat. new '', *$1. split( ',')
else Kernel. raise ArgumentError, 'Unknown form "%s"'% exp
end
end
[ t, $' ]
end
def def cl= Class. new, *exp
exp= [ :main ] if exp. empty?
exp. each do |e|
re= self. to_re e
names= re. names. collect {|n| '@%s'% n }. join ', '
re= ::Regexp. new '^%s$'% re. optimize. to_r
ev= <<-EOF
def #{e}= val
m= #{re. inspect}. match val
raise ArgumentError, 'Unallowed Chars! (%s =~ #{re. inspect})'% val. inspect unless m
#{names}= *m[ 1.. -1]
end
EOF
cl. class_eval ev
end
cl
end
def match( m, exp= :main) self. to_r( exp). match m end
end