RegExpr/lib/regexpr.rb

463 lines
9.6 KiB
Ruby
Raw Normal View History

2010-03-18 13:43:32 +01:00
class RegExpr< Hash
end
class RegExpr::Segment
attr_accessor :value
def initialize( val) self.value= val end
def to_r() self.value.to_s end
def empty?() self.value.nil? end
def names() @value.names.flatten.compact end
2010-03-18 13:43:32 +01:00
def optimize
self.value= self.class.optimize self.value
2010-03-18 13:43:32 +01:00
self
end
class <<self
def optimize v
v= v.optimize
v= nil if v and v.empty?
v= v.value[ 0] if RegExpr::Block === v and v.hidden and v.size == 1
2010-03-18 13:43:32 +01:00
v
end
def deepest
self.class_eval do
2010-03-18 13:43:32 +01:00
def names() [] end
end
end
def novalue
self.class_eval do
2010-03-18 13:43:32 +01:00
def initialize() end
def empty?() false end
def to_r() '' end
def optimize() self end
end
end
def nooptimize
self.class_eval do
2010-03-18 13:43:32 +01:00
def optimize() self end
end
end
end
end
class RegExpr::Block< RegExpr::Segment
attr_accessor :name, :hidden
def hidden?() @hidden end
def optimize() self.dup.optimize! end
def push( *v) @value.push *v end
def pop() @value.pop end
def empty?() @value.empty? end
def size() @value.size end
2010-03-18 13:43:32 +01:00
def names
names= @value.collect &:names
names.push( name) unless self.hidden?
names.flatten.compact
2010-03-18 13:43:32 +01:00
end
def initialize *val
val= val[ 0] if ::Array === val[ 0] and val.size == 1
2010-03-18 13:43:32 +01:00
super val
@hidden= true
end
def optimize!
list, chars= [[]], RegExpr::Chars.new( '')
2010-03-18 13:43:32 +01:00
@value.each do |v|
v= self.class.optimize v
if RegExpr::Or === v
list.push []
else list[ -1].push v
2010-03-18 13:43:32 +01:00
end
end
list.delete_if do |v|
if (RegExpr::Chars === v[ 0] and v.size == 1 ) or RegExpr::Char === v[ 0]
2010-03-18 13:43:32 +01:00
chars+= v[ 0]
else false
end
end
chars= chars.optimize
2010-03-18 13:43:32 +01:00
values= []
list.each do |v|
values.push RegExpr::Or.new
values+= if v.size == 1 and RegExpr::Block === v[ 0] and v[ 0].hidden
v[ 0].value
2010-03-18 13:43:32 +01:00
else
v.collect do |w|
if RegExpr::Block === w and w.hidden
2010-03-18 13:43:32 +01:00
u= false
w.value.each do |i|
break unless u||= RegExpr::Or === i
2010-03-18 13:43:32 +01:00
end
u ? w : w.value
2010-03-18 13:43:32 +01:00
else w
end
end.flatten
2010-03-18 13:43:32 +01:00
end
end
values.push RegExpr::Or.new, chars if chars.size > 0
values.shift
2010-03-18 13:43:32 +01:00
@value= values
self
end
def to_r()
(@hidden ? '(?:%s)' : '(%s)')% @value.collect( &:to_r).join( '')
2010-03-18 13:43:32 +01:00
end
end
class RegExpr::Not< RegExpr::Segment
deepest
novalue
def to_r
if @value.instance_of? RegExpr::Chars
@value.not!
@value.to_s
2010-03-18 13:43:32 +01:00
else '(?!%s)'% @value
end
end
end
class RegExpr::Range< RegExpr::Segment
novalue
attr_accessor :v1, :v2
def names() [] end
def optimize() self.value.optimize end
def to_r() self.optimize.to_r end
2010-03-18 13:43:32 +01:00
def initialize( v1, v2) @v1, @v2= v1, v2 end
# algo stolen from thomas leitner
def value
a, b= @v1< @v2 ? [ @v1, @v2] : [ @v2, @v1]
arr= Array[ a]
af= a == 0 ? 1.0 : a.to_f
bf= b == 0 ? 1.0 : b.to_f
1.upto( b.to_s.length- 1) do |i|
2010-03-18 13:43:32 +01:00
pot= 10** i
num= (af/ pot).ceil* pot # next higher number with i zeros
arr.insert i, num if num < @v2
num= (bf/ pot).floor* pot # next lower number with i zeros
arr.insert -i, num
2010-03-18 13:43:32 +01:00
end
arr.uniq!
arr.push b+ 1 # +1 -> to handle it in the same way as the other elements
2010-03-18 13:43:32 +01:00
result= RegExpr::Block.new
0.upto( arr. length- 2) do |i|
first= arr[ i].to_s
2010-03-18 13:43:32 +01:00
second= (arr[ i+ 1]- 1).to_s
result.push RegExpr::Or.new
0.upto( first.length- 1) do |j|
result.push( if first[ j] == second[ j]
RegExpr::Char.new first[ j].chr
2010-03-18 13:43:32 +01:00
else
RegExpr::Chars.new '%c-%c'% [ first[ j], second[ j] ]
2010-03-18 13:43:32 +01:00
end)
end
end
result. value. shift
result
end
end
class RegExpr::Chars< RegExpr::Segment
deepest
attr_reader :chars, :not
def to_r() '[%s]'% self. value end
def not?() @not end
def empty?() @chars. empty? end
def size() @chars. size end
def value=( val) @chars= (@not= val[ 0] == ?^) ? val[ 1.. -1] : val ; val end
2010-03-18 13:43:32 +01:00
def value() (self. not? ? '^' : '')+ (@chars) end
def not!() @not= !@not end
alias -@ not!
def split
chars= []
@chars. gsub( /\\-/) do |r|
chars. push ?-.ord
2010-03-18 13:43:32 +01:00
nil
end. gsub( /.-./) do |r|
chars+= (r[ 0].ord .. r[ 2].ord). to_a
2010-03-18 13:43:32 +01:00
nil
end. bytes. each do |c|
2010-03-18 13:43:32 +01:00
chars. push c
end
chars
end
def optimize!
b2chr= lambda do |b|
b = b.chr
"-[]".include?( b) ? '\%c'% b : b
2010-03-18 13:43:32 +01:00
end
chars= self. split. sort. uniq
2010-03-18 13:43:32 +01:00
@chars= ''
return self if chars.empty?
b= chars.shift
chars.each do |i|
2010-03-18 13:43:32 +01:00
if b+1 == i
unless @chars[ -1] == ?- and @chars[-2] != ?\\
@chars+= b2chr.call( b)+ '-'
2010-03-18 13:43:32 +01:00
end
else @chars+= b2chr.call b
2010-03-18 13:43:32 +01:00
end
b= i
end
@chars+= b2chr.call b
2010-03-18 13:43:32 +01:00
self
end
def optimize
n= self.dup.optimize!
if (n.size == 1 or (n.size == 2 and n.value[ 0] == ?\\ )) and not n.not?
RegExpr::Char.new n.chars[ -1]
else
n
2010-03-18 13:43:32 +01:00
end
end
def + b
chars= self.not? ? '^' : ''
chars+= if b.instance_of? RegExpr::Char
self.split.push b.value[ 0]
elsif self.not? == b.not?
self.split+ b.split
elsif self.not?
(0..255).to_a- self.split+ b.split
2010-03-18 13:43:32 +01:00
else
(0..255).to_a- b.split+ self.split
end.compact.uniq.collect {|i| i.chr }.join( '')
self.class.new chars
2010-03-18 13:43:32 +01:00
end
end
class RegExpr::Repeat< RegExpr::Segment
attr_reader :min, :max
def minandmax x
case x
when nil, '' then nil
else x.to_i
2010-03-18 13:43:32 +01:00
end
end
def optimize
super
r = (min == 1 and max == 1) ? @value : self
r
2010-03-18 13:43:32 +01:00
end
def initialize value, min= 1, max= min
super value
@min, @max= self.minandmax( min), self.minandmax( max)
2010-03-18 13:43:32 +01:00
end
def to_r
t= '{%s,%s}'% [ @min||'', @max||'' ]
return '' if '{0,0}' == t
2010-03-18 13:43:32 +01:00
t= Hash[ *%w<{,1} ? {0,1} ? {0,} * {,} * {1,} +>+ ['{1,1}', ''] ][ t]|| t
@value.to_r+ t
2010-03-18 13:43:32 +01:00
end
end
class RegExpr::Char< RegExpr::Segment
deepest
nooptimize
def to_r() ::Regexp.quote @value end
2010-03-18 13:43:32 +01:00
def size() 1 end
def self.new x
x= x.split( '').collect {|i| super i }
x.size == 1 ? x[ 0] : RegExpr::Block.new( x)
2010-03-18 13:43:32 +01:00
end
end
class RegExpr::Regexp< RegExpr::Segment
deepest
nooptimize
def to_r() @value. to_s end
end
class RegExpr::Or< RegExpr::Segment
deepest
novalue
def to_r() '|' end
def to_s() '|' end
end
class RegExpr::End< RegExpr::Segment
deepest
novalue
def to_r() '$' end
def to_s() '$' end
end
class RegExpr::Begin< RegExpr::Segment
deepest
novalue
def to_r() '^' end
def to_s() '^' end
end
class RegExpr::WildCard< RegExpr::Segment
deepest
nooptimize
def to_r() @value end
def to_s() @value end
end
class RegExpr
class <<self
STDEXP= Hash[
'loalpha' => '[a-z]',
'hialpha' => '[A-Z]',
'alpha' => 'loalpha | hialpha',
'digit' => '[0-9]',
'alphadigit' => 'alpha | digit',
'hexdigit' => 'digit | [a-fA-F]',
'octdigit' => '[0-7]',
'bindigit' => '[01]',
'space' => '[ \t\n\r\v]'
]
def [] *vals
ret= super *vals
STDEXP.each {|k, v| ret[ k]||= v }
2010-03-18 13:43:32 +01:00
ret
end
def new *vals
ret= super *vals
STDEXP.each {|k, v| ret[ k]||= v }
2010-03-18 13:43:32 +01:00
ret
end
end
def to_r exp= :main
r = self.to_re( exp)
#r.optimize!
h, r = r.hidden?, r.to_r
2010-03-18 13:43:32 +01:00
r = r[ 1...-1] unless h
::Regexp.new r
2010-03-18 13:43:32 +01:00
end
def to_re exp= :main
u= RegExpr::Block.new
t, u.hidden= if Symbol === exp
u.name= exp.to_sym
2010-03-18 13:43:32 +01:00
if self[ exp]
[ self[ exp], false]
else [ self[ exp.to_s], true]
2010-03-18 13:43:32 +01:00
end
else [ exp.to_s, true]
2010-03-18 13:43:32 +01:00
end
until !t or t.empty?
v, t= self.to_r_next t
2010-03-18 13:43:32 +01:00
case v
when ')' then return u, t
when RegExpr::Repeat then v.value= u.pop
2010-03-18 13:43:32 +01:00
end
u.push v
2010-03-18 13:43:32 +01:00
end
u
end
def to_r_next exp
exp.strip!
/^/ =~ exp[ 1.. -1]
2010-03-18 13:43:32 +01:00
t= case exp[ 0]
when ?^ then return RegExpr::Begin.new, exp[ 1.. -1]
when ?$ then return RegExpr::End.new, exp[ 1.. -1]
2010-03-18 13:43:32 +01:00
when ?\\
h= case exp[ 1]
when ?D, ?S, ?W, ?a, ?d.. ?f, ?n, ?r.. ?t, ?v, ?w
return RegExpr::WildCard.new( '\%c'% exp[ 1]), exp[ 2.. -1]
2010-03-18 13:43:32 +01:00
when ?x then 16
when ?o then 8
when ?b then 2
when ?0.. ?9
exp= 'XX'+ exp[ 1.. -1]
10
else raise ArgumentError, 'Unknown form "%s"'% exp
2010-03-18 13:43:32 +01:00
end
i= exp[ 2.. -1].to_i h
return RegExpr::Char.new( i.chr), exp[ (i.to_s( h). size+ 2).. -1]
2010-03-18 13:43:32 +01:00
when ?. then return RegExpr::WildCard.new( '.'), exp[ 1.. -1]
2010-03-18 13:43:32 +01:00
when ?0
case exp[ 1]
when ?x then %r<^0x([0-9a-f]+)>i.match exp
return '', $1.to_i( 16).to_s+ $'
when ?o then %r<^0o([0-8]+)>.match exp
return '', $1.to_i( 8).to_s+ $'
when ?b then %r<^0b([01]+)>.match exp
return '', $1.to_i( 2).to_s+ $'
2010-03-18 13:43:32 +01:00
else
case exp
when %r<(\d+)..(\d+)> then RegExpr::Range.new $1.to_i, $2.to_i
when %r<^(\d+,\d+|,\d+|\d+,?)> then RegExpr::Repeat.new '', *$1.split( ',')
else raise ArgumentError, 'Unknown form "%s"'% exp
2010-03-18 13:43:32 +01:00
end
end
when ?( then return self.to_re( exp[ 1.. -1])
2010-03-18 13:43:32 +01:00
when ?) then ')'
when ?| then RegExpr::Or.new
2010-03-18 13:43:32 +01:00
when ?+ then RegExpr::Repeat.new '', 1, nil
when ?* then RegExpr::Repeat.new '', nil
when ?? then RegExpr::Repeat.new '', 0, 1
2010-03-18 13:43:32 +01:00
when ?" then RegExpr::Char.new %r<^"((?:[^"]|\\")*)">.match( exp)[ 1]
when ?[ then RegExpr::Chars.new %r<^\[((?:[^\]]|\\\])*[^\\]|)\]>.match( exp)[ 1]
2010-03-18 13:43:32 +01:00
when ?/ then exp =~ %r<^/((?:[^/]|\\/)*)/(im?|mi)?>
RegExpr::Regexp.new ::Regexp.new( $1,
2010-03-18 13:43:32 +01:00
($2 =~ /i/ ? ::Regexp::IGNORECASE : 0)+
($2 =~ /m/ ? ::Regexp::MULTILINE : 0))
else
case exp
when %r<^([a-z_][a-z_0-9]*\b)>i then self.to_re $1.to_sym
when %r<(\d+)..(\d+)> then RegExpr::Range.new $1.to_i, $2.to_i
when %r<^(\d+,\d+|,\d+|\d+,?)> then RegExpr::Repeat.new '', *$1.split( ',')
else raise ArgumentError, 'Unknown form "%s"'% exp
2010-03-18 13:43:32 +01:00
end
end
[ t, $' ]
end
def def cl= Class.new, *exp
exp= [ :main ] if exp.empty?
exp.each do |e|
re= self.to_re e
names= re.names.collect('@%s'.method(:%)).join ', '
re= ::Regexp.new '^%s$'% re.to_r
2010-03-18 13:43:32 +01:00
ev= <<-EOF
def #{e}= val
m= #{re.inspect}. match val
raise ArgumentError, 'Unallowed Chars! (%s =~ #{re.inspect})'% val. inspect unless m
2010-03-18 13:43:32 +01:00
#{names}= *m[ 1.. -1]
end
EOF
cl.class_eval ev
2010-03-18 13:43:32 +01:00
end
cl
end
def match( m, exp= :main) to_r( exp).match m end
def =~( x) to_r =~ x end
2010-03-18 13:43:32 +01:00
end