Streamlined URI parsing [dm1]

2005-01-18 20:31:42 +00:00 · 2005-01-18 20:31:42 +00:00 · 07e43d2dae
commit 07e43d2dae
parent 60c16e0be7
1 changed files with 29 additions and 33 deletions
--- a/app/models/chunks/uri.rb
+++ b/app/models/chunks/uri.rb
@ -26,53 +26,49 @@ class URIChunk < Chunk::Abstract
    COUNTRY = '(?:au|at|be|ca|ch|de|dk|fr|hk|in|ir|it|jp|nl|no|pt|ru|se|sw|tv|tw|uk|us)'
    # These are needed otherwise HOST will match almost anything
-    TLDS = "\\.(?:#{GENERIC}|#{COUNTRY})" 
+    TLDS = "(?:#{GENERIC}|#{COUNTRY})"
    # Redefine USERINFO so that it must have non-zero length
    USERINFO = "(?:[#{UNRESERVED};:&=+$,]|#{ESCAPED})+"
-    # Pattern of legal URI endings to stop interference with some Textile
+    # unreserved_no_ending = alphanum | mark, but URI_ENDING [)!] excluded
-    # markup. (Images: !URI!) and other punctuation eg, (http://wiki.com/)
+    UNRESERVED_NO_ENDING = "-_.~*'(#{ALNUM}"  
-    URI_ENDING = '[)!]'
+
    # this ensures that query or fragment do not end with URI_ENDING
    # and enable us to use a much simpler self.pattern Regexp
    # uric_no_ending = reserved | unreserved_no_ending | escaped
    URIC_NO_ENDING = "(?:[#{UNRESERVED_NO_ENDING}#{RESERVED}]|#{ESCAPED})"
    # query = *uric
    QUERY = "#{URIC_NO_ENDING}*"
    # fragment = *uric
    FRAGMENT = "#{URIC_NO_ENDING}*"
    # DOMLABEL is defined in the ruby uri library, TLDS is defined above
    FULL_HOSTNAME = "(?:#{DOMLABEL}\\.)+#{TLDS}" 
    # Correct a typo bug in ruby 1.8.x lib/uri/common.rb 
    PORT = '\\d*'
    # The basic URI expression as a string
    URI_PATTERN =
-  	"(?:(#{SCHEME})://)?" +    # Optional scheme://              (\1|\8)
+        "(?:(#{SCHEME}):/{0,2})?" + # Optional scheme:        (\1)
-  	"(?:(#{USERINFO})@)?" +    # Optional userinfo@              (\2|\9)
+        "(?:(#{USERINFO})@)?" +     # Optional userinfo@      (\2)
-  	"(#{HOSTNAME}#{TLDS})" +   # Mandatory host eg, HOST.com.au  (\3|\10)
+        "(#{FULL_HOSTNAME})" +      # Mandatory hostname      (\3)
-  	"(?::(#{PORT}))?" +        # Optional :port                  (\4|\11)
+        "(?::(#{PORT}))?" +         # Optional :port          (\4)
-  	"(#{ABS_PATH})?" +         # Optional absolute path          (\5|\12)
+        "(#{ABS_PATH})?"  +         # Optional absolute path  (\5)
-  	"(?:\\?(#{QUERY}))?" +     # Optional ?query                 (\6|\13)
+        "(?:\\?(#{QUERY}))?" +      # Optional ?query         (\6)
-  	"(?:\\#(#{FRAGMENT}))?"    # Optional #fragment              (\7|\14)
+        "(?:\\#(#{FRAGMENT}))?"     # Optional #fragment      (\7)
  end
  def self.pattern()
-    # This pattern first tries to match the URI_PATTERN that ends with 
+    Regexp.new(URI_PATTERN, Regexp::EXTENDED, 'N')
    # punctuation that is a valid URI character (eg, ')', '!'). If
    # such a match occurs, there should be no backtracking (hence the ?> ). 
    # If the string cannot match a URI ending with URI_ENDING, then a second
    # attempt is tried.
    Regexp.new("(?>#{URI_PATTERN}(?=#{URI_ENDING}))|#{URI_PATTERN}", Regexp::EXTENDED, 'N')
  end
  attr_reader :uri, :scheme, :user, :host, :port, :path, :query, :fragment, :link_text
  def initialize(match_data)
    super(match_data)
-    # Since the URI_PATTERN is tried twice, there are two sets of
+    @scheme, @user, @host, @port, @path, @query, @fragment = match_data[1..-1]
    # groups, one from \1 to \7 and the second from \8 to \14.
    # The fields are set by which ever group matches.
    @scheme   	= match_data[1] || match_data[8]
    @user     	= match_data[2] || match_data[9]
    @host     	= match_data[3] || match_data[10]
    @port		= match_data[4] || match_data[11]
    @path		= match_data[5] || match_data[12]
    @query		= match_data[6] || match_data[13]
    @fragment	= match_data[7] || match_data[14]
    # If there is no scheme, add an appropriate one, otherwise
    # set the URI to the matched text.