From b888799798086e1d598819d0736f9ff7f5a67a19 Mon Sep 17 00:00:00 2001 From: Alexey Verkhovsky Date: Sat, 5 Feb 2005 16:46:26 +0000 Subject: [PATCH] Better URL parsing (URIChunk knows more country codes, and is smarter about avoiding messing with Textile markup) --- app/models/chunks/uri.rb | 23 ++++++++++++++++----- test/unit/uri_test.rb | 43 +++++++++++++++++++++++++++++++++++----- 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/app/models/chunks/uri.rb b/app/models/chunks/uri.rb index 8dd95078..6b4894f5 100644 --- a/app/models/chunks/uri.rb +++ b/app/models/chunks/uri.rb @@ -15,15 +15,26 @@ require 'chunks/chunk' # I'm using a part of the [ISO 3166-1 Standard][iso3166] for country name suffixes. # The generic names are from www.bnoack.com/data/countrycode2.html) # [iso3166]: http://geotags.com/iso3166/ + class URIChunk < Chunk::Abstract include URI::REGEXP::PATTERN # this condition is to get rid of pesky warnings in tests unless defined? URIChunk::INTERNET_URI_REGEXP - GENERIC = '(?:aero|biz|com|coop|edu|gov|info|int|mil|museum|name|net|org)' - COUNTRY = '(?:au|at|be|ca|ch|de|dk|fr|hk|in|ir|it|jp|nl|no|pt|ru|se|sw|tv|tw|uk|us)' - + GENERIC = 'aero|biz|com|coop|edu|gov|info|int|mil|museum|name|net|org' + + COUNTRY = 'ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|az|ba|bb|bd|be|' + + 'bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cf|cd|cg|ch|ci|ck|cl|' + + 'cm|cn|co|cr|cs|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|fi|' + + 'fj|fk|fm|fo|fr|fx|ga|gb|gd|ge|gf|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|' + + 'hk|hm|hn|hr|ht|hu|id|ie|il|in|io|iq|ir|is|it|jm|jo|jp|ke|kg|kh|ki|km|kn|' + + 'kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|mg|mh|mk|ml|mm|' + + 'mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nt|' + + 'nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pt|pw|py|qa|re|ro|ru|rw|sa|sb|sc|' + + 'sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|' + + 'tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|' + + 'ws|ye|yt|yu|za|zm|zr|zw' # These are needed otherwise HOST will match almost anything TLDS = "(?:#{GENERIC}|#{COUNTRY})" @@ -56,9 +67,11 @@ class URIChunk < Chunk::Abstract "(?::(#{PORT}))?" + # Optional :port (\4) "(#{ABS_PATH})?" + # Optional absolute path (\5) "(?:\\?(#{QUERY}))?" + # Optional ?query (\6) - "(?:\\#(#{FRAGMENT}))?" # Optional #fragment (\7) + "(?:\\#(#{FRAGMENT}))?" + # Optional #fragment (\7) + '(?=\.?(?:\s|\)|\z))' # ends only with + # optional dot + space or ")" or end of string - TEXTILE_SYNTAX_PREFIX = '(!)?' + TEXTILE_SYNTAX_PREFIX = '(!|\"\:)?' # ! or ": INTERNET_URI_REGEXP = Regexp.new(TEXTILE_SYNTAX_PREFIX + INTERNET_URI, Regexp::EXTENDED, 'N') diff --git a/test/unit/uri_test.rb b/test/unit/uri_test.rb index b1df875f..a5ad75ad 100755 --- a/test/unit/uri_test.rb +++ b/test/unit/uri_test.rb @@ -97,6 +97,15 @@ class URITest < Test::Unit::TestCase :scheme =>'http', :host =>'www.example.com.tw', :port => '80', :path => '/~jdoe123/Help%20Me%20', :query => 'arg=val&arg2=val2', :link_text => 'http://www.example.com.tw:80/~jdoe123/Help%20Me%20?arg=val&arg2=val2') + + # from 0.9 bug reports + match(URIChunk, 'http://www2.pos.to/~tosh/ruby/rdtool/en/doc/rd-draft.html', + :scheme =>'http', :host => 'www2.pos.to', + :path => '/~tosh/ruby/rdtool/en/doc/rd-draft.html') + + match(URIChunk, 'http://support.microsoft.com/default.aspx?scid=kb;en-us;234562', + :scheme =>'http', :host => 'support.microsoft.com', :path => '/default.aspx', + :query => 'scid=kb;en-us;234562') end def test_email_uri @@ -110,10 +119,32 @@ class URITest < Test::Unit::TestCase match(URIChunk, 'Not an email: @example.com', :user => nil, :uri => 'http://example.com') end + def test_textile_image + assert_conversion_does_not_apply(URIChunk, + 'This !http://hobix.com/sample.jpg! is a Textile image link.') + end + + def test_textile_link + assert_conversion_does_not_apply(URIChunk, + 'This "hobix (hobix)":http://hobix.com/sample.jpg is a Textile link.') + # just to be sure ... + match(URIChunk, 'This http://hobix.com/sample.jpg should match', + :link_text => 'http://hobix.com/sample.jpg') + end + def test_non_uri + # "so" is a valid country code; "libproxy.so" is a valid url + match(URIChunk, 'libproxy.so', :link_text => 'libproxy.so') + assert_conversion_does_not_apply URIChunk, 'httpd.conf' - assert_conversion_does_not_apply URIChunk, 'libproxy.so' - assert_conversion_does_not_apply URIChunk, 'ld.so.conf' + assert_conversion_does_not_apply URIChunk, 'ld.so.conf' + assert_conversion_does_not_apply URIChunk, 'index.jpeg' + assert_conversion_does_not_apply URIChunk, 'index.jpg' + assert_conversion_does_not_apply URIChunk, 'file.txt' + assert_conversion_does_not_apply URIChunk, 'file.doc' + assert_conversion_does_not_apply URIChunk, 'file.pdf' + assert_conversion_does_not_apply URIChunk, 'file.png' + assert_conversion_does_not_apply URIChunk, 'file.ps' end def test_uri_in_text @@ -123,8 +154,10 @@ class URITest < Test::Unit::TestCase 'Email david@loudthinking.com', :scheme =>'mailto', :user =>'david', :host =>'loudthinking.com') # check that trailing punctuation is not included in the hostname - match(URIChunk, '"link":http://fake.link.com.', :scheme => 'http', :host => 'fake.link.com') - end + match(URIChunk, 'Hey dude, http://fake.link.com.', :scheme => 'http', :host => 'fake.link.com') + # this is a textile link, no match please. + assert_conversion_does_not_apply(URIChunk, '"link":http://fake.link.com.') + end def test_uri_in_parentheses match(URIChunk, 'URI (http://brackets.com.de) in brackets', :host => 'brackets.com.de') @@ -172,7 +205,7 @@ class URITest < Test::Unit::TestCase def assert_conversion_does_not_apply(chunk_type, str) processed_str = str.dup - URIChunk.apply_to(processed_str) + chunk_type.apply_to(processed_str) assert_equal(str, processed_str) end