Better URL parsing (URIChunk knows more country codes, and is smarter about avoiding messing with Textile markup)

This commit is contained in:
Alexey Verkhovsky 2005-02-05 16:46:26 +00:00
parent 21f7693c06
commit b888799798
2 changed files with 56 additions and 10 deletions

View file

@ -15,15 +15,26 @@ require 'chunks/chunk'
# I'm using a part of the [ISO 3166-1 Standard][iso3166] for country name suffixes.
# The generic names are from www.bnoack.com/data/countrycode2.html)
# [iso3166]: http://geotags.com/iso3166/
class URIChunk < Chunk::Abstract
include URI::REGEXP::PATTERN
# this condition is to get rid of pesky warnings in tests
unless defined? URIChunk::INTERNET_URI_REGEXP
GENERIC = '(?:aero|biz|com|coop|edu|gov|info|int|mil|museum|name|net|org)'
COUNTRY = '(?:au|at|be|ca|ch|de|dk|fr|hk|in|ir|it|jp|nl|no|pt|ru|se|sw|tv|tw|uk|us)'
GENERIC = 'aero|biz|com|coop|edu|gov|info|int|mil|museum|name|net|org'
COUNTRY = 'ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|az|ba|bb|bd|be|' +
'bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cf|cd|cg|ch|ci|ck|cl|' +
'cm|cn|co|cr|cs|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|fi|' +
'fj|fk|fm|fo|fr|fx|ga|gb|gd|ge|gf|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|' +
'hk|hm|hn|hr|ht|hu|id|ie|il|in|io|iq|ir|is|it|jm|jo|jp|ke|kg|kh|ki|km|kn|' +
'kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|mg|mh|mk|ml|mm|' +
'mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nt|' +
'nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pt|pw|py|qa|re|ro|ru|rw|sa|sb|sc|' +
'sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|' +
'tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|' +
'ws|ye|yt|yu|za|zm|zr|zw'
# These are needed otherwise HOST will match almost anything
TLDS = "(?:#{GENERIC}|#{COUNTRY})"
@ -56,9 +67,11 @@ class URIChunk < Chunk::Abstract
"(?::(#{PORT}))?" + # Optional :port (\4)
"(#{ABS_PATH})?" + # Optional absolute path (\5)
"(?:\\?(#{QUERY}))?" + # Optional ?query (\6)
"(?:\\#(#{FRAGMENT}))?" # Optional #fragment (\7)
"(?:\\#(#{FRAGMENT}))?" + # Optional #fragment (\7)
'(?=\.?(?:\s|\)|\z))' # ends only with
# optional dot + space or ")" or end of string
TEXTILE_SYNTAX_PREFIX = '(!)?'
TEXTILE_SYNTAX_PREFIX = '(!|\"\:)?' # ! or ":
INTERNET_URI_REGEXP = Regexp.new(TEXTILE_SYNTAX_PREFIX + INTERNET_URI, Regexp::EXTENDED, 'N')

View file

@ -97,6 +97,15 @@ class URITest < Test::Unit::TestCase
:scheme =>'http', :host =>'www.example.com.tw', :port => '80',
:path => '/~jdoe123/Help%20Me%20', :query => 'arg=val&arg2=val2',
:link_text => 'http://www.example.com.tw:80/~jdoe123/Help%20Me%20?arg=val&arg2=val2')
# from 0.9 bug reports
match(URIChunk, 'http://www2.pos.to/~tosh/ruby/rdtool/en/doc/rd-draft.html',
:scheme =>'http', :host => 'www2.pos.to',
:path => '/~tosh/ruby/rdtool/en/doc/rd-draft.html')
match(URIChunk, 'http://support.microsoft.com/default.aspx?scid=kb;en-us;234562',
:scheme =>'http', :host => 'support.microsoft.com', :path => '/default.aspx',
:query => 'scid=kb;en-us;234562')
end
def test_email_uri
@ -110,10 +119,32 @@ class URITest < Test::Unit::TestCase
match(URIChunk, 'Not an email: @example.com', :user => nil, :uri => 'http://example.com')
end
def test_textile_image
assert_conversion_does_not_apply(URIChunk,
'This !http://hobix.com/sample.jpg! is a Textile image link.')
end
def test_textile_link
assert_conversion_does_not_apply(URIChunk,
'This "hobix (hobix)":http://hobix.com/sample.jpg is a Textile link.')
# just to be sure ...
match(URIChunk, 'This http://hobix.com/sample.jpg should match',
:link_text => 'http://hobix.com/sample.jpg')
end
def test_non_uri
# "so" is a valid country code; "libproxy.so" is a valid url
match(URIChunk, 'libproxy.so', :link_text => 'libproxy.so')
assert_conversion_does_not_apply URIChunk, 'httpd.conf'
assert_conversion_does_not_apply URIChunk, 'libproxy.so'
assert_conversion_does_not_apply URIChunk, 'ld.so.conf'
assert_conversion_does_not_apply URIChunk, 'ld.so.conf'
assert_conversion_does_not_apply URIChunk, 'index.jpeg'
assert_conversion_does_not_apply URIChunk, 'index.jpg'
assert_conversion_does_not_apply URIChunk, 'file.txt'
assert_conversion_does_not_apply URIChunk, 'file.doc'
assert_conversion_does_not_apply URIChunk, 'file.pdf'
assert_conversion_does_not_apply URIChunk, 'file.png'
assert_conversion_does_not_apply URIChunk, 'file.ps'
end
def test_uri_in_text
@ -123,8 +154,10 @@ class URITest < Test::Unit::TestCase
'Email david@loudthinking.com',
:scheme =>'mailto', :user =>'david', :host =>'loudthinking.com')
# check that trailing punctuation is not included in the hostname
match(URIChunk, '"link":http://fake.link.com.', :scheme => 'http', :host => 'fake.link.com')
end
match(URIChunk, 'Hey dude, http://fake.link.com.', :scheme => 'http', :host => 'fake.link.com')
# this is a textile link, no match please.
assert_conversion_does_not_apply(URIChunk, '"link":http://fake.link.com.')
end
def test_uri_in_parentheses
match(URIChunk, 'URI (http://brackets.com.de) in brackets', :host => 'brackets.com.de')
@ -172,7 +205,7 @@ class URITest < Test::Unit::TestCase
def assert_conversion_does_not_apply(chunk_type, str)
processed_str = str.dup
URIChunk.apply_to(processed_str)
chunk_type.apply_to(processed_str)
assert_equal(str, processed_str)
end