Better URL parsing (URIChunk knows more country codes, and is smarter about avoiding messing with Textile markup)
This commit is contained in:
parent
21f7693c06
commit
b888799798
2 changed files with 56 additions and 10 deletions
|
@ -15,15 +15,26 @@ require 'chunks/chunk'
|
|||
# I'm using a part of the [ISO 3166-1 Standard][iso3166] for country name suffixes.
|
||||
# The generic names are from www.bnoack.com/data/countrycode2.html)
|
||||
# [iso3166]: http://geotags.com/iso3166/
|
||||
|
||||
class URIChunk < Chunk::Abstract
|
||||
include URI::REGEXP::PATTERN
|
||||
|
||||
# this condition is to get rid of pesky warnings in tests
|
||||
unless defined? URIChunk::INTERNET_URI_REGEXP
|
||||
|
||||
GENERIC = '(?:aero|biz|com|coop|edu|gov|info|int|mil|museum|name|net|org)'
|
||||
COUNTRY = '(?:au|at|be|ca|ch|de|dk|fr|hk|in|ir|it|jp|nl|no|pt|ru|se|sw|tv|tw|uk|us)'
|
||||
|
||||
GENERIC = 'aero|biz|com|coop|edu|gov|info|int|mil|museum|name|net|org'
|
||||
|
||||
COUNTRY = 'ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|az|ba|bb|bd|be|' +
|
||||
'bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cf|cd|cg|ch|ci|ck|cl|' +
|
||||
'cm|cn|co|cr|cs|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|fi|' +
|
||||
'fj|fk|fm|fo|fr|fx|ga|gb|gd|ge|gf|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|' +
|
||||
'hk|hm|hn|hr|ht|hu|id|ie|il|in|io|iq|ir|is|it|jm|jo|jp|ke|kg|kh|ki|km|kn|' +
|
||||
'kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|mg|mh|mk|ml|mm|' +
|
||||
'mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nt|' +
|
||||
'nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pt|pw|py|qa|re|ro|ru|rw|sa|sb|sc|' +
|
||||
'sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|' +
|
||||
'tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|' +
|
||||
'ws|ye|yt|yu|za|zm|zr|zw'
|
||||
# These are needed otherwise HOST will match almost anything
|
||||
TLDS = "(?:#{GENERIC}|#{COUNTRY})"
|
||||
|
||||
|
@ -56,9 +67,11 @@ class URIChunk < Chunk::Abstract
|
|||
"(?::(#{PORT}))?" + # Optional :port (\4)
|
||||
"(#{ABS_PATH})?" + # Optional absolute path (\5)
|
||||
"(?:\\?(#{QUERY}))?" + # Optional ?query (\6)
|
||||
"(?:\\#(#{FRAGMENT}))?" # Optional #fragment (\7)
|
||||
"(?:\\#(#{FRAGMENT}))?" + # Optional #fragment (\7)
|
||||
'(?=\.?(?:\s|\)|\z))' # ends only with
|
||||
# optional dot + space or ")" or end of string
|
||||
|
||||
TEXTILE_SYNTAX_PREFIX = '(!)?'
|
||||
TEXTILE_SYNTAX_PREFIX = '(!|\"\:)?' # ! or ":
|
||||
|
||||
INTERNET_URI_REGEXP = Regexp.new(TEXTILE_SYNTAX_PREFIX + INTERNET_URI, Regexp::EXTENDED, 'N')
|
||||
|
||||
|
|
|
@ -97,6 +97,15 @@ class URITest < Test::Unit::TestCase
|
|||
:scheme =>'http', :host =>'www.example.com.tw', :port => '80',
|
||||
:path => '/~jdoe123/Help%20Me%20', :query => 'arg=val&arg2=val2',
|
||||
:link_text => 'http://www.example.com.tw:80/~jdoe123/Help%20Me%20?arg=val&arg2=val2')
|
||||
|
||||
# from 0.9 bug reports
|
||||
match(URIChunk, 'http://www2.pos.to/~tosh/ruby/rdtool/en/doc/rd-draft.html',
|
||||
:scheme =>'http', :host => 'www2.pos.to',
|
||||
:path => '/~tosh/ruby/rdtool/en/doc/rd-draft.html')
|
||||
|
||||
match(URIChunk, 'http://support.microsoft.com/default.aspx?scid=kb;en-us;234562',
|
||||
:scheme =>'http', :host => 'support.microsoft.com', :path => '/default.aspx',
|
||||
:query => 'scid=kb;en-us;234562')
|
||||
end
|
||||
|
||||
def test_email_uri
|
||||
|
@ -110,10 +119,32 @@ class URITest < Test::Unit::TestCase
|
|||
match(URIChunk, 'Not an email: @example.com', :user => nil, :uri => 'http://example.com')
|
||||
end
|
||||
|
||||
def test_textile_image
|
||||
assert_conversion_does_not_apply(URIChunk,
|
||||
'This !http://hobix.com/sample.jpg! is a Textile image link.')
|
||||
end
|
||||
|
||||
def test_textile_link
|
||||
assert_conversion_does_not_apply(URIChunk,
|
||||
'This "hobix (hobix)":http://hobix.com/sample.jpg is a Textile link.')
|
||||
# just to be sure ...
|
||||
match(URIChunk, 'This http://hobix.com/sample.jpg should match',
|
||||
:link_text => 'http://hobix.com/sample.jpg')
|
||||
end
|
||||
|
||||
def test_non_uri
|
||||
# "so" is a valid country code; "libproxy.so" is a valid url
|
||||
match(URIChunk, 'libproxy.so', :link_text => 'libproxy.so')
|
||||
|
||||
assert_conversion_does_not_apply URIChunk, 'httpd.conf'
|
||||
assert_conversion_does_not_apply URIChunk, 'libproxy.so'
|
||||
assert_conversion_does_not_apply URIChunk, 'ld.so.conf'
|
||||
assert_conversion_does_not_apply URIChunk, 'ld.so.conf'
|
||||
assert_conversion_does_not_apply URIChunk, 'index.jpeg'
|
||||
assert_conversion_does_not_apply URIChunk, 'index.jpg'
|
||||
assert_conversion_does_not_apply URIChunk, 'file.txt'
|
||||
assert_conversion_does_not_apply URIChunk, 'file.doc'
|
||||
assert_conversion_does_not_apply URIChunk, 'file.pdf'
|
||||
assert_conversion_does_not_apply URIChunk, 'file.png'
|
||||
assert_conversion_does_not_apply URIChunk, 'file.ps'
|
||||
end
|
||||
|
||||
def test_uri_in_text
|
||||
|
@ -123,8 +154,10 @@ class URITest < Test::Unit::TestCase
|
|||
'Email david@loudthinking.com',
|
||||
:scheme =>'mailto', :user =>'david', :host =>'loudthinking.com')
|
||||
# check that trailing punctuation is not included in the hostname
|
||||
match(URIChunk, '"link":http://fake.link.com.', :scheme => 'http', :host => 'fake.link.com')
|
||||
end
|
||||
match(URIChunk, 'Hey dude, http://fake.link.com.', :scheme => 'http', :host => 'fake.link.com')
|
||||
# this is a textile link, no match please.
|
||||
assert_conversion_does_not_apply(URIChunk, '"link":http://fake.link.com.')
|
||||
end
|
||||
|
||||
def test_uri_in_parentheses
|
||||
match(URIChunk, 'URI (http://brackets.com.de) in brackets', :host => 'brackets.com.de')
|
||||
|
@ -172,7 +205,7 @@ class URITest < Test::Unit::TestCase
|
|||
|
||||
def assert_conversion_does_not_apply(chunk_type, str)
|
||||
processed_str = str.dup
|
||||
URIChunk.apply_to(processed_str)
|
||||
chunk_type.apply_to(processed_str)
|
||||
assert_equal(str, processed_str)
|
||||
end
|
||||
|
||||
|
|
Loading…
Reference in a new issue