Better URL parsing (URIChunk knows more country codes, and is smarter about avoiding messing with Textile markup)
This commit is contained in:
parent
21f7693c06
commit
b888799798
2 changed files with 56 additions and 10 deletions
|
@ -15,15 +15,26 @@ require 'chunks/chunk'
|
|||
# I'm using a part of the [ISO 3166-1 Standard][iso3166] for country name suffixes.
|
||||
# The generic names are from www.bnoack.com/data/countrycode2.html)
|
||||
# [iso3166]: http://geotags.com/iso3166/
|
||||
|
||||
class URIChunk < Chunk::Abstract
|
||||
include URI::REGEXP::PATTERN
|
||||
|
||||
# this condition is to get rid of pesky warnings in tests
|
||||
unless defined? URIChunk::INTERNET_URI_REGEXP
|
||||
|
||||
GENERIC = '(?:aero|biz|com|coop|edu|gov|info|int|mil|museum|name|net|org)'
|
||||
COUNTRY = '(?:au|at|be|ca|ch|de|dk|fr|hk|in|ir|it|jp|nl|no|pt|ru|se|sw|tv|tw|uk|us)'
|
||||
|
||||
GENERIC = 'aero|biz|com|coop|edu|gov|info|int|mil|museum|name|net|org'
|
||||
|
||||
COUNTRY = 'ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|az|ba|bb|bd|be|' +
|
||||
'bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cf|cd|cg|ch|ci|ck|cl|' +
|
||||
'cm|cn|co|cr|cs|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|fi|' +
|
||||
'fj|fk|fm|fo|fr|fx|ga|gb|gd|ge|gf|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|' +
|
||||
'hk|hm|hn|hr|ht|hu|id|ie|il|in|io|iq|ir|is|it|jm|jo|jp|ke|kg|kh|ki|km|kn|' +
|
||||
'kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|mg|mh|mk|ml|mm|' +
|
||||
'mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nt|' +
|
||||
'nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pt|pw|py|qa|re|ro|ru|rw|sa|sb|sc|' +
|
||||
'sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|' +
|
||||
'tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|' +
|
||||
'ws|ye|yt|yu|za|zm|zr|zw'
|
||||
# These are needed otherwise HOST will match almost anything
|
||||
TLDS = "(?:#{GENERIC}|#{COUNTRY})"
|
||||
|
||||
|
@ -56,9 +67,11 @@ class URIChunk < Chunk::Abstract
|
|||
"(?::(#{PORT}))?" + # Optional :port (\4)
|
||||
"(#{ABS_PATH})?" + # Optional absolute path (\5)
|
||||
"(?:\\?(#{QUERY}))?" + # Optional ?query (\6)
|
||||
"(?:\\#(#{FRAGMENT}))?" # Optional #fragment (\7)
|
||||
"(?:\\#(#{FRAGMENT}))?" + # Optional #fragment (\7)
|
||||
'(?=\.?(?:\s|\)|\z))' # ends only with
|
||||
# optional dot + space or ")" or end of string
|
||||
|
||||
TEXTILE_SYNTAX_PREFIX = '(!)?'
|
||||
TEXTILE_SYNTAX_PREFIX = '(!|\"\:)?' # ! or ":
|
||||
|
||||
INTERNET_URI_REGEXP = Regexp.new(TEXTILE_SYNTAX_PREFIX + INTERNET_URI, Regexp::EXTENDED, 'N')
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue