Efficiency
This version of String#purify is 12% faster, under Ruby 1.9, than before.
This commit is contained in:
parent
f7044ecbb4
commit
171c12d2c1
|
@ -6,20 +6,30 @@ class String
|
||||||
#
|
#
|
||||||
# Under 1.8, this is a NOOP. Under 1.9, it sets the encoding to "ASCII-8BIT"
|
# Under 1.8, this is a NOOP. Under 1.9, it sets the encoding to "ASCII-8BIT"
|
||||||
#--
|
#--
|
||||||
|
if "".respond_to?(:force_encoding)
|
||||||
|
def as_bytes
|
||||||
|
force_encoding("ASCII-8BIT")
|
||||||
|
end
|
||||||
|
else
|
||||||
def as_bytes
|
def as_bytes
|
||||||
force_encoding("ASCII-8BIT") if self.respond_to?(:force_encoding)
|
|
||||||
self
|
self
|
||||||
end
|
end
|
||||||
|
end
|
||||||
|
|
||||||
#++
|
#++
|
||||||
# A method to allow string-oriented operations in both Ruby 1.8 and Ruby 1.9
|
# A method to allow string-oriented operations in both Ruby 1.8 and Ruby 1.9
|
||||||
#
|
#
|
||||||
# Under 1.8, this is a NOOP. Under 1.9, it sets the encoding to "UTF-8"
|
# Under 1.8, this is a NOOP. Under 1.9, it sets the encoding to "UTF-8"
|
||||||
#--
|
#--
|
||||||
|
if "".respond_to?(:force_encoding)
|
||||||
|
def as_utf8
|
||||||
|
force_encoding("UTF-8")
|
||||||
|
end
|
||||||
|
else
|
||||||
def as_utf8
|
def as_utf8
|
||||||
force_encoding("UTF-8") if self.respond_to?(:force_encoding)
|
|
||||||
self
|
self
|
||||||
end
|
end
|
||||||
|
end
|
||||||
|
|
||||||
#++
|
#++
|
||||||
# Take a string, and remove any invalid substrings, returning a valid utf-8 string.
|
# Take a string, and remove any invalid substrings, returning a valid utf-8 string.
|
||||||
|
@ -29,14 +39,17 @@ class String
|
||||||
#
|
#
|
||||||
# returns a valid utf-8 string, purged of any subsequences of illegal bytes.
|
# returns a valid utf-8 string, purged of any subsequences of illegal bytes.
|
||||||
#--
|
#--
|
||||||
def purify
|
if "".respond_to?(:force_encoding)
|
||||||
text = self.dup.check_ncrs.as_utf8
|
def purify
|
||||||
if text.respond_to?(:force_encoding)
|
text = check_ncrs.as_utf8
|
||||||
text.chars.collect{|c| c.as_bytes}.grep(UTF8_REGEX).join.as_utf8
|
text.chars.collect{|c| c.as_bytes}.grep(UTF8_REGEX).join.as_utf8
|
||||||
else
|
end
|
||||||
text.split(//u).grep(UTF8_REGEX).join
|
else
|
||||||
end
|
def purify
|
||||||
end
|
text = check_ncrs
|
||||||
|
text.split(//u).grep(UTF8_REGEX).join
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def check_ncrs
|
def check_ncrs
|
||||||
text = gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' }
|
text = gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' }
|
||||||
|
|
Loading…
Reference in a new issue