Rollback
The "optimization" of using arrays instead of regexps to implement to_utf8 and is_utf8? (and their brethren) is actually no faster. Go back to the logically-clearer implementation.
This commit is contained in:
parent
a37b06b801
commit
f8e74e53bd
|
@ -131,11 +131,16 @@ class String
|
||||||
#--
|
#--
|
||||||
def is_utf8?
|
def is_utf8?
|
||||||
#expand NCRs to utf-8
|
#expand NCRs to utf-8
|
||||||
pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)
|
text = self.gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*') }
|
||||||
1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')}
|
text.gsub!(/&#(\d+);/) { |m| [$1.to_i].pack('U*') }
|
||||||
pieces = pieces.join.split(/&#(\d+);/)
|
|
||||||
1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')}
|
# You might think this is faster, but it isn't
|
||||||
text = pieces.join
|
#pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)
|
||||||
|
#1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')}
|
||||||
|
#pieces = pieces.join.split(/&#(\d+);/)
|
||||||
|
#1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')}
|
||||||
|
#text = pieces.join
|
||||||
|
|
||||||
#ensure the resulting string of bytes is valid utf-8
|
#ensure the resulting string of bytes is valid utf-8
|
||||||
text =~ /\A(
|
text =~ /\A(
|
||||||
[\x09\x0A\x0D\x20-\x7E] # ASCII
|
[\x09\x0A\x0D\x20-\x7E] # ASCII
|
||||||
|
@ -2283,9 +2288,7 @@ class String
|
||||||
# string.to_ncr -> string
|
# string.to_ncr -> string
|
||||||
#
|
#
|
||||||
def to_ncr
|
def to_ncr
|
||||||
pieces = self.split(/&([a-zA-Z0-9]+);/)
|
self.gsub(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_ncr}
|
||||||
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr}
|
|
||||||
pieces.join
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Converts XHTML+MathML named entities in string to Numeric Character References
|
# Converts XHTML+MathML named entities in string to Numeric Character References
|
||||||
|
@ -2296,9 +2299,7 @@ class String
|
||||||
# Substitution is done in-place.
|
# Substitution is done in-place.
|
||||||
#
|
#
|
||||||
def to_ncr!
|
def to_ncr!
|
||||||
pieces = self.split(/&([a-zA-Z0-9]+);/)
|
self.gsub!(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_ncr}
|
||||||
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr}
|
|
||||||
self.replace pieces.join
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Converts XHTML+MathML named entities in string to UTF-8
|
# Converts XHTML+MathML named entities in string to UTF-8
|
||||||
|
@ -2306,12 +2307,17 @@ class String
|
||||||
# :call-seq:
|
# :call-seq:
|
||||||
# string.to_utf8 -> string
|
# string.to_utf8 -> string
|
||||||
#
|
#
|
||||||
|
#--
|
||||||
def to_utf8
|
def to_utf8
|
||||||
pieces = self.split(/&([a-zA-Z0-9]+);/)
|
self.gsub(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_utf8}
|
||||||
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
|
|
||||||
pieces.join
|
# You might think this is faster, but it isn't
|
||||||
|
# pieces = self.split(/&([a-zA-Z0-9]+);/)
|
||||||
|
# 1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
|
||||||
|
# pieces.join
|
||||||
end
|
end
|
||||||
|
|
||||||
|
#++
|
||||||
# Converts XHTML+MathML named entities in string to UTF-8
|
# Converts XHTML+MathML named entities in string to UTF-8
|
||||||
#
|
#
|
||||||
# :call-seq:
|
# :call-seq:
|
||||||
|
@ -2320,9 +2326,7 @@ class String
|
||||||
# Substitution is done in-place.
|
# Substitution is done in-place.
|
||||||
#
|
#
|
||||||
def to_utf8!
|
def to_utf8!
|
||||||
pieces = self.split(/&([a-zA-Z0-9]+);/)
|
self.gsub!(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_utf8}
|
||||||
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
|
|
||||||
self.replace pieces.join
|
|
||||||
end
|
end
|
||||||
|
|
||||||
protected
|
protected
|
||||||
|
|
Loading…
Reference in a new issue