The "optimization" of using arrays instead of regexps to
implement to_utf8 and is_utf8? (and their brethren) is 
actually no faster. Go back to the logically-clearer implementation.
This commit is contained in:
Jacques Distler 2008-05-18 13:22:38 -05:00
parent a37b06b801
commit f8e74e53bd

View file

@ -131,11 +131,16 @@ class String
#-- #--
def is_utf8? def is_utf8?
#expand NCRs to utf-8 #expand NCRs to utf-8
pieces = self.split(/&#[xX]([a-fA-F0-9]+);/) text = self.gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*') }
1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')} text.gsub!(/&#(\d+);/) { |m| [$1.to_i].pack('U*') }
pieces = pieces.join.split(/&#(\d+);/)
1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')} # You might think this is faster, but it isn't
text = pieces.join #pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)
#1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')}
#pieces = pieces.join.split(/&#(\d+);/)
#1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')}
#text = pieces.join
#ensure the resulting string of bytes is valid utf-8 #ensure the resulting string of bytes is valid utf-8
text =~ /\A( text =~ /\A(
[\x09\x0A\x0D\x20-\x7E] # ASCII [\x09\x0A\x0D\x20-\x7E] # ASCII
@ -2283,9 +2288,7 @@ class String
# string.to_ncr -> string # string.to_ncr -> string
# #
def to_ncr def to_ncr
pieces = self.split(/&([a-zA-Z0-9]+);/) self.gsub(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_ncr}
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr}
pieces.join
end end
# Converts XHTML+MathML named entities in string to Numeric Character References # Converts XHTML+MathML named entities in string to Numeric Character References
@ -2296,9 +2299,7 @@ class String
# Substitution is done in-place. # Substitution is done in-place.
# #
def to_ncr! def to_ncr!
pieces = self.split(/&([a-zA-Z0-9]+);/) self.gsub!(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_ncr}
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr}
self.replace pieces.join
end end
# Converts XHTML+MathML named entities in string to UTF-8 # Converts XHTML+MathML named entities in string to UTF-8
@ -2306,12 +2307,17 @@ class String
# :call-seq: # :call-seq:
# string.to_utf8 -> string # string.to_utf8 -> string
# #
#--
def to_utf8 def to_utf8
pieces = self.split(/&([a-zA-Z0-9]+);/) self.gsub(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_utf8}
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
pieces.join # You might think this is faster, but it isn't
# pieces = self.split(/&([a-zA-Z0-9]+);/)
# 1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
# pieces.join
end end
#++
# Converts XHTML+MathML named entities in string to UTF-8 # Converts XHTML+MathML named entities in string to UTF-8
# #
# :call-seq: # :call-seq:
@ -2320,9 +2326,7 @@ class String
# Substitution is done in-place. # Substitution is done in-place.
# #
def to_utf8! def to_utf8!
pieces = self.split(/&([a-zA-Z0-9]+);/) self.gsub!(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_utf8}
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
self.replace pieces.join
end end
protected protected