From f8e74e53bd80b8ef313590926eb5e81f52dd7de2 Mon Sep 17 00:00:00 2001 From: Jacques Distler Date: Sun, 18 May 2008 13:22:38 -0500 Subject: [PATCH] Rollback The "optimization" of using arrays instead of regexps to implement to_utf8 and is_utf8? (and their brethren) is actually no faster. Go back to the logically-clearer implementation. --- lib/sanitize.rb | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/lib/sanitize.rb b/lib/sanitize.rb index 00af8297..3727bb08 100644 --- a/lib/sanitize.rb +++ b/lib/sanitize.rb @@ -131,11 +131,16 @@ class String #-- def is_utf8? #expand NCRs to utf-8 - pieces = self.split(/&#[xX]([a-fA-F0-9]+);/) - 1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')} - pieces = pieces.join.split(/&#(\d+);/) - 1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')} - text = pieces.join + text = self.gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*') } + text.gsub!(/&#(\d+);/) { |m| [$1.to_i].pack('U*') } + + # You might think this is faster, but it isn't + #pieces = self.split(/&#[xX]([a-fA-F0-9]+);/) + #1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')} + #pieces = pieces.join.split(/&#(\d+);/) + #1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')} + #text = pieces.join + #ensure the resulting string of bytes is valid utf-8 text =~ /\A( [\x09\x0A\x0D\x20-\x7E] # ASCII @@ -2283,9 +2288,7 @@ class String # string.to_ncr -> string # def to_ncr - pieces = self.split(/&([a-zA-Z0-9]+);/) - 1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr} - pieces.join + self.gsub(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_ncr} end # Converts XHTML+MathML named entities in string to Numeric Character References @@ -2296,9 +2299,7 @@ class String # Substitution is done in-place. # def to_ncr! - pieces = self.split(/&([a-zA-Z0-9]+);/) - 1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr} - self.replace pieces.join + self.gsub!(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_ncr} end # Converts XHTML+MathML named entities in string to UTF-8 @@ -2306,12 +2307,17 @@ class String # :call-seq: # string.to_utf8 -> string # +#-- def to_utf8 - pieces = self.split(/&([a-zA-Z0-9]+);/) - 1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8} - pieces.join + self.gsub(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_utf8} + + # You might think this is faster, but it isn't + # pieces = self.split(/&([a-zA-Z0-9]+);/) + # 1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8} + # pieces.join end - + +#++ # Converts XHTML+MathML named entities in string to UTF-8 # # :call-seq: @@ -2320,9 +2326,7 @@ class String # Substitution is done in-place. # def to_utf8! - pieces = self.split(/&([a-zA-Z0-9]+);/) - 1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8} - self.replace pieces.join + self.gsub!(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_utf8} end protected