From f8e74e53bd80b8ef313590926eb5e81f52dd7de2 Mon Sep 17 00:00:00 2001
From: Jacques Distler <distler@golem.ph.utexas.edu>
Date: Sun, 18 May 2008 13:22:38 -0500
Subject: [PATCH] Rollback The "optimization" of using arrays instead of
 regexps to implement to_utf8 and is_utf8? (and their brethren) is actually no
 faster. Go back to the logically-clearer implementation.

---
 lib/sanitize.rb | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/lib/sanitize.rb b/lib/sanitize.rb
index 00af8297..3727bb08 100644
--- a/lib/sanitize.rb
+++ b/lib/sanitize.rb
@@ -131,11 +131,16 @@ class String
 #--
    def is_utf8?
      #expand NCRs to utf-8
-     pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)
-     1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')}
-     pieces = pieces.join.split(/&#(\d+);/)
-     1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')}
-     text = pieces.join     
+     text = self.gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*') }
+     text.gsub!(/&#(\d+);/) { |m| [$1.to_i].pack('U*') }
+     
+     # You might think this is faster, but it isn't
+     #pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)
+     #1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')}
+     #pieces = pieces.join.split(/&#(\d+);/)
+     #1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')}
+     #text = pieces.join
+          
      #ensure the resulting string of bytes is valid utf-8
      text =~  /\A(
          [\x09\x0A\x0D\x20-\x7E]            # ASCII
@@ -2283,9 +2288,7 @@ class String
 #     string.to_ncr  -> string
 #
     def to_ncr
-       pieces = self.split(/&([a-zA-Z0-9]+);/)
-       1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr}
-       pieces.join
+       self.gsub(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_ncr}
     end
 
 # Converts XHTML+MathML named entities in string to Numeric Character References
@@ -2296,9 +2299,7 @@ class String
 # Substitution is done in-place.
 #
     def to_ncr!
-       pieces = self.split(/&([a-zA-Z0-9]+);/)
-       1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr}
-       self.replace pieces.join
+       self.gsub!(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_ncr}
     end
 
 # Converts XHTML+MathML named entities in string to UTF-8
@@ -2306,12 +2307,17 @@ class String
 #  :call-seq:
 #     string.to_utf8  -> string
 #
+#--
     def to_utf8
-       pieces = self.split(/&([a-zA-Z0-9]+);/)
-       1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
-       pieces.join
+      self.gsub(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_utf8}
+      
+      # You might think this is faster, but it isn't
+      # pieces = self.split(/&([a-zA-Z0-9]+);/)
+      # 1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
+      # pieces.join
     end
-
+    
+#++
 # Converts XHTML+MathML named entities in string to UTF-8
 #
 #  :call-seq:
@@ -2320,9 +2326,7 @@ class String
 # Substitution is done in-place.
 #
     def to_utf8!
-       pieces = self.split(/&([a-zA-Z0-9]+);/)
-       1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
-       self.replace pieces.join
+       self.gsub!(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_utf8}
     end
 
   protected