Efficiency: Entity handling

Previously, used a regexp to find and convert named entities in the content. Now use a more efficient algorithm. Similar tweak for converting NCRs before checking whether text is valid utf-8.
2008-05-17 01:43:11 -05:00 · 2008-05-17 01:43:11 -05:00 · 41346bf8bd
commit 41346bf8bd
parent 5ca0760f7c
7 changed files with 50 additions and 29 deletions
--- a/app/controllers/wiki_controller.rb
+++ b/app/controllers/wiki_controller.rb
@ -13,8 +13,6 @@ class WikiController < ApplicationController
  layout 'default', :except => [:atom_with_content, :atom_with_headlines, :atom, :tex, :s5, :export_html]
  include Sanitize
  def index
    if @web_name
      redirect_home
--- a/lib/chunks/engines.rb
+++ b/lib/chunks/engines.rb
@ -1,6 +1,8 @@
 $: << File.dirname(__FILE__) + "../../lib"
 require_dependency 'chunks/chunk'
 require 'sanitize'
 # The markup engines are Chunks that call the one of RedCloth
 # or RDoc to convert text. This markup occurs when the chunk is required
@ -40,13 +42,13 @@ module Engines
      # If the request is for S5, call Maruku accordingly (without math)
      if @content.options[:mode] == :s5
-        my_content = Maruku.new(@content.delete("\r"), {:math_enabled => false,
+        my_content = Maruku.new(@content.delete("\r").to_utf8,
-                            :content_only => true,
+                           {:math_enabled => false, :content_only => true,
                            :author => @content.options[:engine_opts][:author],
                            :title => @content.options[:engine_opts][:title]})
        @content.options[:renderer].s5_theme = my_content.s5_theme
      else
-        html = Maruku.new(@content.delete("\r"), {:math_enabled => false}).to_html
+        html = Maruku.new(@content.delete("\r").to_utf8, {:math_enabled => false}).to_html
        html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
      end
@ -60,7 +62,8 @@ module Engines
      # If the request is for S5, call Maruku accordingly
      if @content.options[:mode] == :s5
-        my_content = Maruku.new(@content.delete("\r"), {:math_enabled => true,
+        my_content = Maruku.new(@content.delete("\r").to_utf8,
                           {:math_enabled => true,
                            :math_numbered => ['\\[','\\begin{equation}'],
                            :content_only => true,
                            :author => @content.options[:engine_opts][:author],
@ -68,7 +71,7 @@ module Engines
        @content.options[:renderer].s5_theme = my_content.s5_theme
        my_content.to_s5
      else
-        html = Maruku.new(@content.delete("\r"),
+        html = Maruku.new(@content.delete("\r").to_utf8,
             {:math_enabled => true,
              :math_numbered => ['\\[','\\begin{equation}']}).to_html
        html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
--- a/lib/chunks/nowiki.rb
+++ b/lib/chunks/nowiki.rb
@ -14,9 +14,6 @@ require 'chunks/chunk'
 # Author: Mark Reid <mark at threewordslong dot com>
 # Created: 8th June 2004
 require 'sanitize'
 include Sanitize
 class NoWiki < Chunk::Abstract
  NOWIKI_PATTERN = Regexp.new('<nowiki>(.*?)</nowiki>', Regexp::MULTILINE)
@ -26,7 +23,7 @@ class NoWiki < Chunk::Abstract
  def initialize(match_data, content)
    super
-    @plain_text = @unmask_text = sanitize_xhtml(match_data[1])
+    @plain_text = @unmask_text = match_data[1]
  end
 end
--- a/lib/sanitize.rb
+++ b/lib/sanitize.rb
@ -131,8 +131,12 @@ class String
 #--
   def is_utf8?
     #expand NCRs to utf-8
-     text = self.gsub(/&#x([a-fA-F0-9]+);/) {|m| [$1.hex].pack('U*') }
+     pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)
-     text.gsub!(/&#(\d+);/) {|m| [$1.to_i].pack('U*') }
+     1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')}
     text = pieces.join
     pieces = text.split(/&#(\d+);/)
     1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')}
     text = pieces.join     
     #ensure the resulting string of bytes is valid utf-8
     text =~  /\A(
         [\x09\x0A\x0D\x20-\x7E]            # ASCII
@ -2280,7 +2284,9 @@ class String
 #     string.to_ncr  -> string
 #
    def to_ncr
-       self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
+       pieces = self.split(/&([a-zA-Z0-9]+);/)
       1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr}
       pieces.join
    end
 # Converts XHTML+MathML named entities in string to Numeric Character References
@ -2291,7 +2297,9 @@ class String
 # Substitution is done in-place.
 #
    def to_ncr!
-       self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
+       pieces = self.split(/&([a-zA-Z0-9]+);/)
       1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr}
       self.replace pieces.join
    end
 # Converts XHTML+MathML named entities in string to UTF-8
@ -2300,7 +2308,9 @@ class String
 #     string.to_utf8  -> string
 #
    def to_utf8
-       self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8}
+       pieces = self.split(/&([a-zA-Z0-9]+);/)
       1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
       pieces.join
    end
 # Converts XHTML+MathML named entities in string to UTF-8
@ -2311,21 +2321,31 @@ class String
 # Substitution is done in-place.
 #
    def to_utf8!
-       self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8}
+       pieces = self.split(/&([a-zA-Z0-9]+);/)
       1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
       self.replace pieces.join
    end
  protected
    def convert_to_ncr #:nodoc:
-      self =~ /^&([a-zA-Z0-9]+);$/
+      if self =~ /^(lt|gt|amp|quot|apos)$/
-      name = $1
+        self.replace "&" + self + ";"
-      return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name] : "&amp;" + name + ";"
+      elsif MATHML_ENTITIES.has_key?(self)
        self.replace MATHML_ENTITIES[self]
      else
        self.replace "&amp;" + self + ";"
      end
    end
    def convert_to_utf8 #:nodoc:
-      self =~ /^&([a-zA-Z0-9]+);$/
+      if self =~ /^(lt|gt|amp|quot|apos)$/
-      name = $1
+        self.replace "&" + self + ";"
-      return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*') : "&amp;" + name + ";"
+      elsif MATHML_ENTITIES.has_key?(self)         
        self.replace MATHML_ENTITIES[self].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*')
      else
        self.replace "&amp;" + self + ";"
      end
    end
--- a/lib/wiki_content.rb
+++ b/lib/wiki_content.rb
@ -7,6 +7,8 @@ require_dependency 'chunks/literal'
 require 'chunks/nowiki'
 require 'sanitize'
 include Sanitize
 # Wiki content is just a string that can process itself with a chain of
 # actions. The actions can modify wiki content so that certain parts of
 # it are protected from being rendered by later actions.
--- a/test/unit/chunks/nowiki_test.rb
+++ b/test/unit/chunks/nowiki_test.rb
@ -12,10 +12,4 @@ class NoWikiTest < Test::Unit::TestCase
 	)
  end
  def test_sanitized_nowiki
 	match(NoWiki, 'This sentence contains <nowiki><span>a & b</span> <script>alert("XSS!");</script></nowiki>. Do not touch!',
 		:plain_text => '<span>a &amp; b</span> &lt;script&gt;alert("XSS!");&lt;/script&gt;'
 	)
  end
 end
--- a/test/unit/page_renderer_test.rb
+++ b/test/unit/page_renderer_test.rb
@ -344,6 +344,13 @@ class PageRendererTest < Test::Unit::TestCase
        "</ins> and lovely morning<ins class='diffins'> today</ins></span></p>", test_renderer(@page.revisions.last).display_diff
  end
  def test_nowiki_sanitization
    assert_markup_parsed_as('<p>This sentence contains <span>a &amp; b</span> ' +
     '&lt;script&gt;alert("XSS!");&lt;/script&gt;. Do not touch!</p>',
      'This sentence contains <nowiki><span>a & b</span> <script>alert("XSS!");' +
      '</script></nowiki>. Do not touch!')
  end
  def test_link_to_file
    assert_markup_parsed_as( 
      "<p><span class='newWikiWord'>doc.pdf<a href='../file/doc.pdf'>?</a></span></p>",