From 41346bf8bd918de83a562e35758189b5d6a21138 Mon Sep 17 00:00:00 2001 From: Jacques Distler Date: Sat, 17 May 2008 01:43:11 -0500 Subject: [PATCH] Efficiency: Entity handling Previously, used a regexp to find and convert named entities in the content. Now use a more efficient algorithm. Similar tweak for converting NCRs before checking whether text is valid utf-8. --- app/controllers/wiki_controller.rb | 2 -- lib/chunks/engines.rb | 13 +++++---- lib/chunks/nowiki.rb | 5 +--- lib/sanitize.rb | 44 ++++++++++++++++++++++-------- lib/wiki_content.rb | 2 ++ test/unit/chunks/nowiki_test.rb | 6 ---- test/unit/page_renderer_test.rb | 7 +++++ 7 files changed, 50 insertions(+), 29 deletions(-) diff --git a/app/controllers/wiki_controller.rb b/app/controllers/wiki_controller.rb index 6d69b554..80bd4a8e 100644 --- a/app/controllers/wiki_controller.rb +++ b/app/controllers/wiki_controller.rb @@ -13,8 +13,6 @@ class WikiController < ApplicationController layout 'default', :except => [:atom_with_content, :atom_with_headlines, :atom, :tex, :s5, :export_html] - include Sanitize - def index if @web_name redirect_home diff --git a/lib/chunks/engines.rb b/lib/chunks/engines.rb index 2576ad14..0f5300c6 100644 --- a/lib/chunks/engines.rb +++ b/lib/chunks/engines.rb @@ -1,6 +1,8 @@ $: << File.dirname(__FILE__) + "../../lib" require_dependency 'chunks/chunk' +require 'sanitize' + # The markup engines are Chunks that call the one of RedCloth # or RDoc to convert text. This markup occurs when the chunk is required @@ -40,13 +42,13 @@ module Engines # If the request is for S5, call Maruku accordingly (without math) if @content.options[:mode] == :s5 - my_content = Maruku.new(@content.delete("\r"), {:math_enabled => false, - :content_only => true, + my_content = Maruku.new(@content.delete("\r").to_utf8, + {:math_enabled => false, :content_only => true, :author => @content.options[:engine_opts][:author], :title => @content.options[:engine_opts][:title]}) @content.options[:renderer].s5_theme = my_content.s5_theme else - html = Maruku.new(@content.delete("\r"), {:math_enabled => false}).to_html + html = Maruku.new(@content.delete("\r").to_utf8, {:math_enabled => false}).to_html html.gsub(/\A
\n?(.*?)\n?<\/div>\Z/m, '\1') end @@ -60,7 +62,8 @@ module Engines # If the request is for S5, call Maruku accordingly if @content.options[:mode] == :s5 - my_content = Maruku.new(@content.delete("\r"), {:math_enabled => true, + my_content = Maruku.new(@content.delete("\r").to_utf8, + {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}'], :content_only => true, :author => @content.options[:engine_opts][:author], @@ -68,7 +71,7 @@ module Engines @content.options[:renderer].s5_theme = my_content.s5_theme my_content.to_s5 else - html = Maruku.new(@content.delete("\r"), + html = Maruku.new(@content.delete("\r").to_utf8, {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html html.gsub(/\A
\n?(.*?)\n?<\/div>\Z/m, '\1') diff --git a/lib/chunks/nowiki.rb b/lib/chunks/nowiki.rb index dc1f1109..db67c847 100644 --- a/lib/chunks/nowiki.rb +++ b/lib/chunks/nowiki.rb @@ -14,9 +14,6 @@ require 'chunks/chunk' # Author: Mark Reid # Created: 8th June 2004 -require 'sanitize' -include Sanitize - class NoWiki < Chunk::Abstract NOWIKI_PATTERN = Regexp.new('(.*?)', Regexp::MULTILINE) @@ -26,7 +23,7 @@ class NoWiki < Chunk::Abstract def initialize(match_data, content) super - @plain_text = @unmask_text = sanitize_xhtml(match_data[1]) + @plain_text = @unmask_text = match_data[1] end end diff --git a/lib/sanitize.rb b/lib/sanitize.rb index 76da1779..8d2e0595 100644 --- a/lib/sanitize.rb +++ b/lib/sanitize.rb @@ -131,8 +131,12 @@ class String #-- def is_utf8? #expand NCRs to utf-8 - text = self.gsub(/&#x([a-fA-F0-9]+);/) {|m| [$1.hex].pack('U*') } - text.gsub!(/&#(\d+);/) {|m| [$1.to_i].pack('U*') } + pieces = self.split(/&#[xX]([a-fA-F0-9]+);/) + 1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')} + text = pieces.join + pieces = text.split(/&#(\d+);/) + 1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')} + text = pieces.join #ensure the resulting string of bytes is valid utf-8 text =~ /\A( [\x09\x0A\x0D\x20-\x7E] # ASCII @@ -2280,7 +2284,9 @@ class String # string.to_ncr -> string # def to_ncr - self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr} + pieces = self.split(/&([a-zA-Z0-9]+);/) + 1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr} + pieces.join end # Converts XHTML+MathML named entities in string to Numeric Character References @@ -2291,7 +2297,9 @@ class String # Substitution is done in-place. # def to_ncr! - self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr} + pieces = self.split(/&([a-zA-Z0-9]+);/) + 1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr} + self.replace pieces.join end # Converts XHTML+MathML named entities in string to UTF-8 @@ -2300,7 +2308,9 @@ class String # string.to_utf8 -> string # def to_utf8 - self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8} + pieces = self.split(/&([a-zA-Z0-9]+);/) + 1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8} + pieces.join end # Converts XHTML+MathML named entities in string to UTF-8 @@ -2311,21 +2321,31 @@ class String # Substitution is done in-place. # def to_utf8! - self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8} + pieces = self.split(/&([a-zA-Z0-9]+);/) + 1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8} + self.replace pieces.join end protected def convert_to_ncr #:nodoc: - self =~ /^&([a-zA-Z0-9]+);$/ - name = $1 - return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name] : "&" + name + ";" + if self =~ /^(lt|gt|amp|quot|apos)$/ + self.replace "&" + self + ";" + elsif MATHML_ENTITIES.has_key?(self) + self.replace MATHML_ENTITIES[self] + else + self.replace "&" + self + ";" + end end def convert_to_utf8 #:nodoc: - self =~ /^&([a-zA-Z0-9]+);$/ - name = $1 - return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*') : "&" + name + ";" + if self =~ /^(lt|gt|amp|quot|apos)$/ + self.replace "&" + self + ";" + elsif MATHML_ENTITIES.has_key?(self) + self.replace MATHML_ENTITIES[self].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*') + else + self.replace "&" + self + ";" + end end diff --git a/lib/wiki_content.rb b/lib/wiki_content.rb index c3e9de1e..75a846e5 100644 --- a/lib/wiki_content.rb +++ b/lib/wiki_content.rb @@ -7,6 +7,8 @@ require_dependency 'chunks/literal' require 'chunks/nowiki' require 'sanitize' +include Sanitize + # Wiki content is just a string that can process itself with a chain of # actions. The actions can modify wiki content so that certain parts of # it are protected from being rendered by later actions. diff --git a/test/unit/chunks/nowiki_test.rb b/test/unit/chunks/nowiki_test.rb index c1010922..8af5a645 100755 --- a/test/unit/chunks/nowiki_test.rb +++ b/test/unit/chunks/nowiki_test.rb @@ -12,10 +12,4 @@ class NoWikiTest < Test::Unit::TestCase ) end - def test_sanitized_nowiki - match(NoWiki, 'This sentence contains a & b . Do not touch!', - :plain_text => 'a & b <script>alert("XSS!");</script>' - ) - end - end diff --git a/test/unit/page_renderer_test.rb b/test/unit/page_renderer_test.rb index 943f698d..feef67e0 100644 --- a/test/unit/page_renderer_test.rb +++ b/test/unit/page_renderer_test.rb @@ -344,6 +344,13 @@ class PageRendererTest < Test::Unit::TestCase " and lovely morning today

", test_renderer(@page.revisions.last).display_diff end + def test_nowiki_sanitization + assert_markup_parsed_as('

This sentence contains a & b ' + + '<script>alert("XSS!");</script>. Do not touch!

', + 'This sentence contains a & b . Do not touch!') + end + def test_link_to_file assert_markup_parsed_as( "

doc.pdf?

",