Efficiency: Entity handling

Previously, used a regexp to find and convert named entities in the content.
Now use a more efficient algorithm.
Similar tweak for converting NCRs before checking whether text is valid utf-8.
This commit is contained in:
Jacques Distler 2008-05-17 01:43:11 -05:00
parent 5ca0760f7c
commit 41346bf8bd
7 changed files with 50 additions and 29 deletions

View file

@ -13,8 +13,6 @@ class WikiController < ApplicationController
layout 'default', :except => [:atom_with_content, :atom_with_headlines, :atom, :tex, :s5, :export_html] layout 'default', :except => [:atom_with_content, :atom_with_headlines, :atom, :tex, :s5, :export_html]
include Sanitize
def index def index
if @web_name if @web_name
redirect_home redirect_home

View file

@ -1,6 +1,8 @@
$: << File.dirname(__FILE__) + "../../lib" $: << File.dirname(__FILE__) + "../../lib"
require_dependency 'chunks/chunk' require_dependency 'chunks/chunk'
require 'sanitize'
# The markup engines are Chunks that call the one of RedCloth # The markup engines are Chunks that call the one of RedCloth
# or RDoc to convert text. This markup occurs when the chunk is required # or RDoc to convert text. This markup occurs when the chunk is required
@ -40,13 +42,13 @@ module Engines
# If the request is for S5, call Maruku accordingly (without math) # If the request is for S5, call Maruku accordingly (without math)
if @content.options[:mode] == :s5 if @content.options[:mode] == :s5
my_content = Maruku.new(@content.delete("\r"), {:math_enabled => false, my_content = Maruku.new(@content.delete("\r").to_utf8,
:content_only => true, {:math_enabled => false, :content_only => true,
:author => @content.options[:engine_opts][:author], :author => @content.options[:engine_opts][:author],
:title => @content.options[:engine_opts][:title]}) :title => @content.options[:engine_opts][:title]})
@content.options[:renderer].s5_theme = my_content.s5_theme @content.options[:renderer].s5_theme = my_content.s5_theme
else else
html = Maruku.new(@content.delete("\r"), {:math_enabled => false}).to_html html = Maruku.new(@content.delete("\r").to_utf8, {:math_enabled => false}).to_html
html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1') html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
end end
@ -60,7 +62,8 @@ module Engines
# If the request is for S5, call Maruku accordingly # If the request is for S5, call Maruku accordingly
if @content.options[:mode] == :s5 if @content.options[:mode] == :s5
my_content = Maruku.new(@content.delete("\r"), {:math_enabled => true, my_content = Maruku.new(@content.delete("\r").to_utf8,
{:math_enabled => true,
:math_numbered => ['\\[','\\begin{equation}'], :math_numbered => ['\\[','\\begin{equation}'],
:content_only => true, :content_only => true,
:author => @content.options[:engine_opts][:author], :author => @content.options[:engine_opts][:author],
@ -68,7 +71,7 @@ module Engines
@content.options[:renderer].s5_theme = my_content.s5_theme @content.options[:renderer].s5_theme = my_content.s5_theme
my_content.to_s5 my_content.to_s5
else else
html = Maruku.new(@content.delete("\r"), html = Maruku.new(@content.delete("\r").to_utf8,
{:math_enabled => true, {:math_enabled => true,
:math_numbered => ['\\[','\\begin{equation}']}).to_html :math_numbered => ['\\[','\\begin{equation}']}).to_html
html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1') html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')

View file

@ -14,9 +14,6 @@ require 'chunks/chunk'
# Author: Mark Reid <mark at threewordslong dot com> # Author: Mark Reid <mark at threewordslong dot com>
# Created: 8th June 2004 # Created: 8th June 2004
require 'sanitize'
include Sanitize
class NoWiki < Chunk::Abstract class NoWiki < Chunk::Abstract
NOWIKI_PATTERN = Regexp.new('<nowiki>(.*?)</nowiki>', Regexp::MULTILINE) NOWIKI_PATTERN = Regexp.new('<nowiki>(.*?)</nowiki>', Regexp::MULTILINE)
@ -26,7 +23,7 @@ class NoWiki < Chunk::Abstract
def initialize(match_data, content) def initialize(match_data, content)
super super
@plain_text = @unmask_text = sanitize_xhtml(match_data[1]) @plain_text = @unmask_text = match_data[1]
end end
end end

View file

@ -131,8 +131,12 @@ class String
#-- #--
def is_utf8? def is_utf8?
#expand NCRs to utf-8 #expand NCRs to utf-8
text = self.gsub(/&#x([a-fA-F0-9]+);/) {|m| [$1.hex].pack('U*') } pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)
text.gsub!(/&#(\d+);/) {|m| [$1.to_i].pack('U*') } 1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')}
text = pieces.join
pieces = text.split(/&#(\d+);/)
1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')}
text = pieces.join
#ensure the resulting string of bytes is valid utf-8 #ensure the resulting string of bytes is valid utf-8
text =~ /\A( text =~ /\A(
[\x09\x0A\x0D\x20-\x7E] # ASCII [\x09\x0A\x0D\x20-\x7E] # ASCII
@ -2280,7 +2284,9 @@ class String
# string.to_ncr -> string # string.to_ncr -> string
# #
def to_ncr def to_ncr
self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr} pieces = self.split(/&([a-zA-Z0-9]+);/)
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr}
pieces.join
end end
# Converts XHTML+MathML named entities in string to Numeric Character References # Converts XHTML+MathML named entities in string to Numeric Character References
@ -2291,7 +2297,9 @@ class String
# Substitution is done in-place. # Substitution is done in-place.
# #
def to_ncr! def to_ncr!
self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr} pieces = self.split(/&([a-zA-Z0-9]+);/)
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr}
self.replace pieces.join
end end
# Converts XHTML+MathML named entities in string to UTF-8 # Converts XHTML+MathML named entities in string to UTF-8
@ -2300,7 +2308,9 @@ class String
# string.to_utf8 -> string # string.to_utf8 -> string
# #
def to_utf8 def to_utf8
self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8} pieces = self.split(/&([a-zA-Z0-9]+);/)
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
pieces.join
end end
# Converts XHTML+MathML named entities in string to UTF-8 # Converts XHTML+MathML named entities in string to UTF-8
@ -2311,21 +2321,31 @@ class String
# Substitution is done in-place. # Substitution is done in-place.
# #
def to_utf8! def to_utf8!
self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8} pieces = self.split(/&([a-zA-Z0-9]+);/)
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
self.replace pieces.join
end end
protected protected
def convert_to_ncr #:nodoc: def convert_to_ncr #:nodoc:
self =~ /^&([a-zA-Z0-9]+);$/ if self =~ /^(lt|gt|amp|quot|apos)$/
name = $1 self.replace "&" + self + ";"
return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name] : "&amp;" + name + ";" elsif MATHML_ENTITIES.has_key?(self)
self.replace MATHML_ENTITIES[self]
else
self.replace "&amp;" + self + ";"
end
end end
def convert_to_utf8 #:nodoc: def convert_to_utf8 #:nodoc:
self =~ /^&([a-zA-Z0-9]+);$/ if self =~ /^(lt|gt|amp|quot|apos)$/
name = $1 self.replace "&" + self + ";"
return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*') : "&amp;" + name + ";" elsif MATHML_ENTITIES.has_key?(self)
self.replace MATHML_ENTITIES[self].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*')
else
self.replace "&amp;" + self + ";"
end
end end

View file

@ -7,6 +7,8 @@ require_dependency 'chunks/literal'
require 'chunks/nowiki' require 'chunks/nowiki'
require 'sanitize' require 'sanitize'
include Sanitize
# Wiki content is just a string that can process itself with a chain of # Wiki content is just a string that can process itself with a chain of
# actions. The actions can modify wiki content so that certain parts of # actions. The actions can modify wiki content so that certain parts of
# it are protected from being rendered by later actions. # it are protected from being rendered by later actions.

View file

@ -12,10 +12,4 @@ class NoWikiTest < Test::Unit::TestCase
) )
end end
def test_sanitized_nowiki
match(NoWiki, 'This sentence contains <nowiki><span>a & b</span> <script>alert("XSS!");</script></nowiki>. Do not touch!',
:plain_text => '<span>a &amp; b</span> &lt;script&gt;alert("XSS!");&lt;/script&gt;'
)
end
end end

View file

@ -344,6 +344,13 @@ class PageRendererTest < Test::Unit::TestCase
"</ins> and lovely morning<ins class='diffins'> today</ins></span></p>", test_renderer(@page.revisions.last).display_diff "</ins> and lovely morning<ins class='diffins'> today</ins></span></p>", test_renderer(@page.revisions.last).display_diff
end end
def test_nowiki_sanitization
assert_markup_parsed_as('<p>This sentence contains <span>a &amp; b</span> ' +
'&lt;script&gt;alert("XSS!");&lt;/script&gt;. Do not touch!</p>',
'This sentence contains <nowiki><span>a & b</span> <script>alert("XSS!");' +
'</script></nowiki>. Do not touch!')
end
def test_link_to_file def test_link_to_file
assert_markup_parsed_as( assert_markup_parsed_as(
"<p><span class='newWikiWord'>doc.pdf<a href='../file/doc.pdf'>?</a></span></p>", "<p><span class='newWikiWord'>doc.pdf<a href='../file/doc.pdf'>?</a></span></p>",