Efficiency: Entity handling
Previously, used a regexp to find and convert named entities in the content. Now use a more efficient algorithm. Similar tweak for converting NCRs before checking whether text is valid utf-8.
This commit is contained in:
parent
5ca0760f7c
commit
41346bf8bd
7 changed files with 50 additions and 29 deletions
|
@ -13,8 +13,6 @@ class WikiController < ApplicationController
|
|||
|
||||
layout 'default', :except => [:atom_with_content, :atom_with_headlines, :atom, :tex, :s5, :export_html]
|
||||
|
||||
include Sanitize
|
||||
|
||||
def index
|
||||
if @web_name
|
||||
redirect_home
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
$: << File.dirname(__FILE__) + "../../lib"
|
||||
|
||||
require_dependency 'chunks/chunk'
|
||||
require 'sanitize'
|
||||
|
||||
|
||||
# The markup engines are Chunks that call the one of RedCloth
|
||||
# or RDoc to convert text. This markup occurs when the chunk is required
|
||||
|
@ -40,13 +42,13 @@ module Engines
|
|||
|
||||
# If the request is for S5, call Maruku accordingly (without math)
|
||||
if @content.options[:mode] == :s5
|
||||
my_content = Maruku.new(@content.delete("\r"), {:math_enabled => false,
|
||||
:content_only => true,
|
||||
my_content = Maruku.new(@content.delete("\r").to_utf8,
|
||||
{:math_enabled => false, :content_only => true,
|
||||
:author => @content.options[:engine_opts][:author],
|
||||
:title => @content.options[:engine_opts][:title]})
|
||||
@content.options[:renderer].s5_theme = my_content.s5_theme
|
||||
else
|
||||
html = Maruku.new(@content.delete("\r"), {:math_enabled => false}).to_html
|
||||
html = Maruku.new(@content.delete("\r").to_utf8, {:math_enabled => false}).to_html
|
||||
html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
|
||||
end
|
||||
|
||||
|
@ -60,7 +62,8 @@ module Engines
|
|||
|
||||
# If the request is for S5, call Maruku accordingly
|
||||
if @content.options[:mode] == :s5
|
||||
my_content = Maruku.new(@content.delete("\r"), {:math_enabled => true,
|
||||
my_content = Maruku.new(@content.delete("\r").to_utf8,
|
||||
{:math_enabled => true,
|
||||
:math_numbered => ['\\[','\\begin{equation}'],
|
||||
:content_only => true,
|
||||
:author => @content.options[:engine_opts][:author],
|
||||
|
@ -68,7 +71,7 @@ module Engines
|
|||
@content.options[:renderer].s5_theme = my_content.s5_theme
|
||||
my_content.to_s5
|
||||
else
|
||||
html = Maruku.new(@content.delete("\r"),
|
||||
html = Maruku.new(@content.delete("\r").to_utf8,
|
||||
{:math_enabled => true,
|
||||
:math_numbered => ['\\[','\\begin{equation}']}).to_html
|
||||
html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
|
||||
|
|
|
@ -14,9 +14,6 @@ require 'chunks/chunk'
|
|||
# Author: Mark Reid <mark at threewordslong dot com>
|
||||
# Created: 8th June 2004
|
||||
|
||||
require 'sanitize'
|
||||
include Sanitize
|
||||
|
||||
class NoWiki < Chunk::Abstract
|
||||
|
||||
NOWIKI_PATTERN = Regexp.new('<nowiki>(.*?)</nowiki>', Regexp::MULTILINE)
|
||||
|
@ -26,7 +23,7 @@ class NoWiki < Chunk::Abstract
|
|||
|
||||
def initialize(match_data, content)
|
||||
super
|
||||
@plain_text = @unmask_text = sanitize_xhtml(match_data[1])
|
||||
@plain_text = @unmask_text = match_data[1]
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -131,8 +131,12 @@ class String
|
|||
#--
|
||||
def is_utf8?
|
||||
#expand NCRs to utf-8
|
||||
text = self.gsub(/&#x([a-fA-F0-9]+);/) {|m| [$1.hex].pack('U*') }
|
||||
text.gsub!(/&#(\d+);/) {|m| [$1.to_i].pack('U*') }
|
||||
pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)
|
||||
1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')}
|
||||
text = pieces.join
|
||||
pieces = text.split(/&#(\d+);/)
|
||||
1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')}
|
||||
text = pieces.join
|
||||
#ensure the resulting string of bytes is valid utf-8
|
||||
text =~ /\A(
|
||||
[\x09\x0A\x0D\x20-\x7E] # ASCII
|
||||
|
@ -2280,7 +2284,9 @@ class String
|
|||
# string.to_ncr -> string
|
||||
#
|
||||
def to_ncr
|
||||
self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
|
||||
pieces = self.split(/&([a-zA-Z0-9]+);/)
|
||||
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr}
|
||||
pieces.join
|
||||
end
|
||||
|
||||
# Converts XHTML+MathML named entities in string to Numeric Character References
|
||||
|
@ -2291,7 +2297,9 @@ class String
|
|||
# Substitution is done in-place.
|
||||
#
|
||||
def to_ncr!
|
||||
self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
|
||||
pieces = self.split(/&([a-zA-Z0-9]+);/)
|
||||
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr}
|
||||
self.replace pieces.join
|
||||
end
|
||||
|
||||
# Converts XHTML+MathML named entities in string to UTF-8
|
||||
|
@ -2300,7 +2308,9 @@ class String
|
|||
# string.to_utf8 -> string
|
||||
#
|
||||
def to_utf8
|
||||
self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8}
|
||||
pieces = self.split(/&([a-zA-Z0-9]+);/)
|
||||
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
|
||||
pieces.join
|
||||
end
|
||||
|
||||
# Converts XHTML+MathML named entities in string to UTF-8
|
||||
|
@ -2311,21 +2321,31 @@ class String
|
|||
# Substitution is done in-place.
|
||||
#
|
||||
def to_utf8!
|
||||
self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8}
|
||||
pieces = self.split(/&([a-zA-Z0-9]+);/)
|
||||
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
|
||||
self.replace pieces.join
|
||||
end
|
||||
|
||||
protected
|
||||
|
||||
def convert_to_ncr #:nodoc:
|
||||
self =~ /^&([a-zA-Z0-9]+);$/
|
||||
name = $1
|
||||
return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name] : "&" + name + ";"
|
||||
if self =~ /^(lt|gt|amp|quot|apos)$/
|
||||
self.replace "&" + self + ";"
|
||||
elsif MATHML_ENTITIES.has_key?(self)
|
||||
self.replace MATHML_ENTITIES[self]
|
||||
else
|
||||
self.replace "&" + self + ";"
|
||||
end
|
||||
end
|
||||
|
||||
def convert_to_utf8 #:nodoc:
|
||||
self =~ /^&([a-zA-Z0-9]+);$/
|
||||
name = $1
|
||||
return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*') : "&" + name + ";"
|
||||
if self =~ /^(lt|gt|amp|quot|apos)$/
|
||||
self.replace "&" + self + ";"
|
||||
elsif MATHML_ENTITIES.has_key?(self)
|
||||
self.replace MATHML_ENTITIES[self].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*')
|
||||
else
|
||||
self.replace "&" + self + ";"
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
|
|
|
@ -7,6 +7,8 @@ require_dependency 'chunks/literal'
|
|||
require 'chunks/nowiki'
|
||||
require 'sanitize'
|
||||
|
||||
include Sanitize
|
||||
|
||||
# Wiki content is just a string that can process itself with a chain of
|
||||
# actions. The actions can modify wiki content so that certain parts of
|
||||
# it are protected from being rendered by later actions.
|
||||
|
|
|
@ -12,10 +12,4 @@ class NoWikiTest < Test::Unit::TestCase
|
|||
)
|
||||
end
|
||||
|
||||
def test_sanitized_nowiki
|
||||
match(NoWiki, 'This sentence contains <nowiki><span>a & b</span> <script>alert("XSS!");</script></nowiki>. Do not touch!',
|
||||
:plain_text => '<span>a & b</span> <script>alert("XSS!");</script>'
|
||||
)
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -344,6 +344,13 @@ class PageRendererTest < Test::Unit::TestCase
|
|||
"</ins> and lovely morning<ins class='diffins'> today</ins></span></p>", test_renderer(@page.revisions.last).display_diff
|
||||
end
|
||||
|
||||
def test_nowiki_sanitization
|
||||
assert_markup_parsed_as('<p>This sentence contains <span>a & b</span> ' +
|
||||
'<script>alert("XSS!");</script>. Do not touch!</p>',
|
||||
'This sentence contains <nowiki><span>a & b</span> <script>alert("XSS!");' +
|
||||
'</script></nowiki>. Do not touch!')
|
||||
end
|
||||
|
||||
def test_link_to_file
|
||||
assert_markup_parsed_as(
|
||||
"<p><span class='newWikiWord'>doc.pdf<a href='../file/doc.pdf'>?</a></span></p>",
|
||||
|
|
Loading…
Add table
Reference in a new issue