Efficiency: Entity handling
Previously, used a regexp to find and convert named entities in the content. Now use a more efficient algorithm. Similar tweak for converting NCRs before checking whether text is valid utf-8.
This commit is contained in:
parent
5ca0760f7c
commit
41346bf8bd
|
@ -13,8 +13,6 @@ class WikiController < ApplicationController
|
||||||
|
|
||||||
layout 'default', :except => [:atom_with_content, :atom_with_headlines, :atom, :tex, :s5, :export_html]
|
layout 'default', :except => [:atom_with_content, :atom_with_headlines, :atom, :tex, :s5, :export_html]
|
||||||
|
|
||||||
include Sanitize
|
|
||||||
|
|
||||||
def index
|
def index
|
||||||
if @web_name
|
if @web_name
|
||||||
redirect_home
|
redirect_home
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
$: << File.dirname(__FILE__) + "../../lib"
|
$: << File.dirname(__FILE__) + "../../lib"
|
||||||
|
|
||||||
require_dependency 'chunks/chunk'
|
require_dependency 'chunks/chunk'
|
||||||
|
require 'sanitize'
|
||||||
|
|
||||||
|
|
||||||
# The markup engines are Chunks that call the one of RedCloth
|
# The markup engines are Chunks that call the one of RedCloth
|
||||||
# or RDoc to convert text. This markup occurs when the chunk is required
|
# or RDoc to convert text. This markup occurs when the chunk is required
|
||||||
|
@ -40,13 +42,13 @@ module Engines
|
||||||
|
|
||||||
# If the request is for S5, call Maruku accordingly (without math)
|
# If the request is for S5, call Maruku accordingly (without math)
|
||||||
if @content.options[:mode] == :s5
|
if @content.options[:mode] == :s5
|
||||||
my_content = Maruku.new(@content.delete("\r"), {:math_enabled => false,
|
my_content = Maruku.new(@content.delete("\r").to_utf8,
|
||||||
:content_only => true,
|
{:math_enabled => false, :content_only => true,
|
||||||
:author => @content.options[:engine_opts][:author],
|
:author => @content.options[:engine_opts][:author],
|
||||||
:title => @content.options[:engine_opts][:title]})
|
:title => @content.options[:engine_opts][:title]})
|
||||||
@content.options[:renderer].s5_theme = my_content.s5_theme
|
@content.options[:renderer].s5_theme = my_content.s5_theme
|
||||||
else
|
else
|
||||||
html = Maruku.new(@content.delete("\r"), {:math_enabled => false}).to_html
|
html = Maruku.new(@content.delete("\r").to_utf8, {:math_enabled => false}).to_html
|
||||||
html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
|
html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -60,7 +62,8 @@ module Engines
|
||||||
|
|
||||||
# If the request is for S5, call Maruku accordingly
|
# If the request is for S5, call Maruku accordingly
|
||||||
if @content.options[:mode] == :s5
|
if @content.options[:mode] == :s5
|
||||||
my_content = Maruku.new(@content.delete("\r"), {:math_enabled => true,
|
my_content = Maruku.new(@content.delete("\r").to_utf8,
|
||||||
|
{:math_enabled => true,
|
||||||
:math_numbered => ['\\[','\\begin{equation}'],
|
:math_numbered => ['\\[','\\begin{equation}'],
|
||||||
:content_only => true,
|
:content_only => true,
|
||||||
:author => @content.options[:engine_opts][:author],
|
:author => @content.options[:engine_opts][:author],
|
||||||
|
@ -68,7 +71,7 @@ module Engines
|
||||||
@content.options[:renderer].s5_theme = my_content.s5_theme
|
@content.options[:renderer].s5_theme = my_content.s5_theme
|
||||||
my_content.to_s5
|
my_content.to_s5
|
||||||
else
|
else
|
||||||
html = Maruku.new(@content.delete("\r"),
|
html = Maruku.new(@content.delete("\r").to_utf8,
|
||||||
{:math_enabled => true,
|
{:math_enabled => true,
|
||||||
:math_numbered => ['\\[','\\begin{equation}']}).to_html
|
:math_numbered => ['\\[','\\begin{equation}']}).to_html
|
||||||
html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
|
html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
|
||||||
|
|
|
@ -14,9 +14,6 @@ require 'chunks/chunk'
|
||||||
# Author: Mark Reid <mark at threewordslong dot com>
|
# Author: Mark Reid <mark at threewordslong dot com>
|
||||||
# Created: 8th June 2004
|
# Created: 8th June 2004
|
||||||
|
|
||||||
require 'sanitize'
|
|
||||||
include Sanitize
|
|
||||||
|
|
||||||
class NoWiki < Chunk::Abstract
|
class NoWiki < Chunk::Abstract
|
||||||
|
|
||||||
NOWIKI_PATTERN = Regexp.new('<nowiki>(.*?)</nowiki>', Regexp::MULTILINE)
|
NOWIKI_PATTERN = Regexp.new('<nowiki>(.*?)</nowiki>', Regexp::MULTILINE)
|
||||||
|
@ -26,7 +23,7 @@ class NoWiki < Chunk::Abstract
|
||||||
|
|
||||||
def initialize(match_data, content)
|
def initialize(match_data, content)
|
||||||
super
|
super
|
||||||
@plain_text = @unmask_text = sanitize_xhtml(match_data[1])
|
@plain_text = @unmask_text = match_data[1]
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -131,8 +131,12 @@ class String
|
||||||
#--
|
#--
|
||||||
def is_utf8?
|
def is_utf8?
|
||||||
#expand NCRs to utf-8
|
#expand NCRs to utf-8
|
||||||
text = self.gsub(/&#x([a-fA-F0-9]+);/) {|m| [$1.hex].pack('U*') }
|
pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)
|
||||||
text.gsub!(/&#(\d+);/) {|m| [$1.to_i].pack('U*') }
|
1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')}
|
||||||
|
text = pieces.join
|
||||||
|
pieces = text.split(/&#(\d+);/)
|
||||||
|
1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')}
|
||||||
|
text = pieces.join
|
||||||
#ensure the resulting string of bytes is valid utf-8
|
#ensure the resulting string of bytes is valid utf-8
|
||||||
text =~ /\A(
|
text =~ /\A(
|
||||||
[\x09\x0A\x0D\x20-\x7E] # ASCII
|
[\x09\x0A\x0D\x20-\x7E] # ASCII
|
||||||
|
@ -2280,7 +2284,9 @@ class String
|
||||||
# string.to_ncr -> string
|
# string.to_ncr -> string
|
||||||
#
|
#
|
||||||
def to_ncr
|
def to_ncr
|
||||||
self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
|
pieces = self.split(/&([a-zA-Z0-9]+);/)
|
||||||
|
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr}
|
||||||
|
pieces.join
|
||||||
end
|
end
|
||||||
|
|
||||||
# Converts XHTML+MathML named entities in string to Numeric Character References
|
# Converts XHTML+MathML named entities in string to Numeric Character References
|
||||||
|
@ -2291,7 +2297,9 @@ class String
|
||||||
# Substitution is done in-place.
|
# Substitution is done in-place.
|
||||||
#
|
#
|
||||||
def to_ncr!
|
def to_ncr!
|
||||||
self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
|
pieces = self.split(/&([a-zA-Z0-9]+);/)
|
||||||
|
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr}
|
||||||
|
self.replace pieces.join
|
||||||
end
|
end
|
||||||
|
|
||||||
# Converts XHTML+MathML named entities in string to UTF-8
|
# Converts XHTML+MathML named entities in string to UTF-8
|
||||||
|
@ -2300,7 +2308,9 @@ class String
|
||||||
# string.to_utf8 -> string
|
# string.to_utf8 -> string
|
||||||
#
|
#
|
||||||
def to_utf8
|
def to_utf8
|
||||||
self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8}
|
pieces = self.split(/&([a-zA-Z0-9]+);/)
|
||||||
|
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
|
||||||
|
pieces.join
|
||||||
end
|
end
|
||||||
|
|
||||||
# Converts XHTML+MathML named entities in string to UTF-8
|
# Converts XHTML+MathML named entities in string to UTF-8
|
||||||
|
@ -2311,21 +2321,31 @@ class String
|
||||||
# Substitution is done in-place.
|
# Substitution is done in-place.
|
||||||
#
|
#
|
||||||
def to_utf8!
|
def to_utf8!
|
||||||
self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_utf8}
|
pieces = self.split(/&([a-zA-Z0-9]+);/)
|
||||||
|
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
|
||||||
|
self.replace pieces.join
|
||||||
end
|
end
|
||||||
|
|
||||||
protected
|
protected
|
||||||
|
|
||||||
def convert_to_ncr #:nodoc:
|
def convert_to_ncr #:nodoc:
|
||||||
self =~ /^&([a-zA-Z0-9]+);$/
|
if self =~ /^(lt|gt|amp|quot|apos)$/
|
||||||
name = $1
|
self.replace "&" + self + ";"
|
||||||
return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name] : "&" + name + ";"
|
elsif MATHML_ENTITIES.has_key?(self)
|
||||||
|
self.replace MATHML_ENTITIES[self]
|
||||||
|
else
|
||||||
|
self.replace "&" + self + ";"
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def convert_to_utf8 #:nodoc:
|
def convert_to_utf8 #:nodoc:
|
||||||
self =~ /^&([a-zA-Z0-9]+);$/
|
if self =~ /^(lt|gt|amp|quot|apos)$/
|
||||||
name = $1
|
self.replace "&" + self + ";"
|
||||||
return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*') : "&" + name + ";"
|
elsif MATHML_ENTITIES.has_key?(self)
|
||||||
|
self.replace MATHML_ENTITIES[self].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*')
|
||||||
|
else
|
||||||
|
self.replace "&" + self + ";"
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,8 @@ require_dependency 'chunks/literal'
|
||||||
require 'chunks/nowiki'
|
require 'chunks/nowiki'
|
||||||
require 'sanitize'
|
require 'sanitize'
|
||||||
|
|
||||||
|
include Sanitize
|
||||||
|
|
||||||
# Wiki content is just a string that can process itself with a chain of
|
# Wiki content is just a string that can process itself with a chain of
|
||||||
# actions. The actions can modify wiki content so that certain parts of
|
# actions. The actions can modify wiki content so that certain parts of
|
||||||
# it are protected from being rendered by later actions.
|
# it are protected from being rendered by later actions.
|
||||||
|
|
|
@ -12,10 +12,4 @@ class NoWikiTest < Test::Unit::TestCase
|
||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
def test_sanitized_nowiki
|
|
||||||
match(NoWiki, 'This sentence contains <nowiki><span>a & b</span> <script>alert("XSS!");</script></nowiki>. Do not touch!',
|
|
||||||
:plain_text => '<span>a & b</span> <script>alert("XSS!");</script>'
|
|
||||||
)
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -344,6 +344,13 @@ class PageRendererTest < Test::Unit::TestCase
|
||||||
"</ins> and lovely morning<ins class='diffins'> today</ins></span></p>", test_renderer(@page.revisions.last).display_diff
|
"</ins> and lovely morning<ins class='diffins'> today</ins></span></p>", test_renderer(@page.revisions.last).display_diff
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def test_nowiki_sanitization
|
||||||
|
assert_markup_parsed_as('<p>This sentence contains <span>a & b</span> ' +
|
||||||
|
'<script>alert("XSS!");</script>. Do not touch!</p>',
|
||||||
|
'This sentence contains <nowiki><span>a & b</span> <script>alert("XSS!");' +
|
||||||
|
'</script></nowiki>. Do not touch!')
|
||||||
|
end
|
||||||
|
|
||||||
def test_link_to_file
|
def test_link_to_file
|
||||||
assert_markup_parsed_as(
|
assert_markup_parsed_as(
|
||||||
"<p><span class='newWikiWord'>doc.pdf<a href='../file/doc.pdf'>?</a></span></p>",
|
"<p><span class='newWikiWord'>doc.pdf<a href='../file/doc.pdf'>?</a></span></p>",
|
||||||
|
|
Loading…
Reference in a new issue