Clean, rather than Complain

Previously, if the user tried to submit content which was
malformed utf-8, Instiki would complain loudly to him.

A slightly more user-friendly approach was suggested by
the latest Rails 2.3.4, and a conversation with Sam Ruby
(who suggested some improvements).

Now, instead of complaining, we remove the offending bytes,
leaving a well-formed utf-8 string, which we pretend is what
the user meant to submit.
This commit is contained in:
Jacques Distler 2009-09-07 16:02:36 -05:00
parent f029aae60e
commit c79fef9c01
9 changed files with 96 additions and 75 deletions

View file

@ -140,7 +140,7 @@ class ApplicationController < ActionController::Base
<html xmlns="http://www.w3.org/1999/xhtml"><body>
<h2>Internal Error</h2>
<p>An application error occurred while processing your request.</p>
<!-- \n#{exception.to_s.is_utf8? ? exception.to_s.gsub!(/-{2,}/, '- -') : ''}\n#{exception.backtrace.join("\n")}\n -->
<!-- \n#{exception.to_s.purify.gsub!(/-{2,}/, '- -') }\n#{exception.backtrace.join("\n")}\n -->
</body></html>
EOL
end

View file

@ -124,14 +124,10 @@ class FileController < ApplicationController
zip = Zip::ZipInputStream.open(archive)
while (entry = zip.get_next_entry) do
ext_length = File.extname(entry.name).length
page_name = entry.name[0..-(ext_length + 1)]
page_content = entry.get_input_stream.read
page_name = entry.name[0..-(ext_length + 1)].purify
page_content = entry.get_input_stream.read.purify
logger.info "Processing page '#{page_name}'"
begin
if !page_content.is_utf8?
logger.info "Page '#{page_name}' contains non-utf8 character data. Skipping."
next
end
existing_page = @wiki.read_page(@web.address, page_name)
if existing_page
if existing_page.content == page_content

View file

@ -162,8 +162,7 @@ class WikiController < ApplicationController
end
def search
@query = params['query']
render(:text => "Your query string was not valid utf-8", :layout => 'error', :status => 400) and return unless @query.is_utf8?
@query = params['query'].purify
@title_results = @web.select { |page| page.name =~ /#{@query}/i }.sort
@results = @web.select { |page| page.content =~ /#{@query}/i }.sort
all_pages_found = (@results + @title_results).uniq
@ -180,7 +179,7 @@ class WikiController < ApplicationController
end
def edit
if @page.nil? or not @page_name.is_utf8?
if @page.nil?
redirect_home
elsif @page.locked?(Time.now) and not params['break_lock']
redirect_to :web => @web_name, :action => 'locked', :id => @page_name
@ -190,12 +189,10 @@ class WikiController < ApplicationController
end
def locked
render(:text => 'Page name is not valid utf-8.', :status => 400, :layout => 'error') unless @page_name.is_utf8?
# to template
end
def new
render(:text => 'Page name is not valid utf-8.', :status => 400, :layout => 'error') unless @page_name.is_utf8?
# to template
end
@ -258,32 +255,22 @@ class WikiController < ApplicationController
end
def save
render(:status => 404, :text => 'Undefined page name', :layout => 'error') and return if @page_name.nil? or not @page_name.is_utf8?
render(:status => 404, :text => 'Undefined page name', :layout => 'error') and return if @page_name.nil?
unless (request.post? || ENV["RAILS_ENV"] == "test")
headers['Allow'] = 'POST'
render(:status => 405, :text => 'You must use an HTTP POST', :layout => 'error')
return
end
author_name = params['author']
author_name = params['author'].purify
author_name = 'AnonymousCoward' if author_name =~ /^\s*$/
begin
raise Instiki::ValidationError.new('Your name was not valid utf-8') unless author_name.is_utf8?
raise Instiki::ValidationError.new('Your name cannot contain a "."') if author_name.include? '.'
cookies['author'] = { :value => author_name, :expires => Time.utc(2030) }
the_content = params['content']
the_content = params['content'].purify
filter_spam(the_content)
unless the_content.is_utf8?
if @page
the_content = @page.content
else
the_content = ''
end
raise Instiki::ValidationError.new('Your content was not valid utf-8.')
end
if @page
new_name = params['new_name'] || @page_name
raise Instiki::ValidationError.new('Your new title was not valid utf-8.') unless new_name.is_utf8?
new_name = params['new_name'] ? params['new_name'].purify : @page_name
raise Instiki::ValidationError.new('Your new title cannot contain a "."') if new_name.include? '.'
raise Instiki::ValidationError.new('A page named "' + new_name.escapeHTML + '" already exists.') if @page_name != new_name && @web.has_page?(new_name)
wiki.revise_page(@web_name, @page_name, new_name, the_content, Time.now,
@ -325,7 +312,7 @@ class WikiController < ApplicationController
end
end
else
if not @page_name.nil? and @page_name.is_utf8? and not @page_name.empty?
if not @page_name.nil? and not @page_name.empty?
real_page = WikiReference.page_that_redirects_for(@web, @page_name)
if real_page
flash[:info] = "Redirected from \"#{@page_name}\"."
@ -354,7 +341,7 @@ class WikiController < ApplicationController
end
render :action => 'history'
else
if not @page_name.nil? and @page_name.is_utf8? and not @page_name.empty?
if not @page_name.nil? and not @page_name.empty?
redirect_to :web => @web_name, :action => 'new', :id => @page_name
else
render :text => 'Page name is not specified', :status => 404, :layout => 'error'
@ -397,7 +384,7 @@ class WikiController < ApplicationController
end
def load_page
@page_name = params['id']
@page_name = params['id'] ? params['id'].purify : nil
@page = @wiki.read_page(@web_name, @page_name) if @page_name
end

View file

@ -14,7 +14,7 @@
'accept-charset' => 'utf-8' }) do %>
<div>
<textarea name="content" id="content" rows="24" cols="60"><%= h(flash[:content] ||
((params['content'] && params['content'].is_utf8?) ? params['content'] : @page.content).purify) %></textarea>
(params['content'] ? params['content'] : @page.content).purify) %></textarea>
<% if @page_name != 'HomePage' -%>
<p>
<%= check_box_tag :alter_title, value = "1", checked=false,

View file

@ -14,7 +14,7 @@
{ 'id' => 'editForm', 'method' => 'post', 'onsubmit' => 'cleanAuthorName();', 'accept-charset' => 'utf-8' }) do %>
<textarea name="content" id="content" rows="24" cols="60"><%= h(flash[:content] ||
( (params['content'] && params['content'].is_utf8?) ? params['content'] : '').purify ) %></textarea>
params['content'] ? params['content'].purify : '' ) %></textarea>
<div id="editFormButtons">
<input type="submit" value="Submit" accesskey="s"/> as
<%= text_field_tag :author, @author,

View file

@ -1,4 +1,4 @@
<%- @title = "Search results for \"#{h params["query"]}\"" -%>
<%- @title = "Search results for \"#{h @query}\"" -%>
<%- unless @title_results.empty? -%>
<h2><%= @title_results.length %> page(s) containing search string in the page name:</h2>
@ -24,7 +24,7 @@
<%- end -%>
<%- if (@results + @title_results).empty? -%>
<h2>No pages contain "<%= h params["query"] %>" </h2>
<h2>No pages contain "<%= h @query %>" </h2>
<p>
Perhaps you should try expanding your query. Remember that Instiki searches for entire
phrases, so if you search for "all that jazz" it will not match pages that contain these
@ -36,6 +36,6 @@
"[a-z]*Leet?RegExpSkill(s|z)"
</p>
<p>
<b>Create a new page, named:</b> "<span class='newWikiWord'><%= link_to h(params[:query]), :web => @web.address, :action => 'new', :id => params[:query] %></span>"
<b>Create a new page, named:</b> "<span class='newWikiWord'><%= link_to h(@query), :web => @web.address, :action => 'new', :id => @query %></span>"
</p>
<%- end -%>

View file

@ -2,27 +2,25 @@
class String
# Check whether a string is valid utf-8
# Take a string, and remove any invalid substrings, returning a valid utf-8 string.
#
# :call-seq:
# string.is_utf8? -> boolean
# string.purify -> new_string
#
# returns true if the sequence of bytes in string is valid utf-8
# returns a valid utf-8 string, purged of any subsequences of illegal bytes.
#--
def is_utf8?
#expand NCRs to utf-8
text = self.gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*') }
text.gsub!(/&#(\d+);/) { |m| [$1.to_i].pack('U*') }
def purify
text = expand_ncrs
text.split(//u).grep(UTF8_REGEX).join
end
# You might think this is faster, but it isn't
#pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)
#1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')}
#pieces = pieces.join.split(/&#(\d+);/)
#1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')}
#text = pieces.join
def expand_ncrs
text = gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*') }
text.gsub!(/&#(\d+);/) { |m| [$1.to_i].pack('U*') }
text
end
#ensure the resulting string of bytes is valid utf-8
text =~ /\A(
UTF8_REGEX = /\A(
[\x09\x0A\x0D\x20-\x7E] # ASCII
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
@ -33,13 +31,31 @@ class String
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
)*\Z/nx;
end
)+\Z/nx;
#++
def purify
delete("\x01-\x08\x0B\x0C\x0E-\x1F", "\ufffe\uffff")
end
# Check whether a string is valid utf-8
#
# :call-seq:
# string.is_utf8? -> boolean
#
# returns true if the sequence of bytes in string is valid utf-8
#--
def is_utf8?
#expand NCRs to utf-8
text = self.expand_ncrs
# You might think this is faster, but it isn't
#pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)
#1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')}
#pieces = pieces.join.split(/&#(\d+);/)
#1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')}
#text = pieces.join
#ensure the resulting string of bytes is valid utf-8
text =~ UTF8_REGEX
end
#:stopdoc:
MATHML_ENTITIES = {
'Alpha' => '&#x0391;',

View file

@ -607,30 +607,39 @@ class WikiControllerTest < ActionController::TestCase
end
def test_save_not_utf8
r = process 'save', 'web' => 'wiki1', 'id' => 'NewPage', 'content' => "Contents of a new page\r\n\000",
r = process 'save', 'web' => 'wiki1', 'id' => 'NewPage', 'content' => "Cont\000ents of a new page\r\n\000",
'author' => 'AuthorOfNewPage'
assert_redirected_to :web => 'wiki1', :controller => 'wiki', :action => 'new', :id => 'NewPage', :content => ''
assert_redirected_to :web => 'wiki1', :controller => 'wiki', :action => 'show', :id => 'NewPage'
assert_equal 'AuthorOfNewPage', r.cookies['author']
assert_match @eternity, r.headers["Set-Cookie"][0]
new_page = @wiki.read_page('wiki1', 'NewPage')
assert_equal "Contents of a new page\r\n", new_page.content
assert_equal 'AuthorOfNewPage', new_page.author
end
def test_save_not_utf8_ncr
r = process 'save', 'web' => 'wiki1', 'id' => 'NewPage', 'content' => "Contents of a new page\r\n&#xfffe;",
'author' => 'AuthorOfNewPage'
assert_redirected_to :web => 'wiki1', :controller => 'wiki', :action => 'new', :id => 'NewPage', :content => ''
assert_redirected_to :web => 'wiki1', :controller => 'wiki', :action => 'show', :id => 'NewPage'
assert_equal 'AuthorOfNewPage', r.cookies['author']
assert_match @eternity, r.headers["Set-Cookie"][0]
new_page = @wiki.read_page('wiki1', 'NewPage')
assert_equal "Contents of a new page\r\n", new_page.content
assert_equal 'AuthorOfNewPage', new_page.author
end
def test_save_not_utf8_dec_ncr
r = process 'save', 'web' => 'wiki1', 'id' => 'NewPage', 'content' => "Contents of a new page\r\n&#65535;",
'author' => 'AuthorOfNewPage'
assert_redirected_to :web => 'wiki1', :controller => 'wiki', :action => 'new', :id => 'NewPage', :content => ''
assert_redirected_to :web => 'wiki1', :controller => 'wiki', :action => 'show', :id => 'NewPage'
assert_equal 'AuthorOfNewPage', r.cookies['author']
assert_match @eternity, r.headers["Set-Cookie"][0]
new_page = @wiki.read_page('wiki1', 'NewPage')
assert_equal "Contents of a new page\r\n", new_page.content
assert_equal 'AuthorOfNewPage', new_page.author
end
def test_save_new_revision_of_existing_page
@ -653,17 +662,15 @@ class WikiControllerTest < ActionController::TestCase
@home.lock(Time.now, 'Batman')
current_revisions = @home.revisions.size
r = process 'save', 'web' => 'wiki1', 'id' => 'HomePage', 'content' => "Revised HomePage\000",
r = process 'save', 'web' => 'wiki1', 'id' => 'HomePage', 'content' => "Newly rev\000ised HomePage",
'author' => 'Batman'
assert_redirected_to :web => 'wiki1', :controller => 'wiki', :action => 'edit', :id => 'HomePage',
:content => 'HisWay would be MyWay $\sin(x)\begin{svg}<svg/>\end{svg}\includegraphics[width' +
'=3em]{foo}$ in kinda ThatWay in HisWay though MyWay \OverThere -- see SmartEng' +
'ine in that SmartEngineGUI'
assert_redirected_to :web => 'wiki1', :controller => 'wiki', :action => 'show', :id => 'HomePage'
assert_equal 'Batman', r.cookies['author']
home_page = @wiki.read_page('wiki1', 'HomePage')
assert_equal current_revisions, home_page.revisions.size
assert_equal 'DavidHeinemeierHansson', home_page.author
assert_equal current_revisions+1, home_page.revisions.size
assert_equal 'Newly revised HomePage', home_page.content
assert_equal 'Batman', home_page.author
assert !home_page.locked?(Time.now)
end
@ -750,11 +757,20 @@ class WikiControllerTest < ActionController::TestCase
assert_redirected_to :action => 'new', :controller => 'wiki', :web => 'wiki1', :id => 'NewPage'
assert r.flash[:error].to_s == 'Your name cannot contain a "."'
r = process 'save', 'web' => 'wiki1', 'id' => 'NewPage', 'content' => 'Contents of a new page',
'author' => "Fu\000Manchu"
assert_redirected_to :action => 'show', :controller => 'wiki', :web => 'wiki1', :id => 'NewPage'
new_page = @wiki.read_page('wiki1', 'NewPage')
assert_equal 'FuManchu', new_page.author
r = process 'save', 'web' => 'wiki1', 'id' => 'AnotherPage', 'content' => 'Contents of a new page',
'author' => "\000"
assert_redirected_to :action => 'new', :controller => 'wiki', :web => 'wiki1', :id => 'AnotherPage'
assert r.flash[:error].to_s == "Your name was not valid utf-8"
assert_redirected_to :action => 'show', :controller => 'wiki', :web => 'wiki1', :id => 'AnotherPage'
new_page = @wiki.read_page('wiki1', 'AnotherPage')
assert_equal 'AnonymousCoward', new_page.author
end
def test_search
@ -794,17 +810,24 @@ class WikiControllerTest < ActionController::TestCase
end
def test_search_null_in_query
r = process 'search', 'web' => 'wiki1', 'query' => "\x00"
r = process 'search', 'web' => 'wiki1', 'query' => "non-existant\x00"
assert_response(400)
assert_match /Your query string was not valid utf-8/, r.body
assert_response(:success)
assert_match /No pages contain \"non-existant\"/, r.body
end
def test_search_FFFF_in_query
r = process 'search', 'web' => 'wiki1', 'query' => "\xEF\xBF\xBF"
assert_response(400)
assert_match /Your query string was not valid utf-8/, r.body
assert_response(:success)
assert_match /9 page\(s\) containing search string in the page name:/, r.body
r = process 'search', 'web' => 'wiki1', 'query' => "Al\357\277\277l about"
assert_response(:success)
assert_equal 'All about', r.template_objects['query']
assert_equal [@elephant, @oak], r.template_objects['results']
assert_equal [], r.template_objects['title_results']
end
def test_search_FFFD_in_query

View file

@ -1 +0,0 @@
performance.sql