Don't Expand NCRs

That operation is not idempotent (among other defects).
Instead, just check that the NCRs corespond to valid utf-8.
(Reported by Andrew Stacey)
This commit is contained in:
Jacques Distler 2009-09-09 09:16:00 -05:00
parent d51b04d3db
commit 3ff68ef42f
2 changed files with 11 additions and 5 deletions

View file

@ -10,13 +10,13 @@ class String
# returns a valid utf-8 string, purged of any subsequences of illegal bytes. # returns a valid utf-8 string, purged of any subsequences of illegal bytes.
#-- #--
def purify def purify
text = expand_ncrs text = check_ncrs
text.split(//u).grep(UTF8_REGEX).join text.split(//u).grep(UTF8_REGEX).join
end end
def expand_ncrs def check_ncrs
text = gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*') } text = gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*') =~ UTF8_REGEX ? m : '' }
text.gsub!(/&#(\d+);/) { |m| [$1.to_i].pack('U*') } text.gsub!(/&#(\d+);/) { |m| [$1.to_i].pack('U*') =~ UTF8_REGEX ? m : '' }
text text
end end
@ -43,7 +43,7 @@ class String
#-- #--
def is_utf8? def is_utf8?
#expand NCRs to utf-8 #expand NCRs to utf-8
text = self.expand_ncrs text = self.check_ncrs
# You might think this is faster, but it isn't # You might think this is faster, but it isn't
#pieces = self.split(/&#[xX]([a-fA-F0-9]+);/) #pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)

View file

@ -771,6 +771,12 @@ class WikiControllerTest < ActionController::TestCase
new_page = @wiki.read_page('wiki1', 'AnotherPage') new_page = @wiki.read_page('wiki1', 'AnotherPage')
assert_equal 'AnonymousCoward', new_page.author assert_equal 'AnonymousCoward', new_page.author
r = process 'save', 'web' => 'wiki1', 'id' => 'AnotherPage', 'content' => 'Revised contents of a new page',
'author' => "G&#xfffe;eo&#2147483647;rge &#38; June"
assert_redirected_to :action => 'show', :controller => 'wiki', :web => 'wiki1', :id => 'AnotherPage'
new_page = @wiki.read_page('wiki1', 'AnotherPage')
assert_equal 'George &#38; June', new_page.author
end end
def test_search def test_search