Performance
My REXML::Element.to_ncr (and REXML::Element.to_utf8) is horribly slow. For long documents, it proves more efficient to serialize to a string, apply String.to_ncr (or String.to_utf8) and then Sanitize the string.
This commit is contained in:
parent
0eb1ab56b0
commit
198d7847bd
5 changed files with 121 additions and 16 deletions
32
test/unit/sanitize_test.rb
Normal file
32
test/unit/sanitize_test.rb
Normal file
|
@ -0,0 +1,32 @@
|
|||
#!/usr/bin/env ruby
|
||||
|
||||
require File.expand_path(File.dirname(__FILE__) + '/../test_helper')
|
||||
require 'sanitize'
|
||||
|
||||
class SanitizeTest < Test::Unit::TestCase
|
||||
|
||||
def setup
|
||||
|
||||
end
|
||||
|
||||
def rexml_doc(string)
|
||||
REXML::Document.new(
|
||||
"<div xmlns='http://www.w3.org/1999/xhtml'>#{string}</div>")
|
||||
end
|
||||
|
||||
def my_rex(string)
|
||||
sanitize_rexml(rexml_doc(string)).gsub(/\A<div xmlns="http:\/\/www.w3.org\/1999\/xhtml">(.*)<\/div>\Z/m, '\1')
|
||||
end
|
||||
|
||||
def test_sanitize_named_entities
|
||||
input = '<p>Greek φ, double-struck 𝔸, numeric 𝔸 ⁗</p>'
|
||||
output = "<p>Greek \317\225, double-struck \360\235\224\270, numeric \360\235\224\270 \342\201\227</p>"
|
||||
output2 = "<p>Greek \317\225, double-struck \360\235\224\270, numeric 𝔸 ⁗</p>"
|
||||
assert_equal(output, sanitize_xhtml(input))
|
||||
assert_equal(output, sanitize_html(input))
|
||||
assert_equal(output, my_rex(input))
|
||||
assert_equal(output2, input.to_utf8)
|
||||
end
|
||||
|
||||
|
||||
end
|
Loading…
Add table
Add a link
Reference in a new issue