198d7847bd
My REXML::Element.to_ncr (and REXML::Element.to_utf8) is horribly slow. For long documents, it proves more efficient to serialize to a string, apply String.to_ncr (or String.to_utf8) and then Sanitize the string.
33 lines
951 B
Ruby
33 lines
951 B
Ruby
#!/usr/bin/env ruby
|
|
|
|
require File.expand_path(File.dirname(__FILE__) + '/../test_helper')
|
|
require 'sanitize'
|
|
|
|
class SanitizeTest < Test::Unit::TestCase
|
|
|
|
def setup
|
|
|
|
end
|
|
|
|
def rexml_doc(string)
|
|
REXML::Document.new(
|
|
"<div xmlns='http://www.w3.org/1999/xhtml'>#{string}</div>")
|
|
end
|
|
|
|
def my_rex(string)
|
|
sanitize_rexml(rexml_doc(string)).gsub(/\A<div xmlns="http:\/\/www.w3.org\/1999\/xhtml">(.*)<\/div>\Z/m, '\1')
|
|
end
|
|
|
|
def test_sanitize_named_entities
|
|
input = '<p>Greek φ, double-struck 𝔸, numeric 𝔸 ⁗</p>'
|
|
output = "<p>Greek \317\225, double-struck \360\235\224\270, numeric \360\235\224\270 \342\201\227</p>"
|
|
output2 = "<p>Greek \317\225, double-struck \360\235\224\270, numeric 𝔸 ⁗</p>"
|
|
assert_equal(output, sanitize_xhtml(input))
|
|
assert_equal(output, sanitize_html(input))
|
|
assert_equal(output, my_rex(input))
|
|
assert_equal(output2, input.to_utf8)
|
|
end
|
|
|
|
|
|
end
|