diff --git a/lib/sanitize.rb b/lib/sanitize.rb index 10e481d0..d2cfcd74 100644 --- a/lib/sanitize.rb +++ b/lib/sanitize.rb @@ -436,7 +436,7 @@ class String 'plussim' => '⨦', 'plustwo' => '⨧', 'prod' => '∏', - 'race' => '⧚', + 'race' => '∽̱', 'roplus' => '⨮', 'rotimes' => '⨵', 'rthree' => '⋌', @@ -624,7 +624,7 @@ class String 'iiota' => '℩', 'image' => 'ℑ', 'imath' => 'ı', - 'jmath' => 'j', + 'jmath' => 'ȷ', 'laemptyv' => '⦴', 'lltri' => '◺', 'lrtri' => '⊿', @@ -638,7 +638,7 @@ class String 'range' => '⦥', 'real' => 'ℜ', 'tbrk' => '⎴', - 'trpezium' => '�', + 'trpezium' => '⏢', 'ultri' => '◸', 'urtri' => '◹', 'vzigzag' => '⦚', @@ -844,8 +844,10 @@ class String 'nu' => 'ν', 'Omega' => 'Ω', 'omega' => 'ω', + 'phgr' => 'φ', 'Phi' => 'Φ', - 'phi' => 'ϕ', + 'phi' => 'φ', + 'phis' => 'ϕ', 'phiv' => 'φ', 'Pi' => 'Π', 'pi' => 'π', @@ -1006,7 +1008,7 @@ class String 'andv' => '⩚', 'angrt' => '∟', 'angsph' => '∢', - 'angst' => 'Å', + 'angst' => 'Å', 'ap' => '≈', 'apacir' => '⩯', 'awconint' => '∳', @@ -1036,7 +1038,7 @@ class String 'dsol' => '⧶', 'dtdot' => '⋱', 'dwangle' => '⦦', - 'elinters' => '�', + 'elinters' => '⏧', 'epar' => '⋕', 'eparsl' => '⧣', 'equiv' => '≡', @@ -1063,13 +1065,13 @@ class String 'isinsv' => '⋳', 'isinv' => '∈', 'lagran' => 'ℒ', - 'Lang' => '《', - 'lang' => '〈', + 'Lang' => '⟪', + 'lang' => '⟨', 'lArr' => '⇐', - 'lbbrk' => '〔', + 'lbbrk' => '❲', 'le' => '≤', - 'loang' => '〘', - 'lobrk' => '〚', + 'loang' => '⟬', + 'lobrk' => '⟦', 'lopar' => '⦅', 'lowast' => '∗', 'minus' => '−', @@ -1123,12 +1125,12 @@ class String 'qprime' => '⁗', 'quatint' => '⨖', 'radic' => '√', - 'Rang' => '》', - 'rang' => '〉', + 'Rang' => '⟫', + 'rang' => '⟩', 'rArr' => '⇒', - 'rbbrk' => '〕', - 'roang' => '〙', - 'robrk' => '〛', + 'rbbrk' => '❳', + 'roang' => '⟭', + 'robrk' => '⟧', 'ropar' => '⦆', 'rppolint' => '⨒', 'scpolint' => '⨓', @@ -1529,7 +1531,7 @@ class String 'nbsp' => ' ', 'not' => '¬', 'num' => '#', - 'ohm' => 'Ω', + 'ohm' => 'Ω', 'ordf' => 'ª', 'ordm' => 'º', 'para' => '¶', @@ -1590,6 +1592,7 @@ class String 'fflig' => 'ff', 'ffllig' => 'ffl', 'filig' => 'fi', + 'fjlig' => 'fj', 'flat' => '♭', 'fllig' => 'fl', 'frac13' => '⅓', @@ -1714,9 +1717,9 @@ class String 'NotSucceedsTilde' => '≿̸', 'oopf' => '𝕠', 'OverBar' => '¯', - 'OverBrace' => '︷', + 'OverBrace' => '⏞', 'OverBracket' => '⎴', - 'OverParenthesis' => '︵', + 'OverParenthesis' => '⏜', 'planckh' => 'ℎ', 'popf' => '𝕡', 'Product' => '∏', @@ -1738,9 +1741,9 @@ class String 'ThickSpace' => '   ', 'topf' => '𝕥', 'UnderBar' => '̲', - 'UnderBrace' => '︸', + 'UnderBrace' => '⏟', 'UnderBracket' => '⎵', - 'UnderParenthesis' => '︶', + 'UnderParenthesis' => '⏝', 'uopf' => '𝕦', 'UpArrowBar' => '⤒', 'Upsilon' => 'Υ', @@ -1944,11 +1947,11 @@ class String 'intprod' => '⨼', 'InvisibleComma' => '⁣', 'InvisibleTimes' => '⁢', - 'langle' => '〈', + 'langle' => '⟨', 'Laplacetrf' => 'ℒ', 'lbrace' => '{', 'lbrack' => '[', - 'LeftAngleBracket' => '〈', + 'LeftAngleBracket' => '⟨', 'LeftArrow' => '←', 'Leftarrow' => '⇐', 'leftarrow' => '←', @@ -1956,7 +1959,7 @@ class String 'LeftArrowRightArrow' => '⇆', 'leftarrowtail' => '↢', 'LeftCeiling' => '⌈', - 'LeftDoubleBracket' => '〚', + 'LeftDoubleBracket' => '⟦', 'LeftDownVector' => '⇃', 'LeftFloor' => '⌊', 'leftharpoondown' => '↽', @@ -2136,7 +2139,7 @@ class String 'propto' => '∝', 'quaternions' => 'ℍ', 'questeq' => '≟', - 'rangle' => '〉', + 'rangle' => '⟩', 'rationals' => 'ℚ', 'rbrace' => '}', 'rbrack' => ']', @@ -2147,7 +2150,7 @@ class String 'ReverseElement' => '∋', 'ReverseEquilibrium' => '⇋', 'ReverseUpEquilibrium' => '⥯', - 'RightAngleBracket' => '〉', + 'RightAngleBracket' => '⟩', 'RightArrow' => '→', 'Rightarrow' => '⇒', 'rightarrow' => '→', @@ -2155,7 +2158,7 @@ class String 'RightArrowLeftArrow' => '⇄', 'rightarrowtail' => '↣', 'RightCeiling' => '⌉', - 'RightDoubleBracket' => '〛', + 'RightDoubleBracket' => '⟧', 'RightDownVector' => '⇂', 'RightFloor' => '⌋', 'rightharpoondown' => '⇁', @@ -2299,7 +2302,14 @@ class String 'wedge' => '∧', 'wp' => '℘', 'wr' => '≀', - 'zeetrf' => 'ℨ' + 'zeetrf' => 'ℨ', + 'AMP' => '&', + 'COPY' => '©', + 'GT' => '>', + 'LT' => '<', + 'QUOT' => '"', + 'REG' => '®', + 'TRADE' => '™' } unless const_defined? "MATHML_ENTITIES" #:startdoc: @@ -2363,8 +2373,8 @@ class String end def convert_to_utf8 #:nodoc: - if self =~ /^(lt|gt|amp|quot|apos)$/ - self.replace "&" + self + ";" + if self =~ /^(lt|gt|amp|quot|apos)$/i + self.replace "&" + self.downcase + ";" elsif MATHML_ENTITIES.has_key?(self) self.replace MATHML_ENTITIES[self].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*') else diff --git a/lib/stringsupport.rb b/lib/stringsupport.rb index 8349ae26..a16aeda8 100644 --- a/lib/stringsupport.rb +++ b/lib/stringsupport.rb @@ -2295,8 +2295,8 @@ class String end def convert_to_utf8 #:nodoc: - if self =~ /^(lt|gt|amp|quot|apos)$/ - self.replace "&" + self + ";" + if self =~ /^(lt|gt|amp|quot|apos)$/i + self.replace "&" + self.downcase + ";" elsif MATHML_ENTITIES.has_key?(self) self.replace MATHML_ENTITIES[self].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*') else diff --git a/test/unit/sanitize_test.rb b/test/unit/sanitize_test.rb index fb897b6c..90c0050e 100644 --- a/test/unit/sanitize_test.rb +++ b/test/unit/sanitize_test.rb @@ -27,13 +27,13 @@ class SanitizeTest < Test::Unit::TestCase end def my_rex(string) - sanitize_rexml(rexml_doc(string)).gsub(/\A
(.*)<\/div>\Z/m, '\1') + sanitize_rexml(rexml_doc(string.to_utf8)).gsub(/\A
(.*)<\/div>\Z/m, '\1') end def test_sanitize_named_entities - input = '

Greek φ, double-struck 𝔸, numeric 𝔸 ⁗

' - output = "

Greek \317\225, double-struck \360\235\224\270, numeric \360\235\224\270 \342\201\227

" - output2 = "

Greek \317\225, double-struck \360\235\224\270, numeric 𝔸 ⁗

" + input = '

Greek &phis; φ, double-struck 𝔸, numeric 𝔸 ⁗, uppercase ™ <

' + output = "

Greek \317\225 \317\206, double-struck \360\235\224\270, numeric \360\235\224\270 \342\201\227, uppercase \342\204\242 <

" + output2 = "

Greek \317\225 \317\206, double-struck \360\235\224\270, numeric 𝔸 ⁗, uppercase \342\204\242 <

" assert_equal(output, sanitize_xhtml(input)) assert_equal(output, sanitize_html(input)) assert_equal(output, my_rex(input)) diff --git a/test/unit/sanitizer_test.rb b/test/unit/sanitizer_test.rb index 11f865a4..3e94bf01 100644 --- a/test/unit/sanitizer_test.rb +++ b/test/unit/sanitizer_test.rb @@ -22,6 +22,14 @@ class SanitizerTest < Test::Unit::TestCase assert_equal xhtmloutput, do_sanitize_xhtml(input) end + def test_sanitize_named_entities + input = '

Greek &phis; φ, double-struck 𝔸, numeric 𝔸 ⁗, uppercase ™ <

' + output = "

Greek \317\225 \317\206, double-struck \360\235\224\270, numeric \360\235\224\270 \342\201\227, uppercase \342\204\242 <

" + output2 = "

Greek \317\225 \317\206, double-struck \360\235\224\270, numeric 𝔸 ⁗, uppercase \342\204\242 <

" + check_sanitization(input, output, output, output) + assert_equal(output2, input.to_utf8) + end + Sanitizer::ALLOWED_ELEMENTS.each do |tag_name| define_method "test_should_allow_#{tag_name}_tag" do input = "<#{tag_name} title='1'>foo bar baz"