Ensure the_content REALLY is utf-8

Our check that the the_content was valid utf-8 was rather busted.
This one works right. In particular, we needed to expand NCRs before checking.
This commit is contained in:
Jacques Distler 2008-01-03 15:27:03 -06:00
parent c89aeb6665
commit ebc409e1a0
3 changed files with 55 additions and 11 deletions

View file

@ -128,8 +128,13 @@ class String
# string.is_utf8? -> boolean
#
# returns true if the sequence of bytes in string is valid utf-8
#--
def is_utf8?
self =~ /^(
#expand NCRs to utf-8
text = self.gsub(/&#x([a-fA-F0-9]+);/) {|m| [$1.hex].pack('U*') }
text.gsub!(/&#(\d+);/) {|m| [$1.to_i].pack('U*') }
#ensure the resulting string of bytes is valid utf-8
text =~ /\A(
[\x09\x0A\x0D\x20-\x7E] # ASCII
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
@ -140,8 +145,9 @@ class String
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
)*$/x;
)*\Z/x;
end
#++
#:stopdoc:
MATHML_ENTITIES = {