Ensure the_content REALLY is utf-8
Our check that the the_content was valid utf-8 was rather busted. This one works right. In particular, we needed to expand NCRs before checking.
This commit is contained in:
parent
c89aeb6665
commit
ebc409e1a0
3 changed files with 55 additions and 11 deletions
|
@ -128,8 +128,13 @@ class String
|
|||
# string.is_utf8? -> boolean
|
||||
#
|
||||
# returns true if the sequence of bytes in string is valid utf-8
|
||||
#--
|
||||
def is_utf8?
|
||||
self =~ /^(
|
||||
#expand NCRs to utf-8
|
||||
text = self.gsub(/&#x([a-fA-F0-9]+);/) {|m| [$1.hex].pack('U*') }
|
||||
text.gsub!(/&#(\d+);/) {|m| [$1.to_i].pack('U*') }
|
||||
#ensure the resulting string of bytes is valid utf-8
|
||||
text =~ /\A(
|
||||
[\x09\x0A\x0D\x20-\x7E] # ASCII
|
||||
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
||||
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
||||
|
@ -140,8 +145,9 @@ class String
|
|||
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
|
||||
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
|
||||
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
|
||||
)*$/x;
|
||||
)*\Z/x;
|
||||
end
|
||||
#++
|
||||
|
||||
#:stopdoc:
|
||||
MATHML_ENTITIES = {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue