instiki/lib/sanitize.rb
Jacques Distler dfe22be5ff Minor tweak
This is slightly better.
2008-05-17 02:32:20 -05:00

2472 lines
66 KiB
Ruby

# == Introduction
#
# This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes. Its genesis is {described here}[http://golem.ph.utexas.edu/~distler/blog/archives/001181.html].
#
# Uses the {HTML5lib parser}[http://code.google.com/p/html5lib/], so that the parsing behaviour should
# resemble that of browsers.
#
# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
# sanitize_html() is a case-insensitive sanitizer suitable for HTML
# sanitize_rexml() sanitizes a REXML tree, returning a string
#
# == Files
#
# {sanitize.rb}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/lib/sanitize.rb],
# {HTML5lib}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/vendor/plugins/HTML5lib/]
#
# == Author
#
# {Jacques Distler}[http://golem.ph.utexas.edu/~distler/]
#
# == License
#
# Ruby License
module Sanitize
require 'html5/html5parser'
require 'html5/liberalxmlparser'
require 'html5/treewalkers'
require 'html5/treebuilders'
require 'html5/serializer'
require 'html5/sanitizer'
include HTML5
# Sanitize a string, parsed using XHTML parsing rules.
#
# :call-seq:
# sanitize_xhtml(string) -> string
# sanitize_xhtml(string, {:encoding => 'iso-8859-1', :to_tree => true}) -> REXML::Document
#
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
# By default, the output is a string. But, optionally, you can return a REXML tree.
#
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
# (REXML trees are always utf-8 encoded.)
def sanitize_xhtml(html, options = {})
@encoding = 'utf-8'
@treebuilder = TreeBuilders::REXML::TreeBuilder
@to_tree = false
options.each do |name, value|
next unless %w(encoding treebuilder to_tree).include? name.to_s
if name.to_s == 'treebuilder'
@treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
else
instance_variable_set("@#{name}", value)
end
end
if @encoding == 'utf-8'
parsed = XHTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
:lowercase_element_name => false, :lowercase_attr_name => false,
:encoding => @encoding, :tree => @treebuilder })
else
parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
:lowercase_element_name => false, :lowercase_attr_name => false,
:encoding => @encoding, :tree => @treebuilder })
end
return parsed if @to_tree
return parsed.to_s
end
# Sanitize a string, parsed using HTML parsing rules.
#
# :call-seq:
# sanitize_html( string ) -> string
# sanitize_html( string, {:encoding => 'iso-8859-1', :to_tree => true} ) -> REXML::Document
#
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
# By default, the output is a string. But, optionally, you can return a REXML tree.
#
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
# (REXML trees are always utf-8 encoded.)
def sanitize_html(html, options = {})
@encoding = 'utf-8'
@treebuilder = TreeBuilders::REXML::TreeBuilder
@to_tree = false
options.each do |name, value|
next unless %w(encoding treebuilder to_tree).include? name.to_s
if name.to_s == 'treebuilder'
@treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
else
instance_variable_set("@#{name}", value)
end
end
if @encoding == 'utf-8'
parsed = HTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
:encoding => @encoding, :tree => @treebuilder })
else
parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
:encoding => @encoding, :tree => @treebuilder })
end
return parsed if @to_tree
return parsed.to_s
end
# Sanitize a REXML tree. The output is a string.
#
# :call-seq:
# sanitize_rexml(tree) -> string
#
def sanitize_rexml(tree)
tokens = TreeWalkers.get_tree_walker('rexml2').new(tree)
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
:space_before_trailing_solidus => true,
:inject_meta_charset => false,
:sanitize => true})
end
end
# Some useful additions to the String class
class String
# Check whether a string is valid utf-8
#
# :call-seq:
# string.is_utf8? -> boolean
#
# returns true if the sequence of bytes in string is valid utf-8
#--
def is_utf8?
#expand NCRs to utf-8
pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)
1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')}
pieces = pieces.join.split(/&#(\d+);/)
1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')}
text = pieces.join
#ensure the resulting string of bytes is valid utf-8
text =~ /\A(
[\x09\x0A\x0D\x20-\x7E] # ASCII
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
| [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte
| \xEF[\x80-\xBE]{2} #
| \xEF\xBF[\x80-\xBD] # excluding U+fffe and U+ffff
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
)*\Z/x;
end
#++
#:stopdoc:
MATHML_ENTITIES = {
'Alpha' => 'Α',
'Beta' => 'Β',
'Epsilon' => 'Ε',
'Zeta' => 'Ζ',
'Eta' => 'Η',
'Iota' => 'Ι',
'Kappa' => 'Κ',
'Mu' => 'Μ',
'Nu' => 'Ν',
'Omicron' => 'Ο',
'Rho' => 'Ρ',
'Tau' => 'Τ',
'Chi' => 'Χ',
'epsilon' => 'ε',
'zeta' => 'ζ',
'omicron' => 'ο',
'sigmaf' => 'ς',
'thetasym' => 'ϑ',
'upsih' => 'ϒ',
'oline' => '‾',
'frasl' => '⁄',
'alefsym' => 'ℵ',
'crarr' => '↵',
'empty' => '∅',
'amp' => '&',
'lt' => '<',
'zwnj' => '‌',
'zwj' => '‍',
'lrm' => '‎',
'rlm' => '‏',
'sbquo' => '‚',
'bdquo' => '„',
'lsaquo' => '‹',
'rsaquo' => '›',
'euro' => '€',
'angzarr' => '⍼',
'cirmid' => '⫯',
'cudarrl' => '⤸',
'cudarrr' => '⤵',
'cularr' => '↶',
'cularrp' => '⤽',
'curarr' => '↷',
'curarrm' => '⤼',
'Darr' => '↡',
'dArr' => '⇓',
'ddarr' => '⇊',
'DDotrahd' => '⤑',
'dfisht' => '⥿',
'dHar' => '⥥',
'dharl' => '⇃',
'dharr' => '⇂',
'duarr' => '⇵',
'duhar' => '⥯',
'dzigrarr' => '⟿',
'erarr' => '⥱',
'hArr' => '⇔',
'harr' => '↔',
'harrcir' => '⥈',
'harrw' => '↭',
'hoarr' => '⇿',
'imof' => '⊷',
'lAarr' => '⇚',
'Larr' => '↞',
'larrbfs' => '⤟',
'larrfs' => '⤝',
'larrhk' => '↩',
'larrlp' => '↫',
'larrpl' => '⤹',
'larrsim' => '⥳',
'larrtl' => '↢',
'lAtail' => '⤛',
'latail' => '⤙',
'lBarr' => '⤎',
'lbarr' => '⤌',
'ldca' => '⤶',
'ldrdhar' => '⥧',
'ldrushar' => '⥋',
'ldsh' => '↲',
'lfisht' => '⥼',
'lHar' => '⥢',
'lhard' => '↽',
'lharu' => '↼',
'lharul' => '⥪',
'llarr' => '⇇',
'llhard' => '⥫',
'loarr' => '⇽',
'lrarr' => '⇆',
'lrhar' => '⇋',
'lrhard' => '⥭',
'lsh' => '↰',
'lurdshar' => '⥊',
'luruhar' => '⥦',
'Map' => '⤅',
'map' => '↦',
'midcir' => '⫰',
'mumap' => '⊸',
'nearhk' => '⤤',
'neArr' => '⇗',
'nearr' => '↗',
'nesear' => '⤨',
'nhArr' => '⇎',
'nharr' => '↮',
'nlArr' => '⇍',
'nlarr' => '↚',
'nrArr' => '⇏',
'nrarr' => '↛',
'nrarrc' => '⤳̸',
'nrarrw' => '↝̸',
'nvHarr' => '⤄',
'nvlArr' => '⤂',
'nvrArr' => '⤃',
'nwarhk' => '⤣',
'nwArr' => '⇖',
'nwarr' => '↖',
'nwnear' => '⤧',
'olarr' => '↺',
'orarr' => '↻',
'origof' => '⊶',
'rAarr' => '⇛',
'Rarr' => '↠',
'rarrap' => '⥵',
'rarrbfs' => '⤠',
'rarrc' => '⤳',
'rarrfs' => '⤞',
'rarrhk' => '↪',
'rarrlp' => '↬',
'rarrpl' => '⥅',
'rarrsim' => '⥴',
'Rarrtl' => '⤖',
'rarrtl' => '↣',
'rarrw' => '↝',
'rAtail' => '⤜',
'ratail' => '⤚',
'RBarr' => '⤐',
'rBarr' => '⤏',
'rbarr' => '⤍',
'rdca' => '⤷',
'rdldhar' => '⥩',
'rdsh' => '↳',
'rfisht' => '⥽',
'rHar' => '⥤',
'rhard' => '⇁',
'rharu' => '⇀',
'rharul' => '⥬',
'rlarr' => '⇄',
'rlhar' => '⇌',
'roarr' => '⇾',
'rrarr' => '⇉',
'rsh' => '↱',
'ruluhar' => '⥨',
'searhk' => '⤥',
'seArr' => '⇘',
'searr' => '↘',
'seswar' => '⤩',
'simrarr' => '⥲',
'slarr' => '←',
'srarr' => '→',
'swarhk' => '⤦',
'swArr' => '⇙',
'swarr' => '↙',
'swnwar' => '⤪',
'Uarr' => '↟',
'uArr' => '⇑',
'Uarrocir' => '⥉',
'udarr' => '⇅',
'udhar' => '⥮',
'ufisht' => '⥾',
'uHar' => '⥣',
'uharl' => '↿',
'uharr' => '↾',
'uuarr' => '⇈',
'vArr' => '⇕',
'varr' => '↕',
'xhArr' => '⟺',
'xharr' => '⟷',
'xlArr' => '⟸',
'xlarr' => '⟵',
'xmap' => '⟼',
'xrArr' => '⟹',
'xrarr' => '⟶',
'zigrarr' => '⇝',
'ac' => '∾',
'acE' => '∾̳',
'amalg' => '⨿',
'barvee' => '⊽',
'Barwed' => '⌆',
'barwed' => '⌅',
'bsolb' => '⧅',
'Cap' => '⋒',
'capand' => '⩄',
'capbrcup' => '⩉',
'capcap' => '⩋',
'capcup' => '⩇',
'capdot' => '⩀',
'caps' => '∩︀',
'ccaps' => '⩍',
'ccups' => '⩌',
'ccupssm' => '⩐',
'coprod' => '∐',
'Cup' => '⋓',
'cupbrcap' => '⩈',
'cupcap' => '⩆',
'cupcup' => '⩊',
'cupdot' => '⊍',
'cupor' => '⩅',
'cups' => '∪︀',
'cuvee' => '⋎',
'cuwed' => '⋏',
'Dagger' => '‡',
'dagger' => '†',
'diam' => '⋄',
'divonx' => '⋇',
'eplus' => '⩱',
'hercon' => '⊹',
'intcal' => '⊺',
'iprod' => '⨼',
'loplus' => '⨭',
'lotimes' => '⨴',
'lthree' => '⋋',
'ltimes' => '⋉',
'midast' => '*',
'minusb' => '⊟',
'minusd' => '∸',
'minusdu' => '⨪',
'ncap' => '⩃',
'ncup' => '⩂',
'oast' => '⊛',
'ocir' => '⊚',
'odash' => '⊝',
'odiv' => '⨸',
'odot' => '⊙',
'odsold' => '⦼',
'ofcir' => '⦿',
'ogt' => '⧁',
'ohbar' => '⦵',
'olcir' => '⦾',
'olt' => '⧀',
'omid' => '⦶',
'ominus' => '⊖',
'opar' => '⦷',
'operp' => '⦹',
'oplus' => '⊕',
'osol' => '⊘',
'Otimes' => '⨷',
'otimes' => '⊗',
'otimesas' => '⨶',
'ovbar' => '⌽',
'plusacir' => '⨣',
'plusb' => '⊞',
'pluscir' => '⨢',
'plusdo' => '∔',
'plusdu' => '⨥',
'pluse' => '⩲',
'plussim' => '⨦',
'plustwo' => '⨧',
'prod' => '∏',
'race' => '⧚',
'roplus' => '⨮',
'rotimes' => '⨵',
'rthree' => '⋌',
'rtimes' => '⋊',
'sdot' => '⋅',
'sdotb' => '⊡',
'setmn' => '∖',
'simplus' => '⨤',
'smashp' => '⨳',
'solb' => '⧄',
'sqcap' => '⊓',
'sqcaps' => '⊓︀',
'sqcup' => '⊔',
'sqcups' => '⊔︀',
'ssetmn' => '∖',
'sstarf' => '⋆',
'subdot' => '⪽',
'sum' => '∑',
'supdot' => '⪾',
'timesb' => '⊠',
'timesbar' => '⨱',
'timesd' => '⨰',
'tridot' => '◬',
'triminus' => '⨺',
'triplus' => '⨹',
'trisb' => '⧍',
'tritime' => '⨻',
'uplus' => '⊎',
'veebar' => '⊻',
'wedbar' => '⩟',
'wreath' => '≀',
'xcap' => '⋂',
'xcirc' => '◯',
'xcup' => '⋃',
'xdtri' => '▽',
'xodot' => '⨀',
'xoplus' => '⨁',
'xotime' => '⨂',
'xsqcup' => '⨆',
'xuplus' => '⨄',
'xutri' => '△',
'xvee' => '⋁',
'xwedge' => '⋀',
'dlcorn' => '⌞',
'drcorn' => '⌟',
'gtlPar' => '⦕',
'langd' => '⦑',
'lbrke' => '⦋',
'lbrksld' => '⦏',
'lbrkslu' => '⦍',
'lceil' => '⌈',
'lfloor' => '⌊',
'lmoust' => '⎰',
'lparlt' => '⦓',
'ltrPar' => '⦖',
'rangd' => '⦒',
'rbrke' => '⦌',
'rbrksld' => '⦎',
'rbrkslu' => '⦐',
'rceil' => '⌉',
'rfloor' => '⌋',
'rmoust' => '⎱',
'rpargt' => '⦔',
'ulcorn' => '⌜',
'urcorn' => '⌝',
'gnap' => '⪊',
'gnE' => '≩',
'gne' => '⪈',
'gnsim' => '⋧',
'gvnE' => '≩︀',
'lnap' => '⪉',
'lnE' => '≨',
'lne' => '⪇',
'lnsim' => '⋦',
'lvnE' => '≨︀',
'nap' => '≉',
'napE' => '⩰̸',
'napid' => '≋̸',
'ncong' => '≇',
'ncongdot' => '⩭̸',
'nequiv' => '≢',
'ngE' => '≧̸',
'nge' => '≱',
'nges' => '⩾̸',
'nGg' => '⋙̸',
'ngsim' => '≵',
'nGt' => '≫⃒',
'ngt' => '≯',
'nGtv' => '≫̸',
'nlE' => '≦̸',
'nle' => '≰',
'nles' => '⩽̸',
'nLl' => '⋘̸',
'nlsim' => '≴',
'nLt' => '≪⃒',
'nlt' => '≮',
'nltri' => '⋪',
'nltrie' => '⋬',
'nLtv' => '≪̸',
'nmid' => '∤',
'npar' => '∦',
'npr' => '⊀',
'nprcue' => '⋠',
'npre' => '⪯̸',
'nrtri' => '⋫',
'nrtrie' => '⋭',
'nsc' => '⊁',
'nsccue' => '⋡',
'nsce' => '⪰̸',
'nsim' => '≁',
'nsime' => '≄',
'nsmid' => '∤',
'nspar' => '∦',
'nsqsube' => '⋢',
'nsqsupe' => '⋣',
'nsub' => '⊄',
'nsubE' => '⫅̸',
'nsube' => '⊈',
'nsup' => '⊅',
'nsupE' => '⫆̸',
'nsupe' => '⊉',
'ntgl' => '≹',
'ntlg' => '≸',
'nvap' => '≍⃒',
'nVDash' => '⊯',
'nVdash' => '⊮',
'nvDash' => '⊭',
'nvdash' => '⊬',
'nvge' => '≥⃒',
'nvgt' => '>⃒',
'nvle' => '≤⃒',
'nvltrie' => '⊴⃒',
'nvrtrie' => '⊵⃒',
'nvsim' => '∼⃒',
'parsim' => '⫳',
'prnap' => '⪹',
'prnE' => '⪵',
'prnsim' => '⋨',
'rnmid' => '⫮',
'scnap' => '⪺',
'scnE' => '⪶',
'scnsim' => '⋩',
'simne' => '≆',
'solbar' => '⌿',
'subnE' => '⫋',
'subne' => '⊊',
'supnE' => '⫌',
'supne' => '⊋',
'vnsub' => '⊂⃒',
'vnsup' => '⊃⃒',
'vsubnE' => '⫋︀',
'vsubne' => '⊊︀',
'vsupnE' => '⫌︀',
'vsupne' => '⊋︀',
'ang' => '∠',
'ange' => '⦤',
'angmsd' => '∡',
'angmsdaa' => '⦨',
'angmsdab' => '⦩',
'angmsdac' => '⦪',
'angmsdad' => '⦫',
'angmsdae' => '⦬',
'angmsdaf' => '⦭',
'angmsdag' => '⦮',
'angmsdah' => '⦯',
'angrtvb' => '⊾',
'angrtvbd' => '⦝',
'bbrk' => '⎵',
'bbrktbrk' => '⎶',
'bemptyv' => '⦰',
'beth' => 'ℶ',
'boxbox' => '⧉',
'bprime' => '‵',
'bsemi' => '⁏',
'cemptyv' => '⦲',
'cirE' => '⧃',
'cirscir' => '⧂',
'comp' => '∁',
'daleth' => 'ℸ',
'demptyv' => '⦱',
'ell' => 'ℓ',
'empty' => '∅',
'emptyv' => '∅',
'gimel' => 'ℷ',
'iiota' => '℩',
'image' => 'ℑ',
'imath' => 'ı',
'jmath' => 'j',
'laemptyv' => '⦴',
'lltri' => '◺',
'lrtri' => '⊿',
'mho' => '℧',
'nang' => '∠⃒',
'nexist' => '∄',
'oS' => 'Ⓢ',
'planck' => 'ℏ',
'plankv' => 'ℏ',
'raemptyv' => '⦳',
'range' => '⦥',
'real' => 'ℜ',
'tbrk' => '⎴',
'trpezium' => '�',
'ultri' => '◸',
'urtri' => '◹',
'vzigzag' => '⦚',
'weierp' => '℘',
'apE' => '⩰',
'ape' => '≊',
'apid' => '≋',
'asymp' => '≈',
'Barv' => '⫧',
'bcong' => '≌',
'bepsi' => '϶',
'bowtie' => '⋈',
'bsim' => '∽',
'bsime' => '⋍',
'bsolhsub' => '\⊂',
'bump' => '≎',
'bumpE' => '⪮',
'bumpe' => '≏',
'cire' => '≗',
'Colon' => '∷',
'Colone' => '⩴',
'colone' => '≔',
'congdot' => '⩭',
'csub' => '⫏',
'csube' => '⫑',
'csup' => '⫐',
'csupe' => '⫒',
'cuepr' => '⋞',
'cuesc' => '⋟',
'Dashv' => '⫤',
'dashv' => '⊣',
'easter' => '⩮',
'ecir' => '≖',
'ecolon' => '≕',
'eDDot' => '⩷',
'eDot' => '≑',
'efDot' => '≒',
'eg' => '⪚',
'egs' => '⪖',
'egsdot' => '⪘',
'el' => '⪙',
'els' => '⪕',
'elsdot' => '⪗',
'equest' => '≟',
'equivDD' => '⩸',
'erDot' => '≓',
'esdot' => '≐',
'Esim' => '⩳',
'esim' => '≂',
'fork' => '⋔',
'forkv' => '⫙',
'frown' => '⌢',
'gap' => '⪆',
'gE' => '≧',
'gEl' => '⪌',
'gel' => '⋛',
'ges' => '⩾',
'gescc' => '⪩',
'gesdot' => '⪀',
'gesdoto' => '⪂',
'gesdotol' => '⪄',
'gesl' => '⋛︀',
'gesles' => '⪔',
'Gg' => '⋙',
'gl' => '≷',
'gla' => '⪥',
'glE' => '⪒',
'glj' => '⪤',
'gsim' => '≳',
'gsime' => '⪎',
'gsiml' => '⪐',
'Gt' => '≫',
'gtcc' => '⪧',
'gtcir' => '⩺',
'gtdot' => '⋗',
'gtquest' => '⩼',
'gtrarr' => '⥸',
'homtht' => '∻',
'lap' => '⪅',
'lat' => '⪫',
'late' => '⪭',
'lates' => '⪭︀',
'lE' => '≦',
'lEg' => '⪋',
'leg' => '⋚',
'les' => '⩽',
'lescc' => '⪨',
'lesdot' => '⩿',
'lesdoto' => '⪁',
'lesdotor' => '⪃',
'lesg' => '⋚︀',
'lesges' => '⪓',
'lg' => '≶',
'lgE' => '⪑',
'Ll' => '⋘',
'lsim' => '≲',
'lsime' => '⪍',
'lsimg' => '⪏',
'Lt' => '≪',
'ltcc' => '⪦',
'ltcir' => '⩹',
'ltdot' => '⋖',
'ltlarr' => '⥶',
'ltquest' => '⩻',
'ltrie' => '⊴',
'mcomma' => '⨩',
'mDDot' => '∺',
'mid' => '∣',
'mlcp' => '⫛',
'models' => '⊧',
'mstpos' => '∾',
'Pr' => '⪻',
'pr' => '≺',
'prap' => '⪷',
'prcue' => '≼',
'prE' => '⪳',
'pre' => '⪯',
'prsim' => '≾',
'prurel' => '⊰',
'ratio' => '∶',
'rtrie' => '⊵',
'rtriltri' => '⧎',
'Sc' => '⪼',
'sc' => '≻',
'scap' => '⪸',
'sccue' => '≽',
'scE' => '⪴',
'sce' => '⪰',
'scsim' => '≿',
'sdote' => '⩦',
'sfrown' => '⌢',
'simg' => '⪞',
'simgE' => '⪠',
'siml' => '⪝',
'simlE' => '⪟',
'smid' => '∣',
'smile' => '⌣',
'smt' => '⪪',
'smte' => '⪬',
'smtes' => '⪬︀',
'spar' => '∥',
'sqsub' => '⊏',
'sqsube' => '⊑',
'sqsup' => '⊐',
'sqsupe' => '⊒',
'ssmile' => '⌣',
'Sub' => '⋐',
'subE' => '⫅',
'subedot' => '⫃',
'submult' => '⫁',
'subplus' => '⪿',
'subrarr' => '⥹',
'subsim' => '⫇',
'subsub' => '⫕',
'subsup' => '⫓',
'Sup' => '⋑',
'supdsub' => '⫘',
'supE' => '⫆',
'supedot' => '⫄',
'suphsol' => '⊃/',
'suphsub' => '⫗',
'suplarr' => '⥻',
'supmult' => '⫂',
'supplus' => '⫀',
'supsim' => '⫈',
'supsub' => '⫔',
'supsup' => '⫖',
'thkap' => '≈',
'thksim' => '∼',
'topfork' => '⫚',
'trie' => '≜',
'twixt' => '≬',
'Vbar' => '⫫',
'vBar' => '⫨',
'vBarv' => '⫩',
'VDash' => '⊫',
'Vdash' => '⊩',
'vDash' => '⊨',
'vdash' => '⊢',
'Vdashl' => '⫦',
'vltri' => '⊲',
'vprop' => '∝',
'vrtri' => '⊳',
'Vvdash' => '⊪',
'alpha' => 'α',
'beta' => 'β',
'chi' => 'χ',
'Delta' => 'Δ',
'delta' => 'δ',
'epsi' => 'ϵ',
'epsiv' => 'ε',
'eta' => 'η',
'Gamma' => 'Γ',
'gamma' => 'γ',
'Gammad' => 'Ϝ',
'gammad' => 'ϝ',
'iota' => 'ι',
'kappa' => 'κ',
'kappav' => 'ϰ',
'Lambda' => 'Λ',
'lambda' => 'λ',
'mu' => 'μ',
'nu' => 'ν',
'Omega' => 'Ω',
'omega' => 'ω',
'Phi' => 'Φ',
'phi' => 'ϕ',
'phiv' => 'φ',
'Pi' => 'Π',
'pi' => 'π',
'piv' => 'ϖ',
'Psi' => 'Ψ',
'psi' => 'ψ',
'rho' => 'ρ',
'rhov' => 'ϱ',
'Sigma' => 'Σ',
'sigma' => 'σ',
'sigmav' => 'ς',
'tau' => 'τ',
'Theta' => 'Θ',
'theta' => 'θ',
'thetav' => 'ϑ',
'Upsi' => 'ϒ',
'upsi' => 'υ',
'Xi' => 'Ξ',
'xi' => 'ξ',
'zeta' => 'ζ',
'Afr' => '𝔄',
'afr' => '𝔞',
'Bfr' => '𝔅',
'bfr' => '𝔟',
'Cfr' => 'ℭ',
'cfr' => '𝔠',
'Dfr' => '𝔇',
'dfr' => '𝔡',
'Efr' => '𝔈',
'efr' => '𝔢',
'Ffr' => '𝔉',
'ffr' => '𝔣',
'Gfr' => '𝔊',
'gfr' => '𝔤',
'Hfr' => 'ℌ',
'hfr' => '𝔥',
'Ifr' => 'ℑ',
'ifr' => '𝔦',
'Jfr' => '𝔍',
'jfr' => '𝔧',
'Kfr' => '𝔎',
'kfr' => '𝔨',
'Lfr' => '𝔏',
'lfr' => '𝔩',
'Mfr' => '𝔐',
'mfr' => '𝔪',
'Nfr' => '𝔑',
'nfr' => '𝔫',
'Ofr' => '𝔒',
'ofr' => '𝔬',
'Pfr' => '𝔓',
'pfr' => '𝔭',
'Qfr' => '𝔔',
'qfr' => '𝔮',
'Rfr' => 'ℜ',
'rfr' => '𝔯',
'Sfr' => '𝔖',
'sfr' => '𝔰',
'Tfr' => '𝔗',
'tfr' => '𝔱',
'Ufr' => '𝔘',
'ufr' => '𝔲',
'Vfr' => '𝔙',
'vfr' => '𝔳',
'Wfr' => '𝔚',
'wfr' => '𝔴',
'Xfr' => '𝔛',
'xfr' => '𝔵',
'Yfr' => '𝔜',
'yfr' => '𝔶',
'Zfr' => 'ℨ',
'zfr' => '𝔷',
'Aopf' => '𝔸',
'Bopf' => '𝔹',
'Copf' => 'ℂ',
'Dopf' => '𝔻',
'Eopf' => '𝔼',
'Fopf' => '𝔽',
'Gopf' => '𝔾',
'Hopf' => 'ℍ',
'Iopf' => '𝕀',
'Jopf' => '𝕁',
'Kopf' => '𝕂',
'Lopf' => '𝕃',
'Mopf' => '𝕄',
'Nopf' => 'ℕ',
'Oopf' => '𝕆',
'Popf' => 'ℙ',
'Qopf' => 'ℚ',
'Ropf' => 'ℝ',
'Sopf' => '𝕊',
'Topf' => '𝕋',
'Uopf' => '𝕌',
'Vopf' => '𝕍',
'Wopf' => '𝕎',
'Xopf' => '𝕏',
'Yopf' => '𝕐',
'Zopf' => 'ℤ',
'Ascr' => '𝒜',
'ascr' => '𝒶',
'Bscr' => 'ℬ',
'bscr' => '𝒷',
'Cscr' => '𝒞',
'cscr' => '𝒸',
'Dscr' => '𝒟',
'dscr' => '𝒹',
'Escr' => 'ℰ',
'escr' => 'ℯ',
'Fscr' => 'ℱ',
'fscr' => '𝒻',
'Gscr' => '𝒢',
'gscr' => 'ℊ',
'Hscr' => 'ℋ',
'hscr' => '𝒽',
'Iscr' => 'ℐ',
'iscr' => '𝒾',
'Jscr' => '𝒥',
'jscr' => '𝒿',
'Kscr' => '𝒦',
'kscr' => '𝓀',
'Lscr' => 'ℒ',
'lscr' => '𝓁',
'Mscr' => 'ℳ',
'mscr' => '𝓂',
'Nscr' => '𝒩',
'nscr' => '𝓃',
'Oscr' => '𝒪',
'oscr' => 'ℴ',
'Pscr' => '𝒫',
'pscr' => '𝓅',
'Qscr' => '𝒬',
'qscr' => '𝓆',
'Rscr' => 'ℛ',
'rscr' => '𝓇',
'Sscr' => '𝒮',
'sscr' => '𝓈',
'Tscr' => '𝒯',
'tscr' => '𝓉',
'Uscr' => '𝒰',
'uscr' => '𝓊',
'Vscr' => '𝒱',
'vscr' => '𝓋',
'Wscr' => '𝒲',
'wscr' => '𝓌',
'Xscr' => '𝒳',
'xscr' => '𝓍',
'Yscr' => '𝒴',
'yscr' => '𝓎',
'Zscr' => '𝒵',
'zscr' => '𝓏',
'acd' => '∿',
'aleph' => 'ℵ',
'And' => '⩓',
'and' => '∧',
'andand' => '⩕',
'andd' => '⩜',
'andslope' => '⩘',
'andv' => '⩚',
'angrt' => '∟',
'angsph' => '∢',
'angst' => 'Å',
'ap' => '≈',
'apacir' => '⩯',
'awconint' => '∳',
'awint' => '⨑',
'becaus' => '∵',
'bernou' => 'ℬ',
'bne' => '=⃥',
'bnequiv' => '≡⃥',
'bNot' => '⫭',
'bnot' => '⌐',
'bottom' => '⊥',
'cap' => '∩',
'Cconint' => '∰',
'cirfnint' => '⨐',
'compfn' => '∘',
'cong' => '≅',
'Conint' => '∯',
'conint' => '∮',
'ctdot' => '⋯',
'cup' => '∪',
'cwconint' => '∲',
'cwint' => '∱',
'cylcty' => '⌭',
'disin' => '⋲',
'Dot' => '¨',
'DotDot' => '⃜',
'dsol' => '⧶',
'dtdot' => '⋱',
'dwangle' => '⦦',
'elinters' => '�',
'epar' => '⋕',
'eparsl' => '⧣',
'equiv' => '≡',
'eqvparsl' => '⧥',
'exist' => '∃',
'fltns' => '▱',
'fnof' => 'ƒ',
'forall' => '∀',
'fpartint' => '⨍',
'ge' => '≥',
'hamilt' => 'ℋ',
'iff' => '⇔',
'iinfin' => '⧜',
'imped' => 'Ƶ',
'infin' => '∞',
'infintie' => '⧝',
'Int' => '∬',
'int' => '∫',
'intlarhk' => '⨗',
'isin' => '∈',
'isindot' => '⋵',
'isinE' => '⋹',
'isins' => '⋴',
'isinsv' => '⋳',
'isinv' => '∈',
'lagran' => 'ℒ',
'Lang' => '《',
'lang' => '〈',
'lArr' => '⇐',
'lbbrk' => '〔',
'le' => '≤',
'loang' => '〘',
'lobrk' => '〚',
'lopar' => '⦅',
'lowast' => '∗',
'minus' => '−',
'mnplus' => '∓',
'nabla' => '∇',
'ne' => '≠',
'nedot' => '≐̸',
'nhpar' => '⫲',
'ni' => '∋',
'nis' => '⋼',
'nisd' => '⋺',
'niv' => '∋',
'Not' => '⫬',
'notin' => '∉',
'notindot' => '⋵̸',
'notinE' => '⋹̸',
'notinva' => '∉',
'notinvb' => '⋷',
'notinvc' => '⋶',
'notni' => '∌',
'notniva' => '∌',
'notnivb' => '⋾',
'notnivc' => '⋽',
'nparsl' => '⫽⃥',
'npart' => '∂̸',
'npolint' => '⨔',
'nvinfin' => '⧞',
'olcross' => '⦻',
'Or' => '⩔',
'or' => '∨',
'ord' => '⩝',
'order' => 'ℴ',
'oror' => '⩖',
'orslope' => '⩗',
'orv' => '⩛',
'par' => '∥',
'parsl' => '⫽',
'part' => '∂',
'permil' => '‰',
'perp' => '⊥',
'pertenk' => '‱',
'phmmat' => 'ℳ',
'pointint' => '⨕',
'Prime' => '″',
'prime' => '′',
'profalar' => '⌮',
'profline' => '⌒',
'profsurf' => '⌓',
'prop' => '∝',
'qint' => '⨌',
'qprime' => '⁗',
'quatint' => '⨖',
'radic' => '√',
'Rang' => '》',
'rang' => '〉',
'rArr' => '⇒',
'rbbrk' => '〕',
'roang' => '〙',
'robrk' => '〛',
'ropar' => '⦆',
'rppolint' => '⨒',
'scpolint' => '⨓',
'sim' => '∼',
'simdot' => '⩪',
'sime' => '≃',
'smeparsl' => '⧤',
'square' => '□',
'squarf' => '▪',
'strns' => '¯',
'sub' => '⊂',
'sube' => '⊆',
'sup' => '⊃',
'supe' => '⊇',
'tdot' => '⃛',
'there4' => '∴',
'tint' => '∭',
'top' => '⊤',
'topbot' => '⌶',
'topcir' => '⫱',
'tprime' => '‴',
'utdot' => '⋰',
'uwangle' => '⦧',
'vangrt' => '⦜',
'veeeq' => '≚',
'Verbar' => '‖',
'wedgeq' => '≙',
'xnis' => '⋻',
'boxDL' => '╗',
'boxDl' => '╖',
'boxdL' => '╕',
'boxdl' => '┐',
'boxDR' => '╔',
'boxDr' => '╓',
'boxdR' => '╒',
'boxdr' => '┌',
'boxH' => '═',
'boxh' => '─',
'boxHD' => '╦',
'boxHd' => '╤',
'boxhD' => '╥',
'boxhd' => '┬',
'boxHU' => '╩',
'boxHu' => '╧',
'boxhU' => '╨',
'boxhu' => '┴',
'boxUL' => '╝',
'boxUl' => '╜',
'boxuL' => '╛',
'boxul' => '┘',
'boxUR' => '╚',
'boxUr' => '╙',
'boxuR' => '╘',
'boxur' => '└',
'boxV' => '║',
'boxv' => '│',
'boxVH' => '╬',
'boxVh' => '╫',
'boxvH' => '╪',
'boxvh' => '┼',
'boxVL' => '╣',
'boxVl' => '╢',
'boxvL' => '╡',
'boxvl' => '┤',
'boxVR' => '╠',
'boxVr' => '╟',
'boxvR' => '╞',
'boxvr' => '├',
'Acy' => 'А',
'acy' => 'а',
'Bcy' => 'Б',
'bcy' => 'б',
'CHcy' => 'Ч',
'chcy' => 'ч',
'Dcy' => 'Д',
'dcy' => 'д',
'Ecy' => 'Э',
'ecy' => 'э',
'Fcy' => 'Ф',
'fcy' => 'ф',
'Gcy' => 'Г',
'gcy' => 'г',
'HARDcy' => 'Ъ',
'hardcy' => 'ъ',
'Icy' => 'И',
'icy' => 'и',
'IEcy' => 'Е',
'iecy' => 'е',
'IOcy' => 'Ё',
'iocy' => 'ё',
'Jcy' => 'Й',
'jcy' => 'й',
'Kcy' => 'К',
'kcy' => 'к',
'KHcy' => 'Х',
'khcy' => 'х',
'Lcy' => 'Л',
'lcy' => 'л',
'Mcy' => 'М',
'mcy' => 'м',
'Ncy' => 'Н',
'ncy' => 'н',
'numero' => '№',
'Ocy' => 'О',
'ocy' => 'о',
'Pcy' => 'П',
'pcy' => 'п',
'Rcy' => 'Р',
'rcy' => 'р',
'Scy' => 'С',
'scy' => 'с',
'SHCHcy' => 'Щ',
'shchcy' => 'щ',
'SHcy' => 'Ш',
'shcy' => 'ш',
'SOFTcy' => 'Ь',
'softcy' => 'ь',
'Tcy' => 'Т',
'tcy' => 'т',
'TScy' => 'Ц',
'tscy' => 'ц',
'Ucy' => 'У',
'ucy' => 'у',
'Vcy' => 'В',
'vcy' => 'в',
'YAcy' => 'Я',
'yacy' => 'я',
'Ycy' => 'Ы',
'ycy' => 'ы',
'YUcy' => 'Ю',
'yucy' => 'ю',
'Zcy' => 'З',
'zcy' => 'з',
'ZHcy' => 'Ж',
'zhcy' => 'ж',
'DJcy' => 'Ђ',
'djcy' => 'ђ',
'DScy' => 'Ѕ',
'dscy' => 'ѕ',
'DZcy' => 'Џ',
'dzcy' => 'џ',
'GJcy' => 'Ѓ',
'gjcy' => 'ѓ',
'Iukcy' => 'І',
'iukcy' => 'і',
'Jsercy' => 'Ј',
'jsercy' => 'ј',
'Jukcy' => 'Є',
'jukcy' => 'є',
'KJcy' => 'Ќ',
'kjcy' => 'ќ',
'LJcy' => 'Љ',
'ljcy' => 'љ',
'NJcy' => 'Њ',
'njcy' => 'њ',
'TSHcy' => 'Ћ',
'tshcy' => 'ћ',
'Ubrcy' => 'Ў',
'ubrcy' => 'ў',
'YIcy' => 'Ї',
'yicy' => 'ї',
'acute' => '´',
'breve' => '˘',
'caron' => 'ˇ',
'cedil' => '¸',
'circ' => 'ˆ',
'dblac' => '˝',
'die' => '¨',
'dot' => '˙',
'grave' => '`',
'macr' => '¯',
'ogon' => '˛',
'ring' => '˚',
'tilde' => '˜',
'uml' => '¨',
'Aacute' => 'Á',
'aacute' => 'á',
'Acirc' => 'Â',
'acirc' => 'â',
'AElig' => 'Æ',
'aelig' => 'æ',
'Agrave' => 'À',
'agrave' => 'à',
'Aring' => 'Å',
'aring' => 'å',
'Atilde' => 'Ã',
'atilde' => 'ã',
'Auml' => 'Ä',
'auml' => 'ä',
'Ccedil' => 'Ç',
'ccedil' => 'ç',
'Eacute' => 'É',
'eacute' => 'é',
'Ecirc' => 'Ê',
'ecirc' => 'ê',
'Egrave' => 'È',
'egrave' => 'è',
'ETH' => 'Ð',
'eth' => 'ð',
'Euml' => 'Ë',
'euml' => 'ë',
'Iacute' => 'Í',
'iacute' => 'í',
'Icirc' => 'Î',
'icirc' => 'î',
'Igrave' => 'Ì',
'igrave' => 'ì',
'Iuml' => 'Ï',
'iuml' => 'ï',
'Ntilde' => 'Ñ',
'ntilde' => 'ñ',
'Oacute' => 'Ó',
'oacute' => 'ó',
'Ocirc' => 'Ô',
'ocirc' => 'ô',
'Ograve' => 'Ò',
'ograve' => 'ò',
'Oslash' => 'Ø',
'oslash' => 'ø',
'Otilde' => 'Õ',
'otilde' => 'õ',
'Ouml' => 'Ö',
'ouml' => 'ö',
'szlig' => 'ß',
'THORN' => 'Þ',
'thorn' => 'þ',
'Uacute' => 'Ú',
'uacute' => 'ú',
'Ucirc' => 'Û',
'ucirc' => 'û',
'Ugrave' => 'Ù',
'ugrave' => 'ù',
'Uuml' => 'Ü',
'uuml' => 'ü',
'Yacute' => 'Ý',
'yacute' => 'ý',
'yuml' => 'ÿ',
'Abreve' => 'Ă',
'abreve' => 'ă',
'Amacr' => 'Ā',
'amacr' => 'ā',
'Aogon' => 'Ą',
'aogon' => 'ą',
'Cacute' => 'Ć',
'cacute' => 'ć',
'Ccaron' => 'Č',
'ccaron' => 'č',
'Ccirc' => 'Ĉ',
'ccirc' => 'ĉ',
'Cdot' => 'Ċ',
'cdot' => 'ċ',
'Dcaron' => 'Ď',
'dcaron' => 'ď',
'Dstrok' => 'Đ',
'dstrok' => 'đ',
'Ecaron' => 'Ě',
'ecaron' => 'ě',
'Edot' => 'Ė',
'edot' => 'ė',
'Emacr' => 'Ē',
'emacr' => 'ē',
'ENG' => 'Ŋ',
'eng' => 'ŋ',
'Eogon' => 'Ę',
'eogon' => 'ę',
'gacute' => 'ǵ',
'Gbreve' => 'Ğ',
'gbreve' => 'ğ',
'Gcedil' => 'Ģ',
'Gcirc' => 'Ĝ',
'gcirc' => 'ĝ',
'Gdot' => 'Ġ',
'gdot' => 'ġ',
'Hcirc' => 'Ĥ',
'hcirc' => 'ĥ',
'Hstrok' => 'Ħ',
'hstrok' => 'ħ',
'Idot' => 'İ',
'IJlig' => 'IJ',
'ijlig' => 'ij',
'Imacr' => 'Ī',
'imacr' => 'ī',
'inodot' => 'ı',
'Iogon' => 'Į',
'iogon' => 'į',
'Itilde' => 'Ĩ',
'itilde' => 'ĩ',
'Jcirc' => 'Ĵ',
'jcirc' => 'ĵ',
'Kcedil' => 'Ķ',
'kcedil' => 'ķ',
'kgreen' => 'ĸ',
'Lacute' => 'Ĺ',
'lacute' => 'ĺ',
'Lcaron' => 'Ľ',
'lcaron' => 'ľ',
'Lcedil' => 'Ļ',
'lcedil' => 'ļ',
'Lmidot' => 'Ŀ',
'lmidot' => 'ŀ',
'Lstrok' => 'Ł',
'lstrok' => 'ł',
'Nacute' => 'Ń',
'nacute' => 'ń',
'napos' => 'ʼn',
'Ncaron' => 'Ň',
'ncaron' => 'ň',
'Ncedil' => 'Ņ',
'ncedil' => 'ņ',
'Odblac' => 'Ő',
'odblac' => 'ő',
'OElig' => 'Œ',
'oelig' => 'œ',
'Omacr' => 'Ō',
'omacr' => 'ō',
'Racute' => 'Ŕ',
'racute' => 'ŕ',
'Rcaron' => 'Ř',
'rcaron' => 'ř',
'Rcedil' => 'Ŗ',
'rcedil' => 'ŗ',
'Sacute' => 'Ś',
'sacute' => 'ś',
'Scaron' => 'Š',
'scaron' => 'š',
'Scedil' => 'Ş',
'scedil' => 'ş',
'Scirc' => 'Ŝ',
'scirc' => 'ŝ',
'Tcaron' => 'Ť',
'tcaron' => 'ť',
'Tcedil' => 'Ţ',
'tcedil' => 'ţ',
'Tstrok' => 'Ŧ',
'tstrok' => 'ŧ',
'Ubreve' => 'Ŭ',
'ubreve' => 'ŭ',
'Udblac' => 'Ű',
'udblac' => 'ű',
'Umacr' => 'Ū',
'umacr' => 'ū',
'Uogon' => 'Ų',
'uogon' => 'ų',
'Uring' => 'Ů',
'uring' => 'ů',
'Utilde' => 'Ũ',
'utilde' => 'ũ',
'Wcirc' => 'Ŵ',
'wcirc' => 'ŵ',
'Ycirc' => 'Ŷ',
'ycirc' => 'ŷ',
'Yuml' => 'Ÿ',
'Zacute' => 'Ź',
'zacute' => 'ź',
'Zcaron' => 'Ž',
'zcaron' => 'ž',
'Zdot' => 'Ż',
'zdot' => 'ż',
'apos' => ''',
'ast' => '*',
'brvbar' => '¦',
'bsol' => '\',
'cent' => '¢',
'colon' => ':',
'comma' => ',',
'commat' => '@',
'copy' => '©',
'curren' => '¤',
'darr' => '↓',
'deg' => '°',
'divide' => '÷',
'dollar' => '$',
'equals' => '=',
'excl' => '!',
'frac12' => '½',
'frac14' => '¼',
'frac18' => '⅛',
'frac34' => '¾',
'frac38' => '⅜',
'frac58' => '⅝',
'frac78' => '⅞',
'gt' => '>',
'half' => '½',
'horbar' => '―',
'hyphen' => '‐',
'iexcl' => '¡',
'iquest' => '¿',
'laquo' => '«',
'larr' => '←',
'lcub' => '{',
'ldquo' => '“',
'lowbar' => '_',
'lpar' => '(',
'lsqb' => '[',
'lsquo' => '‘',
'micro' => 'µ',
'middot' => '·',
'nbsp' => ' ',
'not' => '¬',
'num' => '#',
'ohm' => 'Ω',
'ordf' => 'ª',
'ordm' => 'º',
'para' => '¶',
'percnt' => '%',
'period' => '.',
'plus' => '+',
'plusmn' => '±',
'pound' => '£',
'quest' => '?',
'quot' => '"',
'raquo' => '»',
'rarr' => '→',
'rcub' => '}',
'rdquo' => '”',
'reg' => '®',
'rpar' => ')',
'rsqb' => ']',
'rsquo' => '’',
'sect' => '§',
'semi' => '&#x0003B;',
'shy' => '­',
'sol' => '/',
'sung' => '♪',
'sup1' => '¹',
'sup2' => '²',
'sup3' => '³',
'times' => '×',
'trade' => '™',
'uarr' => '↑',
'verbar' => '|',
'yen' => '¥',
'blank' => '␣',
'blk12' => '▒',
'blk14' => '░',
'blk34' => '▓',
'block' => '█',
'bull' => '•',
'caret' => '⁁',
'check' => '✓',
'cir' => '○',
'clubs' => '♣',
'copysr' => '℗',
'cross' => '✗',
'Dagger' => '‡',
'dagger' => '†',
'dash' => '‐',
'diams' => '♦',
'dlcrop' => '⌍',
'drcrop' => '⌌',
'dtri' => '▿',
'dtrif' => '▾',
'emsp' => ' ',
'emsp13' => ' ',
'emsp14' => ' ',
'ensp' => ' ',
'female' => '♀',
'ffilig' => 'ffi',
'fflig' => 'ff',
'ffllig' => 'ffl',
'filig' => 'fi',
'flat' => '♭',
'fllig' => 'fl',
'frac13' => '⅓',
'frac15' => '⅕',
'frac16' => '⅙',
'frac23' => '⅔',
'frac25' => '⅖',
'frac35' => '⅗',
'frac45' => '⅘',
'frac56' => '⅚',
'hairsp' => ' ',
'hearts' => '♥',
'hellip' => '…',
'hybull' => '⁃',
'incare' => '℅',
'ldquor' => '„',
'lhblk' => '▄',
'loz' => '◊',
'lozf' => '⧫',
'lsquor' => '‚',
'ltri' => '◃',
'ltrif' => '◂',
'male' => '♂',
'malt' => '✠',
'marker' => '▮',
'mdash' => '—',
'mldr' => '…',
'natur' => '♮',
'ndash' => '–',
'nldr' => '‥',
'numsp' => ' ',
'phone' => '☎',
'puncsp' => ' ',
'rdquor' => '”',
'rect' => '▭',
'rsquor' => '’',
'rtri' => '▹',
'rtrif' => '▸',
'rx' => '℞',
'sext' => '✶',
'sharp' => '♯',
'spades' => '♠',
'squ' => '□',
'squf' => '▪',
'star' => '☆',
'starf' => '★',
'target' => '⌖',
'telrec' => '⌕',
'thinsp' => ' ',
'uhblk' => '▀',
'ulcrop' => '⌏',
'urcrop' => '⌎',
'utri' => '▵',
'utrif' => '▴',
'vellip' => '⋮',
'af' => '⁡',
'aopf' => '𝕒',
'asympeq' => '≍',
'bopf' => '𝕓',
'copf' => '𝕔',
'Cross' => '⨯',
'DD' => 'ⅅ',
'dd' => 'ⅆ',
'dopf' => '𝕕',
'DownArrowBar' => '⤓',
'DownBreve' => '̑',
'DownLeftRightVector' => '⥐',
'DownLeftTeeVector' => '⥞',
'DownLeftVectorBar' => '⥖',
'DownRightTeeVector' => '⥟',
'DownRightVectorBar' => '⥗',
'ee' => 'ⅇ',
'EmptySmallSquare' => '◻',
'EmptyVerySmallSquare' => '▫',
'eopf' => '𝕖',
'Equal' => '⩵',
'FilledSmallSquare' => '◼',
'FilledVerySmallSquare' => '▪',
'fopf' => '𝕗',
'gopf' => '𝕘',
'GreaterGreater' => '⪢',
'Hat' => '^',
'hopf' => '𝕙',
'HorizontalLine' => '─',
'ic' => '⁣',
'ii' => 'ⅈ',
'iopf' => '𝕚',
'it' => '⁢',
'jopf' => '𝕛',
'kopf' => '𝕜',
'larrb' => '⇤',
'LeftDownTeeVector' => '⥡',
'LeftDownVectorBar' => '⥙',
'LeftRightVector' => '⥎',
'LeftTeeVector' => '⥚',
'LeftTriangleBar' => '⧏',
'LeftUpDownVector' => '⥑',
'LeftUpTeeVector' => '⥠',
'LeftUpVectorBar' => '⥘',
'LeftVectorBar' => '⥒',
'LessLess' => '⪡',
'lopf' => '𝕝',
'mapstodown' => '↧',
'mapstoleft' => '↤',
'mapstoup' => '↥',
'MediumSpace' => ' ',
'mopf' => '𝕞',
'nbump' => '≎̸',
'nbumpe' => '≏̸',
'nesim' => '≂̸',
'NewLine' => '
',
'NoBreak' => '⁠',
'nopf' => '𝕟',
'NotCupCap' => '≭',
'NotHumpEqual' => '≏̸',
'NotLeftTriangleBar' => '⧏̸',
'NotNestedGreaterGreater' => '⪢̸',
'NotNestedLessLess' => '⪡̸',
'NotRightTriangleBar' => '⧐̸',
'NotSquareSubset' => '⊏̸',
'NotSquareSuperset' => '⊐̸',
'NotSucceedsTilde' => '≿̸',
'oopf' => '𝕠',
'OverBar' => '¯',
'OverBrace' => '︷',
'OverBracket' => '⎴',
'OverParenthesis' => '︵',
'planckh' => 'ℎ',
'popf' => '𝕡',
'Product' => '∏',
'qopf' => '𝕢',
'rarrb' => '⇥',
'RightDownTeeVector' => '⥝',
'RightDownVectorBar' => '⥕',
'RightTeeVector' => '⥛',
'RightTriangleBar' => '⧐',
'RightUpDownVector' => '⥏',
'RightUpTeeVector' => '⥜',
'RightUpVectorBar' => '⥔',
'RightVectorBar' => '⥓',
'ropf' => '𝕣',
'RoundImplies' => '⥰',
'RuleDelayed' => '⧴',
'sopf' => '𝕤',
'Tab' => '	',
'ThickSpace' => '   ',
'topf' => '𝕥',
'UnderBar' => '̲',
'UnderBrace' => '︸',
'UnderBracket' => '⎵',
'UnderParenthesis' => '︶',
'uopf' => '𝕦',
'UpArrowBar' => '⤒',
'Upsilon' => 'Υ',
'VerticalLine' => '|',
'VerticalSeparator' => '❘',
'vopf' => '𝕧',
'wopf' => '𝕨',
'xopf' => '𝕩',
'yopf' => '𝕪',
'ZeroWidthSpace' => '​',
'zopf' => '𝕫',
'angle' => '∠',
'ApplyFunction' => '⁡',
'approx' => '≈',
'approxeq' => '≊',
'Assign' => '≔',
'backcong' => '≌',
'backepsilon' => '϶',
'backprime' => '‵',
'backsim' => '∽',
'backsimeq' => '⋍',
'Backslash' => '∖',
'barwedge' => '⌅',
'Because' => '∵',
'because' => '∵',
'Bernoullis' => 'ℬ',
'between' => '≬',
'bigcap' => '⋂',
'bigcirc' => '◯',
'bigcup' => '⋃',
'bigodot' => '⨀',
'bigoplus' => '⨁',
'bigotimes' => '⨂',
'bigsqcup' => '⨆',
'bigstar' => '★',
'bigtriangledown' => '▽',
'bigtriangleup' => '△',
'biguplus' => '⨄',
'bigvee' => '⋁',
'bigwedge' => '⋀',
'bkarow' => '⤍',
'blacklozenge' => '⧫',
'blacksquare' => '▪',
'blacktriangle' => '▴',
'blacktriangledown' => '▾',
'blacktriangleleft' => '◂',
'blacktriangleright' => '▸',
'bot' => '⊥',
'boxminus' => '⊟',
'boxplus' => '⊞',
'boxtimes' => '⊠',
'Breve' => '˘',
'bullet' => '•',
'Bumpeq' => '≎',
'bumpeq' => '≏',
'CapitalDifferentialD' => 'ⅅ',
'Cayleys' => 'ℭ',
'Cedilla' => '¸',
'CenterDot' => '·',
'centerdot' => '·',
'checkmark' => '✓',
'circeq' => '≗',
'circlearrowleft' => '↺',
'circlearrowright' => '↻',
'circledast' => '⊛',
'circledcirc' => '⊚',
'circleddash' => '⊝',
'CircleDot' => '⊙',
'circledR' => '®',
'circledS' => 'Ⓢ',
'CircleMinus' => '⊖',
'CirclePlus' => '⊕',
'CircleTimes' => '⊗',
'ClockwiseContourIntegral' => '∲',
'CloseCurlyDoubleQuote' => '”',
'CloseCurlyQuote' => '’',
'clubsuit' => '♣',
'coloneq' => '≔',
'complement' => '∁',
'complexes' => 'ℂ',
'Congruent' => '≡',
'ContourIntegral' => '∮',
'Coproduct' => '∐',
'CounterClockwiseContourIntegral' => '∳',
'CupCap' => '≍',
'curlyeqprec' => '⋞',
'curlyeqsucc' => '⋟',
'curlyvee' => '⋎',
'curlywedge' => '⋏',
'curvearrowleft' => '↶',
'curvearrowright' => '↷',
'dbkarow' => '⤏',
'ddagger' => '‡',
'ddotseq' => '⩷',
'Del' => '∇',
'DiacriticalAcute' => '´',
'DiacriticalDot' => '˙',
'DiacriticalDoubleAcute' => '˝',
'DiacriticalGrave' => '`',
'DiacriticalTilde' => '˜',
'Diamond' => '⋄',
'diamond' => '⋄',
'diamondsuit' => '♦',
'DifferentialD' => 'ⅆ',
'digamma' => 'ϝ',
'div' => '÷',
'divideontimes' => '⋇',
'doteq' => '≐',
'doteqdot' => '≑',
'DotEqual' => '≐',
'dotminus' => '∸',
'dotplus' => '∔',
'dotsquare' => '⊡',
'doublebarwedge' => '⌆',
'DoubleContourIntegral' => '∯',
'DoubleDot' => '¨',
'DoubleDownArrow' => '⇓',
'DoubleLeftArrow' => '⇐',
'DoubleLeftRightArrow' => '⇔',
'DoubleLeftTee' => '⫤',
'DoubleLongLeftArrow' => '⟸',
'DoubleLongLeftRightArrow' => '⟺',
'DoubleLongRightArrow' => '⟹',
'DoubleRightArrow' => '⇒',
'DoubleRightTee' => '⊨',
'DoubleUpArrow' => '⇑',
'DoubleUpDownArrow' => '⇕',
'DoubleVerticalBar' => '∥',
'DownArrow' => '↓',
'Downarrow' => '⇓',
'downarrow' => '↓',
'DownArrowUpArrow' => '⇵',
'downdownarrows' => '⇊',
'downharpoonleft' => '⇃',
'downharpoonright' => '⇂',
'DownLeftVector' => '↽',
'DownRightVector' => '⇁',
'DownTee' => '⊤',
'DownTeeArrow' => '↧',
'drbkarow' => '⤐',
'Element' => '∈',
'emptyset' => '∅',
'eqcirc' => '≖',
'eqcolon' => '≕',
'eqsim' => '≂',
'eqslantgtr' => '⪖',
'eqslantless' => '⪕',
'EqualTilde' => '≂',
'Equilibrium' => '⇌',
'Exists' => '∃',
'expectation' => 'ℰ',
'ExponentialE' => 'ⅇ',
'exponentiale' => 'ⅇ',
'fallingdotseq' => '≒',
'ForAll' => '∀',
'Fouriertrf' => 'ℱ',
'geq' => '≥',
'geqq' => '≧',
'geqslant' => '⩾',
'gg' => '≫',
'ggg' => '⋙',
'gnapprox' => '⪊',
'gneq' => '⪈',
'gneqq' => '≩',
'GreaterEqual' => '≥',
'GreaterEqualLess' => '⋛',
'GreaterFullEqual' => '≧',
'GreaterLess' => '≷',
'GreaterSlantEqual' => '⩾',
'GreaterTilde' => '≳',
'gtrapprox' => '⪆',
'gtrdot' => '⋗',
'gtreqless' => '⋛',
'gtreqqless' => '⪌',
'gtrless' => '≷',
'gtrsim' => '≳',
'gvertneqq' => '≩︀',
'Hacek' => 'ˇ',
'hbar' => 'ℏ',
'heartsuit' => '♥',
'HilbertSpace' => 'ℋ',
'hksearow' => '⤥',
'hkswarow' => '⤦',
'hookleftarrow' => '↩',
'hookrightarrow' => '↪',
'hslash' => 'ℏ',
'HumpDownHump' => '≎',
'HumpEqual' => '≏',
'iiiint' => '⨌',
'iiint' => '∭',
'Im' => 'ℑ',
'ImaginaryI' => 'ⅈ',
'imagline' => 'ℐ',
'imagpart' => 'ℑ',
'Implies' => '⇒',
'in' => '∈',
'integers' => 'ℤ',
'Integral' => '∫',
'intercal' => '⊺',
'Intersection' => '⋂',
'intprod' => '⨼',
'InvisibleComma' => '⁣',
'InvisibleTimes' => '⁢',
'langle' => '〈',
'Laplacetrf' => 'ℒ',
'lbrace' => '{',
'lbrack' => '[',
'LeftAngleBracket' => '〈',
'LeftArrow' => '←',
'Leftarrow' => '⇐',
'leftarrow' => '←',
'LeftArrowBar' => '⇤',
'LeftArrowRightArrow' => '⇆',
'leftarrowtail' => '↢',
'LeftCeiling' => '⌈',
'LeftDoubleBracket' => '〚',
'LeftDownVector' => '⇃',
'LeftFloor' => '⌊',
'leftharpoondown' => '↽',
'leftharpoonup' => '↼',
'leftleftarrows' => '⇇',
'LeftRightArrow' => '↔',
'Leftrightarrow' => '⇔',
'leftrightarrow' => '↔',
'leftrightarrows' => '⇆',
'leftrightharpoons' => '⇋',
'leftrightsquigarrow' => '↭',
'LeftTee' => '⊣',
'LeftTeeArrow' => '↤',
'leftthreetimes' => '⋋',
'LeftTriangle' => '⊲',
'LeftTriangleEqual' => '⊴',
'LeftUpVector' => '↿',
'LeftVector' => '↼',
'leq' => '≤',
'leqq' => '≦',
'leqslant' => '⩽',
'lessapprox' => '⪅',
'lessdot' => '⋖',
'lesseqgtr' => '⋚',
'lesseqqgtr' => '⪋',
'LessEqualGreater' => '⋚',
'LessFullEqual' => '≦',
'LessGreater' => '≶',
'lessgtr' => '≶',
'lesssim' => '≲',
'LessSlantEqual' => '⩽',
'LessTilde' => '≲',
'll' => '≪',
'llcorner' => '⌞',
'Lleftarrow' => '⇚',
'lmoustache' => '⎰',
'lnapprox' => '⪉',
'lneq' => '⪇',
'lneqq' => '≨',
'LongLeftArrow' => '⟵',
'Longleftarrow' => '⟸',
'longleftarrow' => '⟵',
'LongLeftRightArrow' => '⟷',
'Longleftrightarrow' => '⟺',
'longleftrightarrow' => '⟷',
'longmapsto' => '⟼',
'LongRightArrow' => '⟶',
'Longrightarrow' => '⟹',
'longrightarrow' => '⟶',
'looparrowleft' => '↫',
'looparrowright' => '↬',
'LowerLeftArrow' => '↙',
'LowerRightArrow' => '↘',
'lozenge' => '◊',
'lrcorner' => '⌟',
'Lsh' => '↰',
'lvertneqq' => '≨︀',
'maltese' => '✠',
'mapsto' => '↦',
'measuredangle' => '∡',
'Mellintrf' => 'ℳ',
'MinusPlus' => '∓',
'mp' => '∓',
'multimap' => '⊸',
'napprox' => '≉',
'natural' => '♮',
'naturals' => 'ℕ',
'nearrow' => '↗',
'NegativeMediumSpace' => '​',
'NegativeThickSpace' => '​',
'NegativeThinSpace' => '​',
'NegativeVeryThinSpace' => '​',
'NestedGreaterGreater' => '≫',
'NestedLessLess' => '≪',
'nexists' => '∄',
'ngeq' => '≱',
'ngeqq' => '≧̸',
'ngeqslant' => '⩾̸',
'ngtr' => '≯',
'nLeftarrow' => '⇍',
'nleftarrow' => '↚',
'nLeftrightarrow' => '⇎',
'nleftrightarrow' => '↮',
'nleq' => '≰',
'nleqq' => '≦̸',
'nleqslant' => '⩽̸',
'nless' => '≮',
'NonBreakingSpace' => ' ',
'NotCongruent' => '≢',
'NotDoubleVerticalBar' => '∦',
'NotElement' => '∉',
'NotEqual' => '≠',
'NotEqualTilde' => '≂̸',
'NotExists' => '∄',
'NotGreater' => '≯',
'NotGreaterEqual' => '≱',
'NotGreaterFullEqual' => '≦̸',
'NotGreaterGreater' => '≫̸',
'NotGreaterLess' => '≹',
'NotGreaterSlantEqual' => '⩾̸',
'NotGreaterTilde' => '≵',
'NotHumpDownHump' => '≎̸',
'NotLeftTriangle' => '⋪',
'NotLeftTriangleEqual' => '⋬',
'NotLess' => '≮',
'NotLessEqual' => '≰',
'NotLessGreater' => '≸',
'NotLessLess' => '≪̸',
'NotLessSlantEqual' => '⩽̸',
'NotLessTilde' => '≴',
'NotPrecedes' => '⊀',
'NotPrecedesEqual' => '⪯̸',
'NotPrecedesSlantEqual' => '⋠',
'NotReverseElement' => '∌',
'NotRightTriangle' => '⋫',
'NotRightTriangleEqual' => '⋭',
'NotSquareSubsetEqual' => '⋢',
'NotSquareSupersetEqual' => '⋣',
'NotSubset' => '⊂⃒',
'NotSubsetEqual' => '⊈',
'NotSucceeds' => '⊁',
'NotSucceedsEqual' => '⪰̸',
'NotSucceedsSlantEqual' => '⋡',
'NotSuperset' => '⊃⃒',
'NotSupersetEqual' => '⊉',
'NotTilde' => '≁',
'NotTildeEqual' => '≄',
'NotTildeFullEqual' => '≇',
'NotTildeTilde' => '≉',
'NotVerticalBar' => '∤',
'nparallel' => '∦',
'nprec' => '⊀',
'npreceq' => '⪯̸',
'nRightarrow' => '⇏',
'nrightarrow' => '↛',
'nshortmid' => '∤',
'nshortparallel' => '∦',
'nsimeq' => '≄',
'nsubset' => '⊂⃒',
'nsubseteq' => '⊈',
'nsubseteqq' => '⫅̸',
'nsucc' => '⊁',
'nsucceq' => '⪰̸',
'nsupset' => '⊃⃒',
'nsupseteq' => '⊉',
'nsupseteqq' => '⫆̸',
'ntriangleleft' => '⋪',
'ntrianglelefteq' => '⋬',
'ntriangleright' => '⋫',
'ntrianglerighteq' => '⋭',
'nwarrow' => '↖',
'oint' => '∮',
'OpenCurlyDoubleQuote' => '“',
'OpenCurlyQuote' => '‘',
'orderof' => 'ℴ',
'parallel' => '∥',
'PartialD' => '∂',
'pitchfork' => '⋔',
'PlusMinus' => '±',
'pm' => '±',
'Poincareplane' => 'ℌ',
'prec' => '≺',
'precapprox' => '⪷',
'preccurlyeq' => '≼',
'Precedes' => '≺',
'PrecedesEqual' => '⪯',
'PrecedesSlantEqual' => '≼',
'PrecedesTilde' => '≾',
'preceq' => '⪯',
'precnapprox' => '⪹',
'precneqq' => '⪵',
'precnsim' => '⋨',
'precsim' => '≾',
'primes' => 'ℙ',
'Proportion' => '∷',
'Proportional' => '∝',
'propto' => '∝',
'quaternions' => 'ℍ',
'questeq' => '≟',
'rangle' => '〉',
'rationals' => 'ℚ',
'rbrace' => '}',
'rbrack' => ']',
'Re' => 'ℜ',
'realine' => 'ℛ',
'realpart' => 'ℜ',
'reals' => 'ℝ',
'ReverseElement' => '∋',
'ReverseEquilibrium' => '⇋',
'ReverseUpEquilibrium' => '⥯',
'RightAngleBracket' => '〉',
'RightArrow' => '→',
'Rightarrow' => '⇒',
'rightarrow' => '→',
'RightArrowBar' => '⇥',
'RightArrowLeftArrow' => '⇄',
'rightarrowtail' => '↣',
'RightCeiling' => '⌉',
'RightDoubleBracket' => '〛',
'RightDownVector' => '⇂',
'RightFloor' => '⌋',
'rightharpoondown' => '⇁',
'rightharpoonup' => '⇀',
'rightleftarrows' => '⇄',
'rightleftharpoons' => '⇌',
'rightrightarrows' => '⇉',
'rightsquigarrow' => '↝',
'RightTee' => '⊢',
'RightTeeArrow' => '↦',
'rightthreetimes' => '⋌',
'RightTriangle' => '⊳',
'RightTriangleEqual' => '⊵',
'RightUpVector' => '↾',
'RightVector' => '⇀',
'risingdotseq' => '≓',
'rmoustache' => '⎱',
'Rrightarrow' => '⇛',
'Rsh' => '↱',
'searrow' => '↘',
'setminus' => '∖',
'ShortDownArrow' => '↓',
'ShortLeftArrow' => '←',
'shortmid' => '∣',
'shortparallel' => '∥',
'ShortRightArrow' => '→',
'ShortUpArrow' => '↑',
'simeq' => '≃',
'SmallCircle' => '∘',
'smallsetminus' => '∖',
'spadesuit' => '♠',
'Sqrt' => '√',
'sqsubset' => '⊏',
'sqsubseteq' => '⊑',
'sqsupset' => '⊐',
'sqsupseteq' => '⊒',
'Square' => '□',
'SquareIntersection' => '⊓',
'SquareSubset' => '⊏',
'SquareSubsetEqual' => '⊑',
'SquareSuperset' => '⊐',
'SquareSupersetEqual' => '⊒',
'SquareUnion' => '⊔',
'Star' => '⋆',
'straightepsilon' => 'ϵ',
'straightphi' => 'ϕ',
'Subset' => '⋐',
'subset' => '⊂',
'subseteq' => '⊆',
'subseteqq' => '⫅',
'SubsetEqual' => '⊆',
'subsetneq' => '⊊',
'subsetneqq' => '⫋',
'succ' => '≻',
'succapprox' => '⪸',
'succcurlyeq' => '≽',
'Succeeds' => '≻',
'SucceedsEqual' => '⪰',
'SucceedsSlantEqual' => '≽',
'SucceedsTilde' => '≿',
'succeq' => '⪰',
'succnapprox' => '⪺',
'succneqq' => '⪶',
'succnsim' => '⋩',
'succsim' => '≿',
'SuchThat' => '∋',
'Sum' => '∑',
'Superset' => '⊃',
'SupersetEqual' => '⊇',
'Supset' => '⋑',
'supset' => '⊃',
'supseteq' => '⊇',
'supseteqq' => '⫆',
'supsetneq' => '⊋',
'supsetneqq' => '⫌',
'swarrow' => '↙',
'Therefore' => '∴',
'therefore' => '∴',
'thickapprox' => '≈',
'thicksim' => '∼',
'ThinSpace' => ' ',
'Tilde' => '∼',
'TildeEqual' => '≃',
'TildeFullEqual' => '≅',
'TildeTilde' => '≈',
'toea' => '⤨',
'tosa' => '⤩',
'triangle' => '▵',
'triangledown' => '▿',
'triangleleft' => '◃',
'trianglelefteq' => '⊴',
'triangleq' => '≜',
'triangleright' => '▹',
'trianglerighteq' => '⊵',
'TripleDot' => '⃛',
'twoheadleftarrow' => '↞',
'twoheadrightarrow' => '↠',
'ulcorner' => '⌜',
'Union' => '⋃',
'UnionPlus' => '⊎',
'UpArrow' => '↑',
'Uparrow' => '⇑',
'uparrow' => '↑',
'UpArrowDownArrow' => '⇅',
'UpDownArrow' => '↕',
'Updownarrow' => '⇕',
'updownarrow' => '↕',
'UpEquilibrium' => '⥮',
'upharpoonleft' => '↿',
'upharpoonright' => '↾',
'UpperLeftArrow' => '↖',
'UpperRightArrow' => '↗',
'upsilon' => 'υ',
'UpTee' => '⊥',
'UpTeeArrow' => '↥',
'upuparrows' => '⇈',
'urcorner' => '⌝',
'varepsilon' => 'ε',
'varkappa' => 'ϰ',
'varnothing' => '∅',
'varphi' => 'φ',
'varpi' => 'ϖ',
'varpropto' => '∝',
'varrho' => 'ϱ',
'varsigma' => 'ς',
'varsubsetneq' => '⊊︀',
'varsubsetneqq' => '⫋︀',
'varsupsetneq' => '⊋︀',
'varsupsetneqq' => '⫌︀',
'vartheta' => 'ϑ',
'vartriangleleft' => '⊲',
'vartriangleright' => '⊳',
'Vee' => '⋁',
'vee' => '∨',
'Vert' => '‖',
'vert' => '|',
'VerticalBar' => '∣',
'VerticalTilde' => '≀',
'VeryThinSpace' => ' ',
'Wedge' => '⋀',
'wedge' => '∧',
'wp' => '℘',
'wr' => '≀',
'zeetrf' => 'ℨ'
}
#:startdoc:
# Converts XHTML+MathML named entities in string to Numeric Character References
#
# :call-seq:
# string.to_ncr -> string
#
def to_ncr
pieces = self.split(/&([a-zA-Z0-9]+);/)
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr}
pieces.join
end
# Converts XHTML+MathML named entities in string to Numeric Character References
#
# :call-seq:
# string.to_ncr! -> str or nil
#
# Substitution is done in-place.
#
def to_ncr!
pieces = self.split(/&([a-zA-Z0-9]+);/)
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_ncr}
self.replace pieces.join
end
# Converts XHTML+MathML named entities in string to UTF-8
#
# :call-seq:
# string.to_utf8 -> string
#
def to_utf8
pieces = self.split(/&([a-zA-Z0-9]+);/)
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
pieces.join
end
# Converts XHTML+MathML named entities in string to UTF-8
#
# :call-seq:
# string.to_ncr! -> str or nil
#
# Substitution is done in-place.
#
def to_utf8!
pieces = self.split(/&([a-zA-Z0-9]+);/)
1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
self.replace pieces.join
end
protected
def convert_to_ncr #:nodoc:
if self =~ /^(lt|gt|amp|quot|apos)$/
self.replace "&" + self + ";"
elsif MATHML_ENTITIES.has_key?(self)
self.replace MATHML_ENTITIES[self]
else
self.replace "&" + self + ";"
end
end
def convert_to_utf8 #:nodoc:
if self =~ /^(lt|gt|amp|quot|apos)$/
self.replace "&" + self + ";"
elsif MATHML_ENTITIES.has_key?(self)
self.replace MATHML_ENTITIES[self].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*')
else
self.replace "&" + self + ";"
end
end
end
require 'rexml/element'
module REXML #:nodoc:
class Element
# Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References
#
# :call-seq:
# tree.to_ncr -> REXML::Element
#
# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
# access the resulting REXML document.
#
# Note that this method needs to traverse the entire tree, converting text nodes and attributes
# for each element. This can be SLOW. It will often be faster to serialize to a string and then
# use String.to_ncr instead.
#
def to_ncr
self.each_element { |el|
el.texts.each_index {|i|
el.texts[i].value = el.texts[i].to_s.to_ncr
}
el.attributes.each { |name,val|
el.attributes[name] = val.to_ncr
}
el.to_ncr if el.has_elements?
}
return self
end
# Convert XHTML+MathML Named Entities in a REXML::Element to UTF-8
#
# :call-seq:
# tree.to_utf8 -> REXML::Element
#
# Note that this method needs to traverse the entire tree, converting text nodes and attributes
# for each element. This can be SLOW. It will often be faster to serialize to a string and then
# use String.to_utf8 instead.
#
def to_utf8
self.each_element { |el|
el.texts.each_index {|i|
el.texts[i].value = el.texts[i].to_s.to_utf8
}
el.attributes.each { |name,val|
el.attributes[name] = val.to_utf8
}
el.to_utf8 if el.has_elements?
}
return self
end
end
end
module HTML5 #:nodoc: all
module TreeWalkers
private
class << self
def [](name)
case name.to_s.downcase
when 'rexml'
require 'html5/treewalkers/rexml'
REXML::TreeWalker
when 'rexml2'
REXML2::TreeWalker
else
raise "Unknown TreeWalker #{name}"
end
end
alias :get_tree_walker :[]
end
module REXML2
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
private
def node_details(node)
case node
when ::REXML::Document
[:DOCUMENT]
when ::REXML::Element
if !node.name
[:DOCUMENT_FRAGMENT]
else
[:ELEMENT, node.name,
node.attributes.map {|name,value| [name,value.to_utf8]},
node.has_elements? || node.has_text?]
end
when ::REXML::Text
[:TEXT, node.value.to_utf8]
when ::REXML::Comment
[:COMMENT, node.string]
when ::REXML::DocType
[:DOCTYPE, node.name, node.public, node.system]
when ::REXML::XMLDecl
[nil]
else
[:UNKNOWN, node.class.inspect]
end
end
def first_child(node)
node.children.first
end
def next_sibling(node)
node.next_sibling
end
def parent(node)
node.parent
end
end
end
end
end