diff --git a/app/controllers/file_controller.rb b/app/controllers/file_controller.rb index 21cde860..5c04b9e7 100644 --- a/app/controllers/file_controller.rb +++ b/app/controllers/file_controller.rb @@ -1,7 +1,7 @@ # Controller responsible for serving files and pictures. require 'zip/zip' -require 'sanitize' +require 'stringsupport' class FileController < ApplicationController diff --git a/app/controllers/wiki_controller.rb b/app/controllers/wiki_controller.rb index 80bd4a8e..e42c3f80 100644 --- a/app/controllers/wiki_controller.rb +++ b/app/controllers/wiki_controller.rb @@ -2,7 +2,7 @@ require 'fileutils' require 'maruku' require 'parsedate' require 'zip/zip' -require 'sanitize' +require 'stringsupport' require 'resolv' class WikiController < ApplicationController diff --git a/lib/chunks/category.rb b/lib/chunks/category.rb index f008c85a..67987a4d 100644 --- a/lib/chunks/category.rb +++ b/lib/chunks/category.rb @@ -1,5 +1,5 @@ require 'chunks/chunk' -require 'sanitize' +require 'stringsupport' # The category chunk looks for "category: news" on a line by # itself and parses the terms after the ':' as categories. diff --git a/lib/chunks/engines.rb b/lib/chunks/engines.rb index 0f5300c6..84e984b0 100644 --- a/lib/chunks/engines.rb +++ b/lib/chunks/engines.rb @@ -1,7 +1,7 @@ $: << File.dirname(__FILE__) + "../../lib" require_dependency 'chunks/chunk' -require 'sanitize' +require 'stringsupport' # The markup engines are Chunks that call the one of RedCloth diff --git a/lib/node.rb b/lib/node.rb new file mode 100644 index 00000000..b0d0eac6 --- /dev/null +++ b/lib/node.rb @@ -0,0 +1,532 @@ +require 'strscan' + +module XHTML #:nodoc: + + class Conditions < Hash #:nodoc: + def initialize(hash) + super() + hash = { :content => hash } unless Hash === hash + hash = keys_to_symbols(hash) + hash.each do |k,v| + case k + when :tag, :content then + # keys are valid, and require no further processing + when :attributes then + hash[k] = keys_to_strings(v) + when :parent, :child, :ancestor, :descendant, :sibling, :before, + :after + hash[k] = Conditions.new(v) + when :children + hash[k] = v = keys_to_symbols(v) + v.each do |k,v2| + case k + when :count, :greater_than, :less_than + # keys are valid, and require no further processing + when :only + v[k] = Conditions.new(v2) + else + raise "illegal key #{k.inspect} => #{v2.inspect}" + end + end + else + raise "illegal key #{k.inspect} => #{v.inspect}" + end + end + update hash + end + + private + + def keys_to_strings(hash) + hash.keys.inject({}) do |h,k| + h[k.to_s] = hash[k] + h + end + end + + def keys_to_symbols(hash) + hash.keys.inject({}) do |h,k| + raise "illegal key #{k.inspect}" unless k.respond_to?(:to_sym) + h[k.to_sym] = hash[k] + h + end + end + end + + # The base class of all nodes, textual and otherwise, in an HTML document. + class Node #:nodoc: + # The array of children of this node. Not all nodes have children. + attr_reader :children + + # The parent node of this node. All nodes have a parent, except for the + # root node. + attr_reader :parent + + # The line number of the input where this node was begun + attr_reader :line + + # The byte position in the input where this node was begun + attr_reader :position + + # Create a new node as a child of the given parent. + def initialize(parent, line=0, pos=0) + @parent = parent + @children = [] + @line, @position = line, pos + end + + # Return a textual representation of the node. + def to_s + s = "" + @children.each { |child| s << child.to_s } + s + end + + # Return false (subclasses must override this to provide specific matching + # behavior.) +conditions+ may be of any type. + def match(conditions) + false + end + + # Search the children of this node for the first node for which #find + # returns non +nil+. Returns the result of the #find call that succeeded. + def find(conditions) + conditions = validate_conditions(conditions) + @children.each do |child| + node = child.find(conditions) + return node if node + end + nil + end + + # Search for all nodes that match the given conditions, and return them + # as an array. + def find_all(conditions) + conditions = validate_conditions(conditions) + + matches = [] + matches << self if match(conditions) + @children.each do |child| + matches.concat child.find_all(conditions) + end + matches + end + + # Returns +false+. Subclasses may override this if they define a kind of + # tag. + def tag? + false + end + + def validate_conditions(conditions) + Conditions === conditions ? conditions : Conditions.new(conditions) + end + + def ==(node) + return false unless self.class == node.class && children.size == node.children.size + + equivalent = true + + children.size.times do |i| + equivalent &&= children[i] == node.children[i] + end + + equivalent + end + + class </) + return CDATA.new(parent, line, pos, scanner.pre_match.gsub(/\/]+/) + end + end + attributes[attr] = value + scanner.skip(/\s*/) + end + + closing = ( scanner.scan(/\//) ? :self : nil ) + end + + unless scanner.scan(/\s*>/) + if strict + raise "expected > (got #{scanner.rest.inspect} for #{content}, #{attributes.inspect})" + else + # throw away all text until we find what we're looking for + scanner.skip_until(/>/) or scanner.terminate + end + end + + Tag.new(parent, line, pos, name, attributes, closing) + end + end + end + end + + # A node that represents text, rather than markup. + class Text < Node #:nodoc: + + attr_reader :content + + # Creates a new text node as a child of the given parent, with the given + # content. + def initialize(parent, line, pos, content) + super(parent, line, pos) + @content = content + end + + # Returns the content of this node. + def to_s + @content + end + + # Returns +self+ if this node meets the given conditions. Text nodes support + # conditions of the following kinds: + # + # * if +conditions+ is a string, it must be a substring of the node's + # content + # * if +conditions+ is a regular expression, it must match the node's + # content + # * if +conditions+ is a hash, it must contain a :content key that + # is either a string or a regexp, and which is interpreted as described + # above. + def find(conditions) + match(conditions) && self + end + + # Returns non-+nil+ if this node meets the given conditions, or +nil+ + # otherwise. See the discussion of #find for the valid conditions. + def match(conditions) + case conditions + when String + @content == conditions + when Regexp + @content =~ conditions + when Hash + conditions = validate_conditions(conditions) + + # Text nodes only have :content, :parent, :ancestor + unless (conditions.keys - [:content, :parent, :ancestor]).empty? + return false + end + + match(conditions[:content]) + else + nil + end + end + + def ==(node) + return false unless super + content == node.content + end + end + + # A CDATA node is simply a text node with a specialized way of displaying + # itself. + class CDATA < Text #:nodoc: + def to_s + "" + end + end + + # A Tag is any node that represents markup. It may be an opening tag, a + # closing tag, or a self-closing tag. It has a name, and may have a hash of + # attributes. + class Tag < Node #:nodoc: + + # Either +nil+, :close, or :self + attr_reader :closing + + # Either +nil+, or a hash of attributes for this node. + attr_reader :attributes + + # The name of this tag. + attr_reader :name + + # Create a new node as a child of the given parent, using the given content + # to describe the node. It will be parsed and the node name, attributes and + # closing status extracted. + def initialize(parent, line, pos, name, attributes, closing) + super(parent, line, pos) + @name = name + @attributes = attributes + @closing = closing + end + + # A convenience for obtaining an attribute of the node. Returns +nil+ if + # the node has no attributes. + def [](attr) + @attributes ? @attributes[attr] : nil + end + + # Returns non-+nil+ if this tag can contain child nodes. + def childless?(xml = false) + return false if xml && @closing.nil? +# !@closing.nil? || + @name =~ /^(img|br|hr|link|meta|area|base|basefont| + col|frame|input|isindex|param)$/ox + end + + # Returns a textual representation of the node + def to_s + s = '' + if @closing == :close + s = "" unless self.childless? + else + s = "<#{@name}" + atlist = @attributes.sort + atlist.each do |att| + s << " #{att[0]}" + s << "='#{att[1]}'" if String === att[1] + end + s << "/" if (@children.empty? && @closing == :self) or self.childless? + s << ">" + @children.each { |child| s << child.to_s } + s << "" if @closing != :self && !@closing.nil? && !@children.empty? + end + s + end + + # If either the node or any of its children meet the given conditions, the + # matching node is returned. Otherwise, +nil+ is returned. (See the + # description of the valid conditions in the +match+ method.) + def find(conditions) + match(conditions) && self || super + end + + # Returns +true+, indicating that this node represents an HTML tag. + def tag? + true + end + + # Returns +true+ if the node meets any of the given conditions. The + # +conditions+ parameter must be a hash of any of the following keys + # (all are optional): + # + # * :tag: the node name must match the corresponding value + # * :attributes: a hash. The node's values must match the + # corresponding values in the hash. + # * :parent: a hash. The node's parent must match the + # corresponding hash. + # * :child: a hash. At least one of the node's immediate children + # must meet the criteria described by the hash. + # * :ancestor: a hash. At least one of the node's ancestors must + # meet the criteria described by the hash. + # * :descendant: a hash. At least one of the node's descendants + # must meet the criteria described by the hash. + # * :sibling: a hash. At least one of the node's siblings must + # meet the criteria described by the hash. + # * :after: a hash. The node must be after any sibling meeting + # the criteria described by the hash, and at least one sibling must match. + # * :before: a hash. The node must be before any sibling meeting + # the criteria described by the hash, and at least one sibling must match. + # * :children: a hash, for counting children of a node. Accepts the + # keys: + # ** :count: either a number or a range which must equal (or + # include) the number of children that match. + # ** :less_than: the number of matching children must be less than + # this number. + # ** :greater_than: the number of matching children must be + # greater than this number. + # ** :only: another hash consisting of the keys to use + # to match on the children, and only matching children will be + # counted. + # + # Conditions are matched using the following algorithm: + # + # * if the condition is a string, it must be a substring of the value. + # * if the condition is a regexp, it must match the value. + # * if the condition is a number, the value must match number.to_s. + # * if the condition is +true+, the value must not be +nil+. + # * if the condition is +false+ or +nil+, the value must be +nil+. + # + # Usage: + # + # # test if the node is a "span" tag + # node.match :tag => "span" + # + # # test if the node's parent is a "div" + # node.match :parent => { :tag => "div" } + # + # # test if any of the node's ancestors are "table" tags + # node.match :ancestor => { :tag => "table" } + # + # # test if any of the node's immediate children are "em" tags + # node.match :child => { :tag => "em" } + # + # # test if any of the node's descendants are "strong" tags + # node.match :descendant => { :tag => "strong" } + # + # # test if the node has between 2 and 4 span tags as immediate children + # node.match :children => { :count => 2..4, :only => { :tag => "span" } } + # + # # get funky: test to see if the node is a "div", has a "ul" ancestor + # # and an "li" parent (with "class" = "enum"), and whether or not it has + # # a "span" descendant that contains # text matching /hello world/: + # node.match :tag => "div", + # :ancestor => { :tag => "ul" }, + # :parent => { :tag => "li", + # :attributes => { :class => "enum" } }, + # :descendant => { :tag => "span", + # :child => /hello world/ } + def match(conditions) + conditions = validate_conditions(conditions) + # check content of child nodes + if conditions[:content] + if children.empty? + return false unless match_condition("", conditions[:content]) + else + return false unless children.find { |child| child.match(conditions[:content]) } + end + end + + # test the name + return false unless match_condition(@name, conditions[:tag]) if conditions[:tag] + + # test attributes + (conditions[:attributes] || {}).each do |key, value| + return false unless match_condition(self[key], value) + end + + # test parent + return false unless parent.match(conditions[:parent]) if conditions[:parent] + + # test children + return false unless children.find { |child| child.match(conditions[:child]) } if conditions[:child] + + # test ancestors + if conditions[:ancestor] + return false unless catch :found do + p = self + throw :found, true if p.match(conditions[:ancestor]) while p = p.parent + end + end + + # test descendants + if conditions[:descendant] + return false unless children.find do |child| + # test the child + child.match(conditions[:descendant]) || + # test the child's descendants + child.match(:descendant => conditions[:descendant]) + end + end + + # count children + if opts = conditions[:children] + matches = children.select do |c| + (c.kind_of?(HTML::Tag) and (c.closing == :self or ! c.childless?)) + end + + matches = matches.select { |c| c.match(opts[:only]) } if opts[:only] + opts.each do |key, value| + next if key == :only + case key + when :count + if Integer === value + return false if matches.length != value + else + return false unless value.include?(matches.length) + end + when :less_than + return false unless matches.length < value + when :greater_than + return false unless matches.length > value + else raise "unknown count condition #{key}" + end + end + end + + # test siblings + if conditions[:sibling] || conditions[:before] || conditions[:after] + siblings = parent ? parent.children : [] + self_index = siblings.index(self) + + if conditions[:sibling] + return false unless siblings.detect do |s| + s != self && s.match(conditions[:sibling]) + end + end + + if conditions[:before] + return false unless siblings[self_index+1..-1].detect do |s| + s != self && s.match(conditions[:before]) + end + end + + if conditions[:after] + return false unless siblings[0,self_index].detect do |s| + s != self && s.match(conditions[:after]) + end + end + end + + true + end + + def ==(node) + return false unless super + return false unless closing == node.closing && self.name == node.name + attributes == node.attributes + end + + private + # Match the given value to the given condition. + def match_condition(value, condition) + case condition + when String + value && value == condition + when Regexp + value && value.match(condition) + when Numeric + value == condition.to_s + when true + !value.nil? + when false, nil + value.nil? + else + false + end + end + end +end diff --git a/lib/sanitize.rb b/lib/sanitize.rb index 3727bb08..f576b1fb 100644 --- a/lib/sanitize.rb +++ b/lib/sanitize.rb @@ -158,7 +158,7 @@ class String #++ #:stopdoc: - MATHML_ENTITIES = { + MATHML_ENTITIES = { 'Alpha' => 'Α', 'Beta' => 'Β', 'Epsilon' => 'Ε', @@ -2279,7 +2279,7 @@ class String 'wp' => '℘', 'wr' => '≀', 'zeetrf' => 'ℨ' - } + } unless const_defined? "MATHML_ENTITIES" #:startdoc: # Converts XHTML+MathML named entities in string to Numeric Character References diff --git a/lib/sanitizer.rb b/lib/sanitizer.rb new file mode 100644 index 00000000..adc3f3bb --- /dev/null +++ b/lib/sanitizer.rb @@ -0,0 +1,198 @@ +module Sanitizer + +# This module provides sanitization of XHTML+MathML+SVG +# and of inline style attributes. +# +# Based heavily on Sam Ruby's code in the Universal FeedParser. + + require 'html/tokenizer' + require 'node' + require 'stringsupport' + + acceptable_elements = %w[a abbr acronym address area b big blockquote br + button caption center cite code col colgroup dd del dfn dir div dl dt + em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label + legend li map menu ol optgroup option p pre q s samp select small span + strike strong sub sup table tbody td textarea tfoot th thead tr tt u + ul var] + + mathml_elements = %w[annotation annotation-xml maction math merror mfrac + mfenced mi mmultiscripts mn mo mover mpadded mphantom mprescripts mroot + mrow mspace msqrt mstyle msub msubsup msup mtable mtd mtext mtr munder + munderover none semantics] + + svg_elements = %w[a animate animateColor animateMotion animateTransform + circle defs desc ellipse font-face font-face-name font-face-src + foreignObject g glyph hkern linearGradient line marker metadata + missing-glyph mpath path polygon polyline radialGradient rect set + stop svg switch text title tspan use] + + acceptable_attributes = %w[abbr accept accept-charset accesskey action + align alt axis border cellpadding cellspacing char charoff charset + checked cite class clear cols colspan color compact coords datetime + dir disabled enctype for frame headers height href hreflang hspace id + ismap label lang longdesc maxlength media method multiple name nohref + noshade nowrap prompt readonly rel rev rows rowspan rules scope + selected shape size span src start style summary tabindex target title + type usemap valign value vspace width xml:lang] + + mathml_attributes = %w[actiontype align close columnalign columnalign + columnalign columnlines columnspacing columnspan depth display + displaystyle encoding equalcolumns equalrows fence fontstyle fontweight + frame height linethickness lspace mathbackground mathcolor mathvariant + mathvariant maxsize minsize open other rowalign rowalign rowalign + rowlines rowspacing rowspan rspace scriptlevel selection separator + separators stretchy width width xlink:href xlink:show xlink:type xmlns + xmlns:xlink] + + svg_attributes = %w[accent-height accumulate additive alphabetic + arabic-form ascent attributeName attributeType baseProfile bbox begin + by calcMode cap-height class color color-rendering content cx cy d dx + dy descent display dur end fill fill-rule font-family font-size + font-stretch font-style font-variant font-weight from fx fy g1 g2 + glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id + ideographic k keyPoints keySplines keyTimes lang marker-end + marker-mid marker-start markerHeight markerUnits markerWidth + mathematical max min name offset opacity orient origin + overline-position overline-thickness panose-1 path pathLength points + preserveAspectRatio r refX refY repeatCount repeatDur + requiredExtensions requiredFeatures restart rotate rx ry slope stemh + stemv stop-color stop-opacity strikethrough-position + strikethrough-thickness stroke stroke-dasharray stroke-dashoffset + stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity + stroke-width systemLanguage target text-anchor to transform type u1 + u2 underline-position underline-thickness unicode unicode-range + units-per-em values version viewBox visibility width widths x + x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role + xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns + xmlns:xlink y y1 y2 zoomAndPan] + + attr_val_is_uri = %w[href src cite action longdesc xlink:href xml:base] + + SVG_ATTR_VAL_ALLOWS_REF = %w[clip-path color-profile cursor fill + filter marker marker-start marker-mid marker-end mask stroke] + + SVG_ALLOW_LOCAL_HREF = %w[altGlyph animate animateColor animateMotion + animateTransform cursor feImage filter linearGradient pattern + radialGradient textpath tref set use] + + acceptable_css_properties = %w[azimuth background-color + border-bottom-color border-collapse border-color border-left-color + border-right-color border-top-color clear color cursor direction + display elevation float font font-family font-size font-style + font-variant font-weight height letter-spacing line-height overflow + pause pause-after pause-before pitch pitch-range richness speak + speak-header speak-numeral speak-punctuation speech-rate stress + text-align text-decoration text-indent unicode-bidi vertical-align + voice-family volume white-space width] + + acceptable_css_keywords = %w[auto aqua black block blue bold both bottom + brown center collapse dashed dotted fuchsia gray green !important + italic left lime maroon medium none navy normal nowrap olive pointer + purple red right solid silver teal top transparent underline white + yellow] + + acceptable_svg_properties = %w[fill fill-opacity fill-rule stroke + stroke-width stroke-linecap stroke-linejoin stroke-opacity] + + acceptable_protocols = %w[ed2k ftp http https irc mailto news gopher nntp + telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs] + + VOID_ELEMENTS = %w[img br hr link meta area base basefont + col frame input isindex param] + + ALLOWED_ELEMENTS = acceptable_elements + mathml_elements + svg_elements unless defined?(ALLOWED_ELEMENTS) + ALLOWED_ATTRIBUTES = acceptable_attributes + mathml_attributes + svg_attributes unless defined?(ALLOWED_ATTRIBUTES) + ALLOWED_CSS_PROPERTIES = acceptable_css_properties unless defined?(ALLOWED_CSS_PROPERTIES) + ALLOWED_CSS_KEYWORDS = acceptable_css_keywords unless defined?(ALLOWED_CSS_KEYWORDS) + ALLOWED_SVG_PROPERTIES = acceptable_svg_properties unless defined?(ALLOWED_SVG_PROPERTIES) + ALLOWED_PROTOCOLS = acceptable_protocols unless defined?(ALLOWED_PROTOCOLS) + ATTR_VAL_IS_URI = attr_val_is_uri unless defined?(ATTR_VAL_IS_URI) + + # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and stripping out all + # attributes not in ALLOWED_ATTRIBUTES. Style attributes are parsed, and a restricted set, + # specified by ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through. + # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified in + # ALLOWED_PROTOCOLS are allowed. + # You can adjust what gets sanitized, by defining these constant arrays before this Module is loaded. + # + # sanitize_html('') + # => <script> do_nasty_stuff() </script> + # sanitize_html('Click here for $100') + # => Click here for $100 + def sanitize_xhtml(html) + if html.index("<") + tokenizer = HTML::Tokenizer.new(html.to_utf8) + new_text = "" + + while token = tokenizer.next + node = XHTML::Node.parse(nil, 0, 0, token, false) + new_text << case node.tag? + when true + if ALLOWED_ELEMENTS.include?(node.name) + if node.attributes + node.attributes.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) } + ATTR_VAL_IS_URI.each do |attr| + val_unescaped = CGI.unescapeHTML(node.attributes[attr].to_s).gsub(/`|[\000-\040\177\s\200-\240]/,'').downcase + if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0]) + node.attributes.delete attr + end + end + SVG_ATTR_VAL_ALLOWS_REF.each do |attr| + node.attributes[attr] = node.attributes[attr].to_s.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if node.attributes[attr] + end + if SVG_ALLOW_LOCAL_HREF.include?(node.name) && node.attributes['xlink:href'] && node.attributes['xlink:href'] =~ /^\s*[^#\s].*/m + node.attributes.delete 'xlink:href' + end + if node.attributes['style'] + node.attributes['style'] = sanitize_css(node.attributes['style']) + end + node.attributes.each do |attr,val| + if String === val + node.attributes[attr] = CGI.escapeHTML(val.unescapeHTML) + else + node.attributes.delete attr + end + end + end + node.to_s + else + node.to_s.gsub(//, ">") + end + else + CGI.escapeHTML(node.to_s.unescapeHTML) + end + end + + html = new_text + end + html + end + + def sanitize_css(style) + # disallow urls + style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ') + + # gauntlet + return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/ + return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/ + + clean = [] + style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val| + next if val.empty? + prop.downcase! + if self.class.const_get("ALLOWED_CSS_PROPERTIES").include?(prop) + clean << "#{prop}: #{val};" + elsif %w[background border margin padding].include?(prop.split('-')[0]) + clean << "#{prop}: #{val};" unless val.split().any? do |keyword| + !self.class.const_get("ALLOWED_CSS_KEYWORDS").include?(keyword) and + keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/ + end + elsif self.class.const_get("ALLOWED_SVG_PROPERTIES").include?(prop) + clean << "#{prop}: #{val};" + end + end + + style = clean.join(' ') + end +end diff --git a/lib/stringsupport.rb b/lib/stringsupport.rb new file mode 100644 index 00000000..aa076622 --- /dev/null +++ b/lib/stringsupport.rb @@ -0,0 +1,2271 @@ +# Some useful additions to the String class + +class String + +# Check whether a string is valid utf-8 +# +# :call-seq: +# string.is_utf8? -> boolean +# +# returns true if the sequence of bytes in string is valid utf-8 +#-- + def is_utf8? + #expand NCRs to utf-8 + text = self.gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*') } + text.gsub!(/&#(\d+);/) { |m| [$1.to_i].pack('U*') } + + # You might think this is faster, but it isn't + #pieces = self.split(/&#[xX]([a-fA-F0-9]+);/) + #1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')} + #pieces = pieces.join.split(/&#(\d+);/) + #1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')} + #text = pieces.join + + #ensure the resulting string of bytes is valid utf-8 + text =~ /\A( + [\x09\x0A\x0D\x20-\x7E] # ASCII + | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte + | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs + | [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte + | \xEF[\x80-\xBE]{2} # + | \xEF\xBF[\x80-\xBD] # excluding U+fffe and U+ffff + | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates + | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 + | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 + | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 + )*\Z/x; + end +#++ + +#:stopdoc: + MATHML_ENTITIES = { + 'Alpha' => 'Α', + 'Beta' => 'Β', + 'Epsilon' => 'Ε', + 'Zeta' => 'Ζ', + 'Eta' => 'Η', + 'Iota' => 'Ι', + 'Kappa' => 'Κ', + 'Mu' => 'Μ', + 'Nu' => 'Ν', + 'Omicron' => 'Ο', + 'Rho' => 'Ρ', + 'Tau' => 'Τ', + 'Chi' => 'Χ', + 'epsilon' => 'ε', + 'zeta' => 'ζ', + 'omicron' => 'ο', + 'sigmaf' => 'ς', + 'thetasym' => 'ϑ', + 'upsih' => 'ϒ', + 'oline' => '‾', + 'frasl' => '⁄', + 'alefsym' => 'ℵ', + 'crarr' => '↵', + 'empty' => '∅', + 'amp' => '&', + 'lt' => '<', + 'zwnj' => '‌', + 'zwj' => '‍', + 'lrm' => '‎', + 'rlm' => '‏', + 'sbquo' => '‚', + 'bdquo' => '„', + 'lsaquo' => '‹', + 'rsaquo' => '›', + 'euro' => '€', + 'angzarr' => '⍼', + 'cirmid' => '⫯', + 'cudarrl' => '⤸', + 'cudarrr' => '⤵', + 'cularr' => '↶', + 'cularrp' => '⤽', + 'curarr' => '↷', + 'curarrm' => '⤼', + 'Darr' => '↡', + 'dArr' => '⇓', + 'ddarr' => '⇊', + 'DDotrahd' => '⤑', + 'dfisht' => '⥿', + 'dHar' => '⥥', + 'dharl' => '⇃', + 'dharr' => '⇂', + 'duarr' => '⇵', + 'duhar' => '⥯', + 'dzigrarr' => '⟿', + 'erarr' => '⥱', + 'hArr' => '⇔', + 'harr' => '↔', + 'harrcir' => '⥈', + 'harrw' => '↭', + 'hoarr' => '⇿', + 'imof' => '⊷', + 'lAarr' => '⇚', + 'Larr' => '↞', + 'larrbfs' => '⤟', + 'larrfs' => '⤝', + 'larrhk' => '↩', + 'larrlp' => '↫', + 'larrpl' => '⤹', + 'larrsim' => '⥳', + 'larrtl' => '↢', + 'lAtail' => '⤛', + 'latail' => '⤙', + 'lBarr' => '⤎', + 'lbarr' => '⤌', + 'ldca' => '⤶', + 'ldrdhar' => '⥧', + 'ldrushar' => '⥋', + 'ldsh' => '↲', + 'lfisht' => '⥼', + 'lHar' => '⥢', + 'lhard' => '↽', + 'lharu' => '↼', + 'lharul' => '⥪', + 'llarr' => '⇇', + 'llhard' => '⥫', + 'loarr' => '⇽', + 'lrarr' => '⇆', + 'lrhar' => '⇋', + 'lrhard' => '⥭', + 'lsh' => '↰', + 'lurdshar' => '⥊', + 'luruhar' => '⥦', + 'Map' => '⤅', + 'map' => '↦', + 'midcir' => '⫰', + 'mumap' => '⊸', + 'nearhk' => '⤤', + 'neArr' => '⇗', + 'nearr' => '↗', + 'nesear' => '⤨', + 'nhArr' => '⇎', + 'nharr' => '↮', + 'nlArr' => '⇍', + 'nlarr' => '↚', + 'nrArr' => '⇏', + 'nrarr' => '↛', + 'nrarrc' => '⤳̸', + 'nrarrw' => '↝̸', + 'nvHarr' => '⤄', + 'nvlArr' => '⤂', + 'nvrArr' => '⤃', + 'nwarhk' => '⤣', + 'nwArr' => '⇖', + 'nwarr' => '↖', + 'nwnear' => '⤧', + 'olarr' => '↺', + 'orarr' => '↻', + 'origof' => '⊶', + 'rAarr' => '⇛', + 'Rarr' => '↠', + 'rarrap' => '⥵', + 'rarrbfs' => '⤠', + 'rarrc' => '⤳', + 'rarrfs' => '⤞', + 'rarrhk' => '↪', + 'rarrlp' => '↬', + 'rarrpl' => '⥅', + 'rarrsim' => '⥴', + 'Rarrtl' => '⤖', + 'rarrtl' => '↣', + 'rarrw' => '↝', + 'rAtail' => '⤜', + 'ratail' => '⤚', + 'RBarr' => '⤐', + 'rBarr' => '⤏', + 'rbarr' => '⤍', + 'rdca' => '⤷', + 'rdldhar' => '⥩', + 'rdsh' => '↳', + 'rfisht' => '⥽', + 'rHar' => '⥤', + 'rhard' => '⇁', + 'rharu' => '⇀', + 'rharul' => '⥬', + 'rlarr' => '⇄', + 'rlhar' => '⇌', + 'roarr' => '⇾', + 'rrarr' => '⇉', + 'rsh' => '↱', + 'ruluhar' => '⥨', + 'searhk' => '⤥', + 'seArr' => '⇘', + 'searr' => '↘', + 'seswar' => '⤩', + 'simrarr' => '⥲', + 'slarr' => '←', + 'srarr' => '→', + 'swarhk' => '⤦', + 'swArr' => '⇙', + 'swarr' => '↙', + 'swnwar' => '⤪', + 'Uarr' => '↟', + 'uArr' => '⇑', + 'Uarrocir' => '⥉', + 'udarr' => '⇅', + 'udhar' => '⥮', + 'ufisht' => '⥾', + 'uHar' => '⥣', + 'uharl' => '↿', + 'uharr' => '↾', + 'uuarr' => '⇈', + 'vArr' => '⇕', + 'varr' => '↕', + 'xhArr' => '⟺', + 'xharr' => '⟷', + 'xlArr' => '⟸', + 'xlarr' => '⟵', + 'xmap' => '⟼', + 'xrArr' => '⟹', + 'xrarr' => '⟶', + 'zigrarr' => '⇝', + 'ac' => '∾', + 'acE' => '∾̳', + 'amalg' => '⨿', + 'barvee' => '⊽', + 'Barwed' => '⌆', + 'barwed' => '⌅', + 'bsolb' => '⧅', + 'Cap' => '⋒', + 'capand' => '⩄', + 'capbrcup' => '⩉', + 'capcap' => '⩋', + 'capcup' => '⩇', + 'capdot' => '⩀', + 'caps' => '∩︀', + 'ccaps' => '⩍', + 'ccups' => '⩌', + 'ccupssm' => '⩐', + 'coprod' => '∐', + 'Cup' => '⋓', + 'cupbrcap' => '⩈', + 'cupcap' => '⩆', + 'cupcup' => '⩊', + 'cupdot' => '⊍', + 'cupor' => '⩅', + 'cups' => '∪︀', + 'cuvee' => '⋎', + 'cuwed' => '⋏', + 'Dagger' => '‡', + 'dagger' => '†', + 'diam' => '⋄', + 'divonx' => '⋇', + 'eplus' => '⩱', + 'hercon' => '⊹', + 'intcal' => '⊺', + 'iprod' => '⨼', + 'loplus' => '⨭', + 'lotimes' => '⨴', + 'lthree' => '⋋', + 'ltimes' => '⋉', + 'midast' => '*', + 'minusb' => '⊟', + 'minusd' => '∸', + 'minusdu' => '⨪', + 'ncap' => '⩃', + 'ncup' => '⩂', + 'oast' => '⊛', + 'ocir' => '⊚', + 'odash' => '⊝', + 'odiv' => '⨸', + 'odot' => '⊙', + 'odsold' => '⦼', + 'ofcir' => '⦿', + 'ogt' => '⧁', + 'ohbar' => '⦵', + 'olcir' => '⦾', + 'olt' => '⧀', + 'omid' => '⦶', + 'ominus' => '⊖', + 'opar' => '⦷', + 'operp' => '⦹', + 'oplus' => '⊕', + 'osol' => '⊘', + 'Otimes' => '⨷', + 'otimes' => '⊗', + 'otimesas' => '⨶', + 'ovbar' => '⌽', + 'plusacir' => '⨣', + 'plusb' => '⊞', + 'pluscir' => '⨢', + 'plusdo' => '∔', + 'plusdu' => '⨥', + 'pluse' => '⩲', + 'plussim' => '⨦', + 'plustwo' => '⨧', + 'prod' => '∏', + 'race' => '⧚', + 'roplus' => '⨮', + 'rotimes' => '⨵', + 'rthree' => '⋌', + 'rtimes' => '⋊', + 'sdot' => '⋅', + 'sdotb' => '⊡', + 'setmn' => '∖', + 'simplus' => '⨤', + 'smashp' => '⨳', + 'solb' => '⧄', + 'sqcap' => '⊓', + 'sqcaps' => '⊓︀', + 'sqcup' => '⊔', + 'sqcups' => '⊔︀', + 'ssetmn' => '∖', + 'sstarf' => '⋆', + 'subdot' => '⪽', + 'sum' => '∑', + 'supdot' => '⪾', + 'timesb' => '⊠', + 'timesbar' => '⨱', + 'timesd' => '⨰', + 'tridot' => '◬', + 'triminus' => '⨺', + 'triplus' => '⨹', + 'trisb' => '⧍', + 'tritime' => '⨻', + 'uplus' => '⊎', + 'veebar' => '⊻', + 'wedbar' => '⩟', + 'wreath' => '≀', + 'xcap' => '⋂', + 'xcirc' => '◯', + 'xcup' => '⋃', + 'xdtri' => '▽', + 'xodot' => '⨀', + 'xoplus' => '⨁', + 'xotime' => '⨂', + 'xsqcup' => '⨆', + 'xuplus' => '⨄', + 'xutri' => '△', + 'xvee' => '⋁', + 'xwedge' => '⋀', + 'dlcorn' => '⌞', + 'drcorn' => '⌟', + 'gtlPar' => '⦕', + 'langd' => '⦑', + 'lbrke' => '⦋', + 'lbrksld' => '⦏', + 'lbrkslu' => '⦍', + 'lceil' => '⌈', + 'lfloor' => '⌊', + 'lmoust' => '⎰', + 'lparlt' => '⦓', + 'ltrPar' => '⦖', + 'rangd' => '⦒', + 'rbrke' => '⦌', + 'rbrksld' => '⦎', + 'rbrkslu' => '⦐', + 'rceil' => '⌉', + 'rfloor' => '⌋', + 'rmoust' => '⎱', + 'rpargt' => '⦔', + 'ulcorn' => '⌜', + 'urcorn' => '⌝', + 'gnap' => '⪊', + 'gnE' => '≩', + 'gne' => '⪈', + 'gnsim' => '⋧', + 'gvnE' => '≩︀', + 'lnap' => '⪉', + 'lnE' => '≨', + 'lne' => '⪇', + 'lnsim' => '⋦', + 'lvnE' => '≨︀', + 'nap' => '≉', + 'napE' => '⩰̸', + 'napid' => '≋̸', + 'ncong' => '≇', + 'ncongdot' => '⩭̸', + 'nequiv' => '≢', + 'ngE' => '≧̸', + 'nge' => '≱', + 'nges' => '⩾̸', + 'nGg' => '⋙̸', + 'ngsim' => '≵', + 'nGt' => '≫⃒', + 'ngt' => '≯', + 'nGtv' => '≫̸', + 'nlE' => '≦̸', + 'nle' => '≰', + 'nles' => '⩽̸', + 'nLl' => '⋘̸', + 'nlsim' => '≴', + 'nLt' => '≪⃒', + 'nlt' => '≮', + 'nltri' => '⋪', + 'nltrie' => '⋬', + 'nLtv' => '≪̸', + 'nmid' => '∤', + 'npar' => '∦', + 'npr' => '⊀', + 'nprcue' => '⋠', + 'npre' => '⪯̸', + 'nrtri' => '⋫', + 'nrtrie' => '⋭', + 'nsc' => '⊁', + 'nsccue' => '⋡', + 'nsce' => '⪰̸', + 'nsim' => '≁', + 'nsime' => '≄', + 'nsmid' => '∤', + 'nspar' => '∦', + 'nsqsube' => '⋢', + 'nsqsupe' => '⋣', + 'nsub' => '⊄', + 'nsubE' => '⫅̸', + 'nsube' => '⊈', + 'nsup' => '⊅', + 'nsupE' => '⫆̸', + 'nsupe' => '⊉', + 'ntgl' => '≹', + 'ntlg' => '≸', + 'nvap' => '≍⃒', + 'nVDash' => '⊯', + 'nVdash' => '⊮', + 'nvDash' => '⊭', + 'nvdash' => '⊬', + 'nvge' => '≥⃒', + 'nvgt' => '>⃒', + 'nvle' => '≤⃒', + 'nvltrie' => '⊴⃒', + 'nvrtrie' => '⊵⃒', + 'nvsim' => '∼⃒', + 'parsim' => '⫳', + 'prnap' => '⪹', + 'prnE' => '⪵', + 'prnsim' => '⋨', + 'rnmid' => '⫮', + 'scnap' => '⪺', + 'scnE' => '⪶', + 'scnsim' => '⋩', + 'simne' => '≆', + 'solbar' => '⌿', + 'subnE' => '⫋', + 'subne' => '⊊', + 'supnE' => '⫌', + 'supne' => '⊋', + 'vnsub' => '⊂⃒', + 'vnsup' => '⊃⃒', + 'vsubnE' => '⫋︀', + 'vsubne' => '⊊︀', + 'vsupnE' => '⫌︀', + 'vsupne' => '⊋︀', + 'ang' => '∠', + 'ange' => '⦤', + 'angmsd' => '∡', + 'angmsdaa' => '⦨', + 'angmsdab' => '⦩', + 'angmsdac' => '⦪', + 'angmsdad' => '⦫', + 'angmsdae' => '⦬', + 'angmsdaf' => '⦭', + 'angmsdag' => '⦮', + 'angmsdah' => '⦯', + 'angrtvb' => '⊾', + 'angrtvbd' => '⦝', + 'bbrk' => '⎵', + 'bbrktbrk' => '⎶', + 'bemptyv' => '⦰', + 'beth' => 'ℶ', + 'boxbox' => '⧉', + 'bprime' => '‵', + 'bsemi' => '⁏', + 'cemptyv' => '⦲', + 'cirE' => '⧃', + 'cirscir' => '⧂', + 'comp' => '∁', + 'daleth' => 'ℸ', + 'demptyv' => '⦱', + 'ell' => 'ℓ', + 'empty' => '∅', + 'emptyv' => '∅', + 'gimel' => 'ℷ', + 'iiota' => '℩', + 'image' => 'ℑ', + 'imath' => 'ı', + 'jmath' => 'j', + 'laemptyv' => '⦴', + 'lltri' => '◺', + 'lrtri' => '⊿', + 'mho' => '℧', + 'nang' => '∠⃒', + 'nexist' => '∄', + 'oS' => 'Ⓢ', + 'planck' => 'ℏ', + 'plankv' => 'ℏ', + 'raemptyv' => '⦳', + 'range' => '⦥', + 'real' => 'ℜ', + 'tbrk' => '⎴', + 'trpezium' => '�', + 'ultri' => '◸', + 'urtri' => '◹', + 'vzigzag' => '⦚', + 'weierp' => '℘', + 'apE' => '⩰', + 'ape' => '≊', + 'apid' => '≋', + 'asymp' => '≈', + 'Barv' => '⫧', + 'bcong' => '≌', + 'bepsi' => '϶', + 'bowtie' => '⋈', + 'bsim' => '∽', + 'bsime' => '⋍', + 'bsolhsub' => '\⊂', + 'bump' => '≎', + 'bumpE' => '⪮', + 'bumpe' => '≏', + 'cire' => '≗', + 'Colon' => '∷', + 'Colone' => '⩴', + 'colone' => '≔', + 'congdot' => '⩭', + 'csub' => '⫏', + 'csube' => '⫑', + 'csup' => '⫐', + 'csupe' => '⫒', + 'cuepr' => '⋞', + 'cuesc' => '⋟', + 'Dashv' => '⫤', + 'dashv' => '⊣', + 'easter' => '⩮', + 'ecir' => '≖', + 'ecolon' => '≕', + 'eDDot' => '⩷', + 'eDot' => '≑', + 'efDot' => '≒', + 'eg' => '⪚', + 'egs' => '⪖', + 'egsdot' => '⪘', + 'el' => '⪙', + 'els' => '⪕', + 'elsdot' => '⪗', + 'equest' => '≟', + 'equivDD' => '⩸', + 'erDot' => '≓', + 'esdot' => '≐', + 'Esim' => '⩳', + 'esim' => '≂', + 'fork' => '⋔', + 'forkv' => '⫙', + 'frown' => '⌢', + 'gap' => '⪆', + 'gE' => '≧', + 'gEl' => '⪌', + 'gel' => '⋛', + 'ges' => '⩾', + 'gescc' => '⪩', + 'gesdot' => '⪀', + 'gesdoto' => '⪂', + 'gesdotol' => '⪄', + 'gesl' => '⋛︀', + 'gesles' => '⪔', + 'Gg' => '⋙', + 'gl' => '≷', + 'gla' => '⪥', + 'glE' => '⪒', + 'glj' => '⪤', + 'gsim' => '≳', + 'gsime' => '⪎', + 'gsiml' => '⪐', + 'Gt' => '≫', + 'gtcc' => '⪧', + 'gtcir' => '⩺', + 'gtdot' => '⋗', + 'gtquest' => '⩼', + 'gtrarr' => '⥸', + 'homtht' => '∻', + 'lap' => '⪅', + 'lat' => '⪫', + 'late' => '⪭', + 'lates' => '⪭︀', + 'lE' => '≦', + 'lEg' => '⪋', + 'leg' => '⋚', + 'les' => '⩽', + 'lescc' => '⪨', + 'lesdot' => '⩿', + 'lesdoto' => '⪁', + 'lesdotor' => '⪃', + 'lesg' => '⋚︀', + 'lesges' => '⪓', + 'lg' => '≶', + 'lgE' => '⪑', + 'Ll' => '⋘', + 'lsim' => '≲', + 'lsime' => '⪍', + 'lsimg' => '⪏', + 'Lt' => '≪', + 'ltcc' => '⪦', + 'ltcir' => '⩹', + 'ltdot' => '⋖', + 'ltlarr' => '⥶', + 'ltquest' => '⩻', + 'ltrie' => '⊴', + 'mcomma' => '⨩', + 'mDDot' => '∺', + 'mid' => '∣', + 'mlcp' => '⫛', + 'models' => '⊧', + 'mstpos' => '∾', + 'Pr' => '⪻', + 'pr' => '≺', + 'prap' => '⪷', + 'prcue' => '≼', + 'prE' => '⪳', + 'pre' => '⪯', + 'prsim' => '≾', + 'prurel' => '⊰', + 'ratio' => '∶', + 'rtrie' => '⊵', + 'rtriltri' => '⧎', + 'Sc' => '⪼', + 'sc' => '≻', + 'scap' => '⪸', + 'sccue' => '≽', + 'scE' => '⪴', + 'sce' => '⪰', + 'scsim' => '≿', + 'sdote' => '⩦', + 'sfrown' => '⌢', + 'simg' => '⪞', + 'simgE' => '⪠', + 'siml' => '⪝', + 'simlE' => '⪟', + 'smid' => '∣', + 'smile' => '⌣', + 'smt' => '⪪', + 'smte' => '⪬', + 'smtes' => '⪬︀', + 'spar' => '∥', + 'sqsub' => '⊏', + 'sqsube' => '⊑', + 'sqsup' => '⊐', + 'sqsupe' => '⊒', + 'ssmile' => '⌣', + 'Sub' => '⋐', + 'subE' => '⫅', + 'subedot' => '⫃', + 'submult' => '⫁', + 'subplus' => '⪿', + 'subrarr' => '⥹', + 'subsim' => '⫇', + 'subsub' => '⫕', + 'subsup' => '⫓', + 'Sup' => '⋑', + 'supdsub' => '⫘', + 'supE' => '⫆', + 'supedot' => '⫄', + 'suphsol' => '⊃/', + 'suphsub' => '⫗', + 'suplarr' => '⥻', + 'supmult' => '⫂', + 'supplus' => '⫀', + 'supsim' => '⫈', + 'supsub' => '⫔', + 'supsup' => '⫖', + 'thkap' => '≈', + 'thksim' => '∼', + 'topfork' => '⫚', + 'trie' => '≜', + 'twixt' => '≬', + 'Vbar' => '⫫', + 'vBar' => '⫨', + 'vBarv' => '⫩', + 'VDash' => '⊫', + 'Vdash' => '⊩', + 'vDash' => '⊨', + 'vdash' => '⊢', + 'Vdashl' => '⫦', + 'vltri' => '⊲', + 'vprop' => '∝', + 'vrtri' => '⊳', + 'Vvdash' => '⊪', + 'alpha' => 'α', + 'beta' => 'β', + 'chi' => 'χ', + 'Delta' => 'Δ', + 'delta' => 'δ', + 'epsi' => 'ϵ', + 'epsiv' => 'ε', + 'eta' => 'η', + 'Gamma' => 'Γ', + 'gamma' => 'γ', + 'Gammad' => 'Ϝ', + 'gammad' => 'ϝ', + 'iota' => 'ι', + 'kappa' => 'κ', + 'kappav' => 'ϰ', + 'Lambda' => 'Λ', + 'lambda' => 'λ', + 'mu' => 'μ', + 'nu' => 'ν', + 'Omega' => 'Ω', + 'omega' => 'ω', + 'Phi' => 'Φ', + 'phi' => 'ϕ', + 'phiv' => 'φ', + 'Pi' => 'Π', + 'pi' => 'π', + 'piv' => 'ϖ', + 'Psi' => 'Ψ', + 'psi' => 'ψ', + 'rho' => 'ρ', + 'rhov' => 'ϱ', + 'Sigma' => 'Σ', + 'sigma' => 'σ', + 'sigmav' => 'ς', + 'tau' => 'τ', + 'Theta' => 'Θ', + 'theta' => 'θ', + 'thetav' => 'ϑ', + 'Upsi' => 'ϒ', + 'upsi' => 'υ', + 'Xi' => 'Ξ', + 'xi' => 'ξ', + 'zeta' => 'ζ', + 'Afr' => '𝔄', + 'afr' => '𝔞', + 'Bfr' => '𝔅', + 'bfr' => '𝔟', + 'Cfr' => 'ℭ', + 'cfr' => '𝔠', + 'Dfr' => '𝔇', + 'dfr' => '𝔡', + 'Efr' => '𝔈', + 'efr' => '𝔢', + 'Ffr' => '𝔉', + 'ffr' => '𝔣', + 'Gfr' => '𝔊', + 'gfr' => '𝔤', + 'Hfr' => 'ℌ', + 'hfr' => '𝔥', + 'Ifr' => 'ℑ', + 'ifr' => '𝔦', + 'Jfr' => '𝔍', + 'jfr' => '𝔧', + 'Kfr' => '𝔎', + 'kfr' => '𝔨', + 'Lfr' => '𝔏', + 'lfr' => '𝔩', + 'Mfr' => '𝔐', + 'mfr' => '𝔪', + 'Nfr' => '𝔑', + 'nfr' => '𝔫', + 'Ofr' => '𝔒', + 'ofr' => '𝔬', + 'Pfr' => '𝔓', + 'pfr' => '𝔭', + 'Qfr' => '𝔔', + 'qfr' => '𝔮', + 'Rfr' => 'ℜ', + 'rfr' => '𝔯', + 'Sfr' => '𝔖', + 'sfr' => '𝔰', + 'Tfr' => '𝔗', + 'tfr' => '𝔱', + 'Ufr' => '𝔘', + 'ufr' => '𝔲', + 'Vfr' => '𝔙', + 'vfr' => '𝔳', + 'Wfr' => '𝔚', + 'wfr' => '𝔴', + 'Xfr' => '𝔛', + 'xfr' => '𝔵', + 'Yfr' => '𝔜', + 'yfr' => '𝔶', + 'Zfr' => 'ℨ', + 'zfr' => '𝔷', + 'Aopf' => '𝔸', + 'Bopf' => '𝔹', + 'Copf' => 'ℂ', + 'Dopf' => '𝔻', + 'Eopf' => '𝔼', + 'Fopf' => '𝔽', + 'Gopf' => '𝔾', + 'Hopf' => 'ℍ', + 'Iopf' => '𝕀', + 'Jopf' => '𝕁', + 'Kopf' => '𝕂', + 'Lopf' => '𝕃', + 'Mopf' => '𝕄', + 'Nopf' => 'ℕ', + 'Oopf' => '𝕆', + 'Popf' => 'ℙ', + 'Qopf' => 'ℚ', + 'Ropf' => 'ℝ', + 'Sopf' => '𝕊', + 'Topf' => '𝕋', + 'Uopf' => '𝕌', + 'Vopf' => '𝕍', + 'Wopf' => '𝕎', + 'Xopf' => '𝕏', + 'Yopf' => '𝕐', + 'Zopf' => 'ℤ', + 'Ascr' => '𝒜', + 'ascr' => '𝒶', + 'Bscr' => 'ℬ', + 'bscr' => '𝒷', + 'Cscr' => '𝒞', + 'cscr' => '𝒸', + 'Dscr' => '𝒟', + 'dscr' => '𝒹', + 'Escr' => 'ℰ', + 'escr' => 'ℯ', + 'Fscr' => 'ℱ', + 'fscr' => '𝒻', + 'Gscr' => '𝒢', + 'gscr' => 'ℊ', + 'Hscr' => 'ℋ', + 'hscr' => '𝒽', + 'Iscr' => 'ℐ', + 'iscr' => '𝒾', + 'Jscr' => '𝒥', + 'jscr' => '𝒿', + 'Kscr' => '𝒦', + 'kscr' => '𝓀', + 'Lscr' => 'ℒ', + 'lscr' => '𝓁', + 'Mscr' => 'ℳ', + 'mscr' => '𝓂', + 'Nscr' => '𝒩', + 'nscr' => '𝓃', + 'Oscr' => '𝒪', + 'oscr' => 'ℴ', + 'Pscr' => '𝒫', + 'pscr' => '𝓅', + 'Qscr' => '𝒬', + 'qscr' => '𝓆', + 'Rscr' => 'ℛ', + 'rscr' => '𝓇', + 'Sscr' => '𝒮', + 'sscr' => '𝓈', + 'Tscr' => '𝒯', + 'tscr' => '𝓉', + 'Uscr' => '𝒰', + 'uscr' => '𝓊', + 'Vscr' => '𝒱', + 'vscr' => '𝓋', + 'Wscr' => '𝒲', + 'wscr' => '𝓌', + 'Xscr' => '𝒳', + 'xscr' => '𝓍', + 'Yscr' => '𝒴', + 'yscr' => '𝓎', + 'Zscr' => '𝒵', + 'zscr' => '𝓏', + 'acd' => '∿', + 'aleph' => 'ℵ', + 'And' => '⩓', + 'and' => '∧', + 'andand' => '⩕', + 'andd' => '⩜', + 'andslope' => '⩘', + 'andv' => '⩚', + 'angrt' => '∟', + 'angsph' => '∢', + 'angst' => 'Å', + 'ap' => '≈', + 'apacir' => '⩯', + 'awconint' => '∳', + 'awint' => '⨑', + 'becaus' => '∵', + 'bernou' => 'ℬ', + 'bne' => '=⃥', + 'bnequiv' => '≡⃥', + 'bNot' => '⫭', + 'bnot' => '⌐', + 'bottom' => '⊥', + 'cap' => '∩', + 'Cconint' => '∰', + 'cirfnint' => '⨐', + 'compfn' => '∘', + 'cong' => '≅', + 'Conint' => '∯', + 'conint' => '∮', + 'ctdot' => '⋯', + 'cup' => '∪', + 'cwconint' => '∲', + 'cwint' => '∱', + 'cylcty' => '⌭', + 'disin' => '⋲', + 'Dot' => '¨', + 'DotDot' => '⃜', + 'dsol' => '⧶', + 'dtdot' => '⋱', + 'dwangle' => '⦦', + 'elinters' => '�', + 'epar' => '⋕', + 'eparsl' => '⧣', + 'equiv' => '≡', + 'eqvparsl' => '⧥', + 'exist' => '∃', + 'fltns' => '▱', + 'fnof' => 'ƒ', + 'forall' => '∀', + 'fpartint' => '⨍', + 'ge' => '≥', + 'hamilt' => 'ℋ', + 'iff' => '⇔', + 'iinfin' => '⧜', + 'imped' => 'Ƶ', + 'infin' => '∞', + 'infintie' => '⧝', + 'Int' => '∬', + 'int' => '∫', + 'intlarhk' => '⨗', + 'isin' => '∈', + 'isindot' => '⋵', + 'isinE' => '⋹', + 'isins' => '⋴', + 'isinsv' => '⋳', + 'isinv' => '∈', + 'lagran' => 'ℒ', + 'Lang' => '《', + 'lang' => '〈', + 'lArr' => '⇐', + 'lbbrk' => '〔', + 'le' => '≤', + 'loang' => '〘', + 'lobrk' => '〚', + 'lopar' => '⦅', + 'lowast' => '∗', + 'minus' => '−', + 'mnplus' => '∓', + 'nabla' => '∇', + 'ne' => '≠', + 'nedot' => '≐̸', + 'nhpar' => '⫲', + 'ni' => '∋', + 'nis' => '⋼', + 'nisd' => '⋺', + 'niv' => '∋', + 'Not' => '⫬', + 'notin' => '∉', + 'notindot' => '⋵̸', + 'notinE' => '⋹̸', + 'notinva' => '∉', + 'notinvb' => '⋷', + 'notinvc' => '⋶', + 'notni' => '∌', + 'notniva' => '∌', + 'notnivb' => '⋾', + 'notnivc' => '⋽', + 'nparsl' => '⫽⃥', + 'npart' => '∂̸', + 'npolint' => '⨔', + 'nvinfin' => '⧞', + 'olcross' => '⦻', + 'Or' => '⩔', + 'or' => '∨', + 'ord' => '⩝', + 'order' => 'ℴ', + 'oror' => '⩖', + 'orslope' => '⩗', + 'orv' => '⩛', + 'par' => '∥', + 'parsl' => '⫽', + 'part' => '∂', + 'permil' => '‰', + 'perp' => '⊥', + 'pertenk' => '‱', + 'phmmat' => 'ℳ', + 'pointint' => '⨕', + 'Prime' => '″', + 'prime' => '′', + 'profalar' => '⌮', + 'profline' => '⌒', + 'profsurf' => '⌓', + 'prop' => '∝', + 'qint' => '⨌', + 'qprime' => '⁗', + 'quatint' => '⨖', + 'radic' => '√', + 'Rang' => '》', + 'rang' => '〉', + 'rArr' => '⇒', + 'rbbrk' => '〕', + 'roang' => '〙', + 'robrk' => '〛', + 'ropar' => '⦆', + 'rppolint' => '⨒', + 'scpolint' => '⨓', + 'sim' => '∼', + 'simdot' => '⩪', + 'sime' => '≃', + 'smeparsl' => '⧤', + 'square' => '□', + 'squarf' => '▪', + 'strns' => '¯', + 'sub' => '⊂', + 'sube' => '⊆', + 'sup' => '⊃', + 'supe' => '⊇', + 'tdot' => '⃛', + 'there4' => '∴', + 'tint' => '∭', + 'top' => '⊤', + 'topbot' => '⌶', + 'topcir' => '⫱', + 'tprime' => '‴', + 'utdot' => '⋰', + 'uwangle' => '⦧', + 'vangrt' => '⦜', + 'veeeq' => '≚', + 'Verbar' => '‖', + 'wedgeq' => '≙', + 'xnis' => '⋻', + 'boxDL' => '╗', + 'boxDl' => '╖', + 'boxdL' => '╕', + 'boxdl' => '┐', + 'boxDR' => '╔', + 'boxDr' => '╓', + 'boxdR' => '╒', + 'boxdr' => '┌', + 'boxH' => '═', + 'boxh' => '─', + 'boxHD' => '╦', + 'boxHd' => '╤', + 'boxhD' => '╥', + 'boxhd' => '┬', + 'boxHU' => '╩', + 'boxHu' => '╧', + 'boxhU' => '╨', + 'boxhu' => '┴', + 'boxUL' => '╝', + 'boxUl' => '╜', + 'boxuL' => '╛', + 'boxul' => '┘', + 'boxUR' => '╚', + 'boxUr' => '╙', + 'boxuR' => '╘', + 'boxur' => '└', + 'boxV' => '║', + 'boxv' => '│', + 'boxVH' => '╬', + 'boxVh' => '╫', + 'boxvH' => '╪', + 'boxvh' => '┼', + 'boxVL' => '╣', + 'boxVl' => '╢', + 'boxvL' => '╡', + 'boxvl' => '┤', + 'boxVR' => '╠', + 'boxVr' => '╟', + 'boxvR' => '╞', + 'boxvr' => '├', + 'Acy' => 'А', + 'acy' => 'а', + 'Bcy' => 'Б', + 'bcy' => 'б', + 'CHcy' => 'Ч', + 'chcy' => 'ч', + 'Dcy' => 'Д', + 'dcy' => 'д', + 'Ecy' => 'Э', + 'ecy' => 'э', + 'Fcy' => 'Ф', + 'fcy' => 'ф', + 'Gcy' => 'Г', + 'gcy' => 'г', + 'HARDcy' => 'Ъ', + 'hardcy' => 'ъ', + 'Icy' => 'И', + 'icy' => 'и', + 'IEcy' => 'Е', + 'iecy' => 'е', + 'IOcy' => 'Ё', + 'iocy' => 'ё', + 'Jcy' => 'Й', + 'jcy' => 'й', + 'Kcy' => 'К', + 'kcy' => 'к', + 'KHcy' => 'Х', + 'khcy' => 'х', + 'Lcy' => 'Л', + 'lcy' => 'л', + 'Mcy' => 'М', + 'mcy' => 'м', + 'Ncy' => 'Н', + 'ncy' => 'н', + 'numero' => '№', + 'Ocy' => 'О', + 'ocy' => 'о', + 'Pcy' => 'П', + 'pcy' => 'п', + 'Rcy' => 'Р', + 'rcy' => 'р', + 'Scy' => 'С', + 'scy' => 'с', + 'SHCHcy' => 'Щ', + 'shchcy' => 'щ', + 'SHcy' => 'Ш', + 'shcy' => 'ш', + 'SOFTcy' => 'Ь', + 'softcy' => 'ь', + 'Tcy' => 'Т', + 'tcy' => 'т', + 'TScy' => 'Ц', + 'tscy' => 'ц', + 'Ucy' => 'У', + 'ucy' => 'у', + 'Vcy' => 'В', + 'vcy' => 'в', + 'YAcy' => 'Я', + 'yacy' => 'я', + 'Ycy' => 'Ы', + 'ycy' => 'ы', + 'YUcy' => 'Ю', + 'yucy' => 'ю', + 'Zcy' => 'З', + 'zcy' => 'з', + 'ZHcy' => 'Ж', + 'zhcy' => 'ж', + 'DJcy' => 'Ђ', + 'djcy' => 'ђ', + 'DScy' => 'Ѕ', + 'dscy' => 'ѕ', + 'DZcy' => 'Џ', + 'dzcy' => 'џ', + 'GJcy' => 'Ѓ', + 'gjcy' => 'ѓ', + 'Iukcy' => 'І', + 'iukcy' => 'і', + 'Jsercy' => 'Ј', + 'jsercy' => 'ј', + 'Jukcy' => 'Є', + 'jukcy' => 'є', + 'KJcy' => 'Ќ', + 'kjcy' => 'ќ', + 'LJcy' => 'Љ', + 'ljcy' => 'љ', + 'NJcy' => 'Њ', + 'njcy' => 'њ', + 'TSHcy' => 'Ћ', + 'tshcy' => 'ћ', + 'Ubrcy' => 'Ў', + 'ubrcy' => 'ў', + 'YIcy' => 'Ї', + 'yicy' => 'ї', + 'acute' => '´', + 'breve' => '˘', + 'caron' => 'ˇ', + 'cedil' => '¸', + 'circ' => 'ˆ', + 'dblac' => '˝', + 'die' => '¨', + 'dot' => '˙', + 'grave' => '`', + 'macr' => '¯', + 'ogon' => '˛', + 'ring' => '˚', + 'tilde' => '˜', + 'uml' => '¨', + 'Aacute' => 'Á', + 'aacute' => 'á', + 'Acirc' => 'Â', + 'acirc' => 'â', + 'AElig' => 'Æ', + 'aelig' => 'æ', + 'Agrave' => 'À', + 'agrave' => 'à', + 'Aring' => 'Å', + 'aring' => 'å', + 'Atilde' => 'Ã', + 'atilde' => 'ã', + 'Auml' => 'Ä', + 'auml' => 'ä', + 'Ccedil' => 'Ç', + 'ccedil' => 'ç', + 'Eacute' => 'É', + 'eacute' => 'é', + 'Ecirc' => 'Ê', + 'ecirc' => 'ê', + 'Egrave' => 'È', + 'egrave' => 'è', + 'ETH' => 'Ð', + 'eth' => 'ð', + 'Euml' => 'Ë', + 'euml' => 'ë', + 'Iacute' => 'Í', + 'iacute' => 'í', + 'Icirc' => 'Î', + 'icirc' => 'î', + 'Igrave' => 'Ì', + 'igrave' => 'ì', + 'Iuml' => 'Ï', + 'iuml' => 'ï', + 'Ntilde' => 'Ñ', + 'ntilde' => 'ñ', + 'Oacute' => 'Ó', + 'oacute' => 'ó', + 'Ocirc' => 'Ô', + 'ocirc' => 'ô', + 'Ograve' => 'Ò', + 'ograve' => 'ò', + 'Oslash' => 'Ø', + 'oslash' => 'ø', + 'Otilde' => 'Õ', + 'otilde' => 'õ', + 'Ouml' => 'Ö', + 'ouml' => 'ö', + 'szlig' => 'ß', + 'THORN' => 'Þ', + 'thorn' => 'þ', + 'Uacute' => 'Ú', + 'uacute' => 'ú', + 'Ucirc' => 'Û', + 'ucirc' => 'û', + 'Ugrave' => 'Ù', + 'ugrave' => 'ù', + 'Uuml' => 'Ü', + 'uuml' => 'ü', + 'Yacute' => 'Ý', + 'yacute' => 'ý', + 'yuml' => 'ÿ', + 'Abreve' => 'Ă', + 'abreve' => 'ă', + 'Amacr' => 'Ā', + 'amacr' => 'ā', + 'Aogon' => 'Ą', + 'aogon' => 'ą', + 'Cacute' => 'Ć', + 'cacute' => 'ć', + 'Ccaron' => 'Č', + 'ccaron' => 'č', + 'Ccirc' => 'Ĉ', + 'ccirc' => 'ĉ', + 'Cdot' => 'Ċ', + 'cdot' => 'ċ', + 'Dcaron' => 'Ď', + 'dcaron' => 'ď', + 'Dstrok' => 'Đ', + 'dstrok' => 'đ', + 'Ecaron' => 'Ě', + 'ecaron' => 'ě', + 'Edot' => 'Ė', + 'edot' => 'ė', + 'Emacr' => 'Ē', + 'emacr' => 'ē', + 'ENG' => 'Ŋ', + 'eng' => 'ŋ', + 'Eogon' => 'Ę', + 'eogon' => 'ę', + 'gacute' => 'ǵ', + 'Gbreve' => 'Ğ', + 'gbreve' => 'ğ', + 'Gcedil' => 'Ģ', + 'Gcirc' => 'Ĝ', + 'gcirc' => 'ĝ', + 'Gdot' => 'Ġ', + 'gdot' => 'ġ', + 'Hcirc' => 'Ĥ', + 'hcirc' => 'ĥ', + 'Hstrok' => 'Ħ', + 'hstrok' => 'ħ', + 'Idot' => 'İ', + 'IJlig' => 'IJ', + 'ijlig' => 'ij', + 'Imacr' => 'Ī', + 'imacr' => 'ī', + 'inodot' => 'ı', + 'Iogon' => 'Į', + 'iogon' => 'į', + 'Itilde' => 'Ĩ', + 'itilde' => 'ĩ', + 'Jcirc' => 'Ĵ', + 'jcirc' => 'ĵ', + 'Kcedil' => 'Ķ', + 'kcedil' => 'ķ', + 'kgreen' => 'ĸ', + 'Lacute' => 'Ĺ', + 'lacute' => 'ĺ', + 'Lcaron' => 'Ľ', + 'lcaron' => 'ľ', + 'Lcedil' => 'Ļ', + 'lcedil' => 'ļ', + 'Lmidot' => 'Ŀ', + 'lmidot' => 'ŀ', + 'Lstrok' => 'Ł', + 'lstrok' => 'ł', + 'Nacute' => 'Ń', + 'nacute' => 'ń', + 'napos' => 'ʼn', + 'Ncaron' => 'Ň', + 'ncaron' => 'ň', + 'Ncedil' => 'Ņ', + 'ncedil' => 'ņ', + 'Odblac' => 'Ő', + 'odblac' => 'ő', + 'OElig' => 'Œ', + 'oelig' => 'œ', + 'Omacr' => 'Ō', + 'omacr' => 'ō', + 'Racute' => 'Ŕ', + 'racute' => 'ŕ', + 'Rcaron' => 'Ř', + 'rcaron' => 'ř', + 'Rcedil' => 'Ŗ', + 'rcedil' => 'ŗ', + 'Sacute' => 'Ś', + 'sacute' => 'ś', + 'Scaron' => 'Š', + 'scaron' => 'š', + 'Scedil' => 'Ş', + 'scedil' => 'ş', + 'Scirc' => 'Ŝ', + 'scirc' => 'ŝ', + 'Tcaron' => 'Ť', + 'tcaron' => 'ť', + 'Tcedil' => 'Ţ', + 'tcedil' => 'ţ', + 'Tstrok' => 'Ŧ', + 'tstrok' => 'ŧ', + 'Ubreve' => 'Ŭ', + 'ubreve' => 'ŭ', + 'Udblac' => 'Ű', + 'udblac' => 'ű', + 'Umacr' => 'Ū', + 'umacr' => 'ū', + 'Uogon' => 'Ų', + 'uogon' => 'ų', + 'Uring' => 'Ů', + 'uring' => 'ů', + 'Utilde' => 'Ũ', + 'utilde' => 'ũ', + 'Wcirc' => 'Ŵ', + 'wcirc' => 'ŵ', + 'Ycirc' => 'Ŷ', + 'ycirc' => 'ŷ', + 'Yuml' => 'Ÿ', + 'Zacute' => 'Ź', + 'zacute' => 'ź', + 'Zcaron' => 'Ž', + 'zcaron' => 'ž', + 'Zdot' => 'Ż', + 'zdot' => 'ż', + 'apos' => ''', + 'ast' => '*', + 'brvbar' => '¦', + 'bsol' => '\', + 'cent' => '¢', + 'colon' => ':', + 'comma' => ',', + 'commat' => '@', + 'copy' => '©', + 'curren' => '¤', + 'darr' => '↓', + 'deg' => '°', + 'divide' => '÷', + 'dollar' => '$', + 'equals' => '=', + 'excl' => '!', + 'frac12' => '½', + 'frac14' => '¼', + 'frac18' => '⅛', + 'frac34' => '¾', + 'frac38' => '⅜', + 'frac58' => '⅝', + 'frac78' => '⅞', + 'gt' => '>', + 'half' => '½', + 'horbar' => '―', + 'hyphen' => '‐', + 'iexcl' => '¡', + 'iquest' => '¿', + 'laquo' => '«', + 'larr' => '←', + 'lcub' => '{', + 'ldquo' => '“', + 'lowbar' => '_', + 'lpar' => '(', + 'lsqb' => '[', + 'lsquo' => '‘', + 'micro' => 'µ', + 'middot' => '·', + 'nbsp' => ' ', + 'not' => '¬', + 'num' => '#', + 'ohm' => 'Ω', + 'ordf' => 'ª', + 'ordm' => 'º', + 'para' => '¶', + 'percnt' => '%', + 'period' => '.', + 'plus' => '+', + 'plusmn' => '±', + 'pound' => '£', + 'quest' => '?', + 'quot' => '"', + 'raquo' => '»', + 'rarr' => '→', + 'rcub' => '}', + 'rdquo' => '”', + 'reg' => '®', + 'rpar' => ')', + 'rsqb' => ']', + 'rsquo' => '’', + 'sect' => '§', + 'semi' => ';', + 'shy' => '­', + 'sol' => '/', + 'sung' => '♪', + 'sup1' => '¹', + 'sup2' => '²', + 'sup3' => '³', + 'times' => '×', + 'trade' => '™', + 'uarr' => '↑', + 'verbar' => '|', + 'yen' => '¥', + 'blank' => '␣', + 'blk12' => '▒', + 'blk14' => '░', + 'blk34' => '▓', + 'block' => '█', + 'bull' => '•', + 'caret' => '⁁', + 'check' => '✓', + 'cir' => '○', + 'clubs' => '♣', + 'copysr' => '℗', + 'cross' => '✗', + 'Dagger' => '‡', + 'dagger' => '†', + 'dash' => '‐', + 'diams' => '♦', + 'dlcrop' => '⌍', + 'drcrop' => '⌌', + 'dtri' => '▿', + 'dtrif' => '▾', + 'emsp' => ' ', + 'emsp13' => ' ', + 'emsp14' => ' ', + 'ensp' => ' ', + 'female' => '♀', + 'ffilig' => 'ffi', + 'fflig' => 'ff', + 'ffllig' => 'ffl', + 'filig' => 'fi', + 'flat' => '♭', + 'fllig' => 'fl', + 'frac13' => '⅓', + 'frac15' => '⅕', + 'frac16' => '⅙', + 'frac23' => '⅔', + 'frac25' => '⅖', + 'frac35' => '⅗', + 'frac45' => '⅘', + 'frac56' => '⅚', + 'hairsp' => ' ', + 'hearts' => '♥', + 'hellip' => '…', + 'hybull' => '⁃', + 'incare' => '℅', + 'ldquor' => '„', + 'lhblk' => '▄', + 'loz' => '◊', + 'lozf' => '⧫', + 'lsquor' => '‚', + 'ltri' => '◃', + 'ltrif' => '◂', + 'male' => '♂', + 'malt' => '✠', + 'marker' => '▮', + 'mdash' => '—', + 'mldr' => '…', + 'natur' => '♮', + 'ndash' => '–', + 'nldr' => '‥', + 'numsp' => ' ', + 'phone' => '☎', + 'puncsp' => ' ', + 'rdquor' => '”', + 'rect' => '▭', + 'rsquor' => '’', + 'rtri' => '▹', + 'rtrif' => '▸', + 'rx' => '℞', + 'sext' => '✶', + 'sharp' => '♯', + 'spades' => '♠', + 'squ' => '□', + 'squf' => '▪', + 'star' => '☆', + 'starf' => '★', + 'target' => '⌖', + 'telrec' => '⌕', + 'thinsp' => ' ', + 'uhblk' => '▀', + 'ulcrop' => '⌏', + 'urcrop' => '⌎', + 'utri' => '▵', + 'utrif' => '▴', + 'vellip' => '⋮', + 'af' => '⁡', + 'aopf' => '𝕒', + 'asympeq' => '≍', + 'bopf' => '𝕓', + 'copf' => '𝕔', + 'Cross' => '⨯', + 'DD' => 'ⅅ', + 'dd' => 'ⅆ', + 'dopf' => '𝕕', + 'DownArrowBar' => '⤓', + 'DownBreve' => '̑', + 'DownLeftRightVector' => '⥐', + 'DownLeftTeeVector' => '⥞', + 'DownLeftVectorBar' => '⥖', + 'DownRightTeeVector' => '⥟', + 'DownRightVectorBar' => '⥗', + 'ee' => 'ⅇ', + 'EmptySmallSquare' => '◻', + 'EmptyVerySmallSquare' => '▫', + 'eopf' => '𝕖', + 'Equal' => '⩵', + 'FilledSmallSquare' => '◼', + 'FilledVerySmallSquare' => '▪', + 'fopf' => '𝕗', + 'gopf' => '𝕘', + 'GreaterGreater' => '⪢', + 'Hat' => '^', + 'hopf' => '𝕙', + 'HorizontalLine' => '─', + 'ic' => '⁣', + 'ii' => 'ⅈ', + 'iopf' => '𝕚', + 'it' => '⁢', + 'jopf' => '𝕛', + 'kopf' => '𝕜', + 'larrb' => '⇤', + 'LeftDownTeeVector' => '⥡', + 'LeftDownVectorBar' => '⥙', + 'LeftRightVector' => '⥎', + 'LeftTeeVector' => '⥚', + 'LeftTriangleBar' => '⧏', + 'LeftUpDownVector' => '⥑', + 'LeftUpTeeVector' => '⥠', + 'LeftUpVectorBar' => '⥘', + 'LeftVectorBar' => '⥒', + 'LessLess' => '⪡', + 'lopf' => '𝕝', + 'mapstodown' => '↧', + 'mapstoleft' => '↤', + 'mapstoup' => '↥', + 'MediumSpace' => ' ', + 'mopf' => '𝕞', + 'nbump' => '≎̸', + 'nbumpe' => '≏̸', + 'nesim' => '≂̸', + 'NewLine' => ' ', + 'NoBreak' => '⁠', + 'nopf' => '𝕟', + 'NotCupCap' => '≭', + 'NotHumpEqual' => '≏̸', + 'NotLeftTriangleBar' => '⧏̸', + 'NotNestedGreaterGreater' => '⪢̸', + 'NotNestedLessLess' => '⪡̸', + 'NotRightTriangleBar' => '⧐̸', + 'NotSquareSubset' => '⊏̸', + 'NotSquareSuperset' => '⊐̸', + 'NotSucceedsTilde' => '≿̸', + 'oopf' => '𝕠', + 'OverBar' => '¯', + 'OverBrace' => '︷', + 'OverBracket' => '⎴', + 'OverParenthesis' => '︵', + 'planckh' => 'ℎ', + 'popf' => '𝕡', + 'Product' => '∏', + 'qopf' => '𝕢', + 'rarrb' => '⇥', + 'RightDownTeeVector' => '⥝', + 'RightDownVectorBar' => '⥕', + 'RightTeeVector' => '⥛', + 'RightTriangleBar' => '⧐', + 'RightUpDownVector' => '⥏', + 'RightUpTeeVector' => '⥜', + 'RightUpVectorBar' => '⥔', + 'RightVectorBar' => '⥓', + 'ropf' => '𝕣', + 'RoundImplies' => '⥰', + 'RuleDelayed' => '⧴', + 'sopf' => '𝕤', + 'Tab' => ' ', + 'ThickSpace' => '   ', + 'topf' => '𝕥', + 'UnderBar' => '̲', + 'UnderBrace' => '︸', + 'UnderBracket' => '⎵', + 'UnderParenthesis' => '︶', + 'uopf' => '𝕦', + 'UpArrowBar' => '⤒', + 'Upsilon' => 'Υ', + 'VerticalLine' => '|', + 'VerticalSeparator' => '❘', + 'vopf' => '𝕧', + 'wopf' => '𝕨', + 'xopf' => '𝕩', + 'yopf' => '𝕪', + 'ZeroWidthSpace' => '​', + 'zopf' => '𝕫', + 'angle' => '∠', + 'ApplyFunction' => '⁡', + 'approx' => '≈', + 'approxeq' => '≊', + 'Assign' => '≔', + 'backcong' => '≌', + 'backepsilon' => '϶', + 'backprime' => '‵', + 'backsim' => '∽', + 'backsimeq' => '⋍', + 'Backslash' => '∖', + 'barwedge' => '⌅', + 'Because' => '∵', + 'because' => '∵', + 'Bernoullis' => 'ℬ', + 'between' => '≬', + 'bigcap' => '⋂', + 'bigcirc' => '◯', + 'bigcup' => '⋃', + 'bigodot' => '⨀', + 'bigoplus' => '⨁', + 'bigotimes' => '⨂', + 'bigsqcup' => '⨆', + 'bigstar' => '★', + 'bigtriangledown' => '▽', + 'bigtriangleup' => '△', + 'biguplus' => '⨄', + 'bigvee' => '⋁', + 'bigwedge' => '⋀', + 'bkarow' => '⤍', + 'blacklozenge' => '⧫', + 'blacksquare' => '▪', + 'blacktriangle' => '▴', + 'blacktriangledown' => '▾', + 'blacktriangleleft' => '◂', + 'blacktriangleright' => '▸', + 'bot' => '⊥', + 'boxminus' => '⊟', + 'boxplus' => '⊞', + 'boxtimes' => '⊠', + 'Breve' => '˘', + 'bullet' => '•', + 'Bumpeq' => '≎', + 'bumpeq' => '≏', + 'CapitalDifferentialD' => 'ⅅ', + 'Cayleys' => 'ℭ', + 'Cedilla' => '¸', + 'CenterDot' => '·', + 'centerdot' => '·', + 'checkmark' => '✓', + 'circeq' => '≗', + 'circlearrowleft' => '↺', + 'circlearrowright' => '↻', + 'circledast' => '⊛', + 'circledcirc' => '⊚', + 'circleddash' => '⊝', + 'CircleDot' => '⊙', + 'circledR' => '®', + 'circledS' => 'Ⓢ', + 'CircleMinus' => '⊖', + 'CirclePlus' => '⊕', + 'CircleTimes' => '⊗', + 'ClockwiseContourIntegral' => '∲', + 'CloseCurlyDoubleQuote' => '”', + 'CloseCurlyQuote' => '’', + 'clubsuit' => '♣', + 'coloneq' => '≔', + 'complement' => '∁', + 'complexes' => 'ℂ', + 'Congruent' => '≡', + 'ContourIntegral' => '∮', + 'Coproduct' => '∐', + 'CounterClockwiseContourIntegral' => '∳', + 'CupCap' => '≍', + 'curlyeqprec' => '⋞', + 'curlyeqsucc' => '⋟', + 'curlyvee' => '⋎', + 'curlywedge' => '⋏', + 'curvearrowleft' => '↶', + 'curvearrowright' => '↷', + 'dbkarow' => '⤏', + 'ddagger' => '‡', + 'ddotseq' => '⩷', + 'Del' => '∇', + 'DiacriticalAcute' => '´', + 'DiacriticalDot' => '˙', + 'DiacriticalDoubleAcute' => '˝', + 'DiacriticalGrave' => '`', + 'DiacriticalTilde' => '˜', + 'Diamond' => '⋄', + 'diamond' => '⋄', + 'diamondsuit' => '♦', + 'DifferentialD' => 'ⅆ', + 'digamma' => 'ϝ', + 'div' => '÷', + 'divideontimes' => '⋇', + 'doteq' => '≐', + 'doteqdot' => '≑', + 'DotEqual' => '≐', + 'dotminus' => '∸', + 'dotplus' => '∔', + 'dotsquare' => '⊡', + 'doublebarwedge' => '⌆', + 'DoubleContourIntegral' => '∯', + 'DoubleDot' => '¨', + 'DoubleDownArrow' => '⇓', + 'DoubleLeftArrow' => '⇐', + 'DoubleLeftRightArrow' => '⇔', + 'DoubleLeftTee' => '⫤', + 'DoubleLongLeftArrow' => '⟸', + 'DoubleLongLeftRightArrow' => '⟺', + 'DoubleLongRightArrow' => '⟹', + 'DoubleRightArrow' => '⇒', + 'DoubleRightTee' => '⊨', + 'DoubleUpArrow' => '⇑', + 'DoubleUpDownArrow' => '⇕', + 'DoubleVerticalBar' => '∥', + 'DownArrow' => '↓', + 'Downarrow' => '⇓', + 'downarrow' => '↓', + 'DownArrowUpArrow' => '⇵', + 'downdownarrows' => '⇊', + 'downharpoonleft' => '⇃', + 'downharpoonright' => '⇂', + 'DownLeftVector' => '↽', + 'DownRightVector' => '⇁', + 'DownTee' => '⊤', + 'DownTeeArrow' => '↧', + 'drbkarow' => '⤐', + 'Element' => '∈', + 'emptyset' => '∅', + 'eqcirc' => '≖', + 'eqcolon' => '≕', + 'eqsim' => '≂', + 'eqslantgtr' => '⪖', + 'eqslantless' => '⪕', + 'EqualTilde' => '≂', + 'Equilibrium' => '⇌', + 'Exists' => '∃', + 'expectation' => 'ℰ', + 'ExponentialE' => 'ⅇ', + 'exponentiale' => 'ⅇ', + 'fallingdotseq' => '≒', + 'ForAll' => '∀', + 'Fouriertrf' => 'ℱ', + 'geq' => '≥', + 'geqq' => '≧', + 'geqslant' => '⩾', + 'gg' => '≫', + 'ggg' => '⋙', + 'gnapprox' => '⪊', + 'gneq' => '⪈', + 'gneqq' => '≩', + 'GreaterEqual' => '≥', + 'GreaterEqualLess' => '⋛', + 'GreaterFullEqual' => '≧', + 'GreaterLess' => '≷', + 'GreaterSlantEqual' => '⩾', + 'GreaterTilde' => '≳', + 'gtrapprox' => '⪆', + 'gtrdot' => '⋗', + 'gtreqless' => '⋛', + 'gtreqqless' => '⪌', + 'gtrless' => '≷', + 'gtrsim' => '≳', + 'gvertneqq' => '≩︀', + 'Hacek' => 'ˇ', + 'hbar' => 'ℏ', + 'heartsuit' => '♥', + 'HilbertSpace' => 'ℋ', + 'hksearow' => '⤥', + 'hkswarow' => '⤦', + 'hookleftarrow' => '↩', + 'hookrightarrow' => '↪', + 'hslash' => 'ℏ', + 'HumpDownHump' => '≎', + 'HumpEqual' => '≏', + 'iiiint' => '⨌', + 'iiint' => '∭', + 'Im' => 'ℑ', + 'ImaginaryI' => 'ⅈ', + 'imagline' => 'ℐ', + 'imagpart' => 'ℑ', + 'Implies' => '⇒', + 'in' => '∈', + 'integers' => 'ℤ', + 'Integral' => '∫', + 'intercal' => '⊺', + 'Intersection' => '⋂', + 'intprod' => '⨼', + 'InvisibleComma' => '⁣', + 'InvisibleTimes' => '⁢', + 'langle' => '〈', + 'Laplacetrf' => 'ℒ', + 'lbrace' => '{', + 'lbrack' => '[', + 'LeftAngleBracket' => '〈', + 'LeftArrow' => '←', + 'Leftarrow' => '⇐', + 'leftarrow' => '←', + 'LeftArrowBar' => '⇤', + 'LeftArrowRightArrow' => '⇆', + 'leftarrowtail' => '↢', + 'LeftCeiling' => '⌈', + 'LeftDoubleBracket' => '〚', + 'LeftDownVector' => '⇃', + 'LeftFloor' => '⌊', + 'leftharpoondown' => '↽', + 'leftharpoonup' => '↼', + 'leftleftarrows' => '⇇', + 'LeftRightArrow' => '↔', + 'Leftrightarrow' => '⇔', + 'leftrightarrow' => '↔', + 'leftrightarrows' => '⇆', + 'leftrightharpoons' => '⇋', + 'leftrightsquigarrow' => '↭', + 'LeftTee' => '⊣', + 'LeftTeeArrow' => '↤', + 'leftthreetimes' => '⋋', + 'LeftTriangle' => '⊲', + 'LeftTriangleEqual' => '⊴', + 'LeftUpVector' => '↿', + 'LeftVector' => '↼', + 'leq' => '≤', + 'leqq' => '≦', + 'leqslant' => '⩽', + 'lessapprox' => '⪅', + 'lessdot' => '⋖', + 'lesseqgtr' => '⋚', + 'lesseqqgtr' => '⪋', + 'LessEqualGreater' => '⋚', + 'LessFullEqual' => '≦', + 'LessGreater' => '≶', + 'lessgtr' => '≶', + 'lesssim' => '≲', + 'LessSlantEqual' => '⩽', + 'LessTilde' => '≲', + 'll' => '≪', + 'llcorner' => '⌞', + 'Lleftarrow' => '⇚', + 'lmoustache' => '⎰', + 'lnapprox' => '⪉', + 'lneq' => '⪇', + 'lneqq' => '≨', + 'LongLeftArrow' => '⟵', + 'Longleftarrow' => '⟸', + 'longleftarrow' => '⟵', + 'LongLeftRightArrow' => '⟷', + 'Longleftrightarrow' => '⟺', + 'longleftrightarrow' => '⟷', + 'longmapsto' => '⟼', + 'LongRightArrow' => '⟶', + 'Longrightarrow' => '⟹', + 'longrightarrow' => '⟶', + 'looparrowleft' => '↫', + 'looparrowright' => '↬', + 'LowerLeftArrow' => '↙', + 'LowerRightArrow' => '↘', + 'lozenge' => '◊', + 'lrcorner' => '⌟', + 'Lsh' => '↰', + 'lvertneqq' => '≨︀', + 'maltese' => '✠', + 'mapsto' => '↦', + 'measuredangle' => '∡', + 'Mellintrf' => 'ℳ', + 'MinusPlus' => '∓', + 'mp' => '∓', + 'multimap' => '⊸', + 'napprox' => '≉', + 'natural' => '♮', + 'naturals' => 'ℕ', + 'nearrow' => '↗', + 'NegativeMediumSpace' => '​', + 'NegativeThickSpace' => '​', + 'NegativeThinSpace' => '​', + 'NegativeVeryThinSpace' => '​', + 'NestedGreaterGreater' => '≫', + 'NestedLessLess' => '≪', + 'nexists' => '∄', + 'ngeq' => '≱', + 'ngeqq' => '≧̸', + 'ngeqslant' => '⩾̸', + 'ngtr' => '≯', + 'nLeftarrow' => '⇍', + 'nleftarrow' => '↚', + 'nLeftrightarrow' => '⇎', + 'nleftrightarrow' => '↮', + 'nleq' => '≰', + 'nleqq' => '≦̸', + 'nleqslant' => '⩽̸', + 'nless' => '≮', + 'NonBreakingSpace' => ' ', + 'NotCongruent' => '≢', + 'NotDoubleVerticalBar' => '∦', + 'NotElement' => '∉', + 'NotEqual' => '≠', + 'NotEqualTilde' => '≂̸', + 'NotExists' => '∄', + 'NotGreater' => '≯', + 'NotGreaterEqual' => '≱', + 'NotGreaterFullEqual' => '≦̸', + 'NotGreaterGreater' => '≫̸', + 'NotGreaterLess' => '≹', + 'NotGreaterSlantEqual' => '⩾̸', + 'NotGreaterTilde' => '≵', + 'NotHumpDownHump' => '≎̸', + 'NotLeftTriangle' => '⋪', + 'NotLeftTriangleEqual' => '⋬', + 'NotLess' => '≮', + 'NotLessEqual' => '≰', + 'NotLessGreater' => '≸', + 'NotLessLess' => '≪̸', + 'NotLessSlantEqual' => '⩽̸', + 'NotLessTilde' => '≴', + 'NotPrecedes' => '⊀', + 'NotPrecedesEqual' => '⪯̸', + 'NotPrecedesSlantEqual' => '⋠', + 'NotReverseElement' => '∌', + 'NotRightTriangle' => '⋫', + 'NotRightTriangleEqual' => '⋭', + 'NotSquareSubsetEqual' => '⋢', + 'NotSquareSupersetEqual' => '⋣', + 'NotSubset' => '⊂⃒', + 'NotSubsetEqual' => '⊈', + 'NotSucceeds' => '⊁', + 'NotSucceedsEqual' => '⪰̸', + 'NotSucceedsSlantEqual' => '⋡', + 'NotSuperset' => '⊃⃒', + 'NotSupersetEqual' => '⊉', + 'NotTilde' => '≁', + 'NotTildeEqual' => '≄', + 'NotTildeFullEqual' => '≇', + 'NotTildeTilde' => '≉', + 'NotVerticalBar' => '∤', + 'nparallel' => '∦', + 'nprec' => '⊀', + 'npreceq' => '⪯̸', + 'nRightarrow' => '⇏', + 'nrightarrow' => '↛', + 'nshortmid' => '∤', + 'nshortparallel' => '∦', + 'nsimeq' => '≄', + 'nsubset' => '⊂⃒', + 'nsubseteq' => '⊈', + 'nsubseteqq' => '⫅̸', + 'nsucc' => '⊁', + 'nsucceq' => '⪰̸', + 'nsupset' => '⊃⃒', + 'nsupseteq' => '⊉', + 'nsupseteqq' => '⫆̸', + 'ntriangleleft' => '⋪', + 'ntrianglelefteq' => '⋬', + 'ntriangleright' => '⋫', + 'ntrianglerighteq' => '⋭', + 'nwarrow' => '↖', + 'oint' => '∮', + 'OpenCurlyDoubleQuote' => '“', + 'OpenCurlyQuote' => '‘', + 'orderof' => 'ℴ', + 'parallel' => '∥', + 'PartialD' => '∂', + 'pitchfork' => '⋔', + 'PlusMinus' => '±', + 'pm' => '±', + 'Poincareplane' => 'ℌ', + 'prec' => '≺', + 'precapprox' => '⪷', + 'preccurlyeq' => '≼', + 'Precedes' => '≺', + 'PrecedesEqual' => '⪯', + 'PrecedesSlantEqual' => '≼', + 'PrecedesTilde' => '≾', + 'preceq' => '⪯', + 'precnapprox' => '⪹', + 'precneqq' => '⪵', + 'precnsim' => '⋨', + 'precsim' => '≾', + 'primes' => 'ℙ', + 'Proportion' => '∷', + 'Proportional' => '∝', + 'propto' => '∝', + 'quaternions' => 'ℍ', + 'questeq' => '≟', + 'rangle' => '〉', + 'rationals' => 'ℚ', + 'rbrace' => '}', + 'rbrack' => ']', + 'Re' => 'ℜ', + 'realine' => 'ℛ', + 'realpart' => 'ℜ', + 'reals' => 'ℝ', + 'ReverseElement' => '∋', + 'ReverseEquilibrium' => '⇋', + 'ReverseUpEquilibrium' => '⥯', + 'RightAngleBracket' => '〉', + 'RightArrow' => '→', + 'Rightarrow' => '⇒', + 'rightarrow' => '→', + 'RightArrowBar' => '⇥', + 'RightArrowLeftArrow' => '⇄', + 'rightarrowtail' => '↣', + 'RightCeiling' => '⌉', + 'RightDoubleBracket' => '〛', + 'RightDownVector' => '⇂', + 'RightFloor' => '⌋', + 'rightharpoondown' => '⇁', + 'rightharpoonup' => '⇀', + 'rightleftarrows' => '⇄', + 'rightleftharpoons' => '⇌', + 'rightrightarrows' => '⇉', + 'rightsquigarrow' => '↝', + 'RightTee' => '⊢', + 'RightTeeArrow' => '↦', + 'rightthreetimes' => '⋌', + 'RightTriangle' => '⊳', + 'RightTriangleEqual' => '⊵', + 'RightUpVector' => '↾', + 'RightVector' => '⇀', + 'risingdotseq' => '≓', + 'rmoustache' => '⎱', + 'Rrightarrow' => '⇛', + 'Rsh' => '↱', + 'searrow' => '↘', + 'setminus' => '∖', + 'ShortDownArrow' => '↓', + 'ShortLeftArrow' => '←', + 'shortmid' => '∣', + 'shortparallel' => '∥', + 'ShortRightArrow' => '→', + 'ShortUpArrow' => '↑', + 'simeq' => '≃', + 'SmallCircle' => '∘', + 'smallsetminus' => '∖', + 'spadesuit' => '♠', + 'Sqrt' => '√', + 'sqsubset' => '⊏', + 'sqsubseteq' => '⊑', + 'sqsupset' => '⊐', + 'sqsupseteq' => '⊒', + 'Square' => '□', + 'SquareIntersection' => '⊓', + 'SquareSubset' => '⊏', + 'SquareSubsetEqual' => '⊑', + 'SquareSuperset' => '⊐', + 'SquareSupersetEqual' => '⊒', + 'SquareUnion' => '⊔', + 'Star' => '⋆', + 'straightepsilon' => 'ϵ', + 'straightphi' => 'ϕ', + 'Subset' => '⋐', + 'subset' => '⊂', + 'subseteq' => '⊆', + 'subseteqq' => '⫅', + 'SubsetEqual' => '⊆', + 'subsetneq' => '⊊', + 'subsetneqq' => '⫋', + 'succ' => '≻', + 'succapprox' => '⪸', + 'succcurlyeq' => '≽', + 'Succeeds' => '≻', + 'SucceedsEqual' => '⪰', + 'SucceedsSlantEqual' => '≽', + 'SucceedsTilde' => '≿', + 'succeq' => '⪰', + 'succnapprox' => '⪺', + 'succneqq' => '⪶', + 'succnsim' => '⋩', + 'succsim' => '≿', + 'SuchThat' => '∋', + 'Sum' => '∑', + 'Superset' => '⊃', + 'SupersetEqual' => '⊇', + 'Supset' => '⋑', + 'supset' => '⊃', + 'supseteq' => '⊇', + 'supseteqq' => '⫆', + 'supsetneq' => '⊋', + 'supsetneqq' => '⫌', + 'swarrow' => '↙', + 'Therefore' => '∴', + 'therefore' => '∴', + 'thickapprox' => '≈', + 'thicksim' => '∼', + 'ThinSpace' => ' ', + 'Tilde' => '∼', + 'TildeEqual' => '≃', + 'TildeFullEqual' => '≅', + 'TildeTilde' => '≈', + 'toea' => '⤨', + 'tosa' => '⤩', + 'triangle' => '▵', + 'triangledown' => '▿', + 'triangleleft' => '◃', + 'trianglelefteq' => '⊴', + 'triangleq' => '≜', + 'triangleright' => '▹', + 'trianglerighteq' => '⊵', + 'TripleDot' => '⃛', + 'twoheadleftarrow' => '↞', + 'twoheadrightarrow' => '↠', + 'ulcorner' => '⌜', + 'Union' => '⋃', + 'UnionPlus' => '⊎', + 'UpArrow' => '↑', + 'Uparrow' => '⇑', + 'uparrow' => '↑', + 'UpArrowDownArrow' => '⇅', + 'UpDownArrow' => '↕', + 'Updownarrow' => '⇕', + 'updownarrow' => '↕', + 'UpEquilibrium' => '⥮', + 'upharpoonleft' => '↿', + 'upharpoonright' => '↾', + 'UpperLeftArrow' => '↖', + 'UpperRightArrow' => '↗', + 'upsilon' => 'υ', + 'UpTee' => '⊥', + 'UpTeeArrow' => '↥', + 'upuparrows' => '⇈', + 'urcorner' => '⌝', + 'varepsilon' => 'ε', + 'varkappa' => 'ϰ', + 'varnothing' => '∅', + 'varphi' => 'φ', + 'varpi' => 'ϖ', + 'varpropto' => '∝', + 'varrho' => 'ϱ', + 'varsigma' => 'ς', + 'varsubsetneq' => '⊊︀', + 'varsubsetneqq' => '⫋︀', + 'varsupsetneq' => '⊋︀', + 'varsupsetneqq' => '⫌︀', + 'vartheta' => 'ϑ', + 'vartriangleleft' => '⊲', + 'vartriangleright' => '⊳', + 'Vee' => '⋁', + 'vee' => '∨', + 'Vert' => '‖', + 'vert' => '|', + 'VerticalBar' => '∣', + 'VerticalTilde' => '≀', + 'VeryThinSpace' => ' ', + 'Wedge' => '⋀', + 'wedge' => '∧', + 'wp' => '℘', + 'wr' => '≀', + 'zeetrf' => 'ℨ' + } +#:startdoc: + +# Converts XHTML+MathML named entities in string to Numeric Character References +# +# :call-seq: +# string.to_ncr -> string +# + def to_ncr + self.gsub(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_ncr} + end + +# Converts XHTML+MathML named entities in string to Numeric Character References +# +# :call-seq: +# string.to_ncr! -> str or nil +# +# Substitution is done in-place. +# + def to_ncr! + self.gsub!(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_ncr} + end + +# Converts XHTML+MathML named entities in string to UTF-8 +# +# :call-seq: +# string.to_utf8 -> string +# +#-- + def to_utf8 + self.gsub(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_utf8} + + # You might think this is faster, but it isn't + # pieces = self.split(/&([a-zA-Z0-9]+);/) + # 1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8} + # pieces.join + end + +#++ +# Converts XHTML+MathML named entities in string to UTF-8 +# +# :call-seq: +# string.to_ncr! -> str or nil +# +# Substitution is done in-place. +# + def to_utf8! + self.gsub!(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_utf8} + end + +#:stopdoc: + + def unescapeHTML + self.gsub(/&(.*?);/n) do + match = $1.dup + case match + when /\Aamp\z/ni then '&' + when /\Aquot\z/ni then '"' + when /\Agt\z/ni then '>' + when /\Alt\z/ni then '<' + when /\A#0*(\d+)\z/n then + if Integer($1) < 256 + Integer($1).chr + else + if Integer($1) < 1114111 + [Integer($1)].pack("U") + else + "&##{$1};" + end + end + when /\A#x([0-9a-f]+)\z/ni then + if $1.hex < 256 + $1.hex.chr + else + if $1.hex < 1114111 + [$1.hex].pack("U") + else + "&#x#{$1};" + end + end + else + "&#{match};" + end + end + end + + protected + + def convert_to_ncr #:nodoc: + if self =~ /^(lt|gt|amp|quot|apos)$/ + self.replace "&" + self + ";" + elsif MATHML_ENTITIES.has_key?(self) + self.replace MATHML_ENTITIES[self] + else + self.replace "&" + self + ";" + end + end + + def convert_to_utf8 #:nodoc: + if self =~ /^(lt|gt|amp|quot|apos)$/ + self.replace "&" + self + ";" + elsif MATHML_ENTITIES.has_key?(self) + self.replace MATHML_ENTITIES[self].split(';').collect {|s| s.gsub(/^&#x([A-F0-9]+)$/, '\1').hex }.pack('U*') + else + self.replace "&" + self + ";" + end + end + + +end diff --git a/lib/wiki_content.rb b/lib/wiki_content.rb index 75a846e5..521e5b4f 100644 --- a/lib/wiki_content.rb +++ b/lib/wiki_content.rb @@ -7,8 +7,6 @@ require_dependency 'chunks/literal' require 'chunks/nowiki' require 'sanitize' -include Sanitize - # Wiki content is just a string that can process itself with a chain of # actions. The actions can modify wiki content so that certain parts of # it are protected from being rendered by later actions. @@ -116,6 +114,7 @@ end class WikiContent < String include ChunkManager + include Sanitize DEFAULT_OPTS = { :active_chunks => ACTIVE_CHUNKS, diff --git a/test/sanitizer.dat b/test/sanitizer.dat new file mode 100644 index 00000000..ec781cb9 --- /dev/null +++ b/test/sanitizer.dat @@ -0,0 +1,475 @@ +[ + { + "name": "IE_Comments", + "input": "", + "output": "", + "xhtml": "<!--[if gte IE 4]><script>alert('XSS');</script><![endif]-->" + }, + + { + "name": "IE_Comments_2", + "input": "", + "output": "<script>alert('XSS');</script>", + "xhtml": "<![if !IE 5]><script>alert('XSS');</script><![endif]>", + "rexml": "Ill-formed XHTML!" + }, + + { + "name": "allow_colons_in_path_component", + "input": "foo", + "output": "foo" + }, + + { + "name": "background_attribute", + "input": "
", + "output": "
", + "xhtml": "
", + "rexml": "
" + }, + + { + "name": "bgsound", + "input": "", + "output": "<bgsound src=\"javascript:alert('XSS');\"/>", + "xhtml": "<bgsound src='javascript:alert('XSS');'/>", + "rexml": "<bgsound src=\"javascript:alert('XSS');\"></bgsound>" + }, + + { + "name": "div_background_image_unicode_encoded", + "input": "
foo
", + "output": "
foo
" + }, + + { + "name": "div_expression", + "input": "
foo
", + "output": "
foo
" + }, + + { + "name": "double_open_angle_brackets", + "input": "", + "xhtml": "<", + "rexml": "Ill-formed XHTML!" + }, + + { + "name": "double_open_angle_brackets_2", + "input": "", + "output": "<script XSS=\"\" src=\"http://ha.ckers.org/xss.js\"></script>", + "xhtml": "<script/></script>", + "rexml": "Ill-formed XHTML!" + }, + + { + "name": "non_alpha_non_digit_2", + "input": "foo", + "output": "foo", + "rexml": "Ill-formed XHTML!" + }, + + { + "name": "non_alpha_non_digit_3", + "input": "", + "output": "", + "xhtml": "", + "rexml": "Ill-formed XHTML!" + }, + + { + "name": "non_alpha_non_digit_II", + "input": "foo", + "output": "foo", + "rexml": "Ill-formed XHTML!" + }, + + { + "name": "non_alpha_non_digit_III", + "input": "foo", + "output": "foo", + "rexml": "Ill-formed XHTML!" + }, + + { + "name": "platypus", + "input": "never trust your upstream platypus", + "output": "never trust your upstream platypus" + }, + + { + "name": "protocol_resolution_in_script_tag", + "input": "", + "output": "<script src=\"//ha.ckers.org/.j\"></script>", + "xhtml": "<script src/></script>", + "rexml": "Ill-formed XHTML!" + }, + + { + "name": "should_allow_anchors", + "input": "", + "output": "<script>baz</script>", + "xhtml": "<script>baz</script>" + }, + + { + "name": "should_allow_image_alt_attribute", + "input": "foo", + "output": "foo", + "rexml": "foo" + }, + + { + "name": "should_allow_image_height_attribute", + "input": "", + "output": "", + "rexml": "" + }, + + { + "name": "should_allow_image_src_attribute", + "input": "", + "output": "", + "rexml": "" + }, + + { + "name": "should_allow_image_width_attribute", + "input": "", + "output": "", + "rexml": "" + }, + + { + "name": "should_handle_blank_text", + "input": "", + "output": "" + }, + + { + "name": "should_handle_malformed_image_tags", + "input": "\">", + "output": "<script>alert(\"XSS\")</script>\">", + "xhtml": "", + "rexml": "Ill-formed XHTML!" + }, + + { + "name": "should_handle_non_html", + "input": "abc", + "output": "abc" + }, + + { + "name": "should_not_fall_for_ridiculous_hack", + "input": "", + "output": "", + "rexml": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_0", + "input": "", + "output": "", + "rexml": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_1", + "input": "", + "output": "", + "rexml": "Ill-formed XHTML!" + }, + + { + "name": "should_not_fall_for_xss_image_hack_10", + "input": "", + "output": "", + "rexml": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_11", + "input": "", + "output": "", + "rexml": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_12", + "input": "", + "output": "", + "rexml": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_13", + "input": "", + "output": "", + "rexml": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_14", + "input": "", + "output": "", + "rexml": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_2", + "input": "", + "output": "", + "rexml": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_3", + "input": "", + "output": "", + "rexml": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_4", + "input": "", + "output": "", + "rexml": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_5", + "input": "", + "output": "", + "rexml": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_6", + "input": "", + "output": "", + "rexml": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_7", + "input": "", + "output": "", + "rexml": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_8", + "input": "", + "output": "", + "rexml": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_9", + "input": "", + "output": "", + "rexml": "" + }, + + { + "name": "should_sanitize_half_open_scripts", + "input": "", + "rexml": "Ill-formed XHTML!" + }, + + { + "name": "should_sanitize_invalid_script_tag", + "input": "", + "output": "<script XSS=\"\" SRC=\"http://ha.ckers.org/xss.js\"></script>", + "xhtml": "<script/></script>", + "rexml": "Ill-formed XHTML!" + }, + + { + "name": "should_sanitize_script_tag_with_multiple_open_brackets", + "input": "<", + "output": "<<script>alert(\"XSS\");//<</script>", + "xhtml": "<<script>alert("XSS");//<</script>", + "rexml": "Ill-formed XHTML!" + }, + + { + "name": "should_sanitize_script_tag_with_multiple_open_brackets_2", + "input": "