From 800880f3821c91978f9b724059958d55f40e27bd Mon Sep 17 00:00:00 2001
From: Jacques Distler
Date: Tue, 20 May 2008 17:02:10 -0500
Subject: [PATCH] Rough In New Sanitizer Start work (which may not pan out) on
a new sanitizer. Right now, it passes all but 1 of the HTML5lib Sanitizer's
unit tests. But it doesn't do much of anything to ensure well-formedness.
This is not an issue for Maruku-processed content, but it is a concern for
blocks.
(One solution would be to use the HTML5lib parser on blocks.)
In any case, this baby is 3 times as fast as the HTML5lib sanitizer.
---
app/controllers/file_controller.rb | 2 +-
app/controllers/wiki_controller.rb | 2 +-
lib/chunks/category.rb | 2 +-
lib/chunks/engines.rb | 2 +-
lib/node.rb | 532 ++++
lib/sanitize.rb | 4 +-
lib/sanitizer.rb | 198 ++
lib/stringsupport.rb | 2271 +++++++++++++++++
lib/wiki_content.rb | 3 +-
test/sanitizer.dat | 475 ++++
test/unit/chunks/nowiki_test.rb | 12 +
test/unit/page_renderer_test.rb | 20 +-
test/unit/sanitize_test.rb | 2 +
test/unit/sanitizer_test.rb | 142 ++
.../ext/math/mathml_engines/itex2mml.rb | 2 +-
15 files changed, 3657 insertions(+), 12 deletions(-)
create mode 100644 lib/node.rb
create mode 100644 lib/sanitizer.rb
create mode 100644 lib/stringsupport.rb
create mode 100644 test/sanitizer.dat
create mode 100644 test/unit/sanitizer_test.rb
diff --git a/app/controllers/file_controller.rb b/app/controllers/file_controller.rb
index 21cde860..5c04b9e7 100644
--- a/app/controllers/file_controller.rb
+++ b/app/controllers/file_controller.rb
@@ -1,7 +1,7 @@
# Controller responsible for serving files and pictures.
require 'zip/zip'
-require 'sanitize'
+require 'stringsupport'
class FileController < ApplicationController
diff --git a/app/controllers/wiki_controller.rb b/app/controllers/wiki_controller.rb
index 80bd4a8e..e42c3f80 100644
--- a/app/controllers/wiki_controller.rb
+++ b/app/controllers/wiki_controller.rb
@@ -2,7 +2,7 @@ require 'fileutils'
require 'maruku'
require 'parsedate'
require 'zip/zip'
-require 'sanitize'
+require 'stringsupport'
require 'resolv'
class WikiController < ApplicationController
diff --git a/lib/chunks/category.rb b/lib/chunks/category.rb
index f008c85a..67987a4d 100644
--- a/lib/chunks/category.rb
+++ b/lib/chunks/category.rb
@@ -1,5 +1,5 @@
require 'chunks/chunk'
-require 'sanitize'
+require 'stringsupport'
# The category chunk looks for "category: news" on a line by
# itself and parses the terms after the ':' as categories.
diff --git a/lib/chunks/engines.rb b/lib/chunks/engines.rb
index 0f5300c6..84e984b0 100644
--- a/lib/chunks/engines.rb
+++ b/lib/chunks/engines.rb
@@ -1,7 +1,7 @@
$: << File.dirname(__FILE__) + "../../lib"
require_dependency 'chunks/chunk'
-require 'sanitize'
+require 'stringsupport'
# The markup engines are Chunks that call the one of RedCloth
diff --git a/lib/node.rb b/lib/node.rb
new file mode 100644
index 00000000..b0d0eac6
--- /dev/null
+++ b/lib/node.rb
@@ -0,0 +1,532 @@
+require 'strscan'
+
+module XHTML #:nodoc:
+
+ class Conditions < Hash #:nodoc:
+ def initialize(hash)
+ super()
+ hash = { :content => hash } unless Hash === hash
+ hash = keys_to_symbols(hash)
+ hash.each do |k,v|
+ case k
+ when :tag, :content then
+ # keys are valid, and require no further processing
+ when :attributes then
+ hash[k] = keys_to_strings(v)
+ when :parent, :child, :ancestor, :descendant, :sibling, :before,
+ :after
+ hash[k] = Conditions.new(v)
+ when :children
+ hash[k] = v = keys_to_symbols(v)
+ v.each do |k,v2|
+ case k
+ when :count, :greater_than, :less_than
+ # keys are valid, and require no further processing
+ when :only
+ v[k] = Conditions.new(v2)
+ else
+ raise "illegal key #{k.inspect} => #{v2.inspect}"
+ end
+ end
+ else
+ raise "illegal key #{k.inspect} => #{v.inspect}"
+ end
+ end
+ update hash
+ end
+
+ private
+
+ def keys_to_strings(hash)
+ hash.keys.inject({}) do |h,k|
+ h[k.to_s] = hash[k]
+ h
+ end
+ end
+
+ def keys_to_symbols(hash)
+ hash.keys.inject({}) do |h,k|
+ raise "illegal key #{k.inspect}" unless k.respond_to?(:to_sym)
+ h[k.to_sym] = hash[k]
+ h
+ end
+ end
+ end
+
+ # The base class of all nodes, textual and otherwise, in an HTML document.
+ class Node #:nodoc:
+ # The array of children of this node. Not all nodes have children.
+ attr_reader :children
+
+ # The parent node of this node. All nodes have a parent, except for the
+ # root node.
+ attr_reader :parent
+
+ # The line number of the input where this node was begun
+ attr_reader :line
+
+ # The byte position in the input where this node was begun
+ attr_reader :position
+
+ # Create a new node as a child of the given parent.
+ def initialize(parent, line=0, pos=0)
+ @parent = parent
+ @children = []
+ @line, @position = line, pos
+ end
+
+ # Return a textual representation of the node.
+ def to_s
+ s = ""
+ @children.each { |child| s << child.to_s }
+ s
+ end
+
+ # Return false (subclasses must override this to provide specific matching
+ # behavior.) +conditions+ may be of any type.
+ def match(conditions)
+ false
+ end
+
+ # Search the children of this node for the first node for which #find
+ # returns non +nil+. Returns the result of the #find call that succeeded.
+ def find(conditions)
+ conditions = validate_conditions(conditions)
+ @children.each do |child|
+ node = child.find(conditions)
+ return node if node
+ end
+ nil
+ end
+
+ # Search for all nodes that match the given conditions, and return them
+ # as an array.
+ def find_all(conditions)
+ conditions = validate_conditions(conditions)
+
+ matches = []
+ matches << self if match(conditions)
+ @children.each do |child|
+ matches.concat child.find_all(conditions)
+ end
+ matches
+ end
+
+ # Returns +false+. Subclasses may override this if they define a kind of
+ # tag.
+ def tag?
+ false
+ end
+
+ def validate_conditions(conditions)
+ Conditions === conditions ? conditions : Conditions.new(conditions)
+ end
+
+ def ==(node)
+ return false unless self.class == node.class && children.size == node.children.size
+
+ equivalent = true
+
+ children.size.times do |i|
+ equivalent &&= children[i] == node.children[i]
+ end
+
+ equivalent
+ end
+
+ class </)
+ return CDATA.new(parent, line, pos, scanner.pre_match.gsub(/\/]+/)
+ end
+ end
+ attributes[attr] = value
+ scanner.skip(/\s*/)
+ end
+
+ closing = ( scanner.scan(/\//) ? :self : nil )
+ end
+
+ unless scanner.scan(/\s*>/)
+ if strict
+ raise "expected > (got #{scanner.rest.inspect} for #{content}, #{attributes.inspect})"
+ else
+ # throw away all text until we find what we're looking for
+ scanner.skip_until(/>/) or scanner.terminate
+ end
+ end
+
+ Tag.new(parent, line, pos, name, attributes, closing)
+ end
+ end
+ end
+ end
+
+ # A node that represents text, rather than markup.
+ class Text < Node #:nodoc:
+
+ attr_reader :content
+
+ # Creates a new text node as a child of the given parent, with the given
+ # content.
+ def initialize(parent, line, pos, content)
+ super(parent, line, pos)
+ @content = content
+ end
+
+ # Returns the content of this node.
+ def to_s
+ @content
+ end
+
+ # Returns +self+ if this node meets the given conditions. Text nodes support
+ # conditions of the following kinds:
+ #
+ # * if +conditions+ is a string, it must be a substring of the node's
+ # content
+ # * if +conditions+ is a regular expression, it must match the node's
+ # content
+ # * if +conditions+ is a hash, it must contain a :content key that
+ # is either a string or a regexp, and which is interpreted as described
+ # above.
+ def find(conditions)
+ match(conditions) && self
+ end
+
+ # Returns non-+nil+ if this node meets the given conditions, or +nil+
+ # otherwise. See the discussion of #find for the valid conditions.
+ def match(conditions)
+ case conditions
+ when String
+ @content == conditions
+ when Regexp
+ @content =~ conditions
+ when Hash
+ conditions = validate_conditions(conditions)
+
+ # Text nodes only have :content, :parent, :ancestor
+ unless (conditions.keys - [:content, :parent, :ancestor]).empty?
+ return false
+ end
+
+ match(conditions[:content])
+ else
+ nil
+ end
+ end
+
+ def ==(node)
+ return false unless super
+ content == node.content
+ end
+ end
+
+ # A CDATA node is simply a text node with a specialized way of displaying
+ # itself.
+ class CDATA < Text #:nodoc:
+ def to_s
+ ""
+ end
+ end
+
+ # A Tag is any node that represents markup. It may be an opening tag, a
+ # closing tag, or a self-closing tag. It has a name, and may have a hash of
+ # attributes.
+ class Tag < Node #:nodoc:
+
+ # Either +nil+, :close , or :self
+ attr_reader :closing
+
+ # Either +nil+, or a hash of attributes for this node.
+ attr_reader :attributes
+
+ # The name of this tag.
+ attr_reader :name
+
+ # Create a new node as a child of the given parent, using the given content
+ # to describe the node. It will be parsed and the node name, attributes and
+ # closing status extracted.
+ def initialize(parent, line, pos, name, attributes, closing)
+ super(parent, line, pos)
+ @name = name
+ @attributes = attributes
+ @closing = closing
+ end
+
+ # A convenience for obtaining an attribute of the node. Returns +nil+ if
+ # the node has no attributes.
+ def [](attr)
+ @attributes ? @attributes[attr] : nil
+ end
+
+ # Returns non-+nil+ if this tag can contain child nodes.
+ def childless?(xml = false)
+ return false if xml && @closing.nil?
+# !@closing.nil? ||
+ @name =~ /^(img|br|hr|link|meta|area|base|basefont|
+ col|frame|input|isindex|param)$/ox
+ end
+
+ # Returns a textual representation of the node
+ def to_s
+ s = ''
+ if @closing == :close
+ s = "#{@name}>" unless self.childless?
+ else
+ s = "<#{@name}"
+ atlist = @attributes.sort
+ atlist.each do |att|
+ s << " #{att[0]}"
+ s << "='#{att[1]}'" if String === att[1]
+ end
+ s << "/" if (@children.empty? && @closing == :self) or self.childless?
+ s << ">"
+ @children.each { |child| s << child.to_s }
+ s << "#{@name}>" if @closing != :self && !@closing.nil? && !@children.empty?
+ end
+ s
+ end
+
+ # If either the node or any of its children meet the given conditions, the
+ # matching node is returned. Otherwise, +nil+ is returned. (See the
+ # description of the valid conditions in the +match+ method.)
+ def find(conditions)
+ match(conditions) && self || super
+ end
+
+ # Returns +true+, indicating that this node represents an HTML tag.
+ def tag?
+ true
+ end
+
+ # Returns +true+ if the node meets any of the given conditions. The
+ # +conditions+ parameter must be a hash of any of the following keys
+ # (all are optional):
+ #
+ # * :tag : the node name must match the corresponding value
+ # * :attributes : a hash. The node's values must match the
+ # corresponding values in the hash.
+ # * :parent : a hash. The node's parent must match the
+ # corresponding hash.
+ # * :child : a hash. At least one of the node's immediate children
+ # must meet the criteria described by the hash.
+ # * :ancestor : a hash. At least one of the node's ancestors must
+ # meet the criteria described by the hash.
+ # * :descendant : a hash. At least one of the node's descendants
+ # must meet the criteria described by the hash.
+ # * :sibling : a hash. At least one of the node's siblings must
+ # meet the criteria described by the hash.
+ # * :after : a hash. The node must be after any sibling meeting
+ # the criteria described by the hash, and at least one sibling must match.
+ # * :before : a hash. The node must be before any sibling meeting
+ # the criteria described by the hash, and at least one sibling must match.
+ # * :children : a hash, for counting children of a node. Accepts the
+ # keys:
+ # ** :count : either a number or a range which must equal (or
+ # include) the number of children that match.
+ # ** :less_than : the number of matching children must be less than
+ # this number.
+ # ** :greater_than : the number of matching children must be
+ # greater than this number.
+ # ** :only : another hash consisting of the keys to use
+ # to match on the children, and only matching children will be
+ # counted.
+ #
+ # Conditions are matched using the following algorithm:
+ #
+ # * if the condition is a string, it must be a substring of the value.
+ # * if the condition is a regexp, it must match the value.
+ # * if the condition is a number, the value must match number.to_s.
+ # * if the condition is +true+, the value must not be +nil+.
+ # * if the condition is +false+ or +nil+, the value must be +nil+.
+ #
+ # Usage:
+ #
+ # # test if the node is a "span" tag
+ # node.match :tag => "span"
+ #
+ # # test if the node's parent is a "div"
+ # node.match :parent => { :tag => "div" }
+ #
+ # # test if any of the node's ancestors are "table" tags
+ # node.match :ancestor => { :tag => "table" }
+ #
+ # # test if any of the node's immediate children are "em" tags
+ # node.match :child => { :tag => "em" }
+ #
+ # # test if any of the node's descendants are "strong" tags
+ # node.match :descendant => { :tag => "strong" }
+ #
+ # # test if the node has between 2 and 4 span tags as immediate children
+ # node.match :children => { :count => 2..4, :only => { :tag => "span" } }
+ #
+ # # get funky: test to see if the node is a "div", has a "ul" ancestor
+ # # and an "li" parent (with "class" = "enum"), and whether or not it has
+ # # a "span" descendant that contains # text matching /hello world/:
+ # node.match :tag => "div",
+ # :ancestor => { :tag => "ul" },
+ # :parent => { :tag => "li",
+ # :attributes => { :class => "enum" } },
+ # :descendant => { :tag => "span",
+ # :child => /hello world/ }
+ def match(conditions)
+ conditions = validate_conditions(conditions)
+ # check content of child nodes
+ if conditions[:content]
+ if children.empty?
+ return false unless match_condition("", conditions[:content])
+ else
+ return false unless children.find { |child| child.match(conditions[:content]) }
+ end
+ end
+
+ # test the name
+ return false unless match_condition(@name, conditions[:tag]) if conditions[:tag]
+
+ # test attributes
+ (conditions[:attributes] || {}).each do |key, value|
+ return false unless match_condition(self[key], value)
+ end
+
+ # test parent
+ return false unless parent.match(conditions[:parent]) if conditions[:parent]
+
+ # test children
+ return false unless children.find { |child| child.match(conditions[:child]) } if conditions[:child]
+
+ # test ancestors
+ if conditions[:ancestor]
+ return false unless catch :found do
+ p = self
+ throw :found, true if p.match(conditions[:ancestor]) while p = p.parent
+ end
+ end
+
+ # test descendants
+ if conditions[:descendant]
+ return false unless children.find do |child|
+ # test the child
+ child.match(conditions[:descendant]) ||
+ # test the child's descendants
+ child.match(:descendant => conditions[:descendant])
+ end
+ end
+
+ # count children
+ if opts = conditions[:children]
+ matches = children.select do |c|
+ (c.kind_of?(HTML::Tag) and (c.closing == :self or ! c.childless?))
+ end
+
+ matches = matches.select { |c| c.match(opts[:only]) } if opts[:only]
+ opts.each do |key, value|
+ next if key == :only
+ case key
+ when :count
+ if Integer === value
+ return false if matches.length != value
+ else
+ return false unless value.include?(matches.length)
+ end
+ when :less_than
+ return false unless matches.length < value
+ when :greater_than
+ return false unless matches.length > value
+ else raise "unknown count condition #{key}"
+ end
+ end
+ end
+
+ # test siblings
+ if conditions[:sibling] || conditions[:before] || conditions[:after]
+ siblings = parent ? parent.children : []
+ self_index = siblings.index(self)
+
+ if conditions[:sibling]
+ return false unless siblings.detect do |s|
+ s != self && s.match(conditions[:sibling])
+ end
+ end
+
+ if conditions[:before]
+ return false unless siblings[self_index+1..-1].detect do |s|
+ s != self && s.match(conditions[:before])
+ end
+ end
+
+ if conditions[:after]
+ return false unless siblings[0,self_index].detect do |s|
+ s != self && s.match(conditions[:after])
+ end
+ end
+ end
+
+ true
+ end
+
+ def ==(node)
+ return false unless super
+ return false unless closing == node.closing && self.name == node.name
+ attributes == node.attributes
+ end
+
+ private
+ # Match the given value to the given condition.
+ def match_condition(value, condition)
+ case condition
+ when String
+ value && value == condition
+ when Regexp
+ value && value.match(condition)
+ when Numeric
+ value == condition.to_s
+ when true
+ !value.nil?
+ when false, nil
+ value.nil?
+ else
+ false
+ end
+ end
+ end
+end
diff --git a/lib/sanitize.rb b/lib/sanitize.rb
index 3727bb08..f576b1fb 100644
--- a/lib/sanitize.rb
+++ b/lib/sanitize.rb
@@ -158,7 +158,7 @@ class String
#++
#:stopdoc:
- MATHML_ENTITIES = {
+ MATHML_ENTITIES = {
'Alpha' => 'Α',
'Beta' => 'Β',
'Epsilon' => 'Ε',
@@ -2279,7 +2279,7 @@ class String
'wp' => '℘',
'wr' => '≀',
'zeetrf' => 'ℨ'
- }
+ } unless const_defined? "MATHML_ENTITIES"
#:startdoc:
# Converts XHTML+MathML named entities in string to Numeric Character References
diff --git a/lib/sanitizer.rb b/lib/sanitizer.rb
new file mode 100644
index 00000000..adc3f3bb
--- /dev/null
+++ b/lib/sanitizer.rb
@@ -0,0 +1,198 @@
+module Sanitizer
+
+# This module provides sanitization of XHTML+MathML+SVG
+# and of inline style attributes.
+#
+# Based heavily on Sam Ruby's code in the Universal FeedParser.
+
+ require 'html/tokenizer'
+ require 'node'
+ require 'stringsupport'
+
+ acceptable_elements = %w[a abbr acronym address area b big blockquote br
+ button caption center cite code col colgroup dd del dfn dir div dl dt
+ em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label
+ legend li map menu ol optgroup option p pre q s samp select small span
+ strike strong sub sup table tbody td textarea tfoot th thead tr tt u
+ ul var]
+
+ mathml_elements = %w[annotation annotation-xml maction math merror mfrac
+ mfenced mi mmultiscripts mn mo mover mpadded mphantom mprescripts mroot
+ mrow mspace msqrt mstyle msub msubsup msup mtable mtd mtext mtr munder
+ munderover none semantics]
+
+ svg_elements = %w[a animate animateColor animateMotion animateTransform
+ circle defs desc ellipse font-face font-face-name font-face-src
+ foreignObject g glyph hkern linearGradient line marker metadata
+ missing-glyph mpath path polygon polyline radialGradient rect set
+ stop svg switch text title tspan use]
+
+ acceptable_attributes = %w[abbr accept accept-charset accesskey action
+ align alt axis border cellpadding cellspacing char charoff charset
+ checked cite class clear cols colspan color compact coords datetime
+ dir disabled enctype for frame headers height href hreflang hspace id
+ ismap label lang longdesc maxlength media method multiple name nohref
+ noshade nowrap prompt readonly rel rev rows rowspan rules scope
+ selected shape size span src start style summary tabindex target title
+ type usemap valign value vspace width xml:lang]
+
+ mathml_attributes = %w[actiontype align close columnalign columnalign
+ columnalign columnlines columnspacing columnspan depth display
+ displaystyle encoding equalcolumns equalrows fence fontstyle fontweight
+ frame height linethickness lspace mathbackground mathcolor mathvariant
+ mathvariant maxsize minsize open other rowalign rowalign rowalign
+ rowlines rowspacing rowspan rspace scriptlevel selection separator
+ separators stretchy width width xlink:href xlink:show xlink:type xmlns
+ xmlns:xlink]
+
+ svg_attributes = %w[accent-height accumulate additive alphabetic
+ arabic-form ascent attributeName attributeType baseProfile bbox begin
+ by calcMode cap-height class color color-rendering content cx cy d dx
+ dy descent display dur end fill fill-rule font-family font-size
+ font-stretch font-style font-variant font-weight from fx fy g1 g2
+ glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
+ ideographic k keyPoints keySplines keyTimes lang marker-end
+ marker-mid marker-start markerHeight markerUnits markerWidth
+ mathematical max min name offset opacity orient origin
+ overline-position overline-thickness panose-1 path pathLength points
+ preserveAspectRatio r refX refY repeatCount repeatDur
+ requiredExtensions requiredFeatures restart rotate rx ry slope stemh
+ stemv stop-color stop-opacity strikethrough-position
+ strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
+ stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
+ stroke-width systemLanguage target text-anchor to transform type u1
+ u2 underline-position underline-thickness unicode unicode-range
+ units-per-em values version viewBox visibility width widths x
+ x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
+ xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
+ xmlns:xlink y y1 y2 zoomAndPan]
+
+ attr_val_is_uri = %w[href src cite action longdesc xlink:href xml:base]
+
+ SVG_ATTR_VAL_ALLOWS_REF = %w[clip-path color-profile cursor fill
+ filter marker marker-start marker-mid marker-end mask stroke]
+
+ SVG_ALLOW_LOCAL_HREF = %w[altGlyph animate animateColor animateMotion
+ animateTransform cursor feImage filter linearGradient pattern
+ radialGradient textpath tref set use]
+
+ acceptable_css_properties = %w[azimuth background-color
+ border-bottom-color border-collapse border-color border-left-color
+ border-right-color border-top-color clear color cursor direction
+ display elevation float font font-family font-size font-style
+ font-variant font-weight height letter-spacing line-height overflow
+ pause pause-after pause-before pitch pitch-range richness speak
+ speak-header speak-numeral speak-punctuation speech-rate stress
+ text-align text-decoration text-indent unicode-bidi vertical-align
+ voice-family volume white-space width]
+
+ acceptable_css_keywords = %w[auto aqua black block blue bold both bottom
+ brown center collapse dashed dotted fuchsia gray green !important
+ italic left lime maroon medium none navy normal nowrap olive pointer
+ purple red right solid silver teal top transparent underline white
+ yellow]
+
+ acceptable_svg_properties = %w[fill fill-opacity fill-rule stroke
+ stroke-width stroke-linecap stroke-linejoin stroke-opacity]
+
+ acceptable_protocols = %w[ed2k ftp http https irc mailto news gopher nntp
+ telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
+
+ VOID_ELEMENTS = %w[img br hr link meta area base basefont
+ col frame input isindex param]
+
+ ALLOWED_ELEMENTS = acceptable_elements + mathml_elements + svg_elements unless defined?(ALLOWED_ELEMENTS)
+ ALLOWED_ATTRIBUTES = acceptable_attributes + mathml_attributes + svg_attributes unless defined?(ALLOWED_ATTRIBUTES)
+ ALLOWED_CSS_PROPERTIES = acceptable_css_properties unless defined?(ALLOWED_CSS_PROPERTIES)
+ ALLOWED_CSS_KEYWORDS = acceptable_css_keywords unless defined?(ALLOWED_CSS_KEYWORDS)
+ ALLOWED_SVG_PROPERTIES = acceptable_svg_properties unless defined?(ALLOWED_SVG_PROPERTIES)
+ ALLOWED_PROTOCOLS = acceptable_protocols unless defined?(ALLOWED_PROTOCOLS)
+ ATTR_VAL_IS_URI = attr_val_is_uri unless defined?(ATTR_VAL_IS_URI)
+
+ # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and stripping out all
+ # attributes not in ALLOWED_ATTRIBUTES. Style attributes are parsed, and a restricted set,
+ # specified by ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
+ # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified in
+ # ALLOWED_PROTOCOLS are allowed.
+ # You can adjust what gets sanitized, by defining these constant arrays before this Module is loaded.
+ #
+ # sanitize_html('')
+ # => <script> do_nasty_stuff() </script>
+ # sanitize_html('Click here for $100 ')
+ # => Click here for $100
+ def sanitize_xhtml(html)
+ if html.index("<")
+ tokenizer = HTML::Tokenizer.new(html.to_utf8)
+ new_text = ""
+
+ while token = tokenizer.next
+ node = XHTML::Node.parse(nil, 0, 0, token, false)
+ new_text << case node.tag?
+ when true
+ if ALLOWED_ELEMENTS.include?(node.name)
+ if node.attributes
+ node.attributes.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
+ ATTR_VAL_IS_URI.each do |attr|
+ val_unescaped = CGI.unescapeHTML(node.attributes[attr].to_s).gsub(/`|[\000-\040\177\s\200-\240]/,'').downcase
+ if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
+ node.attributes.delete attr
+ end
+ end
+ SVG_ATTR_VAL_ALLOWS_REF.each do |attr|
+ node.attributes[attr] = node.attributes[attr].to_s.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if node.attributes[attr]
+ end
+ if SVG_ALLOW_LOCAL_HREF.include?(node.name) && node.attributes['xlink:href'] && node.attributes['xlink:href'] =~ /^\s*[^#\s].*/m
+ node.attributes.delete 'xlink:href'
+ end
+ if node.attributes['style']
+ node.attributes['style'] = sanitize_css(node.attributes['style'])
+ end
+ node.attributes.each do |attr,val|
+ if String === val
+ node.attributes[attr] = CGI.escapeHTML(val.unescapeHTML)
+ else
+ node.attributes.delete attr
+ end
+ end
+ end
+ node.to_s
+ else
+ node.to_s.gsub(/, "<").gsub(/>/, ">")
+ end
+ else
+ CGI.escapeHTML(node.to_s.unescapeHTML)
+ end
+ end
+
+ html = new_text
+ end
+ html
+ end
+
+ def sanitize_css(style)
+ # disallow urls
+ style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
+
+ # gauntlet
+ return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
+ return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
+
+ clean = []
+ style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
+ next if val.empty?
+ prop.downcase!
+ if self.class.const_get("ALLOWED_CSS_PROPERTIES").include?(prop)
+ clean << "#{prop}: #{val};"
+ elsif %w[background border margin padding].include?(prop.split('-')[0])
+ clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
+ !self.class.const_get("ALLOWED_CSS_KEYWORDS").include?(keyword) and
+ keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
+ end
+ elsif self.class.const_get("ALLOWED_SVG_PROPERTIES").include?(prop)
+ clean << "#{prop}: #{val};"
+ end
+ end
+
+ style = clean.join(' ')
+ end
+end
diff --git a/lib/stringsupport.rb b/lib/stringsupport.rb
new file mode 100644
index 00000000..aa076622
--- /dev/null
+++ b/lib/stringsupport.rb
@@ -0,0 +1,2271 @@
+# Some useful additions to the String class
+
+class String
+
+# Check whether a string is valid utf-8
+#
+# :call-seq:
+# string.is_utf8? -> boolean
+#
+# returns true if the sequence of bytes in string is valid utf-8
+#--
+ def is_utf8?
+ #expand NCRs to utf-8
+ text = self.gsub(/[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*') }
+ text.gsub!(/(\d+);/) { |m| [$1.to_i].pack('U*') }
+
+ # You might think this is faster, but it isn't
+ #pieces = self.split(/[xX]([a-fA-F0-9]+);/)
+ #1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')}
+ #pieces = pieces.join.split(/(\d+);/)
+ #1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')}
+ #text = pieces.join
+
+ #ensure the resulting string of bytes is valid utf-8
+ text =~ /\A(
+ [\x09\x0A\x0D\x20-\x7E] # ASCII
+ | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
+ | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
+ | [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte
+ | \xEF[\x80-\xBE]{2} #
+ | \xEF\xBF[\x80-\xBD] # excluding U+fffe and U+ffff
+ | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
+ | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
+ | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
+ | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
+ )*\Z/x;
+ end
+#++
+
+#:stopdoc:
+ MATHML_ENTITIES = {
+ 'Alpha' => 'Α',
+ 'Beta' => 'Β',
+ 'Epsilon' => 'Ε',
+ 'Zeta' => 'Ζ',
+ 'Eta' => 'Η',
+ 'Iota' => 'Ι',
+ 'Kappa' => 'Κ',
+ 'Mu' => 'Μ',
+ 'Nu' => 'Ν',
+ 'Omicron' => 'Ο',
+ 'Rho' => 'Ρ',
+ 'Tau' => 'Τ',
+ 'Chi' => 'Χ',
+ 'epsilon' => 'ε',
+ 'zeta' => 'ζ',
+ 'omicron' => 'ο',
+ 'sigmaf' => 'ς',
+ 'thetasym' => 'ϑ',
+ 'upsih' => 'ϒ',
+ 'oline' => '‾',
+ 'frasl' => '⁄',
+ 'alefsym' => 'ℵ',
+ 'crarr' => '↵',
+ 'empty' => '∅',
+ 'amp' => '&',
+ 'lt' => '<',
+ 'zwnj' => '',
+ 'zwj' => '',
+ 'lrm' => '',
+ 'rlm' => '',
+ 'sbquo' => '‚',
+ 'bdquo' => '„',
+ 'lsaquo' => '‹',
+ 'rsaquo' => '›',
+ 'euro' => '€',
+ 'angzarr' => '⍼',
+ 'cirmid' => '⫯',
+ 'cudarrl' => '⤸',
+ 'cudarrr' => '⤵',
+ 'cularr' => '↶',
+ 'cularrp' => '⤽',
+ 'curarr' => '↷',
+ 'curarrm' => '⤼',
+ 'Darr' => '↡',
+ 'dArr' => '⇓',
+ 'ddarr' => '⇊',
+ 'DDotrahd' => '⤑',
+ 'dfisht' => '⥿',
+ 'dHar' => '⥥',
+ 'dharl' => '⇃',
+ 'dharr' => '⇂',
+ 'duarr' => '⇵',
+ 'duhar' => '⥯',
+ 'dzigrarr' => '⟿',
+ 'erarr' => '⥱',
+ 'hArr' => '⇔',
+ 'harr' => '↔',
+ 'harrcir' => '⥈',
+ 'harrw' => '↭',
+ 'hoarr' => '⇿',
+ 'imof' => '⊷',
+ 'lAarr' => '⇚',
+ 'Larr' => '↞',
+ 'larrbfs' => '⤟',
+ 'larrfs' => '⤝',
+ 'larrhk' => '↩',
+ 'larrlp' => '↫',
+ 'larrpl' => '⤹',
+ 'larrsim' => '⥳',
+ 'larrtl' => '↢',
+ 'lAtail' => '⤛',
+ 'latail' => '⤙',
+ 'lBarr' => '⤎',
+ 'lbarr' => '⤌',
+ 'ldca' => '⤶',
+ 'ldrdhar' => '⥧',
+ 'ldrushar' => '⥋',
+ 'ldsh' => '↲',
+ 'lfisht' => '⥼',
+ 'lHar' => '⥢',
+ 'lhard' => '↽',
+ 'lharu' => '↼',
+ 'lharul' => '⥪',
+ 'llarr' => '⇇',
+ 'llhard' => '⥫',
+ 'loarr' => '⇽',
+ 'lrarr' => '⇆',
+ 'lrhar' => '⇋',
+ 'lrhard' => '⥭',
+ 'lsh' => '↰',
+ 'lurdshar' => '⥊',
+ 'luruhar' => '⥦',
+ 'Map' => '⤅',
+ 'map' => '↦',
+ 'midcir' => '⫰',
+ 'mumap' => '⊸',
+ 'nearhk' => '⤤',
+ 'neArr' => '⇗',
+ 'nearr' => '↗',
+ 'nesear' => '⤨',
+ 'nhArr' => '⇎',
+ 'nharr' => '↮',
+ 'nlArr' => '⇍',
+ 'nlarr' => '↚',
+ 'nrArr' => '⇏',
+ 'nrarr' => '↛',
+ 'nrarrc' => '⤳̸',
+ 'nrarrw' => '↝̸',
+ 'nvHarr' => '⤄',
+ 'nvlArr' => '⤂',
+ 'nvrArr' => '⤃',
+ 'nwarhk' => '⤣',
+ 'nwArr' => '⇖',
+ 'nwarr' => '↖',
+ 'nwnear' => '⤧',
+ 'olarr' => '↺',
+ 'orarr' => '↻',
+ 'origof' => '⊶',
+ 'rAarr' => '⇛',
+ 'Rarr' => '↠',
+ 'rarrap' => '⥵',
+ 'rarrbfs' => '⤠',
+ 'rarrc' => '⤳',
+ 'rarrfs' => '⤞',
+ 'rarrhk' => '↪',
+ 'rarrlp' => '↬',
+ 'rarrpl' => '⥅',
+ 'rarrsim' => '⥴',
+ 'Rarrtl' => '⤖',
+ 'rarrtl' => '↣',
+ 'rarrw' => '↝',
+ 'rAtail' => '⤜',
+ 'ratail' => '⤚',
+ 'RBarr' => '⤐',
+ 'rBarr' => '⤏',
+ 'rbarr' => '⤍',
+ 'rdca' => '⤷',
+ 'rdldhar' => '⥩',
+ 'rdsh' => '↳',
+ 'rfisht' => '⥽',
+ 'rHar' => '⥤',
+ 'rhard' => '⇁',
+ 'rharu' => '⇀',
+ 'rharul' => '⥬',
+ 'rlarr' => '⇄',
+ 'rlhar' => '⇌',
+ 'roarr' => '⇾',
+ 'rrarr' => '⇉',
+ 'rsh' => '↱',
+ 'ruluhar' => '⥨',
+ 'searhk' => '⤥',
+ 'seArr' => '⇘',
+ 'searr' => '↘',
+ 'seswar' => '⤩',
+ 'simrarr' => '⥲',
+ 'slarr' => '←',
+ 'srarr' => '→',
+ 'swarhk' => '⤦',
+ 'swArr' => '⇙',
+ 'swarr' => '↙',
+ 'swnwar' => '⤪',
+ 'Uarr' => '↟',
+ 'uArr' => '⇑',
+ 'Uarrocir' => '⥉',
+ 'udarr' => '⇅',
+ 'udhar' => '⥮',
+ 'ufisht' => '⥾',
+ 'uHar' => '⥣',
+ 'uharl' => '↿',
+ 'uharr' => '↾',
+ 'uuarr' => '⇈',
+ 'vArr' => '⇕',
+ 'varr' => '↕',
+ 'xhArr' => '⟺',
+ 'xharr' => '⟷',
+ 'xlArr' => '⟸',
+ 'xlarr' => '⟵',
+ 'xmap' => '⟼',
+ 'xrArr' => '⟹',
+ 'xrarr' => '⟶',
+ 'zigrarr' => '⇝',
+ 'ac' => '∾',
+ 'acE' => '∾̳',
+ 'amalg' => '⨿',
+ 'barvee' => '⊽',
+ 'Barwed' => '⌆',
+ 'barwed' => '⌅',
+ 'bsolb' => '⧅',
+ 'Cap' => '⋒',
+ 'capand' => '⩄',
+ 'capbrcup' => '⩉',
+ 'capcap' => '⩋',
+ 'capcup' => '⩇',
+ 'capdot' => '⩀',
+ 'caps' => '∩︀',
+ 'ccaps' => '⩍',
+ 'ccups' => '⩌',
+ 'ccupssm' => '⩐',
+ 'coprod' => '∐',
+ 'Cup' => '⋓',
+ 'cupbrcap' => '⩈',
+ 'cupcap' => '⩆',
+ 'cupcup' => '⩊',
+ 'cupdot' => '⊍',
+ 'cupor' => '⩅',
+ 'cups' => '∪︀',
+ 'cuvee' => '⋎',
+ 'cuwed' => '⋏',
+ 'Dagger' => '‡',
+ 'dagger' => '†',
+ 'diam' => '⋄',
+ 'divonx' => '⋇',
+ 'eplus' => '⩱',
+ 'hercon' => '⊹',
+ 'intcal' => '⊺',
+ 'iprod' => '⨼',
+ 'loplus' => '⨭',
+ 'lotimes' => '⨴',
+ 'lthree' => '⋋',
+ 'ltimes' => '⋉',
+ 'midast' => '*',
+ 'minusb' => '⊟',
+ 'minusd' => '∸',
+ 'minusdu' => '⨪',
+ 'ncap' => '⩃',
+ 'ncup' => '⩂',
+ 'oast' => '⊛',
+ 'ocir' => '⊚',
+ 'odash' => '⊝',
+ 'odiv' => '⨸',
+ 'odot' => '⊙',
+ 'odsold' => '⦼',
+ 'ofcir' => '⦿',
+ 'ogt' => '⧁',
+ 'ohbar' => '⦵',
+ 'olcir' => '⦾',
+ 'olt' => '⧀',
+ 'omid' => '⦶',
+ 'ominus' => '⊖',
+ 'opar' => '⦷',
+ 'operp' => '⦹',
+ 'oplus' => '⊕',
+ 'osol' => '⊘',
+ 'Otimes' => '⨷',
+ 'otimes' => '⊗',
+ 'otimesas' => '⨶',
+ 'ovbar' => '⌽',
+ 'plusacir' => '⨣',
+ 'plusb' => '⊞',
+ 'pluscir' => '⨢',
+ 'plusdo' => '∔',
+ 'plusdu' => '⨥',
+ 'pluse' => '⩲',
+ 'plussim' => '⨦',
+ 'plustwo' => '⨧',
+ 'prod' => '∏',
+ 'race' => '⧚',
+ 'roplus' => '⨮',
+ 'rotimes' => '⨵',
+ 'rthree' => '⋌',
+ 'rtimes' => '⋊',
+ 'sdot' => '⋅',
+ 'sdotb' => '⊡',
+ 'setmn' => '∖',
+ 'simplus' => '⨤',
+ 'smashp' => '⨳',
+ 'solb' => '⧄',
+ 'sqcap' => '⊓',
+ 'sqcaps' => '⊓︀',
+ 'sqcup' => '⊔',
+ 'sqcups' => '⊔︀',
+ 'ssetmn' => '∖',
+ 'sstarf' => '⋆',
+ 'subdot' => '⪽',
+ 'sum' => '∑',
+ 'supdot' => '⪾',
+ 'timesb' => '⊠',
+ 'timesbar' => '⨱',
+ 'timesd' => '⨰',
+ 'tridot' => '◬',
+ 'triminus' => '⨺',
+ 'triplus' => '⨹',
+ 'trisb' => '⧍',
+ 'tritime' => '⨻',
+ 'uplus' => '⊎',
+ 'veebar' => '⊻',
+ 'wedbar' => '⩟',
+ 'wreath' => '≀',
+ 'xcap' => '⋂',
+ 'xcirc' => '◯',
+ 'xcup' => '⋃',
+ 'xdtri' => '▽',
+ 'xodot' => '⨀',
+ 'xoplus' => '⨁',
+ 'xotime' => '⨂',
+ 'xsqcup' => '⨆',
+ 'xuplus' => '⨄',
+ 'xutri' => '△',
+ 'xvee' => '⋁',
+ 'xwedge' => '⋀',
+ 'dlcorn' => '⌞',
+ 'drcorn' => '⌟',
+ 'gtlPar' => '⦕',
+ 'langd' => '⦑',
+ 'lbrke' => '⦋',
+ 'lbrksld' => '⦏',
+ 'lbrkslu' => '⦍',
+ 'lceil' => '⌈',
+ 'lfloor' => '⌊',
+ 'lmoust' => '⎰',
+ 'lparlt' => '⦓',
+ 'ltrPar' => '⦖',
+ 'rangd' => '⦒',
+ 'rbrke' => '⦌',
+ 'rbrksld' => '⦎',
+ 'rbrkslu' => '⦐',
+ 'rceil' => '⌉',
+ 'rfloor' => '⌋',
+ 'rmoust' => '⎱',
+ 'rpargt' => '⦔',
+ 'ulcorn' => '⌜',
+ 'urcorn' => '⌝',
+ 'gnap' => '⪊',
+ 'gnE' => '≩',
+ 'gne' => '⪈',
+ 'gnsim' => '⋧',
+ 'gvnE' => '≩︀',
+ 'lnap' => '⪉',
+ 'lnE' => '≨',
+ 'lne' => '⪇',
+ 'lnsim' => '⋦',
+ 'lvnE' => '≨︀',
+ 'nap' => '≉',
+ 'napE' => '⩰̸',
+ 'napid' => '≋̸',
+ 'ncong' => '≇',
+ 'ncongdot' => '⩭̸',
+ 'nequiv' => '≢',
+ 'ngE' => '≧̸',
+ 'nge' => '≱',
+ 'nges' => '⩾̸',
+ 'nGg' => '⋙̸',
+ 'ngsim' => '≵',
+ 'nGt' => '≫⃒',
+ 'ngt' => '≯',
+ 'nGtv' => '≫̸',
+ 'nlE' => '≦̸',
+ 'nle' => '≰',
+ 'nles' => '⩽̸',
+ 'nLl' => '⋘̸',
+ 'nlsim' => '≴',
+ 'nLt' => '≪⃒',
+ 'nlt' => '≮',
+ 'nltri' => '⋪',
+ 'nltrie' => '⋬',
+ 'nLtv' => '≪̸',
+ 'nmid' => '∤',
+ 'npar' => '∦',
+ 'npr' => '⊀',
+ 'nprcue' => '⋠',
+ 'npre' => '⪯̸',
+ 'nrtri' => '⋫',
+ 'nrtrie' => '⋭',
+ 'nsc' => '⊁',
+ 'nsccue' => '⋡',
+ 'nsce' => '⪰̸',
+ 'nsim' => '≁',
+ 'nsime' => '≄',
+ 'nsmid' => '∤',
+ 'nspar' => '∦',
+ 'nsqsube' => '⋢',
+ 'nsqsupe' => '⋣',
+ 'nsub' => '⊄',
+ 'nsubE' => '⫅̸',
+ 'nsube' => '⊈',
+ 'nsup' => '⊅',
+ 'nsupE' => '⫆̸',
+ 'nsupe' => '⊉',
+ 'ntgl' => '≹',
+ 'ntlg' => '≸',
+ 'nvap' => '≍⃒',
+ 'nVDash' => '⊯',
+ 'nVdash' => '⊮',
+ 'nvDash' => '⊭',
+ 'nvdash' => '⊬',
+ 'nvge' => '≥⃒',
+ 'nvgt' => '>⃒',
+ 'nvle' => '≤⃒',
+ 'nvltrie' => '⊴⃒',
+ 'nvrtrie' => '⊵⃒',
+ 'nvsim' => '∼⃒',
+ 'parsim' => '⫳',
+ 'prnap' => '⪹',
+ 'prnE' => '⪵',
+ 'prnsim' => '⋨',
+ 'rnmid' => '⫮',
+ 'scnap' => '⪺',
+ 'scnE' => '⪶',
+ 'scnsim' => '⋩',
+ 'simne' => '≆',
+ 'solbar' => '⌿',
+ 'subnE' => '⫋',
+ 'subne' => '⊊',
+ 'supnE' => '⫌',
+ 'supne' => '⊋',
+ 'vnsub' => '⊂⃒',
+ 'vnsup' => '⊃⃒',
+ 'vsubnE' => '⫋︀',
+ 'vsubne' => '⊊︀',
+ 'vsupnE' => '⫌︀',
+ 'vsupne' => '⊋︀',
+ 'ang' => '∠',
+ 'ange' => '⦤',
+ 'angmsd' => '∡',
+ 'angmsdaa' => '⦨',
+ 'angmsdab' => '⦩',
+ 'angmsdac' => '⦪',
+ 'angmsdad' => '⦫',
+ 'angmsdae' => '⦬',
+ 'angmsdaf' => '⦭',
+ 'angmsdag' => '⦮',
+ 'angmsdah' => '⦯',
+ 'angrtvb' => '⊾',
+ 'angrtvbd' => '⦝',
+ 'bbrk' => '⎵',
+ 'bbrktbrk' => '⎶',
+ 'bemptyv' => '⦰',
+ 'beth' => 'ℶ',
+ 'boxbox' => '⧉',
+ 'bprime' => '‵',
+ 'bsemi' => '⁏',
+ 'cemptyv' => '⦲',
+ 'cirE' => '⧃',
+ 'cirscir' => '⧂',
+ 'comp' => '∁',
+ 'daleth' => 'ℸ',
+ 'demptyv' => '⦱',
+ 'ell' => 'ℓ',
+ 'empty' => '∅',
+ 'emptyv' => '∅',
+ 'gimel' => 'ℷ',
+ 'iiota' => '℩',
+ 'image' => 'ℑ',
+ 'imath' => 'ı',
+ 'jmath' => 'j',
+ 'laemptyv' => '⦴',
+ 'lltri' => '◺',
+ 'lrtri' => '⊿',
+ 'mho' => '℧',
+ 'nang' => '∠⃒',
+ 'nexist' => '∄',
+ 'oS' => 'Ⓢ',
+ 'planck' => 'ℏ',
+ 'plankv' => 'ℏ',
+ 'raemptyv' => '⦳',
+ 'range' => '⦥',
+ 'real' => 'ℜ',
+ 'tbrk' => '⎴',
+ 'trpezium' => '�',
+ 'ultri' => '◸',
+ 'urtri' => '◹',
+ 'vzigzag' => '⦚',
+ 'weierp' => '℘',
+ 'apE' => '⩰',
+ 'ape' => '≊',
+ 'apid' => '≋',
+ 'asymp' => '≈',
+ 'Barv' => '⫧',
+ 'bcong' => '≌',
+ 'bepsi' => '϶',
+ 'bowtie' => '⋈',
+ 'bsim' => '∽',
+ 'bsime' => '⋍',
+ 'bsolhsub' => '\⊂',
+ 'bump' => '≎',
+ 'bumpE' => '⪮',
+ 'bumpe' => '≏',
+ 'cire' => '≗',
+ 'Colon' => '∷',
+ 'Colone' => '⩴',
+ 'colone' => '≔',
+ 'congdot' => '⩭',
+ 'csub' => '⫏',
+ 'csube' => '⫑',
+ 'csup' => '⫐',
+ 'csupe' => '⫒',
+ 'cuepr' => '⋞',
+ 'cuesc' => '⋟',
+ 'Dashv' => '⫤',
+ 'dashv' => '⊣',
+ 'easter' => '⩮',
+ 'ecir' => '≖',
+ 'ecolon' => '≕',
+ 'eDDot' => '⩷',
+ 'eDot' => '≑',
+ 'efDot' => '≒',
+ 'eg' => '⪚',
+ 'egs' => '⪖',
+ 'egsdot' => '⪘',
+ 'el' => '⪙',
+ 'els' => '⪕',
+ 'elsdot' => '⪗',
+ 'equest' => '≟',
+ 'equivDD' => '⩸',
+ 'erDot' => '≓',
+ 'esdot' => '≐',
+ 'Esim' => '⩳',
+ 'esim' => '≂',
+ 'fork' => '⋔',
+ 'forkv' => '⫙',
+ 'frown' => '⌢',
+ 'gap' => '⪆',
+ 'gE' => '≧',
+ 'gEl' => '⪌',
+ 'gel' => '⋛',
+ 'ges' => '⩾',
+ 'gescc' => '⪩',
+ 'gesdot' => '⪀',
+ 'gesdoto' => '⪂',
+ 'gesdotol' => '⪄',
+ 'gesl' => '⋛︀',
+ 'gesles' => '⪔',
+ 'Gg' => '⋙',
+ 'gl' => '≷',
+ 'gla' => '⪥',
+ 'glE' => '⪒',
+ 'glj' => '⪤',
+ 'gsim' => '≳',
+ 'gsime' => '⪎',
+ 'gsiml' => '⪐',
+ 'Gt' => '≫',
+ 'gtcc' => '⪧',
+ 'gtcir' => '⩺',
+ 'gtdot' => '⋗',
+ 'gtquest' => '⩼',
+ 'gtrarr' => '⥸',
+ 'homtht' => '∻',
+ 'lap' => '⪅',
+ 'lat' => '⪫',
+ 'late' => '⪭',
+ 'lates' => '⪭︀',
+ 'lE' => '≦',
+ 'lEg' => '⪋',
+ 'leg' => '⋚',
+ 'les' => '⩽',
+ 'lescc' => '⪨',
+ 'lesdot' => '⩿',
+ 'lesdoto' => '⪁',
+ 'lesdotor' => '⪃',
+ 'lesg' => '⋚︀',
+ 'lesges' => '⪓',
+ 'lg' => '≶',
+ 'lgE' => '⪑',
+ 'Ll' => '⋘',
+ 'lsim' => '≲',
+ 'lsime' => '⪍',
+ 'lsimg' => '⪏',
+ 'Lt' => '≪',
+ 'ltcc' => '⪦',
+ 'ltcir' => '⩹',
+ 'ltdot' => '⋖',
+ 'ltlarr' => '⥶',
+ 'ltquest' => '⩻',
+ 'ltrie' => '⊴',
+ 'mcomma' => '⨩',
+ 'mDDot' => '∺',
+ 'mid' => '∣',
+ 'mlcp' => '⫛',
+ 'models' => '⊧',
+ 'mstpos' => '∾',
+ 'Pr' => '⪻',
+ 'pr' => '≺',
+ 'prap' => '⪷',
+ 'prcue' => '≼',
+ 'prE' => '⪳',
+ 'pre' => '⪯',
+ 'prsim' => '≾',
+ 'prurel' => '⊰',
+ 'ratio' => '∶',
+ 'rtrie' => '⊵',
+ 'rtriltri' => '⧎',
+ 'Sc' => '⪼',
+ 'sc' => '≻',
+ 'scap' => '⪸',
+ 'sccue' => '≽',
+ 'scE' => '⪴',
+ 'sce' => '⪰',
+ 'scsim' => '≿',
+ 'sdote' => '⩦',
+ 'sfrown' => '⌢',
+ 'simg' => '⪞',
+ 'simgE' => '⪠',
+ 'siml' => '⪝',
+ 'simlE' => '⪟',
+ 'smid' => '∣',
+ 'smile' => '⌣',
+ 'smt' => '⪪',
+ 'smte' => '⪬',
+ 'smtes' => '⪬︀',
+ 'spar' => '∥',
+ 'sqsub' => '⊏',
+ 'sqsube' => '⊑',
+ 'sqsup' => '⊐',
+ 'sqsupe' => '⊒',
+ 'ssmile' => '⌣',
+ 'Sub' => '⋐',
+ 'subE' => '⫅',
+ 'subedot' => '⫃',
+ 'submult' => '⫁',
+ 'subplus' => '⪿',
+ 'subrarr' => '⥹',
+ 'subsim' => '⫇',
+ 'subsub' => '⫕',
+ 'subsup' => '⫓',
+ 'Sup' => '⋑',
+ 'supdsub' => '⫘',
+ 'supE' => '⫆',
+ 'supedot' => '⫄',
+ 'suphsol' => '⊃/',
+ 'suphsub' => '⫗',
+ 'suplarr' => '⥻',
+ 'supmult' => '⫂',
+ 'supplus' => '⫀',
+ 'supsim' => '⫈',
+ 'supsub' => '⫔',
+ 'supsup' => '⫖',
+ 'thkap' => '≈',
+ 'thksim' => '∼',
+ 'topfork' => '⫚',
+ 'trie' => '≜',
+ 'twixt' => '≬',
+ 'Vbar' => '⫫',
+ 'vBar' => '⫨',
+ 'vBarv' => '⫩',
+ 'VDash' => '⊫',
+ 'Vdash' => '⊩',
+ 'vDash' => '⊨',
+ 'vdash' => '⊢',
+ 'Vdashl' => '⫦',
+ 'vltri' => '⊲',
+ 'vprop' => '∝',
+ 'vrtri' => '⊳',
+ 'Vvdash' => '⊪',
+ 'alpha' => 'α',
+ 'beta' => 'β',
+ 'chi' => 'χ',
+ 'Delta' => 'Δ',
+ 'delta' => 'δ',
+ 'epsi' => 'ϵ',
+ 'epsiv' => 'ε',
+ 'eta' => 'η',
+ 'Gamma' => 'Γ',
+ 'gamma' => 'γ',
+ 'Gammad' => 'Ϝ',
+ 'gammad' => 'ϝ',
+ 'iota' => 'ι',
+ 'kappa' => 'κ',
+ 'kappav' => 'ϰ',
+ 'Lambda' => 'Λ',
+ 'lambda' => 'λ',
+ 'mu' => 'μ',
+ 'nu' => 'ν',
+ 'Omega' => 'Ω',
+ 'omega' => 'ω',
+ 'Phi' => 'Φ',
+ 'phi' => 'ϕ',
+ 'phiv' => 'φ',
+ 'Pi' => 'Π',
+ 'pi' => 'π',
+ 'piv' => 'ϖ',
+ 'Psi' => 'Ψ',
+ 'psi' => 'ψ',
+ 'rho' => 'ρ',
+ 'rhov' => 'ϱ',
+ 'Sigma' => 'Σ',
+ 'sigma' => 'σ',
+ 'sigmav' => 'ς',
+ 'tau' => 'τ',
+ 'Theta' => 'Θ',
+ 'theta' => 'θ',
+ 'thetav' => 'ϑ',
+ 'Upsi' => 'ϒ',
+ 'upsi' => 'υ',
+ 'Xi' => 'Ξ',
+ 'xi' => 'ξ',
+ 'zeta' => 'ζ',
+ 'Afr' => '𝔄',
+ 'afr' => '𝔞',
+ 'Bfr' => '𝔅',
+ 'bfr' => '𝔟',
+ 'Cfr' => 'ℭ',
+ 'cfr' => '𝔠',
+ 'Dfr' => '𝔇',
+ 'dfr' => '𝔡',
+ 'Efr' => '𝔈',
+ 'efr' => '𝔢',
+ 'Ffr' => '𝔉',
+ 'ffr' => '𝔣',
+ 'Gfr' => '𝔊',
+ 'gfr' => '𝔤',
+ 'Hfr' => 'ℌ',
+ 'hfr' => '𝔥',
+ 'Ifr' => 'ℑ',
+ 'ifr' => '𝔦',
+ 'Jfr' => '𝔍',
+ 'jfr' => '𝔧',
+ 'Kfr' => '𝔎',
+ 'kfr' => '𝔨',
+ 'Lfr' => '𝔏',
+ 'lfr' => '𝔩',
+ 'Mfr' => '𝔐',
+ 'mfr' => '𝔪',
+ 'Nfr' => '𝔑',
+ 'nfr' => '𝔫',
+ 'Ofr' => '𝔒',
+ 'ofr' => '𝔬',
+ 'Pfr' => '𝔓',
+ 'pfr' => '𝔭',
+ 'Qfr' => '𝔔',
+ 'qfr' => '𝔮',
+ 'Rfr' => 'ℜ',
+ 'rfr' => '𝔯',
+ 'Sfr' => '𝔖',
+ 'sfr' => '𝔰',
+ 'Tfr' => '𝔗',
+ 'tfr' => '𝔱',
+ 'Ufr' => '𝔘',
+ 'ufr' => '𝔲',
+ 'Vfr' => '𝔙',
+ 'vfr' => '𝔳',
+ 'Wfr' => '𝔚',
+ 'wfr' => '𝔴',
+ 'Xfr' => '𝔛',
+ 'xfr' => '𝔵',
+ 'Yfr' => '𝔜',
+ 'yfr' => '𝔶',
+ 'Zfr' => 'ℨ',
+ 'zfr' => '𝔷',
+ 'Aopf' => '𝔸',
+ 'Bopf' => '𝔹',
+ 'Copf' => 'ℂ',
+ 'Dopf' => '𝔻',
+ 'Eopf' => '𝔼',
+ 'Fopf' => '𝔽',
+ 'Gopf' => '𝔾',
+ 'Hopf' => 'ℍ',
+ 'Iopf' => '𝕀',
+ 'Jopf' => '𝕁',
+ 'Kopf' => '𝕂',
+ 'Lopf' => '𝕃',
+ 'Mopf' => '𝕄',
+ 'Nopf' => 'ℕ',
+ 'Oopf' => '𝕆',
+ 'Popf' => 'ℙ',
+ 'Qopf' => 'ℚ',
+ 'Ropf' => 'ℝ',
+ 'Sopf' => '𝕊',
+ 'Topf' => '𝕋',
+ 'Uopf' => '𝕌',
+ 'Vopf' => '𝕍',
+ 'Wopf' => '𝕎',
+ 'Xopf' => '𝕏',
+ 'Yopf' => '𝕐',
+ 'Zopf' => 'ℤ',
+ 'Ascr' => '𝒜',
+ 'ascr' => '𝒶',
+ 'Bscr' => 'ℬ',
+ 'bscr' => '𝒷',
+ 'Cscr' => '𝒞',
+ 'cscr' => '𝒸',
+ 'Dscr' => '𝒟',
+ 'dscr' => '𝒹',
+ 'Escr' => 'ℰ',
+ 'escr' => 'ℯ',
+ 'Fscr' => 'ℱ',
+ 'fscr' => '𝒻',
+ 'Gscr' => '𝒢',
+ 'gscr' => 'ℊ',
+ 'Hscr' => 'ℋ',
+ 'hscr' => '𝒽',
+ 'Iscr' => 'ℐ',
+ 'iscr' => '𝒾',
+ 'Jscr' => '𝒥',
+ 'jscr' => '𝒿',
+ 'Kscr' => '𝒦',
+ 'kscr' => '𝓀',
+ 'Lscr' => 'ℒ',
+ 'lscr' => '𝓁',
+ 'Mscr' => 'ℳ',
+ 'mscr' => '𝓂',
+ 'Nscr' => '𝒩',
+ 'nscr' => '𝓃',
+ 'Oscr' => '𝒪',
+ 'oscr' => 'ℴ',
+ 'Pscr' => '𝒫',
+ 'pscr' => '𝓅',
+ 'Qscr' => '𝒬',
+ 'qscr' => '𝓆',
+ 'Rscr' => 'ℛ',
+ 'rscr' => '𝓇',
+ 'Sscr' => '𝒮',
+ 'sscr' => '𝓈',
+ 'Tscr' => '𝒯',
+ 'tscr' => '𝓉',
+ 'Uscr' => '𝒰',
+ 'uscr' => '𝓊',
+ 'Vscr' => '𝒱',
+ 'vscr' => '𝓋',
+ 'Wscr' => '𝒲',
+ 'wscr' => '𝓌',
+ 'Xscr' => '𝒳',
+ 'xscr' => '𝓍',
+ 'Yscr' => '𝒴',
+ 'yscr' => '𝓎',
+ 'Zscr' => '𝒵',
+ 'zscr' => '𝓏',
+ 'acd' => '∿',
+ 'aleph' => 'ℵ',
+ 'And' => '⩓',
+ 'and' => '∧',
+ 'andand' => '⩕',
+ 'andd' => '⩜',
+ 'andslope' => '⩘',
+ 'andv' => '⩚',
+ 'angrt' => '∟',
+ 'angsph' => '∢',
+ 'angst' => 'Å',
+ 'ap' => '≈',
+ 'apacir' => '⩯',
+ 'awconint' => '∳',
+ 'awint' => '⨑',
+ 'becaus' => '∵',
+ 'bernou' => 'ℬ',
+ 'bne' => '=⃥',
+ 'bnequiv' => '≡⃥',
+ 'bNot' => '⫭',
+ 'bnot' => '⌐',
+ 'bottom' => '⊥',
+ 'cap' => '∩',
+ 'Cconint' => '∰',
+ 'cirfnint' => '⨐',
+ 'compfn' => '∘',
+ 'cong' => '≅',
+ 'Conint' => '∯',
+ 'conint' => '∮',
+ 'ctdot' => '⋯',
+ 'cup' => '∪',
+ 'cwconint' => '∲',
+ 'cwint' => '∱',
+ 'cylcty' => '⌭',
+ 'disin' => '⋲',
+ 'Dot' => '¨',
+ 'DotDot' => '⃜',
+ 'dsol' => '⧶',
+ 'dtdot' => '⋱',
+ 'dwangle' => '⦦',
+ 'elinters' => '�',
+ 'epar' => '⋕',
+ 'eparsl' => '⧣',
+ 'equiv' => '≡',
+ 'eqvparsl' => '⧥',
+ 'exist' => '∃',
+ 'fltns' => '▱',
+ 'fnof' => 'ƒ',
+ 'forall' => '∀',
+ 'fpartint' => '⨍',
+ 'ge' => '≥',
+ 'hamilt' => 'ℋ',
+ 'iff' => '⇔',
+ 'iinfin' => '⧜',
+ 'imped' => 'Ƶ',
+ 'infin' => '∞',
+ 'infintie' => '⧝',
+ 'Int' => '∬',
+ 'int' => '∫',
+ 'intlarhk' => '⨗',
+ 'isin' => '∈',
+ 'isindot' => '⋵',
+ 'isinE' => '⋹',
+ 'isins' => '⋴',
+ 'isinsv' => '⋳',
+ 'isinv' => '∈',
+ 'lagran' => 'ℒ',
+ 'Lang' => '《',
+ 'lang' => '〈',
+ 'lArr' => '⇐',
+ 'lbbrk' => '〔',
+ 'le' => '≤',
+ 'loang' => '〘',
+ 'lobrk' => '〚',
+ 'lopar' => '⦅',
+ 'lowast' => '∗',
+ 'minus' => '−',
+ 'mnplus' => '∓',
+ 'nabla' => '∇',
+ 'ne' => '≠',
+ 'nedot' => '≐̸',
+ 'nhpar' => '⫲',
+ 'ni' => '∋',
+ 'nis' => '⋼',
+ 'nisd' => '⋺',
+ 'niv' => '∋',
+ 'Not' => '⫬',
+ 'notin' => '∉',
+ 'notindot' => '⋵̸',
+ 'notinE' => '⋹̸',
+ 'notinva' => '∉',
+ 'notinvb' => '⋷',
+ 'notinvc' => '⋶',
+ 'notni' => '∌',
+ 'notniva' => '∌',
+ 'notnivb' => '⋾',
+ 'notnivc' => '⋽',
+ 'nparsl' => '⫽⃥',
+ 'npart' => '∂̸',
+ 'npolint' => '⨔',
+ 'nvinfin' => '⧞',
+ 'olcross' => '⦻',
+ 'Or' => '⩔',
+ 'or' => '∨',
+ 'ord' => '⩝',
+ 'order' => 'ℴ',
+ 'oror' => '⩖',
+ 'orslope' => '⩗',
+ 'orv' => '⩛',
+ 'par' => '∥',
+ 'parsl' => '⫽',
+ 'part' => '∂',
+ 'permil' => '‰',
+ 'perp' => '⊥',
+ 'pertenk' => '‱',
+ 'phmmat' => 'ℳ',
+ 'pointint' => '⨕',
+ 'Prime' => '″',
+ 'prime' => '′',
+ 'profalar' => '⌮',
+ 'profline' => '⌒',
+ 'profsurf' => '⌓',
+ 'prop' => '∝',
+ 'qint' => '⨌',
+ 'qprime' => '⁗',
+ 'quatint' => '⨖',
+ 'radic' => '√',
+ 'Rang' => '》',
+ 'rang' => '〉',
+ 'rArr' => '⇒',
+ 'rbbrk' => '〕',
+ 'roang' => '〙',
+ 'robrk' => '〛',
+ 'ropar' => '⦆',
+ 'rppolint' => '⨒',
+ 'scpolint' => '⨓',
+ 'sim' => '∼',
+ 'simdot' => '⩪',
+ 'sime' => '≃',
+ 'smeparsl' => '⧤',
+ 'square' => '□',
+ 'squarf' => '▪',
+ 'strns' => '¯',
+ 'sub' => '⊂',
+ 'sube' => '⊆',
+ 'sup' => '⊃',
+ 'supe' => '⊇',
+ 'tdot' => '⃛',
+ 'there4' => '∴',
+ 'tint' => '∭',
+ 'top' => '⊤',
+ 'topbot' => '⌶',
+ 'topcir' => '⫱',
+ 'tprime' => '‴',
+ 'utdot' => '⋰',
+ 'uwangle' => '⦧',
+ 'vangrt' => '⦜',
+ 'veeeq' => '≚',
+ 'Verbar' => '‖',
+ 'wedgeq' => '≙',
+ 'xnis' => '⋻',
+ 'boxDL' => '╗',
+ 'boxDl' => '╖',
+ 'boxdL' => '╕',
+ 'boxdl' => '┐',
+ 'boxDR' => '╔',
+ 'boxDr' => '╓',
+ 'boxdR' => '╒',
+ 'boxdr' => '┌',
+ 'boxH' => '═',
+ 'boxh' => '─',
+ 'boxHD' => '╦',
+ 'boxHd' => '╤',
+ 'boxhD' => '╥',
+ 'boxhd' => '┬',
+ 'boxHU' => '╩',
+ 'boxHu' => '╧',
+ 'boxhU' => '╨',
+ 'boxhu' => '┴',
+ 'boxUL' => '╝',
+ 'boxUl' => '╜',
+ 'boxuL' => '╛',
+ 'boxul' => '┘',
+ 'boxUR' => '╚',
+ 'boxUr' => '╙',
+ 'boxuR' => '╘',
+ 'boxur' => '└',
+ 'boxV' => '║',
+ 'boxv' => '│',
+ 'boxVH' => '╬',
+ 'boxVh' => '╫',
+ 'boxvH' => '╪',
+ 'boxvh' => '┼',
+ 'boxVL' => '╣',
+ 'boxVl' => '╢',
+ 'boxvL' => '╡',
+ 'boxvl' => '┤',
+ 'boxVR' => '╠',
+ 'boxVr' => '╟',
+ 'boxvR' => '╞',
+ 'boxvr' => '├',
+ 'Acy' => 'А',
+ 'acy' => 'а',
+ 'Bcy' => 'Б',
+ 'bcy' => 'б',
+ 'CHcy' => 'Ч',
+ 'chcy' => 'ч',
+ 'Dcy' => 'Д',
+ 'dcy' => 'д',
+ 'Ecy' => 'Э',
+ 'ecy' => 'э',
+ 'Fcy' => 'Ф',
+ 'fcy' => 'ф',
+ 'Gcy' => 'Г',
+ 'gcy' => 'г',
+ 'HARDcy' => 'Ъ',
+ 'hardcy' => 'ъ',
+ 'Icy' => 'И',
+ 'icy' => 'и',
+ 'IEcy' => 'Е',
+ 'iecy' => 'е',
+ 'IOcy' => 'Ё',
+ 'iocy' => 'ё',
+ 'Jcy' => 'Й',
+ 'jcy' => 'й',
+ 'Kcy' => 'К',
+ 'kcy' => 'к',
+ 'KHcy' => 'Х',
+ 'khcy' => 'х',
+ 'Lcy' => 'Л',
+ 'lcy' => 'л',
+ 'Mcy' => 'М',
+ 'mcy' => 'м',
+ 'Ncy' => 'Н',
+ 'ncy' => 'н',
+ 'numero' => '№',
+ 'Ocy' => 'О',
+ 'ocy' => 'о',
+ 'Pcy' => 'П',
+ 'pcy' => 'п',
+ 'Rcy' => 'Р',
+ 'rcy' => 'р',
+ 'Scy' => 'С',
+ 'scy' => 'с',
+ 'SHCHcy' => 'Щ',
+ 'shchcy' => 'щ',
+ 'SHcy' => 'Ш',
+ 'shcy' => 'ш',
+ 'SOFTcy' => 'Ь',
+ 'softcy' => 'ь',
+ 'Tcy' => 'Т',
+ 'tcy' => 'т',
+ 'TScy' => 'Ц',
+ 'tscy' => 'ц',
+ 'Ucy' => 'У',
+ 'ucy' => 'у',
+ 'Vcy' => 'В',
+ 'vcy' => 'в',
+ 'YAcy' => 'Я',
+ 'yacy' => 'я',
+ 'Ycy' => 'Ы',
+ 'ycy' => 'ы',
+ 'YUcy' => 'Ю',
+ 'yucy' => 'ю',
+ 'Zcy' => 'З',
+ 'zcy' => 'з',
+ 'ZHcy' => 'Ж',
+ 'zhcy' => 'ж',
+ 'DJcy' => 'Ђ',
+ 'djcy' => 'ђ',
+ 'DScy' => 'Ѕ',
+ 'dscy' => 'ѕ',
+ 'DZcy' => 'Џ',
+ 'dzcy' => 'џ',
+ 'GJcy' => 'Ѓ',
+ 'gjcy' => 'ѓ',
+ 'Iukcy' => 'І',
+ 'iukcy' => 'і',
+ 'Jsercy' => 'Ј',
+ 'jsercy' => 'ј',
+ 'Jukcy' => 'Є',
+ 'jukcy' => 'є',
+ 'KJcy' => 'Ќ',
+ 'kjcy' => 'ќ',
+ 'LJcy' => 'Љ',
+ 'ljcy' => 'љ',
+ 'NJcy' => 'Њ',
+ 'njcy' => 'њ',
+ 'TSHcy' => 'Ћ',
+ 'tshcy' => 'ћ',
+ 'Ubrcy' => 'Ў',
+ 'ubrcy' => 'ў',
+ 'YIcy' => 'Ї',
+ 'yicy' => 'ї',
+ 'acute' => '´',
+ 'breve' => '˘',
+ 'caron' => 'ˇ',
+ 'cedil' => '¸',
+ 'circ' => 'ˆ',
+ 'dblac' => '˝',
+ 'die' => '¨',
+ 'dot' => '˙',
+ 'grave' => '`',
+ 'macr' => '¯',
+ 'ogon' => '˛',
+ 'ring' => '˚',
+ 'tilde' => '˜',
+ 'uml' => '¨',
+ 'Aacute' => 'Á',
+ 'aacute' => 'á',
+ 'Acirc' => 'Â',
+ 'acirc' => 'â',
+ 'AElig' => 'Æ',
+ 'aelig' => 'æ',
+ 'Agrave' => 'À',
+ 'agrave' => 'à',
+ 'Aring' => 'Å',
+ 'aring' => 'å',
+ 'Atilde' => 'Ã',
+ 'atilde' => 'ã',
+ 'Auml' => 'Ä',
+ 'auml' => 'ä',
+ 'Ccedil' => 'Ç',
+ 'ccedil' => 'ç',
+ 'Eacute' => 'É',
+ 'eacute' => 'é',
+ 'Ecirc' => 'Ê',
+ 'ecirc' => 'ê',
+ 'Egrave' => 'È',
+ 'egrave' => 'è',
+ 'ETH' => 'Ð',
+ 'eth' => 'ð',
+ 'Euml' => 'Ë',
+ 'euml' => 'ë',
+ 'Iacute' => 'Í',
+ 'iacute' => 'í',
+ 'Icirc' => 'Î',
+ 'icirc' => 'î',
+ 'Igrave' => 'Ì',
+ 'igrave' => 'ì',
+ 'Iuml' => 'Ï',
+ 'iuml' => 'ï',
+ 'Ntilde' => 'Ñ',
+ 'ntilde' => 'ñ',
+ 'Oacute' => 'Ó',
+ 'oacute' => 'ó',
+ 'Ocirc' => 'Ô',
+ 'ocirc' => 'ô',
+ 'Ograve' => 'Ò',
+ 'ograve' => 'ò',
+ 'Oslash' => 'Ø',
+ 'oslash' => 'ø',
+ 'Otilde' => 'Õ',
+ 'otilde' => 'õ',
+ 'Ouml' => 'Ö',
+ 'ouml' => 'ö',
+ 'szlig' => 'ß',
+ 'THORN' => 'Þ',
+ 'thorn' => 'þ',
+ 'Uacute' => 'Ú',
+ 'uacute' => 'ú',
+ 'Ucirc' => 'Û',
+ 'ucirc' => 'û',
+ 'Ugrave' => 'Ù',
+ 'ugrave' => 'ù',
+ 'Uuml' => 'Ü',
+ 'uuml' => 'ü',
+ 'Yacute' => 'Ý',
+ 'yacute' => 'ý',
+ 'yuml' => 'ÿ',
+ 'Abreve' => 'Ă',
+ 'abreve' => 'ă',
+ 'Amacr' => 'Ā',
+ 'amacr' => 'ā',
+ 'Aogon' => 'Ą',
+ 'aogon' => 'ą',
+ 'Cacute' => 'Ć',
+ 'cacute' => 'ć',
+ 'Ccaron' => 'Č',
+ 'ccaron' => 'č',
+ 'Ccirc' => 'Ĉ',
+ 'ccirc' => 'ĉ',
+ 'Cdot' => 'Ċ',
+ 'cdot' => 'ċ',
+ 'Dcaron' => 'Ď',
+ 'dcaron' => 'ď',
+ 'Dstrok' => 'Đ',
+ 'dstrok' => 'đ',
+ 'Ecaron' => 'Ě',
+ 'ecaron' => 'ě',
+ 'Edot' => 'Ė',
+ 'edot' => 'ė',
+ 'Emacr' => 'Ē',
+ 'emacr' => 'ē',
+ 'ENG' => 'Ŋ',
+ 'eng' => 'ŋ',
+ 'Eogon' => 'Ę',
+ 'eogon' => 'ę',
+ 'gacute' => 'ǵ',
+ 'Gbreve' => 'Ğ',
+ 'gbreve' => 'ğ',
+ 'Gcedil' => 'Ģ',
+ 'Gcirc' => 'Ĝ',
+ 'gcirc' => 'ĝ',
+ 'Gdot' => 'Ġ',
+ 'gdot' => 'ġ',
+ 'Hcirc' => 'Ĥ',
+ 'hcirc' => 'ĥ',
+ 'Hstrok' => 'Ħ',
+ 'hstrok' => 'ħ',
+ 'Idot' => 'İ',
+ 'IJlig' => 'IJ',
+ 'ijlig' => 'ij',
+ 'Imacr' => 'Ī',
+ 'imacr' => 'ī',
+ 'inodot' => 'ı',
+ 'Iogon' => 'Į',
+ 'iogon' => 'į',
+ 'Itilde' => 'Ĩ',
+ 'itilde' => 'ĩ',
+ 'Jcirc' => 'Ĵ',
+ 'jcirc' => 'ĵ',
+ 'Kcedil' => 'Ķ',
+ 'kcedil' => 'ķ',
+ 'kgreen' => 'ĸ',
+ 'Lacute' => 'Ĺ',
+ 'lacute' => 'ĺ',
+ 'Lcaron' => 'Ľ',
+ 'lcaron' => 'ľ',
+ 'Lcedil' => 'Ļ',
+ 'lcedil' => 'ļ',
+ 'Lmidot' => 'Ŀ',
+ 'lmidot' => 'ŀ',
+ 'Lstrok' => 'Ł',
+ 'lstrok' => 'ł',
+ 'Nacute' => 'Ń',
+ 'nacute' => 'ń',
+ 'napos' => 'ʼn',
+ 'Ncaron' => 'Ň',
+ 'ncaron' => 'ň',
+ 'Ncedil' => 'Ņ',
+ 'ncedil' => 'ņ',
+ 'Odblac' => 'Ő',
+ 'odblac' => 'ő',
+ 'OElig' => 'Œ',
+ 'oelig' => 'œ',
+ 'Omacr' => 'Ō',
+ 'omacr' => 'ō',
+ 'Racute' => 'Ŕ',
+ 'racute' => 'ŕ',
+ 'Rcaron' => 'Ř',
+ 'rcaron' => 'ř',
+ 'Rcedil' => 'Ŗ',
+ 'rcedil' => 'ŗ',
+ 'Sacute' => 'Ś',
+ 'sacute' => 'ś',
+ 'Scaron' => 'Š',
+ 'scaron' => 'š',
+ 'Scedil' => 'Ş',
+ 'scedil' => 'ş',
+ 'Scirc' => 'Ŝ',
+ 'scirc' => 'ŝ',
+ 'Tcaron' => 'Ť',
+ 'tcaron' => 'ť',
+ 'Tcedil' => 'Ţ',
+ 'tcedil' => 'ţ',
+ 'Tstrok' => 'Ŧ',
+ 'tstrok' => 'ŧ',
+ 'Ubreve' => 'Ŭ',
+ 'ubreve' => 'ŭ',
+ 'Udblac' => 'Ű',
+ 'udblac' => 'ű',
+ 'Umacr' => 'Ū',
+ 'umacr' => 'ū',
+ 'Uogon' => 'Ų',
+ 'uogon' => 'ų',
+ 'Uring' => 'Ů',
+ 'uring' => 'ů',
+ 'Utilde' => 'Ũ',
+ 'utilde' => 'ũ',
+ 'Wcirc' => 'Ŵ',
+ 'wcirc' => 'ŵ',
+ 'Ycirc' => 'Ŷ',
+ 'ycirc' => 'ŷ',
+ 'Yuml' => 'Ÿ',
+ 'Zacute' => 'Ź',
+ 'zacute' => 'ź',
+ 'Zcaron' => 'Ž',
+ 'zcaron' => 'ž',
+ 'Zdot' => 'Ż',
+ 'zdot' => 'ż',
+ 'apos' => ''',
+ 'ast' => '*',
+ 'brvbar' => '¦',
+ 'bsol' => '\',
+ 'cent' => '¢',
+ 'colon' => ':',
+ 'comma' => ',',
+ 'commat' => '@',
+ 'copy' => '©',
+ 'curren' => '¤',
+ 'darr' => '↓',
+ 'deg' => '°',
+ 'divide' => '÷',
+ 'dollar' => '$',
+ 'equals' => '=',
+ 'excl' => '!',
+ 'frac12' => '½',
+ 'frac14' => '¼',
+ 'frac18' => '⅛',
+ 'frac34' => '¾',
+ 'frac38' => '⅜',
+ 'frac58' => '⅝',
+ 'frac78' => '⅞',
+ 'gt' => '>',
+ 'half' => '½',
+ 'horbar' => '―',
+ 'hyphen' => '‐',
+ 'iexcl' => '¡',
+ 'iquest' => '¿',
+ 'laquo' => '«',
+ 'larr' => '←',
+ 'lcub' => '{',
+ 'ldquo' => '“',
+ 'lowbar' => '_',
+ 'lpar' => '(',
+ 'lsqb' => '[',
+ 'lsquo' => '‘',
+ 'micro' => 'µ',
+ 'middot' => '·',
+ 'nbsp' => ' ',
+ 'not' => '¬',
+ 'num' => '#',
+ 'ohm' => 'Ω',
+ 'ordf' => 'ª',
+ 'ordm' => 'º',
+ 'para' => '¶',
+ 'percnt' => '%',
+ 'period' => '.',
+ 'plus' => '+',
+ 'plusmn' => '±',
+ 'pound' => '£',
+ 'quest' => '?',
+ 'quot' => '"',
+ 'raquo' => '»',
+ 'rarr' => '→',
+ 'rcub' => '}',
+ 'rdquo' => '”',
+ 'reg' => '®',
+ 'rpar' => ')',
+ 'rsqb' => ']',
+ 'rsquo' => '’',
+ 'sect' => '§',
+ 'semi' => ';',
+ 'shy' => '',
+ 'sol' => '/',
+ 'sung' => '♪',
+ 'sup1' => '¹',
+ 'sup2' => '²',
+ 'sup3' => '³',
+ 'times' => '×',
+ 'trade' => '™',
+ 'uarr' => '↑',
+ 'verbar' => '|',
+ 'yen' => '¥',
+ 'blank' => '␣',
+ 'blk12' => '▒',
+ 'blk14' => '░',
+ 'blk34' => '▓',
+ 'block' => '█',
+ 'bull' => '•',
+ 'caret' => '⁁',
+ 'check' => '✓',
+ 'cir' => '○',
+ 'clubs' => '♣',
+ 'copysr' => '℗',
+ 'cross' => '✗',
+ 'Dagger' => '‡',
+ 'dagger' => '†',
+ 'dash' => '‐',
+ 'diams' => '♦',
+ 'dlcrop' => '⌍',
+ 'drcrop' => '⌌',
+ 'dtri' => '▿',
+ 'dtrif' => '▾',
+ 'emsp' => ' ',
+ 'emsp13' => ' ',
+ 'emsp14' => ' ',
+ 'ensp' => ' ',
+ 'female' => '♀',
+ 'ffilig' => 'ffi',
+ 'fflig' => 'ff',
+ 'ffllig' => 'ffl',
+ 'filig' => 'fi',
+ 'flat' => '♭',
+ 'fllig' => 'fl',
+ 'frac13' => '⅓',
+ 'frac15' => '⅕',
+ 'frac16' => '⅙',
+ 'frac23' => '⅔',
+ 'frac25' => '⅖',
+ 'frac35' => '⅗',
+ 'frac45' => '⅘',
+ 'frac56' => '⅚',
+ 'hairsp' => ' ',
+ 'hearts' => '♥',
+ 'hellip' => '…',
+ 'hybull' => '⁃',
+ 'incare' => '℅',
+ 'ldquor' => '„',
+ 'lhblk' => '▄',
+ 'loz' => '◊',
+ 'lozf' => '⧫',
+ 'lsquor' => '‚',
+ 'ltri' => '◃',
+ 'ltrif' => '◂',
+ 'male' => '♂',
+ 'malt' => '✠',
+ 'marker' => '▮',
+ 'mdash' => '—',
+ 'mldr' => '…',
+ 'natur' => '♮',
+ 'ndash' => '–',
+ 'nldr' => '‥',
+ 'numsp' => ' ',
+ 'phone' => '☎',
+ 'puncsp' => ' ',
+ 'rdquor' => '”',
+ 'rect' => '▭',
+ 'rsquor' => '’',
+ 'rtri' => '▹',
+ 'rtrif' => '▸',
+ 'rx' => '℞',
+ 'sext' => '✶',
+ 'sharp' => '♯',
+ 'spades' => '♠',
+ 'squ' => '□',
+ 'squf' => '▪',
+ 'star' => '☆',
+ 'starf' => '★',
+ 'target' => '⌖',
+ 'telrec' => '⌕',
+ 'thinsp' => ' ',
+ 'uhblk' => '▀',
+ 'ulcrop' => '⌏',
+ 'urcrop' => '⌎',
+ 'utri' => '▵',
+ 'utrif' => '▴',
+ 'vellip' => '⋮',
+ 'af' => '',
+ 'aopf' => '𝕒',
+ 'asympeq' => '≍',
+ 'bopf' => '𝕓',
+ 'copf' => '𝕔',
+ 'Cross' => '⨯',
+ 'DD' => 'ⅅ',
+ 'dd' => 'ⅆ',
+ 'dopf' => '𝕕',
+ 'DownArrowBar' => '⤓',
+ 'DownBreve' => '̑',
+ 'DownLeftRightVector' => '⥐',
+ 'DownLeftTeeVector' => '⥞',
+ 'DownLeftVectorBar' => '⥖',
+ 'DownRightTeeVector' => '⥟',
+ 'DownRightVectorBar' => '⥗',
+ 'ee' => 'ⅇ',
+ 'EmptySmallSquare' => '◻',
+ 'EmptyVerySmallSquare' => '▫',
+ 'eopf' => '𝕖',
+ 'Equal' => '⩵',
+ 'FilledSmallSquare' => '◼',
+ 'FilledVerySmallSquare' => '▪',
+ 'fopf' => '𝕗',
+ 'gopf' => '𝕘',
+ 'GreaterGreater' => '⪢',
+ 'Hat' => '^',
+ 'hopf' => '𝕙',
+ 'HorizontalLine' => '─',
+ 'ic' => '',
+ 'ii' => 'ⅈ',
+ 'iopf' => '𝕚',
+ 'it' => '',
+ 'jopf' => '𝕛',
+ 'kopf' => '𝕜',
+ 'larrb' => '⇤',
+ 'LeftDownTeeVector' => '⥡',
+ 'LeftDownVectorBar' => '⥙',
+ 'LeftRightVector' => '⥎',
+ 'LeftTeeVector' => '⥚',
+ 'LeftTriangleBar' => '⧏',
+ 'LeftUpDownVector' => '⥑',
+ 'LeftUpTeeVector' => '⥠',
+ 'LeftUpVectorBar' => '⥘',
+ 'LeftVectorBar' => '⥒',
+ 'LessLess' => '⪡',
+ 'lopf' => '𝕝',
+ 'mapstodown' => '↧',
+ 'mapstoleft' => '↤',
+ 'mapstoup' => '↥',
+ 'MediumSpace' => ' ',
+ 'mopf' => '𝕞',
+ 'nbump' => '≎̸',
+ 'nbumpe' => '≏̸',
+ 'nesim' => '≂̸',
+ 'NewLine' => '
',
+ 'NoBreak' => '',
+ 'nopf' => '𝕟',
+ 'NotCupCap' => '≭',
+ 'NotHumpEqual' => '≏̸',
+ 'NotLeftTriangleBar' => '⧏̸',
+ 'NotNestedGreaterGreater' => '⪢̸',
+ 'NotNestedLessLess' => '⪡̸',
+ 'NotRightTriangleBar' => '⧐̸',
+ 'NotSquareSubset' => '⊏̸',
+ 'NotSquareSuperset' => '⊐̸',
+ 'NotSucceedsTilde' => '≿̸',
+ 'oopf' => '𝕠',
+ 'OverBar' => '¯',
+ 'OverBrace' => '︷',
+ 'OverBracket' => '⎴',
+ 'OverParenthesis' => '︵',
+ 'planckh' => 'ℎ',
+ 'popf' => '𝕡',
+ 'Product' => '∏',
+ 'qopf' => '𝕢',
+ 'rarrb' => '⇥',
+ 'RightDownTeeVector' => '⥝',
+ 'RightDownVectorBar' => '⥕',
+ 'RightTeeVector' => '⥛',
+ 'RightTriangleBar' => '⧐',
+ 'RightUpDownVector' => '⥏',
+ 'RightUpTeeVector' => '⥜',
+ 'RightUpVectorBar' => '⥔',
+ 'RightVectorBar' => '⥓',
+ 'ropf' => '𝕣',
+ 'RoundImplies' => '⥰',
+ 'RuleDelayed' => '⧴',
+ 'sopf' => '𝕤',
+ 'Tab' => ' ',
+ 'ThickSpace' => ' ',
+ 'topf' => '𝕥',
+ 'UnderBar' => '̲',
+ 'UnderBrace' => '︸',
+ 'UnderBracket' => '⎵',
+ 'UnderParenthesis' => '︶',
+ 'uopf' => '𝕦',
+ 'UpArrowBar' => '⤒',
+ 'Upsilon' => 'Υ',
+ 'VerticalLine' => '|',
+ 'VerticalSeparator' => '❘',
+ 'vopf' => '𝕧',
+ 'wopf' => '𝕨',
+ 'xopf' => '𝕩',
+ 'yopf' => '𝕪',
+ 'ZeroWidthSpace' => '',
+ 'zopf' => '𝕫',
+ 'angle' => '∠',
+ 'ApplyFunction' => '',
+ 'approx' => '≈',
+ 'approxeq' => '≊',
+ 'Assign' => '≔',
+ 'backcong' => '≌',
+ 'backepsilon' => '϶',
+ 'backprime' => '‵',
+ 'backsim' => '∽',
+ 'backsimeq' => '⋍',
+ 'Backslash' => '∖',
+ 'barwedge' => '⌅',
+ 'Because' => '∵',
+ 'because' => '∵',
+ 'Bernoullis' => 'ℬ',
+ 'between' => '≬',
+ 'bigcap' => '⋂',
+ 'bigcirc' => '◯',
+ 'bigcup' => '⋃',
+ 'bigodot' => '⨀',
+ 'bigoplus' => '⨁',
+ 'bigotimes' => '⨂',
+ 'bigsqcup' => '⨆',
+ 'bigstar' => '★',
+ 'bigtriangledown' => '▽',
+ 'bigtriangleup' => '△',
+ 'biguplus' => '⨄',
+ 'bigvee' => '⋁',
+ 'bigwedge' => '⋀',
+ 'bkarow' => '⤍',
+ 'blacklozenge' => '⧫',
+ 'blacksquare' => '▪',
+ 'blacktriangle' => '▴',
+ 'blacktriangledown' => '▾',
+ 'blacktriangleleft' => '◂',
+ 'blacktriangleright' => '▸',
+ 'bot' => '⊥',
+ 'boxminus' => '⊟',
+ 'boxplus' => '⊞',
+ 'boxtimes' => '⊠',
+ 'Breve' => '˘',
+ 'bullet' => '•',
+ 'Bumpeq' => '≎',
+ 'bumpeq' => '≏',
+ 'CapitalDifferentialD' => 'ⅅ',
+ 'Cayleys' => 'ℭ',
+ 'Cedilla' => '¸',
+ 'CenterDot' => '·',
+ 'centerdot' => '·',
+ 'checkmark' => '✓',
+ 'circeq' => '≗',
+ 'circlearrowleft' => '↺',
+ 'circlearrowright' => '↻',
+ 'circledast' => '⊛',
+ 'circledcirc' => '⊚',
+ 'circleddash' => '⊝',
+ 'CircleDot' => '⊙',
+ 'circledR' => '®',
+ 'circledS' => 'Ⓢ',
+ 'CircleMinus' => '⊖',
+ 'CirclePlus' => '⊕',
+ 'CircleTimes' => '⊗',
+ 'ClockwiseContourIntegral' => '∲',
+ 'CloseCurlyDoubleQuote' => '”',
+ 'CloseCurlyQuote' => '’',
+ 'clubsuit' => '♣',
+ 'coloneq' => '≔',
+ 'complement' => '∁',
+ 'complexes' => 'ℂ',
+ 'Congruent' => '≡',
+ 'ContourIntegral' => '∮',
+ 'Coproduct' => '∐',
+ 'CounterClockwiseContourIntegral' => '∳',
+ 'CupCap' => '≍',
+ 'curlyeqprec' => '⋞',
+ 'curlyeqsucc' => '⋟',
+ 'curlyvee' => '⋎',
+ 'curlywedge' => '⋏',
+ 'curvearrowleft' => '↶',
+ 'curvearrowright' => '↷',
+ 'dbkarow' => '⤏',
+ 'ddagger' => '‡',
+ 'ddotseq' => '⩷',
+ 'Del' => '∇',
+ 'DiacriticalAcute' => '´',
+ 'DiacriticalDot' => '˙',
+ 'DiacriticalDoubleAcute' => '˝',
+ 'DiacriticalGrave' => '`',
+ 'DiacriticalTilde' => '˜',
+ 'Diamond' => '⋄',
+ 'diamond' => '⋄',
+ 'diamondsuit' => '♦',
+ 'DifferentialD' => 'ⅆ',
+ 'digamma' => 'ϝ',
+ 'div' => '÷',
+ 'divideontimes' => '⋇',
+ 'doteq' => '≐',
+ 'doteqdot' => '≑',
+ 'DotEqual' => '≐',
+ 'dotminus' => '∸',
+ 'dotplus' => '∔',
+ 'dotsquare' => '⊡',
+ 'doublebarwedge' => '⌆',
+ 'DoubleContourIntegral' => '∯',
+ 'DoubleDot' => '¨',
+ 'DoubleDownArrow' => '⇓',
+ 'DoubleLeftArrow' => '⇐',
+ 'DoubleLeftRightArrow' => '⇔',
+ 'DoubleLeftTee' => '⫤',
+ 'DoubleLongLeftArrow' => '⟸',
+ 'DoubleLongLeftRightArrow' => '⟺',
+ 'DoubleLongRightArrow' => '⟹',
+ 'DoubleRightArrow' => '⇒',
+ 'DoubleRightTee' => '⊨',
+ 'DoubleUpArrow' => '⇑',
+ 'DoubleUpDownArrow' => '⇕',
+ 'DoubleVerticalBar' => '∥',
+ 'DownArrow' => '↓',
+ 'Downarrow' => '⇓',
+ 'downarrow' => '↓',
+ 'DownArrowUpArrow' => '⇵',
+ 'downdownarrows' => '⇊',
+ 'downharpoonleft' => '⇃',
+ 'downharpoonright' => '⇂',
+ 'DownLeftVector' => '↽',
+ 'DownRightVector' => '⇁',
+ 'DownTee' => '⊤',
+ 'DownTeeArrow' => '↧',
+ 'drbkarow' => '⤐',
+ 'Element' => '∈',
+ 'emptyset' => '∅',
+ 'eqcirc' => '≖',
+ 'eqcolon' => '≕',
+ 'eqsim' => '≂',
+ 'eqslantgtr' => '⪖',
+ 'eqslantless' => '⪕',
+ 'EqualTilde' => '≂',
+ 'Equilibrium' => '⇌',
+ 'Exists' => '∃',
+ 'expectation' => 'ℰ',
+ 'ExponentialE' => 'ⅇ',
+ 'exponentiale' => 'ⅇ',
+ 'fallingdotseq' => '≒',
+ 'ForAll' => '∀',
+ 'Fouriertrf' => 'ℱ',
+ 'geq' => '≥',
+ 'geqq' => '≧',
+ 'geqslant' => '⩾',
+ 'gg' => '≫',
+ 'ggg' => '⋙',
+ 'gnapprox' => '⪊',
+ 'gneq' => '⪈',
+ 'gneqq' => '≩',
+ 'GreaterEqual' => '≥',
+ 'GreaterEqualLess' => '⋛',
+ 'GreaterFullEqual' => '≧',
+ 'GreaterLess' => '≷',
+ 'GreaterSlantEqual' => '⩾',
+ 'GreaterTilde' => '≳',
+ 'gtrapprox' => '⪆',
+ 'gtrdot' => '⋗',
+ 'gtreqless' => '⋛',
+ 'gtreqqless' => '⪌',
+ 'gtrless' => '≷',
+ 'gtrsim' => '≳',
+ 'gvertneqq' => '≩︀',
+ 'Hacek' => 'ˇ',
+ 'hbar' => 'ℏ',
+ 'heartsuit' => '♥',
+ 'HilbertSpace' => 'ℋ',
+ 'hksearow' => '⤥',
+ 'hkswarow' => '⤦',
+ 'hookleftarrow' => '↩',
+ 'hookrightarrow' => '↪',
+ 'hslash' => 'ℏ',
+ 'HumpDownHump' => '≎',
+ 'HumpEqual' => '≏',
+ 'iiiint' => '⨌',
+ 'iiint' => '∭',
+ 'Im' => 'ℑ',
+ 'ImaginaryI' => 'ⅈ',
+ 'imagline' => 'ℐ',
+ 'imagpart' => 'ℑ',
+ 'Implies' => '⇒',
+ 'in' => '∈',
+ 'integers' => 'ℤ',
+ 'Integral' => '∫',
+ 'intercal' => '⊺',
+ 'Intersection' => '⋂',
+ 'intprod' => '⨼',
+ 'InvisibleComma' => '',
+ 'InvisibleTimes' => '',
+ 'langle' => '〈',
+ 'Laplacetrf' => 'ℒ',
+ 'lbrace' => '{',
+ 'lbrack' => '[',
+ 'LeftAngleBracket' => '〈',
+ 'LeftArrow' => '←',
+ 'Leftarrow' => '⇐',
+ 'leftarrow' => '←',
+ 'LeftArrowBar' => '⇤',
+ 'LeftArrowRightArrow' => '⇆',
+ 'leftarrowtail' => '↢',
+ 'LeftCeiling' => '⌈',
+ 'LeftDoubleBracket' => '〚',
+ 'LeftDownVector' => '⇃',
+ 'LeftFloor' => '⌊',
+ 'leftharpoondown' => '↽',
+ 'leftharpoonup' => '↼',
+ 'leftleftarrows' => '⇇',
+ 'LeftRightArrow' => '↔',
+ 'Leftrightarrow' => '⇔',
+ 'leftrightarrow' => '↔',
+ 'leftrightarrows' => '⇆',
+ 'leftrightharpoons' => '⇋',
+ 'leftrightsquigarrow' => '↭',
+ 'LeftTee' => '⊣',
+ 'LeftTeeArrow' => '↤',
+ 'leftthreetimes' => '⋋',
+ 'LeftTriangle' => '⊲',
+ 'LeftTriangleEqual' => '⊴',
+ 'LeftUpVector' => '↿',
+ 'LeftVector' => '↼',
+ 'leq' => '≤',
+ 'leqq' => '≦',
+ 'leqslant' => '⩽',
+ 'lessapprox' => '⪅',
+ 'lessdot' => '⋖',
+ 'lesseqgtr' => '⋚',
+ 'lesseqqgtr' => '⪋',
+ 'LessEqualGreater' => '⋚',
+ 'LessFullEqual' => '≦',
+ 'LessGreater' => '≶',
+ 'lessgtr' => '≶',
+ 'lesssim' => '≲',
+ 'LessSlantEqual' => '⩽',
+ 'LessTilde' => '≲',
+ 'll' => '≪',
+ 'llcorner' => '⌞',
+ 'Lleftarrow' => '⇚',
+ 'lmoustache' => '⎰',
+ 'lnapprox' => '⪉',
+ 'lneq' => '⪇',
+ 'lneqq' => '≨',
+ 'LongLeftArrow' => '⟵',
+ 'Longleftarrow' => '⟸',
+ 'longleftarrow' => '⟵',
+ 'LongLeftRightArrow' => '⟷',
+ 'Longleftrightarrow' => '⟺',
+ 'longleftrightarrow' => '⟷',
+ 'longmapsto' => '⟼',
+ 'LongRightArrow' => '⟶',
+ 'Longrightarrow' => '⟹',
+ 'longrightarrow' => '⟶',
+ 'looparrowleft' => '↫',
+ 'looparrowright' => '↬',
+ 'LowerLeftArrow' => '↙',
+ 'LowerRightArrow' => '↘',
+ 'lozenge' => '◊',
+ 'lrcorner' => '⌟',
+ 'Lsh' => '↰',
+ 'lvertneqq' => '≨︀',
+ 'maltese' => '✠',
+ 'mapsto' => '↦',
+ 'measuredangle' => '∡',
+ 'Mellintrf' => 'ℳ',
+ 'MinusPlus' => '∓',
+ 'mp' => '∓',
+ 'multimap' => '⊸',
+ 'napprox' => '≉',
+ 'natural' => '♮',
+ 'naturals' => 'ℕ',
+ 'nearrow' => '↗',
+ 'NegativeMediumSpace' => '',
+ 'NegativeThickSpace' => '',
+ 'NegativeThinSpace' => '',
+ 'NegativeVeryThinSpace' => '',
+ 'NestedGreaterGreater' => '≫',
+ 'NestedLessLess' => '≪',
+ 'nexists' => '∄',
+ 'ngeq' => '≱',
+ 'ngeqq' => '≧̸',
+ 'ngeqslant' => '⩾̸',
+ 'ngtr' => '≯',
+ 'nLeftarrow' => '⇍',
+ 'nleftarrow' => '↚',
+ 'nLeftrightarrow' => '⇎',
+ 'nleftrightarrow' => '↮',
+ 'nleq' => '≰',
+ 'nleqq' => '≦̸',
+ 'nleqslant' => '⩽̸',
+ 'nless' => '≮',
+ 'NonBreakingSpace' => ' ',
+ 'NotCongruent' => '≢',
+ 'NotDoubleVerticalBar' => '∦',
+ 'NotElement' => '∉',
+ 'NotEqual' => '≠',
+ 'NotEqualTilde' => '≂̸',
+ 'NotExists' => '∄',
+ 'NotGreater' => '≯',
+ 'NotGreaterEqual' => '≱',
+ 'NotGreaterFullEqual' => '≦̸',
+ 'NotGreaterGreater' => '≫̸',
+ 'NotGreaterLess' => '≹',
+ 'NotGreaterSlantEqual' => '⩾̸',
+ 'NotGreaterTilde' => '≵',
+ 'NotHumpDownHump' => '≎̸',
+ 'NotLeftTriangle' => '⋪',
+ 'NotLeftTriangleEqual' => '⋬',
+ 'NotLess' => '≮',
+ 'NotLessEqual' => '≰',
+ 'NotLessGreater' => '≸',
+ 'NotLessLess' => '≪̸',
+ 'NotLessSlantEqual' => '⩽̸',
+ 'NotLessTilde' => '≴',
+ 'NotPrecedes' => '⊀',
+ 'NotPrecedesEqual' => '⪯̸',
+ 'NotPrecedesSlantEqual' => '⋠',
+ 'NotReverseElement' => '∌',
+ 'NotRightTriangle' => '⋫',
+ 'NotRightTriangleEqual' => '⋭',
+ 'NotSquareSubsetEqual' => '⋢',
+ 'NotSquareSupersetEqual' => '⋣',
+ 'NotSubset' => '⊂⃒',
+ 'NotSubsetEqual' => '⊈',
+ 'NotSucceeds' => '⊁',
+ 'NotSucceedsEqual' => '⪰̸',
+ 'NotSucceedsSlantEqual' => '⋡',
+ 'NotSuperset' => '⊃⃒',
+ 'NotSupersetEqual' => '⊉',
+ 'NotTilde' => '≁',
+ 'NotTildeEqual' => '≄',
+ 'NotTildeFullEqual' => '≇',
+ 'NotTildeTilde' => '≉',
+ 'NotVerticalBar' => '∤',
+ 'nparallel' => '∦',
+ 'nprec' => '⊀',
+ 'npreceq' => '⪯̸',
+ 'nRightarrow' => '⇏',
+ 'nrightarrow' => '↛',
+ 'nshortmid' => '∤',
+ 'nshortparallel' => '∦',
+ 'nsimeq' => '≄',
+ 'nsubset' => '⊂⃒',
+ 'nsubseteq' => '⊈',
+ 'nsubseteqq' => '⫅̸',
+ 'nsucc' => '⊁',
+ 'nsucceq' => '⪰̸',
+ 'nsupset' => '⊃⃒',
+ 'nsupseteq' => '⊉',
+ 'nsupseteqq' => '⫆̸',
+ 'ntriangleleft' => '⋪',
+ 'ntrianglelefteq' => '⋬',
+ 'ntriangleright' => '⋫',
+ 'ntrianglerighteq' => '⋭',
+ 'nwarrow' => '↖',
+ 'oint' => '∮',
+ 'OpenCurlyDoubleQuote' => '“',
+ 'OpenCurlyQuote' => '‘',
+ 'orderof' => 'ℴ',
+ 'parallel' => '∥',
+ 'PartialD' => '∂',
+ 'pitchfork' => '⋔',
+ 'PlusMinus' => '±',
+ 'pm' => '±',
+ 'Poincareplane' => 'ℌ',
+ 'prec' => '≺',
+ 'precapprox' => '⪷',
+ 'preccurlyeq' => '≼',
+ 'Precedes' => '≺',
+ 'PrecedesEqual' => '⪯',
+ 'PrecedesSlantEqual' => '≼',
+ 'PrecedesTilde' => '≾',
+ 'preceq' => '⪯',
+ 'precnapprox' => '⪹',
+ 'precneqq' => '⪵',
+ 'precnsim' => '⋨',
+ 'precsim' => '≾',
+ 'primes' => 'ℙ',
+ 'Proportion' => '∷',
+ 'Proportional' => '∝',
+ 'propto' => '∝',
+ 'quaternions' => 'ℍ',
+ 'questeq' => '≟',
+ 'rangle' => '〉',
+ 'rationals' => 'ℚ',
+ 'rbrace' => '}',
+ 'rbrack' => ']',
+ 'Re' => 'ℜ',
+ 'realine' => 'ℛ',
+ 'realpart' => 'ℜ',
+ 'reals' => 'ℝ',
+ 'ReverseElement' => '∋',
+ 'ReverseEquilibrium' => '⇋',
+ 'ReverseUpEquilibrium' => '⥯',
+ 'RightAngleBracket' => '〉',
+ 'RightArrow' => '→',
+ 'Rightarrow' => '⇒',
+ 'rightarrow' => '→',
+ 'RightArrowBar' => '⇥',
+ 'RightArrowLeftArrow' => '⇄',
+ 'rightarrowtail' => '↣',
+ 'RightCeiling' => '⌉',
+ 'RightDoubleBracket' => '〛',
+ 'RightDownVector' => '⇂',
+ 'RightFloor' => '⌋',
+ 'rightharpoondown' => '⇁',
+ 'rightharpoonup' => '⇀',
+ 'rightleftarrows' => '⇄',
+ 'rightleftharpoons' => '⇌',
+ 'rightrightarrows' => '⇉',
+ 'rightsquigarrow' => '↝',
+ 'RightTee' => '⊢',
+ 'RightTeeArrow' => '↦',
+ 'rightthreetimes' => '⋌',
+ 'RightTriangle' => '⊳',
+ 'RightTriangleEqual' => '⊵',
+ 'RightUpVector' => '↾',
+ 'RightVector' => '⇀',
+ 'risingdotseq' => '≓',
+ 'rmoustache' => '⎱',
+ 'Rrightarrow' => '⇛',
+ 'Rsh' => '↱',
+ 'searrow' => '↘',
+ 'setminus' => '∖',
+ 'ShortDownArrow' => '↓',
+ 'ShortLeftArrow' => '←',
+ 'shortmid' => '∣',
+ 'shortparallel' => '∥',
+ 'ShortRightArrow' => '→',
+ 'ShortUpArrow' => '↑',
+ 'simeq' => '≃',
+ 'SmallCircle' => '∘',
+ 'smallsetminus' => '∖',
+ 'spadesuit' => '♠',
+ 'Sqrt' => '√',
+ 'sqsubset' => '⊏',
+ 'sqsubseteq' => '⊑',
+ 'sqsupset' => '⊐',
+ 'sqsupseteq' => '⊒',
+ 'Square' => '□',
+ 'SquareIntersection' => '⊓',
+ 'SquareSubset' => '⊏',
+ 'SquareSubsetEqual' => '⊑',
+ 'SquareSuperset' => '⊐',
+ 'SquareSupersetEqual' => '⊒',
+ 'SquareUnion' => '⊔',
+ 'Star' => '⋆',
+ 'straightepsilon' => 'ϵ',
+ 'straightphi' => 'ϕ',
+ 'Subset' => '⋐',
+ 'subset' => '⊂',
+ 'subseteq' => '⊆',
+ 'subseteqq' => '⫅',
+ 'SubsetEqual' => '⊆',
+ 'subsetneq' => '⊊',
+ 'subsetneqq' => '⫋',
+ 'succ' => '≻',
+ 'succapprox' => '⪸',
+ 'succcurlyeq' => '≽',
+ 'Succeeds' => '≻',
+ 'SucceedsEqual' => '⪰',
+ 'SucceedsSlantEqual' => '≽',
+ 'SucceedsTilde' => '≿',
+ 'succeq' => '⪰',
+ 'succnapprox' => '⪺',
+ 'succneqq' => '⪶',
+ 'succnsim' => '⋩',
+ 'succsim' => '≿',
+ 'SuchThat' => '∋',
+ 'Sum' => '∑',
+ 'Superset' => '⊃',
+ 'SupersetEqual' => '⊇',
+ 'Supset' => '⋑',
+ 'supset' => '⊃',
+ 'supseteq' => '⊇',
+ 'supseteqq' => '⫆',
+ 'supsetneq' => '⊋',
+ 'supsetneqq' => '⫌',
+ 'swarrow' => '↙',
+ 'Therefore' => '∴',
+ 'therefore' => '∴',
+ 'thickapprox' => '≈',
+ 'thicksim' => '∼',
+ 'ThinSpace' => ' ',
+ 'Tilde' => '∼',
+ 'TildeEqual' => '≃',
+ 'TildeFullEqual' => '≅',
+ 'TildeTilde' => '≈',
+ 'toea' => '⤨',
+ 'tosa' => '⤩',
+ 'triangle' => '▵',
+ 'triangledown' => '▿',
+ 'triangleleft' => '◃',
+ 'trianglelefteq' => '⊴',
+ 'triangleq' => '≜',
+ 'triangleright' => '▹',
+ 'trianglerighteq' => '⊵',
+ 'TripleDot' => '⃛',
+ 'twoheadleftarrow' => '↞',
+ 'twoheadrightarrow' => '↠',
+ 'ulcorner' => '⌜',
+ 'Union' => '⋃',
+ 'UnionPlus' => '⊎',
+ 'UpArrow' => '↑',
+ 'Uparrow' => '⇑',
+ 'uparrow' => '↑',
+ 'UpArrowDownArrow' => '⇅',
+ 'UpDownArrow' => '↕',
+ 'Updownarrow' => '⇕',
+ 'updownarrow' => '↕',
+ 'UpEquilibrium' => '⥮',
+ 'upharpoonleft' => '↿',
+ 'upharpoonright' => '↾',
+ 'UpperLeftArrow' => '↖',
+ 'UpperRightArrow' => '↗',
+ 'upsilon' => 'υ',
+ 'UpTee' => '⊥',
+ 'UpTeeArrow' => '↥',
+ 'upuparrows' => '⇈',
+ 'urcorner' => '⌝',
+ 'varepsilon' => 'ε',
+ 'varkappa' => 'ϰ',
+ 'varnothing' => '∅',
+ 'varphi' => 'φ',
+ 'varpi' => 'ϖ',
+ 'varpropto' => '∝',
+ 'varrho' => 'ϱ',
+ 'varsigma' => 'ς',
+ 'varsubsetneq' => '⊊︀',
+ 'varsubsetneqq' => '⫋︀',
+ 'varsupsetneq' => '⊋︀',
+ 'varsupsetneqq' => '⫌︀',
+ 'vartheta' => 'ϑ',
+ 'vartriangleleft' => '⊲',
+ 'vartriangleright' => '⊳',
+ 'Vee' => '⋁',
+ 'vee' => '∨',
+ 'Vert' => '‖',
+ 'vert' => '|',
+ 'VerticalBar' => '∣',
+ 'VerticalTilde' => '≀',
+ 'VeryThinSpace' => ' ',
+ 'Wedge' => '⋀',
+ 'wedge' => '∧',
+ 'wp' => '℘',
+ 'wr' => '≀',
+ 'zeetrf' => 'ℨ'
+ }
+#:startdoc:
+
+# Converts XHTML+MathML named entities in string to Numeric Character References
+#
+# :call-seq:
+# string.to_ncr -> string
+#
+ def to_ncr
+ self.gsub(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_ncr}
+ end
+
+# Converts XHTML+MathML named entities in string to Numeric Character References
+#
+# :call-seq:
+# string.to_ncr! -> str or nil
+#
+# Substitution is done in-place.
+#
+ def to_ncr!
+ self.gsub!(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_ncr}
+ end
+
+# Converts XHTML+MathML named entities in string to UTF-8
+#
+# :call-seq:
+# string.to_utf8 -> string
+#
+#--
+ def to_utf8
+ self.gsub(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_utf8}
+
+ # You might think this is faster, but it isn't
+ # pieces = self.split(/&([a-zA-Z0-9]+);/)
+ # 1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
+ # pieces.join
+ end
+
+#++
+# Converts XHTML+MathML named entities in string to UTF-8
+#
+# :call-seq:
+# string.to_ncr! -> str or nil
+#
+# Substitution is done in-place.
+#
+ def to_utf8!
+ self.gsub!(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_utf8}
+ end
+
+#:stopdoc:
+
+ def unescapeHTML
+ self.gsub(/&(.*?);/n) do
+ match = $1.dup
+ case match
+ when /\Aamp\z/ni then '&'
+ when /\Aquot\z/ni then '"'
+ when /\Agt\z/ni then '>'
+ when /\Alt\z/ni then '<'
+ when /\A#0*(\d+)\z/n then
+ if Integer($1) < 256
+ Integer($1).chr
+ else
+ if Integer($1) < 1114111
+ [Integer($1)].pack("U")
+ else
+ "#{$1};"
+ end
+ end
+ when /\A#x([0-9a-f]+)\z/ni then
+ if $1.hex < 256
+ $1.hex.chr
+ else
+ if $1.hex < 1114111
+ [$1.hex].pack("U")
+ else
+ "#{$1};"
+ end
+ end
+ else
+ "{match};"
+ end
+ end
+ end
+
+ protected
+
+ def convert_to_ncr #:nodoc:
+ if self =~ /^(lt|gt|amp|quot|apos)$/
+ self.replace "&" + self + ";"
+ elsif MATHML_ENTITIES.has_key?(self)
+ self.replace MATHML_ENTITIES[self]
+ else
+ self.replace "&" + self + ";"
+ end
+ end
+
+ def convert_to_utf8 #:nodoc:
+ if self =~ /^(lt|gt|amp|quot|apos)$/
+ self.replace "&" + self + ";"
+ elsif MATHML_ENTITIES.has_key?(self)
+ self.replace MATHML_ENTITIES[self].split(';').collect {|s| s.gsub(/^([A-F0-9]+)$/, '\1').hex }.pack('U*')
+ else
+ self.replace "&" + self + ";"
+ end
+ end
+
+
+end
diff --git a/lib/wiki_content.rb b/lib/wiki_content.rb
index 75a846e5..521e5b4f 100644
--- a/lib/wiki_content.rb
+++ b/lib/wiki_content.rb
@@ -7,8 +7,6 @@ require_dependency 'chunks/literal'
require 'chunks/nowiki'
require 'sanitize'
-include Sanitize
-
# Wiki content is just a string that can process itself with a chain of
# actions. The actions can modify wiki content so that certain parts of
# it are protected from being rendered by later actions.
@@ -116,6 +114,7 @@ end
class WikiContent < String
include ChunkManager
+ include Sanitize
DEFAULT_OPTS = {
:active_chunks => ACTIVE_CHUNKS,
diff --git a/test/sanitizer.dat b/test/sanitizer.dat
new file mode 100644
index 00000000..ec781cb9
--- /dev/null
+++ b/test/sanitizer.dat
@@ -0,0 +1,475 @@
+[
+ {
+ "name": "IE_Comments",
+ "input": "",
+ "output": "",
+ "xhtml": "<!--[if gte IE 4]><script>alert('XSS');</script><![endif]-->"
+ },
+
+ {
+ "name": "IE_Comments_2",
+ "input": "",
+ "output": "<script>alert('XSS');</script>",
+ "xhtml": "<![if !IE 5]><script>alert('XSS');</script><![endif]>",
+ "rexml": "Ill-formed XHTML!"
+ },
+
+ {
+ "name": "allow_colons_in_path_component",
+ "input": "foo ",
+ "output": "foo "
+ },
+
+ {
+ "name": "background_attribute",
+ "input": "
",
+ "output": "
",
+ "xhtml": "
",
+ "rexml": "
"
+ },
+
+ {
+ "name": "bgsound",
+ "input": " ",
+ "output": "<bgsound src=\"javascript:alert('XSS');\"/>",
+ "xhtml": "<bgsound src='javascript:alert('XSS');'/>",
+ "rexml": "<bgsound src=\"javascript:alert('XSS');\"></bgsound>"
+ },
+
+ {
+ "name": "div_background_image_unicode_encoded",
+ "input": "foo
",
+ "output": "foo
"
+ },
+
+ {
+ "name": "div_expression",
+ "input": "foo
",
+ "output": "foo
"
+ },
+
+ {
+ "name": "double_open_angle_brackets",
+ "input": " ",
+ "xhtml": " <",
+ "rexml": "Ill-formed XHTML!"
+ },
+
+ {
+ "name": "double_open_angle_brackets_2",
+ "input": "",
+ "output": " ",
+ "rexml": "Ill-formed XHTML!"
+ },
+
+ {
+ "name": "img_dynsrc_lowsrc",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "img_vbscript",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "input_image",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "link_stylesheets",
+ "input": " ",
+ "output": "<link rel=\"stylesheet\" href=\"javascript:alert('XSS');\"/>",
+ "xhtml": "<link href='javascript:alert('XSS');' rel='stylesheet'/>",
+ "rexml": "<link href=\"javascript:alert('XSS');\" rel=\"stylesheet\"/>"
+ },
+
+ {
+ "name": "link_stylesheets_2",
+ "input": " ",
+ "output": "<link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\"/>",
+ "xhtml": "<link href='http://ha.ckers.org/xss.css' rel='stylesheet'/>",
+ "rexml": "<link href=\"http://ha.ckers.org/xss.css\" rel=\"stylesheet\"/>"
+ },
+
+ {
+ "name": "list_style_image",
+ "input": "foo ",
+ "output": "foo "
+ },
+
+ {
+ "name": "no_closing_script_tags",
+ "input": "",
+ "output": "<script XSS=\"\" src=\"http://ha.ckers.org/xss.js\"></script>",
+ "xhtml": "<script/></script>",
+ "rexml": "Ill-formed XHTML!"
+ },
+
+ {
+ "name": "non_alpha_non_digit_2",
+ "input": "foo ",
+ "output": "foo ",
+ "rexml": "Ill-formed XHTML!"
+ },
+
+ {
+ "name": "non_alpha_non_digit_3",
+ "input": " ",
+ "output": " ",
+ "xhtml": " ",
+ "rexml": "Ill-formed XHTML!"
+ },
+
+ {
+ "name": "non_alpha_non_digit_II",
+ "input": "foo ",
+ "output": "foo ",
+ "rexml": "Ill-formed XHTML!"
+ },
+
+ {
+ "name": "non_alpha_non_digit_III",
+ "input": "foo ",
+ "output": "foo ",
+ "rexml": "Ill-formed XHTML!"
+ },
+
+ {
+ "name": "platypus",
+ "input": "never trust your upstream platypus ",
+ "output": "never trust your upstream platypus "
+ },
+
+ {
+ "name": "protocol_resolution_in_script_tag",
+ "input": "",
+ "output": "<script src=\"//ha.ckers.org/.j\"></script>",
+ "xhtml": "<script src/></script>",
+ "rexml": "Ill-formed XHTML!"
+ },
+
+ {
+ "name": "should_allow_anchors",
+ "input": " ",
+ "output": "<script>baz</script> ",
+ "xhtml": "<script>baz</script> "
+ },
+
+ {
+ "name": "should_allow_image_alt_attribute",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "should_allow_image_height_attribute",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "should_allow_image_src_attribute",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "should_allow_image_width_attribute",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "should_handle_blank_text",
+ "input": "",
+ "output": ""
+ },
+
+ {
+ "name": "should_handle_malformed_image_tags",
+ "input": " \">",
+ "output": " <script>alert(\"XSS\")</script>\">",
+ "xhtml": " ",
+ "rexml": "Ill-formed XHTML!"
+ },
+
+ {
+ "name": "should_handle_non_html",
+ "input": "abc",
+ "output": "abc"
+ },
+
+ {
+ "name": "should_not_fall_for_ridiculous_hack",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "should_not_fall_for_xss_image_hack_0",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "should_not_fall_for_xss_image_hack_1",
+ "input": " ",
+ "output": " ",
+ "rexml": "Ill-formed XHTML!"
+ },
+
+ {
+ "name": "should_not_fall_for_xss_image_hack_10",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "should_not_fall_for_xss_image_hack_11",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "should_not_fall_for_xss_image_hack_12",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "should_not_fall_for_xss_image_hack_13",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "should_not_fall_for_xss_image_hack_14",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "should_not_fall_for_xss_image_hack_2",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "should_not_fall_for_xss_image_hack_3",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "should_not_fall_for_xss_image_hack_4",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "should_not_fall_for_xss_image_hack_5",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "should_not_fall_for_xss_image_hack_6",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "should_not_fall_for_xss_image_hack_7",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "should_not_fall_for_xss_image_hack_8",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "should_not_fall_for_xss_image_hack_9",
+ "input": " ",
+ "output": " ",
+ "rexml": " "
+ },
+
+ {
+ "name": "should_sanitize_half_open_scripts",
+ "input": " ",
+ "rexml": "Ill-formed XHTML!"
+ },
+
+ {
+ "name": "should_sanitize_invalid_script_tag",
+ "input": "",
+ "output": "<script XSS=\"\" SRC=\"http://ha.ckers.org/xss.js\"></script>",
+ "xhtml": "<script/></script>",
+ "rexml": "Ill-formed XHTML!"
+ },
+
+ {
+ "name": "should_sanitize_script_tag_with_multiple_open_brackets",
+ "input": "<",
+ "output": "<<script>alert(\"XSS\");//<</script>",
+ "xhtml": "<<script>alert("XSS");//<</script>",
+ "rexml": "Ill-formed XHTML!"
+ },
+
+ {
+ "name": "should_sanitize_script_tag_with_multiple_open_brackets_2",
+ "input": " . Do not touch!',
+ :plain_text => '[[test]]&shebang *foo*'
+ )
+ end
+
end
diff --git a/test/unit/page_renderer_test.rb b/test/unit/page_renderer_test.rb
index 7ef77eb0..5a51540f 100644
--- a/test/unit/page_renderer_test.rb
+++ b/test/unit/page_renderer_test.rb
@@ -49,7 +49,7 @@ class PageRendererTest < Test::Unit::TestCase
" in kinda " +
"That Way in " +
"His Way? " +
- %{though My Way OverThere \xE2\x80\x93 see } +
+ %{though My Way OverThere \342\200\223 see } +
"Smart Engine in that " +
"Smart Engine GUI" +
"?
",
@@ -86,7 +86,7 @@ class PageRendererTest < Test::Unit::TestCase
%{(x ) } +
%{ } +
%{\\sin(x) \\begin{svg} \\end{svg}
},
- "$$\\sin(x) \\begin{svg} \\end{svg}$$")
+ "$$\\sin(x) \\begin{svg} \\end{svg}$$")
code_block = [
'This is a code block:',
@@ -275,6 +275,13 @@ class PageRendererTest < Test::Unit::TestCase
'or http://www.thislink.com .')
end
+ def test_malformed_nowiki
+ assert_markup_parsed_as(
+ '
',
+ ' ')
+ end
+
+
def test_multiline_nowiki_tag
assert_markup_parsed_as(
"Do not mark \n up [[this text]] \nand http://this.url.com but markup " +
@@ -283,6 +290,13 @@ class PageRendererTest < Test::Unit::TestCase
"and http://this.url.com but markup [[this]]")
end
+ def test_markdown_nowiki_tag
+ assert_markup_parsed_as(
+ '
Do not mark up *this text* or http://www.thislink.com.
',
+ 'Do not mark up *this text* ' +
+ 'or http://www.thislink.com .')
+ end
+
def test_sanitize_nowiki_tag
assert_markup_parsed_as(
'[[test]]&shebang <script>alert("xss!");</script> *foo*
',
@@ -311,7 +325,7 @@ class PageRendererTest < Test::Unit::TestCase
" in kinda " +
"That Way in " +
"His Way though " +
- %{My Way OverThere \xE2\x80\x93 see } +
+ %{My Way OverThere \342\200\223 see } +
"Smart Engine in that " +
"Smart Engine GUI ",
test_renderer(@revision).display_content_for_export
diff --git a/test/unit/sanitize_test.rb b/test/unit/sanitize_test.rb
index ced2276f..37260516 100644
--- a/test/unit/sanitize_test.rb
+++ b/test/unit/sanitize_test.rb
@@ -5,6 +5,8 @@ require 'sanitize'
class SanitizeTest < Test::Unit::TestCase
+ include Sanitize
+
def setup
end
diff --git a/test/unit/sanitizer_test.rb b/test/unit/sanitizer_test.rb
new file mode 100644
index 00000000..b8c8c83f
--- /dev/null
+++ b/test/unit/sanitizer_test.rb
@@ -0,0 +1,142 @@
+#!/usr/bin/env ruby
+
+require File.expand_path(File.join(File.dirname(__FILE__), '/../test_helper'))
+require 'sanitizer'
+require 'json'
+require 'stringsupport'
+
+class SanitizerTest < Test::Unit::TestCase
+
+ include Sanitizer
+
+ def setup
+
+ end
+
+ def do_sanitize_xhtml stream
+ sanitize_xhtml(stream.to_utf8)
+ end
+
+ def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
+ assert_equal xhtmloutput, do_sanitize_xhtml(input)
+ end
+
+ Sanitizer::ALLOWED_ELEMENTS.each do |tag_name|
+ define_method "test_should_allow_#{tag_name}_tag" do
+ input = "<#{tag_name} title='1'>foo bar baz#{tag_name}>"
+ htmloutput = "<#{tag_name.downcase} title='1'>foo <bad>bar</bad> baz#{tag_name.downcase}>"
+ xhtmloutput = "<#{tag_name} title='1'>foo <bad>bar</bad> baz#{tag_name}>"
+ rexmloutput = xhtmloutput
+
+ if VOID_ELEMENTS.include?(tag_name)
+ htmloutput = "<#{tag_name} title='1'/>foo <bad>bar</bad> baz"
+ xhtmloutput = htmloutput
+ htmloutput += ' ' if tag_name == 'br'
+ rexmloutput = "<#{tag_name} title='1' />"
+ end
+ check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
+ end
+ end
+
+ Sanitizer::ALLOWED_ELEMENTS.each do |tag_name|
+ define_method "test_should_forbid_#{tag_name.upcase}_tag" do
+ input = "<#{tag_name.upcase} title='1'>foo bar baz#{tag_name.upcase}>"
+ output = "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>"
+ xhtmloutput = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
+ check_sanitization(input, output, xhtmloutput, output)
+ end
+ end
+
+ Sanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
+ next if attribute_name == 'style'
+ define_method "test_should_allow_#{attribute_name}_attribute" do
+ input = "foo bar baz
"
+ output = "foo <bad>bar</bad> baz
"
+ htmloutput = "foo <bad>bar</bad> baz
"
+ check_sanitization(input, htmloutput, output, output)
+ end
+ end
+
+ Sanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
+ define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
+ input = "foo bar baz
"
+ output = "foo <bad>bar</bad> baz
"
+ check_sanitization(input, output, output, output)
+ end
+ end
+
+ Sanitizer::ALLOWED_PROTOCOLS.each do |protocol|
+ define_method "test_should_allow_#{protocol}_uris" do
+ input = %(foo )
+ output = "foo "
+ check_sanitization(input, output, output, output)
+ end
+ end
+
+ Sanitizer::ALLOWED_PROTOCOLS.each do |protocol|
+ define_method "test_should_allow_uppercase_#{protocol}_uris" do
+ input = %(foo )
+ output = "foo "
+ check_sanitization(input, output, output, output)
+ end
+ end
+
+ Sanitizer::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
+ next unless Sanitizer::ALLOWED_ELEMENTS.include?(tag_name)
+ define_method "test_#{tag_name}_should_allow_local_href" do
+ input = %(<#{tag_name} xlink:href="#foo"/>)
+ output = "<#{tag_name.downcase} xlink:href='#foo'/>"
+ xhtmloutput = "<#{tag_name} xlink:href='#foo'/>"
+ check_sanitization(input, output, xhtmloutput, xhtmloutput)
+ end
+
+ define_method "test_#{tag_name}_should_allow_local_href_with_newline" do
+ input = %(<#{tag_name} xlink:href="\n#foo"/>)
+ output = "<#{tag_name.downcase} xlink:href='\n#foo'/>"
+ xhtmloutput = "<#{tag_name} xlink:href='\n#foo'/>"
+ check_sanitization(input, output, xhtmloutput, xhtmloutput)
+ end
+
+ define_method "test_#{tag_name}_should_forbid_nonlocal_href" do
+ input = %(<#{tag_name} xlink:href="http://bad.com/foo"/>)
+ output = "<#{tag_name.downcase}/>"
+ xhtmloutput = "<#{tag_name}/>"
+ check_sanitization(input, output, xhtmloutput, xhtmloutput)
+ end
+
+ define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline" do
+ input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo"/>)
+ output = "<#{tag_name.downcase}/>"
+ xhtmloutput = "<#{tag_name}/>"
+ check_sanitization(input, output, xhtmloutput, xhtmloutput)
+ end
+ end
+
+ def test_should_handle_astral_plane_characters
+ input = "𝒵 𝔸
"
+ output = "\360\235\222\265 \360\235\224\270
"
+ check_sanitization(input, output, output, output)
+
+ input = "\360\235\224\270 a
"
+ output = "\360\235\224\270 a
"
+ check_sanitization(input, output, output, output)
+ end
+
+# This affects only NS4. Is it worth fixing?
+# def test_javascript_includes
+# input = %(foo
)
+# output = "foo
"
+# check_sanitization(input, output, output, output)
+# end
+
+ JSON::parse(open(File.expand_path(File.join(File.dirname(__FILE__), '/../sanitizer.dat'))).read).each do |test|
+ define_method "test_#{test['name']}" do
+ check_sanitization(
+ test['input'],
+ test['output'],
+ test['xhtml'] || test['output'],
+ test['rexml'] || test['output']
+ )
+ end
+ end
+end
diff --git a/vendor/plugins/maruku/lib/maruku/ext/math/mathml_engines/itex2mml.rb b/vendor/plugins/maruku/lib/maruku/ext/math/mathml_engines/itex2mml.rb
index bf79fb81..b884863c 100644
--- a/vendor/plugins/maruku/lib/maruku/ext/math/mathml_engines/itex2mml.rb
+++ b/vendor/plugins/maruku/lib/maruku/ext/math/mathml_engines/itex2mml.rb
@@ -4,7 +4,7 @@ module MaRuKu; module Out; module HTML
def convert_to_mathml_itex2mml(kind, tex)
begin
if not $itex2mml_parser
- require 'sanitize'
+ require 'stringsupport'
require 'itextomml'
$itex2mml_parser = Itex2MML::Parser.new
end