Sanitizer API documentation now online
See: http://golem.ph.utexas.edu/~distler/code/rdoc/sanitize/
This commit is contained in:
parent
f818238dd3
commit
a68d1aa8f3
|
@ -1,15 +1,29 @@
|
||||||
module Sanitize
|
# == Introduction
|
||||||
|
|
||||||
# This module provides sanitization of XHTML+MathML+SVG
|
|
||||||
# and of inline style attributes.
|
|
||||||
#
|
#
|
||||||
# Uses the HTML5lib parser, so that the parsing behaviour should
|
# This module provides sanitization of XHTML+MathML+SVG
|
||||||
|
# and of inline style attributes. Its genesis is {described here}[http://golem.ph.utexas.edu/~distler/blog/archives/001181.html].
|
||||||
|
#
|
||||||
|
# Uses the {HTML5lib parser}[http://code.google.com/p/html5lib/], so that the parsing behaviour should
|
||||||
# resemble that of browsers.
|
# resemble that of browsers.
|
||||||
#
|
#
|
||||||
# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
|
# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
|
||||||
# sanitize_html() is a case-insensitive sanitizer suitable for HTML
|
# sanitize_html() is a case-insensitive sanitizer suitable for HTML
|
||||||
# sanitize_rexml() sanitized a REXML tree, returning a string
|
# sanitize_rexml() sanitizes a REXML tree, returning a string
|
||||||
|
#
|
||||||
|
# == Files
|
||||||
|
#
|
||||||
|
# {sanitize.rb}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/lib/sanitize.rb],
|
||||||
|
# {HTML5lib}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/vendor/plugins/HTML5lib/]
|
||||||
|
#
|
||||||
|
# == Author
|
||||||
|
#
|
||||||
|
# {Jacques Distler}[http://golem.ph.utexas.edu/~distler/]
|
||||||
|
#
|
||||||
|
# == License
|
||||||
|
#
|
||||||
|
# Ruby License
|
||||||
|
|
||||||
|
module Sanitize
|
||||||
|
|
||||||
require 'html5lib/html5parser'
|
require 'html5lib/html5parser'
|
||||||
require 'html5lib/liberalxmlparser'
|
require 'html5lib/liberalxmlparser'
|
||||||
|
@ -27,6 +41,7 @@ module Sanitize
|
||||||
#
|
#
|
||||||
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
||||||
# By default, the output is a string. But, optionally, you can return a REXML tree.
|
# By default, the output is a string. But, optionally, you can return a REXML tree.
|
||||||
|
#
|
||||||
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
||||||
# (REXML trees are always utf-8 encoded.)
|
# (REXML trees are always utf-8 encoded.)
|
||||||
def sanitize_xhtml(html, options = {})
|
def sanitize_xhtml(html, options = {})
|
||||||
|
@ -50,11 +65,12 @@ module Sanitize
|
||||||
# Sanitize a string, parsed using HTML parsing rules.
|
# Sanitize a string, parsed using HTML parsing rules.
|
||||||
#
|
#
|
||||||
# :call-seq:
|
# :call-seq:
|
||||||
# sanitize_html(string) -> string
|
# sanitize_html( string ) -> string
|
||||||
# sanitize_html(string, {:encoding => 'iso-8859-1', :to_tree => true}) -> REXML::Document
|
# sanitize_html( string, {:encoding => 'iso-8859-1', :to_tree => true} ) -> REXML::Document
|
||||||
#
|
#
|
||||||
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
||||||
# By default, the output is a string. But, optionally, you can return a REXML tree.
|
# By default, the output is a string. But, optionally, you can return a REXML tree.
|
||||||
|
#
|
||||||
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
||||||
# (REXML trees are always utf-8 encoded.)
|
# (REXML trees are always utf-8 encoded.)
|
||||||
def sanitize_html(html, options = {})
|
def sanitize_html(html, options = {})
|
||||||
|
@ -116,6 +132,7 @@ class String
|
||||||
)*$/x;
|
)*$/x;
|
||||||
end
|
end
|
||||||
|
|
||||||
|
#:stopdoc:
|
||||||
MATHML_ENTITIES = {
|
MATHML_ENTITIES = {
|
||||||
'Alpha' => 'Α',
|
'Alpha' => 'Α',
|
||||||
'Beta' => 'Β',
|
'Beta' => 'Β',
|
||||||
|
@ -2238,6 +2255,7 @@ class String
|
||||||
'wr' => '≀',
|
'wr' => '≀',
|
||||||
'zeetrf' => 'ℨ'
|
'zeetrf' => 'ℨ'
|
||||||
}
|
}
|
||||||
|
#:startdoc:
|
||||||
|
|
||||||
# Converts XHTML+MathML named entities to Numeric Character References
|
# Converts XHTML+MathML named entities to Numeric Character References
|
||||||
#
|
#
|
||||||
|
@ -2260,7 +2278,7 @@ class String
|
||||||
|
|
||||||
protected
|
protected
|
||||||
|
|
||||||
def convert_to_ncr
|
def convert_to_ncr #:nodoc:
|
||||||
self =~ /^&([a-zA-Z0-9]+);$/
|
self =~ /^&([a-zA-Z0-9]+);$/
|
||||||
name = $1
|
name = $1
|
||||||
return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name] : "&" + name + ";"
|
return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name] : "&" + name + ";"
|
||||||
|
@ -2269,13 +2287,13 @@ class String
|
||||||
end
|
end
|
||||||
|
|
||||||
require 'rexml/element'
|
require 'rexml/element'
|
||||||
module REXML
|
module REXML #:nodoc:
|
||||||
class Element
|
class Element
|
||||||
|
|
||||||
# Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References
|
# Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References
|
||||||
#
|
#
|
||||||
# :call-seq:
|
# :call-seq:
|
||||||
# elt.to_ncr -> REXML::Element
|
# tree.to_ncr -> REXML::Element
|
||||||
#
|
#
|
||||||
# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
|
# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
|
||||||
# access the resulting REXML document.
|
# access the resulting REXML document.
|
||||||
|
|
Loading…
Reference in a new issue