Sanitizer API documentation now online

See:
   http://golem.ph.utexas.edu/~distler/code/rdoc/sanitize/
This commit is contained in:
Jacques Distler 2007-06-08 23:51:30 -05:00
parent f818238dd3
commit a68d1aa8f3

View file

@ -1,15 +1,29 @@
module Sanitize # == Introduction
# This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes.
# #
# Uses the HTML5lib parser, so that the parsing behaviour should # This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes. Its genesis is {described here}[http://golem.ph.utexas.edu/~distler/blog/archives/001181.html].
#
# Uses the {HTML5lib parser}[http://code.google.com/p/html5lib/], so that the parsing behaviour should
# resemble that of browsers. # resemble that of browsers.
# #
# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML # sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
# sanitize_html() is a case-insensitive sanitizer suitable for HTML # sanitize_html() is a case-insensitive sanitizer suitable for HTML
# sanitize_rexml() sanitized a REXML tree, returning a string # sanitize_rexml() sanitizes a REXML tree, returning a string
#
# == Files
#
# {sanitize.rb}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/lib/sanitize.rb],
# {HTML5lib}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/vendor/plugins/HTML5lib/]
#
# == Author
#
# {Jacques Distler}[http://golem.ph.utexas.edu/~distler/]
#
# == License
#
# Ruby License
module Sanitize
require 'html5lib/html5parser' require 'html5lib/html5parser'
require 'html5lib/liberalxmlparser' require 'html5lib/liberalxmlparser'
@ -27,6 +41,7 @@ module Sanitize
# #
# Unless otherwise specified, the string is assumed to be utf-8 encoded. # Unless otherwise specified, the string is assumed to be utf-8 encoded.
# By default, the output is a string. But, optionally, you can return a REXML tree. # By default, the output is a string. But, optionally, you can return a REXML tree.
#
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding. # The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
# (REXML trees are always utf-8 encoded.) # (REXML trees are always utf-8 encoded.)
def sanitize_xhtml(html, options = {}) def sanitize_xhtml(html, options = {})
@ -50,11 +65,12 @@ module Sanitize
# Sanitize a string, parsed using HTML parsing rules. # Sanitize a string, parsed using HTML parsing rules.
# #
# :call-seq: # :call-seq:
# sanitize_html(string) -> string # sanitize_html( string ) -> string
# sanitize_html(string, {:encoding => 'iso-8859-1', :to_tree => true}) -> REXML::Document # sanitize_html( string, {:encoding => 'iso-8859-1', :to_tree => true} ) -> REXML::Document
# #
# Unless otherwise specified, the string is assumed to be utf-8 encoded. # Unless otherwise specified, the string is assumed to be utf-8 encoded.
# By default, the output is a string. But, optionally, you can return a REXML tree. # By default, the output is a string. But, optionally, you can return a REXML tree.
#
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding. # The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
# (REXML trees are always utf-8 encoded.) # (REXML trees are always utf-8 encoded.)
def sanitize_html(html, options = {}) def sanitize_html(html, options = {})
@ -116,6 +132,7 @@ class String
)*$/x; )*$/x;
end end
#:stopdoc:
MATHML_ENTITIES = { MATHML_ENTITIES = {
'Alpha' => 'Α', 'Alpha' => 'Α',
'Beta' => 'Β', 'Beta' => 'Β',
@ -2238,6 +2255,7 @@ class String
'wr' => '≀', 'wr' => '≀',
'zeetrf' => 'ℨ' 'zeetrf' => 'ℨ'
} }
#:startdoc:
# Converts XHTML+MathML named entities to Numeric Character References # Converts XHTML+MathML named entities to Numeric Character References
# #
@ -2260,7 +2278,7 @@ class String
protected protected
def convert_to_ncr def convert_to_ncr #:nodoc:
self =~ /^&([a-zA-Z0-9]+);$/ self =~ /^&([a-zA-Z0-9]+);$/
name = $1 name = $1
return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name] : "&" + name + ";" return MATHML_ENTITIES.has_key?(name) ? MATHML_ENTITIES[name] : "&" + name + ";"
@ -2269,13 +2287,13 @@ class String
end end
require 'rexml/element' require 'rexml/element'
module REXML module REXML #:nodoc:
class Element class Element
# Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References # Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References
# #
# :call-seq: # :call-seq:
# elt.to_ncr -> REXML::Element # tree.to_ncr -> REXML::Element
# #
# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you # REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
# access the resulting REXML document. # access the resulting REXML document.