2007-02-22 08:06:53 +01:00
|
|
|
module Sanitize
|
|
|
|
|
2007-05-30 17:45:52 +02:00
|
|
|
# This module provides sanitization of XHTML+MathML+SVG
|
2007-02-22 08:06:53 +01:00
|
|
|
# and of inline style attributes.
|
|
|
|
#
|
2007-05-30 17:45:52 +02:00
|
|
|
# Uses the HTML5lib parser, so that the parsing behaviour should
|
|
|
|
# resemble that of browsers.
|
|
|
|
#
|
|
|
|
# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
|
|
|
|
# sanitize_html() is a case-insensitive sanitizer suitable for HTML
|
2007-06-05 23:34:49 +02:00
|
|
|
# sanitize_rexml() sanitized a REXML tree, returning a string
|
2007-02-22 08:06:53 +01:00
|
|
|
|
|
|
|
|
2007-05-30 17:45:52 +02:00
|
|
|
require 'html5lib/html5parser'
|
|
|
|
require 'html5lib/liberalxmlparser'
|
2007-06-05 23:34:49 +02:00
|
|
|
|
|
|
|
require 'html5lib/treewalkers'
|
|
|
|
require 'html5lib/serializer'
|
|
|
|
require 'string_utils'
|
|
|
|
require 'html5lib/sanitizer'
|
|
|
|
|
2007-05-30 17:45:52 +02:00
|
|
|
include HTML5lib
|
2007-02-22 08:06:53 +01:00
|
|
|
|
2007-05-30 17:45:52 +02:00
|
|
|
def sanitize_xhtml(html)
|
2007-06-05 23:34:49 +02:00
|
|
|
XHTMLParser.parseFragment(html.to_ncr, :tokenizer => HTMLSanitizer).to_s
|
2007-05-30 17:45:52 +02:00
|
|
|
end
|
2007-02-22 08:06:53 +01:00
|
|
|
|
2007-05-30 17:45:52 +02:00
|
|
|
def sanitize_html(html)
|
|
|
|
HTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
|
|
|
|
end
|
2007-02-22 08:06:53 +01:00
|
|
|
|
2007-06-05 23:34:49 +02:00
|
|
|
def sanitize_rexml(tree)
|
|
|
|
tokens = TreeWalkers.getTreeWalker('rexml').new(tree.to_ncr)
|
|
|
|
HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
|
2007-06-06 01:06:26 +02:00
|
|
|
:quote_attr_values => true,
|
|
|
|
:minimize_boolean_attributes => false,
|
|
|
|
:use_trailing_solidus => true,
|
|
|
|
:space_before_trailing_solidus => true,
|
|
|
|
:omit_optional_tags => false,
|
|
|
|
:inject_meta_charset => false,
|
|
|
|
:sanitize => true})
|
2007-06-05 23:34:49 +02:00
|
|
|
end
|
2007-05-30 17:45:52 +02:00
|
|
|
end
|