From 8badd0766a68aa9eec69e8784559a0f94024b130 Mon Sep 17 00:00:00 2001 From: Jacques Distler Date: Fri, 8 Jun 2007 01:23:09 -0500 Subject: [PATCH] Enhancements to sanitize.rb Options, options, ... options. --- lib/sanitize.rb | 59 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 5 deletions(-) diff --git a/lib/sanitize.rb b/lib/sanitize.rb index 34d52e8c..a0221455 100644 --- a/lib/sanitize.rb +++ b/lib/sanitize.rb @@ -13,22 +13,71 @@ module Sanitize require 'html5lib/html5parser' require 'html5lib/liberalxmlparser' - require 'html5lib/treewalkers' + require 'html5lib/treebuilders' require 'html5lib/serializer' require 'string_utils' require 'html5lib/sanitizer' include HTML5lib - def sanitize_xhtml(html) - XHTMLParser.parseFragment(html.to_ncr, {:tokenizer => HTMLSanitizer, :encoding=>'utf-8' }).to_s +# Sanitize a string, parsed using XHTML parsing rules. +# +# :call-seq: +# sanitize_xhtml(string) -> string +# sanitize_xhtml(string, {:encoding => 'iso-8859-1', :to_tree => true}) -> REXML::Document +# +# Unless otherwise specified, the string is assumed to be utf-8 encoded. +# By default, the output is a string. But, optionally, you can return a REXML tree. + def sanitize_xhtml(html, options = {}) + @encoding = 'utf-8' + @treebuilder = TreeBuilders::REXML::TreeBuilder + @to_tree = false + options.each do |name, value| + next unless %w(encoding treebuilder to_tree).include? name.to_s + if name.to_s == 'treebuilder' + @treebuilder = HTML5lib::TreeBuilders.getTreeBuilder(value) + else + instance_variable_set("@#{name}", value) + end + end + parsed = XHTMLParser.parseFragment(html.to_ncr, {:tokenizer => HTMLSanitizer, + :encoding => @encoding, :tree => @treebuilder }) + return parsed if @to_tree + return parsed.to_s end - def sanitize_html(html) - HTMLParser.parseFragment(html, {:tokenizer => HTMLSanitizer, :encoding=>'utf-8' }).to_s +# Sanitize a string, parsed using HTML parsing rules. +# +# :call-seq: +# sanitize_html(string) -> string +# sanitize_html(string, {:encoding => 'iso-8859-1', :to_tree => true}) -> REXML::Document +# +# Unless otherwise specified, the string is assumed to be utf-8 encoded. +# By default, the output is a string. But, optionally, you can return a REXML tree. + def sanitize_html(html, options = {}) + @encoding = 'utf-8' + @treebuilder = TreeBuilders::REXML::TreeBuilder + @to_tree = false + options.each do |name, value| + next unless %w(encoding treebuilder to_tree).include? name.to_s + if name.to_s == 'treebuilder' + @treebuilder = HTML5lib::TreeBuilders.getTreeBuilder(value) + else + instance_variable_set("@#{name}", value) + end + end + parsed = HTMLParser.parseFragment(html.to_ncr, {:tokenizer => HTMLSanitizer, + :encoding => @encoding, :tree => @treebuilder }) + return parsed if @to_tree + return parsed.to_s end +# Sanitize a REXML tree. The output is a string. +# +# :call-seq: +# sanitize_rexml(tree) -> string +# def sanitize_rexml(tree) tokens = TreeWalkers.getTreeWalker('rexml').new(tree.to_ncr) HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',