From d14db51d9e7aa762102841e320cdb2c6156419c5 Mon Sep 17 00:00:00 2001
From: Jacques Distler
Date: Fri, 9 Oct 2009 23:18:17 -0500
Subject: [PATCH] More Sanitizer Refactoring
Make the Sanitizer more efficient.
Also, update some unit tests.
---
lib/sanitizer.rb | 40 +++++++++++++++++----------------
test/unit/page_renderer_test.rb | 6 ++---
2 files changed, 24 insertions(+), 22 deletions(-)
diff --git a/lib/sanitizer.rb b/lib/sanitizer.rb
index bcfb143a..63eeb625 100644
--- a/lib/sanitizer.rb
+++ b/lib/sanitizer.rb
@@ -123,31 +123,33 @@ module Sanitizer
#
# xhtml_sanitize('')
# => <script> do_nasty_stuff() </script>
- # xhtml_sanitize_xhtml('Click here for $100')
+ # xhtml_sanitize('Click here for $100')
# => Click here for $100
def xhtml_sanitize(html)
- if html.index("<")
- tokenizer = HTML::Tokenizer.new(html.to_utf8)
- new_text = ""
+ return html unless sanitizeable?(html)
+ tokenizer = HTML::Tokenizer.new(html.to_utf8)
+ results = []
- while token = tokenizer.next
- node = XHTML::Node.parse(nil, 0, 0, token, false)
- new_text << case node.tag?
- when true
- if ALLOWED_ELEMENTS.include?(node.name)
- process_attributes_for(node)
- node.to_s
- else
- node.to_s.gsub(/, "<").gsub(/>/, ">")
- end
+ while token = tokenizer.next
+ node = XHTML::Node.parse(nil, 0, 0, token, false)
+ results << case node.tag?
+ when true
+ if ALLOWED_ELEMENTS.include?(node.name)
+ process_attributes_for(node)
+ node.to_s
else
- node.to_s.unescapeHTML.escapeHTML
- end
+ node.to_s.gsub(/, "<").gsub(/>/, ">")
+ end
+ else
+ node.to_s.unescapeHTML.escapeHTML
end
-
- html = new_text
end
- html
+
+ results.join
+ end
+
+ def sanitizeable?(text)
+ !(text.nil? || text.empty? || !text.index("<"))
end
protected
diff --git a/test/unit/page_renderer_test.rb b/test/unit/page_renderer_test.rb
index d50f9322..8f2ba29a 100644
--- a/test/unit/page_renderer_test.rb
+++ b/test/unit/page_renderer_test.rb
@@ -373,10 +373,10 @@ END_THM
assert_markup_parsed_as(
"should we go " +
"That Way or
\n" +
- "
",
+ "",
"should we go ThatWay or \n\\[ThisWay\\]\n")
assert_markup_parsed_as(
@@ -393,7 +393,7 @@ END_THM
"That Way or
\n" +
"ThisWay$" +
- "100 ThatWay" +
+ "100ThatWay" +
"ThisWay \\$100 " +
"ThatWay
",
"should we go ThatWay or \n$$ThisWay \\$100 ThatWay $$\n")