Ruby 1.9 Compatibility
Completely removed the html5lib sanitizer. Fixed the string-handling to work in both Ruby 1.8.x and 1.9.2. There are still, inexplicably, two functional tests that fail. But the rest seems to work quite well.
This commit is contained in:
parent
79c8572053
commit
a6429f8c22
142 changed files with 519 additions and 843 deletions
|
@ -30,6 +30,7 @@ module Engines
|
|||
|
||||
class Textile < AbstractEngine
|
||||
def mask
|
||||
@content.as_utf8
|
||||
redcloth = RedCloth.new(@content, [:hard_breaks] + @content.options[:engine_opts])
|
||||
redcloth.filter_html = false
|
||||
redcloth.no_span_caps = false
|
||||
|
@ -39,6 +40,7 @@ module Engines
|
|||
|
||||
class Markdown < AbstractEngine
|
||||
def mask
|
||||
@content.as_utf8
|
||||
# If the request is for S5, call Maruku accordingly (without math)
|
||||
if @content.options[:mode] == :s5
|
||||
my_content = Maruku.new(@content.delete("\r").to_utf8,
|
||||
|
@ -56,6 +58,7 @@ module Engines
|
|||
|
||||
class MarkdownMML < AbstractEngine
|
||||
def mask
|
||||
@content.as_utf8
|
||||
# If the request is for S5, call Maruku accordingly
|
||||
if @content.options[:mode] == :s5
|
||||
my_content = Maruku.new(@content.delete("\r").to_utf8,
|
||||
|
@ -77,6 +80,7 @@ module Engines
|
|||
|
||||
class MarkdownPNG < AbstractEngine
|
||||
def mask
|
||||
@content.as_utf8
|
||||
# If the request is for S5, call Maruku accordingly
|
||||
if @content.options[:mode] == :s5
|
||||
my_content = Maruku.new(@content.delete("\r").to_utf8,
|
||||
|
@ -108,6 +112,7 @@ module Engines
|
|||
|
||||
class Mixed < AbstractEngine
|
||||
def mask
|
||||
@content.as_utf8
|
||||
redcloth = RedCloth.new(@content, @content.options[:engine_opts])
|
||||
redcloth.filter_html = false
|
||||
redcloth.no_span_caps = false
|
||||
|
@ -117,6 +122,7 @@ module Engines
|
|||
|
||||
class RDoc < AbstractEngine
|
||||
def mask
|
||||
@content.as_utf8
|
||||
html = RDocSupport::RDocFormatter.new(@content).to_html
|
||||
end
|
||||
end
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
require 'chunks/chunk'
|
||||
require 'sanitize'
|
||||
require 'sanitizer'
|
||||
|
||||
# This chunks allows certain parts of a wiki page to be hidden from the
|
||||
# rest of the rendering pipeline. It should be run at the beginning
|
||||
|
@ -17,7 +17,7 @@ require 'sanitize'
|
|||
|
||||
class NoWiki < Chunk::Abstract
|
||||
|
||||
include Sanitize
|
||||
include Sanitizer
|
||||
|
||||
NOWIKI_PATTERN = Regexp.new('<nowiki>(.*?)</nowiki>', Regexp::MULTILINE)
|
||||
def self.pattern() NOWIKI_PATTERN end
|
||||
|
@ -26,7 +26,7 @@ class NoWiki < Chunk::Abstract
|
|||
|
||||
def initialize(match_data, content)
|
||||
super
|
||||
@plain_text = @unmask_text = safe_sanitize_xhtml(match_data[1])
|
||||
@plain_text = @unmask_text = safe_xhtml_sanitize(match_data[1])
|
||||
end
|
||||
|
||||
end
|
||||
|
|
262
lib/sanitize.rb
262
lib/sanitize.rb
|
@ -1,262 +0,0 @@
|
|||
# == Introduction
|
||||
#
|
||||
# This module provides sanitization of XHTML+MathML+SVG
|
||||
# and of inline style attributes. Its genesis is {described here}[http://golem.ph.utexas.edu/~distler/blog/archives/001181.html].
|
||||
#
|
||||
# Uses the {HTML5lib parser}[http://code.google.com/p/html5lib/], so that the parsing behaviour should
|
||||
# resemble that of browsers.
|
||||
#
|
||||
# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
|
||||
# sanitize_html() is a case-insensitive sanitizer suitable for HTML
|
||||
# sanitize_rexml() sanitizes a REXML tree, returning a string
|
||||
# safe_sanitize_xhtml() makes extra-sure that the result is well-formed XHTML
|
||||
# by running the output of sanitize_xhtml() through REXML
|
||||
#
|
||||
# == Files
|
||||
#
|
||||
# {sanitize.rb}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/lib/sanitize.rb],
|
||||
# {HTML5lib}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/vendor/plugins/HTML5lib/]
|
||||
#
|
||||
# == Author
|
||||
#
|
||||
# {Jacques Distler}[http://golem.ph.utexas.edu/~distler/]
|
||||
#
|
||||
# == License
|
||||
#
|
||||
# Ruby License
|
||||
|
||||
module Sanitize
|
||||
|
||||
require 'html5/html5parser'
|
||||
require 'html5/liberalxmlparser'
|
||||
require 'html5/treewalkers'
|
||||
require 'html5/treebuilders'
|
||||
require 'html5/serializer'
|
||||
require 'html5/sanitizer'
|
||||
require 'stringsupport.rb'
|
||||
|
||||
include HTML5
|
||||
|
||||
# Sanitize a string, parsed using XHTML parsing rules.
|
||||
#
|
||||
# :call-seq:
|
||||
# sanitize_xhtml(string) -> string
|
||||
# sanitize_xhtml(string, {:encoding => 'iso-8859-1', :to_tree => true}) -> REXML::Document
|
||||
#
|
||||
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
||||
# By default, the output is a string. But, optionally, you can return a REXML tree.
|
||||
#
|
||||
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
||||
# (REXML trees are always utf-8 encoded.)
|
||||
def sanitize_xhtml(html, options = {})
|
||||
@encoding = 'utf-8'
|
||||
@treebuilder = TreeBuilders::REXML::TreeBuilder
|
||||
@to_tree = false
|
||||
options.each do |name, value|
|
||||
next unless %w(encoding treebuilder to_tree).include? name.to_s
|
||||
if name.to_s == 'treebuilder'
|
||||
@treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
|
||||
else
|
||||
instance_variable_set("@#{name}", value)
|
||||
end
|
||||
end
|
||||
if @encoding == 'utf-8'
|
||||
parsed = XHTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
|
||||
:lowercase_element_name => false, :lowercase_attr_name => false,
|
||||
:encoding => @encoding, :tree => @treebuilder })
|
||||
else
|
||||
parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
|
||||
:lowercase_element_name => false, :lowercase_attr_name => false,
|
||||
:encoding => @encoding, :tree => @treebuilder })
|
||||
end
|
||||
return parsed if @to_tree
|
||||
return parsed.to_s
|
||||
end
|
||||
|
||||
# Sanitize a string, parsed using XHTML parsing rules. Reparse the result to
|
||||
# ensure well-formedness.
|
||||
#
|
||||
# :call-seq:
|
||||
# safe_sanitize_xhtml(string) -> string
|
||||
#
|
||||
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
||||
#
|
||||
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
||||
# (REXML trees are always utf-8 encoded.)
|
||||
def safe_sanitize_xhtml(html, options = {})
|
||||
options[:to_tree] = false
|
||||
sanitized = sanitize_xhtml(html, options)
|
||||
doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>")
|
||||
sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
|
||||
rescue REXML::ParseException
|
||||
sanitized = sanitized.escapeHTML
|
||||
end
|
||||
|
||||
# Sanitize a string, parsed using HTML parsing rules.
|
||||
#
|
||||
# :call-seq:
|
||||
# sanitize_html( string ) -> string
|
||||
# sanitize_html( string, {:encoding => 'iso-8859-1', :to_tree => true} ) -> REXML::Document
|
||||
#
|
||||
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
||||
# By default, the output is a string. But, optionally, you can return a REXML tree.
|
||||
#
|
||||
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
||||
# (REXML trees are always utf-8 encoded.)
|
||||
def sanitize_html(html, options = {})
|
||||
@encoding = 'utf-8'
|
||||
@treebuilder = TreeBuilders::REXML::TreeBuilder
|
||||
@to_tree = false
|
||||
options.each do |name, value|
|
||||
next unless %w(encoding treebuilder to_tree).include? name.to_s
|
||||
if name.to_s == 'treebuilder'
|
||||
@treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
|
||||
else
|
||||
instance_variable_set("@#{name}", value)
|
||||
end
|
||||
end
|
||||
if @encoding == 'utf-8'
|
||||
parsed = HTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
|
||||
:encoding => @encoding, :tree => @treebuilder })
|
||||
else
|
||||
parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
|
||||
:encoding => @encoding, :tree => @treebuilder })
|
||||
end
|
||||
return parsed if @to_tree
|
||||
return parsed.to_s
|
||||
end
|
||||
|
||||
# Sanitize a REXML tree. The output is a string.
|
||||
#
|
||||
# :call-seq:
|
||||
# sanitize_rexml(tree) -> string
|
||||
#
|
||||
def sanitize_rexml(tree)
|
||||
tokens = TreeWalkers.get_tree_walker('rexml2').new(tree)
|
||||
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
|
||||
:space_before_trailing_solidus => true,
|
||||
:inject_meta_charset => false,
|
||||
:sanitize => true})
|
||||
end
|
||||
end
|
||||
|
||||
require 'rexml/element'
|
||||
module REXML #:nodoc:
|
||||
class Element
|
||||
|
||||
# Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References
|
||||
#
|
||||
# :call-seq:
|
||||
# tree.to_ncr -> REXML::Element
|
||||
#
|
||||
# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
|
||||
# access the resulting REXML document.
|
||||
#
|
||||
# Note that this method needs to traverse the entire tree, converting text nodes and attributes
|
||||
# for each element. This can be SLOW. It will often be faster to serialize to a string and then
|
||||
# use String.to_ncr instead.
|
||||
#
|
||||
def to_ncr
|
||||
self.each_element { |el|
|
||||
el.texts.each_index {|i|
|
||||
el.texts[i].value = el.texts[i].to_s.to_ncr
|
||||
}
|
||||
el.attributes.each { |name,val|
|
||||
el.attributes[name] = val.to_ncr
|
||||
}
|
||||
el.to_ncr if el.has_elements?
|
||||
}
|
||||
return self
|
||||
end
|
||||
|
||||
# Convert XHTML+MathML Named Entities in a REXML::Element to UTF-8
|
||||
#
|
||||
# :call-seq:
|
||||
# tree.to_utf8 -> REXML::Element
|
||||
#
|
||||
# Note that this method needs to traverse the entire tree, converting text nodes and attributes
|
||||
# for each element. This can be SLOW. It will often be faster to serialize to a string and then
|
||||
# use String.to_utf8 instead.
|
||||
#
|
||||
def to_utf8
|
||||
self.each_element { |el|
|
||||
el.texts.each_index {|i|
|
||||
el.texts[i].value = el.texts[i].to_s.to_utf8
|
||||
}
|
||||
el.attributes.each { |name,val|
|
||||
el.attributes[name] = val.to_utf8
|
||||
}
|
||||
el.to_utf8 if el.has_elements?
|
||||
}
|
||||
return self
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
module HTML5 #:nodoc: all
|
||||
module TreeWalkers
|
||||
|
||||
private
|
||||
|
||||
class << self
|
||||
def [](name)
|
||||
case name.to_s.downcase
|
||||
when 'rexml'
|
||||
require 'html5/treewalkers/rexml'
|
||||
REXML::TreeWalker
|
||||
when 'rexml2'
|
||||
REXML2::TreeWalker
|
||||
else
|
||||
raise "Unknown TreeWalker #{name}"
|
||||
end
|
||||
end
|
||||
|
||||
alias :get_tree_walker :[]
|
||||
end
|
||||
|
||||
module REXML2
|
||||
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
|
||||
|
||||
private
|
||||
|
||||
def node_details(node)
|
||||
case node
|
||||
when ::REXML::Document
|
||||
[:DOCUMENT]
|
||||
when ::REXML::Element
|
||||
if !node.name
|
||||
[:DOCUMENT_FRAGMENT]
|
||||
else
|
||||
[:ELEMENT, node.name,
|
||||
node.attributes.map {|name,value| [name,value.to_utf8]},
|
||||
node.has_elements? || node.has_text?]
|
||||
end
|
||||
when ::REXML::Text
|
||||
[:TEXT, node.value.to_utf8]
|
||||
when ::REXML::Comment
|
||||
[:COMMENT, node.string]
|
||||
when ::REXML::DocType
|
||||
[:DOCTYPE, node.name, node.public, node.system]
|
||||
when ::REXML::XMLDecl
|
||||
[nil]
|
||||
else
|
||||
[:UNKNOWN, node.class.inspect]
|
||||
end
|
||||
end
|
||||
|
||||
def first_child(node)
|
||||
node.children.first
|
||||
end
|
||||
|
||||
def next_sibling(node)
|
||||
node.next_sibling
|
||||
end
|
||||
|
||||
def parent(node)
|
||||
node.parent
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -169,7 +169,7 @@ module Sanitizer
|
|||
node.attributes.delete attr; next
|
||||
end
|
||||
if ATTR_VAL_IS_URI.include?(attr)
|
||||
val_unescaped = val.unescapeHTML.gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
|
||||
val_unescaped = val.unescapeHTML.as_bytes.gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
|
||||
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
|
||||
node.attributes.delete attr; next
|
||||
end
|
||||
|
@ -206,4 +206,23 @@ module Sanitizer
|
|||
|
||||
clean.join(' ')
|
||||
end
|
||||
|
||||
# Sanitize a string, parsed using XHTML parsing rules. Reparse the result to
|
||||
# ensure well-formedness.
|
||||
#
|
||||
# :call-seq:
|
||||
# safe_sanitize_xhtml(string) -> string
|
||||
#
|
||||
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
||||
#
|
||||
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
||||
# (REXML trees are always utf-8 encoded.)
|
||||
def safe_xhtml_sanitize(html, options = {})
|
||||
sanitized = xhtml_sanitize(html.purify)
|
||||
doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>")
|
||||
sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
|
||||
rescue REXML::ParseException
|
||||
sanitized = sanitized.escapeHTML
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -2,6 +2,26 @@
|
|||
|
||||
class String
|
||||
|
||||
# A method to allow byte-oriented operations in both Ruby 1.8 and Ruby 1.9
|
||||
#
|
||||
# Under 1.8, this is a NOOP. Under 1.9, it sets the encoding to "ASCII-8BIT"
|
||||
#--
|
||||
def as_bytes
|
||||
force_encoding("ASCII-8BIT") if self.respond_to?(:force_encoding)
|
||||
self
|
||||
end
|
||||
|
||||
#++
|
||||
# A method to allow string-oriented operations in both Ruby 1.8 and Ruby 1.9
|
||||
#
|
||||
# Under 1.8, this is a NOOP. Under 1.9, it sets the encoding to "UTF-8"
|
||||
#--
|
||||
def as_utf8
|
||||
force_encoding("UTF-8") if self.respond_to?(:force_encoding)
|
||||
self
|
||||
end
|
||||
|
||||
#++
|
||||
# Take a string, and remove any invalid substrings, returning a valid utf-8 string.
|
||||
#
|
||||
# :call-seq:
|
||||
|
@ -11,12 +31,16 @@ class String
|
|||
#--
|
||||
def purify
|
||||
text = check_ncrs
|
||||
text.split(//u).grep(UTF8_REGEX).join
|
||||
if text.respond_to?(:encoding)
|
||||
text.split(//).collect{|c| c.as_bytes}.grep(UTF8_REGEX).join.as_utf8
|
||||
else
|
||||
text.split(//u).grep(UTF8_REGEX).join
|
||||
end
|
||||
end
|
||||
|
||||
def check_ncrs
|
||||
text = gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*') =~ UTF8_REGEX ? m : '' }
|
||||
text.gsub(/&#(\d+);/) { |m| [$1.to_i].pack('U*') =~ UTF8_REGEX ? m : '' }
|
||||
text = gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' }
|
||||
text.gsub(/&#(\d+);/) { |m| [$1.to_i].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' }
|
||||
end
|
||||
|
||||
UTF8_REGEX = /\A(
|
||||
|
@ -42,7 +66,7 @@ class String
|
|||
#--
|
||||
def is_utf8?
|
||||
#expand NCRs to utf-8
|
||||
text = self.check_ncrs
|
||||
text = self.check_ncrs.as_bytes
|
||||
|
||||
# You might think this is faster, but it isn't
|
||||
#pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue