Ruby 1.9 Compatibility

Completely removed the html5lib sanitizer.
Fixed the string-handling to work in both
Ruby 1.8.x and 1.9.2. There are still,
inexplicably, two functional tests that
fail. But the rest seems to work quite well.
This commit is contained in:
Jacques Distler 2009-11-30 16:28:18 -06:00
parent 79c8572053
commit a6429f8c22
142 changed files with 519 additions and 843 deletions

View file

@ -30,6 +30,7 @@ module Engines
class Textile < AbstractEngine
def mask
@content.as_utf8
redcloth = RedCloth.new(@content, [:hard_breaks] + @content.options[:engine_opts])
redcloth.filter_html = false
redcloth.no_span_caps = false
@ -39,6 +40,7 @@ module Engines
class Markdown < AbstractEngine
def mask
@content.as_utf8
# If the request is for S5, call Maruku accordingly (without math)
if @content.options[:mode] == :s5
my_content = Maruku.new(@content.delete("\r").to_utf8,
@ -56,6 +58,7 @@ module Engines
class MarkdownMML < AbstractEngine
def mask
@content.as_utf8
# If the request is for S5, call Maruku accordingly
if @content.options[:mode] == :s5
my_content = Maruku.new(@content.delete("\r").to_utf8,
@ -77,6 +80,7 @@ module Engines
class MarkdownPNG < AbstractEngine
def mask
@content.as_utf8
# If the request is for S5, call Maruku accordingly
if @content.options[:mode] == :s5
my_content = Maruku.new(@content.delete("\r").to_utf8,
@ -108,6 +112,7 @@ module Engines
class Mixed < AbstractEngine
def mask
@content.as_utf8
redcloth = RedCloth.new(@content, @content.options[:engine_opts])
redcloth.filter_html = false
redcloth.no_span_caps = false
@ -117,6 +122,7 @@ module Engines
class RDoc < AbstractEngine
def mask
@content.as_utf8
html = RDocSupport::RDocFormatter.new(@content).to_html
end
end

View file

@ -1,5 +1,5 @@
require 'chunks/chunk'
require 'sanitize'
require 'sanitizer'
# This chunks allows certain parts of a wiki page to be hidden from the
# rest of the rendering pipeline. It should be run at the beginning
@ -17,7 +17,7 @@ require 'sanitize'
class NoWiki < Chunk::Abstract
include Sanitize
include Sanitizer
NOWIKI_PATTERN = Regexp.new('<nowiki>(.*?)</nowiki>', Regexp::MULTILINE)
def self.pattern() NOWIKI_PATTERN end
@ -26,7 +26,7 @@ class NoWiki < Chunk::Abstract
def initialize(match_data, content)
super
@plain_text = @unmask_text = safe_sanitize_xhtml(match_data[1])
@plain_text = @unmask_text = safe_xhtml_sanitize(match_data[1])
end
end

View file

@ -1,262 +0,0 @@
# == Introduction
#
# This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes. Its genesis is {described here}[http://golem.ph.utexas.edu/~distler/blog/archives/001181.html].
#
# Uses the {HTML5lib parser}[http://code.google.com/p/html5lib/], so that the parsing behaviour should
# resemble that of browsers.
#
# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
# sanitize_html() is a case-insensitive sanitizer suitable for HTML
# sanitize_rexml() sanitizes a REXML tree, returning a string
# safe_sanitize_xhtml() makes extra-sure that the result is well-formed XHTML
# by running the output of sanitize_xhtml() through REXML
#
# == Files
#
# {sanitize.rb}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/lib/sanitize.rb],
# {HTML5lib}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/vendor/plugins/HTML5lib/]
#
# == Author
#
# {Jacques Distler}[http://golem.ph.utexas.edu/~distler/]
#
# == License
#
# Ruby License
module Sanitize
require 'html5/html5parser'
require 'html5/liberalxmlparser'
require 'html5/treewalkers'
require 'html5/treebuilders'
require 'html5/serializer'
require 'html5/sanitizer'
require 'stringsupport.rb'
include HTML5
# Sanitize a string, parsed using XHTML parsing rules.
#
# :call-seq:
# sanitize_xhtml(string) -> string
# sanitize_xhtml(string, {:encoding => 'iso-8859-1', :to_tree => true}) -> REXML::Document
#
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
# By default, the output is a string. But, optionally, you can return a REXML tree.
#
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
# (REXML trees are always utf-8 encoded.)
def sanitize_xhtml(html, options = {})
@encoding = 'utf-8'
@treebuilder = TreeBuilders::REXML::TreeBuilder
@to_tree = false
options.each do |name, value|
next unless %w(encoding treebuilder to_tree).include? name.to_s
if name.to_s == 'treebuilder'
@treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
else
instance_variable_set("@#{name}", value)
end
end
if @encoding == 'utf-8'
parsed = XHTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
:lowercase_element_name => false, :lowercase_attr_name => false,
:encoding => @encoding, :tree => @treebuilder })
else
parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
:lowercase_element_name => false, :lowercase_attr_name => false,
:encoding => @encoding, :tree => @treebuilder })
end
return parsed if @to_tree
return parsed.to_s
end
# Sanitize a string, parsed using XHTML parsing rules. Reparse the result to
# ensure well-formedness.
#
# :call-seq:
# safe_sanitize_xhtml(string) -> string
#
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
#
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
# (REXML trees are always utf-8 encoded.)
def safe_sanitize_xhtml(html, options = {})
options[:to_tree] = false
sanitized = sanitize_xhtml(html, options)
doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>")
sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
rescue REXML::ParseException
sanitized = sanitized.escapeHTML
end
# Sanitize a string, parsed using HTML parsing rules.
#
# :call-seq:
# sanitize_html( string ) -> string
# sanitize_html( string, {:encoding => 'iso-8859-1', :to_tree => true} ) -> REXML::Document
#
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
# By default, the output is a string. But, optionally, you can return a REXML tree.
#
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
# (REXML trees are always utf-8 encoded.)
def sanitize_html(html, options = {})
@encoding = 'utf-8'
@treebuilder = TreeBuilders::REXML::TreeBuilder
@to_tree = false
options.each do |name, value|
next unless %w(encoding treebuilder to_tree).include? name.to_s
if name.to_s == 'treebuilder'
@treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
else
instance_variable_set("@#{name}", value)
end
end
if @encoding == 'utf-8'
parsed = HTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
:encoding => @encoding, :tree => @treebuilder })
else
parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
:encoding => @encoding, :tree => @treebuilder })
end
return parsed if @to_tree
return parsed.to_s
end
# Sanitize a REXML tree. The output is a string.
#
# :call-seq:
# sanitize_rexml(tree) -> string
#
def sanitize_rexml(tree)
tokens = TreeWalkers.get_tree_walker('rexml2').new(tree)
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
:space_before_trailing_solidus => true,
:inject_meta_charset => false,
:sanitize => true})
end
end
require 'rexml/element'
module REXML #:nodoc:
class Element
# Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References
#
# :call-seq:
# tree.to_ncr -> REXML::Element
#
# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
# access the resulting REXML document.
#
# Note that this method needs to traverse the entire tree, converting text nodes and attributes
# for each element. This can be SLOW. It will often be faster to serialize to a string and then
# use String.to_ncr instead.
#
def to_ncr
self.each_element { |el|
el.texts.each_index {|i|
el.texts[i].value = el.texts[i].to_s.to_ncr
}
el.attributes.each { |name,val|
el.attributes[name] = val.to_ncr
}
el.to_ncr if el.has_elements?
}
return self
end
# Convert XHTML+MathML Named Entities in a REXML::Element to UTF-8
#
# :call-seq:
# tree.to_utf8 -> REXML::Element
#
# Note that this method needs to traverse the entire tree, converting text nodes and attributes
# for each element. This can be SLOW. It will often be faster to serialize to a string and then
# use String.to_utf8 instead.
#
def to_utf8
self.each_element { |el|
el.texts.each_index {|i|
el.texts[i].value = el.texts[i].to_s.to_utf8
}
el.attributes.each { |name,val|
el.attributes[name] = val.to_utf8
}
el.to_utf8 if el.has_elements?
}
return self
end
end
end
module HTML5 #:nodoc: all
module TreeWalkers
private
class << self
def [](name)
case name.to_s.downcase
when 'rexml'
require 'html5/treewalkers/rexml'
REXML::TreeWalker
when 'rexml2'
REXML2::TreeWalker
else
raise "Unknown TreeWalker #{name}"
end
end
alias :get_tree_walker :[]
end
module REXML2
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
private
def node_details(node)
case node
when ::REXML::Document
[:DOCUMENT]
when ::REXML::Element
if !node.name
[:DOCUMENT_FRAGMENT]
else
[:ELEMENT, node.name,
node.attributes.map {|name,value| [name,value.to_utf8]},
node.has_elements? || node.has_text?]
end
when ::REXML::Text
[:TEXT, node.value.to_utf8]
when ::REXML::Comment
[:COMMENT, node.string]
when ::REXML::DocType
[:DOCTYPE, node.name, node.public, node.system]
when ::REXML::XMLDecl
[nil]
else
[:UNKNOWN, node.class.inspect]
end
end
def first_child(node)
node.children.first
end
def next_sibling(node)
node.next_sibling
end
def parent(node)
node.parent
end
end
end
end
end

View file

@ -169,7 +169,7 @@ module Sanitizer
node.attributes.delete attr; next
end
if ATTR_VAL_IS_URI.include?(attr)
val_unescaped = val.unescapeHTML.gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
val_unescaped = val.unescapeHTML.as_bytes.gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
node.attributes.delete attr; next
end
@ -206,4 +206,23 @@ module Sanitizer
clean.join(' ')
end
# Sanitize a string, parsed using XHTML parsing rules. Reparse the result to
# ensure well-formedness.
#
# :call-seq:
# safe_sanitize_xhtml(string) -> string
#
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
#
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
# (REXML trees are always utf-8 encoded.)
def safe_xhtml_sanitize(html, options = {})
sanitized = xhtml_sanitize(html.purify)
doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>")
sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
rescue REXML::ParseException
sanitized = sanitized.escapeHTML
end
end

View file

@ -2,6 +2,26 @@
class String
# A method to allow byte-oriented operations in both Ruby 1.8 and Ruby 1.9
#
# Under 1.8, this is a NOOP. Under 1.9, it sets the encoding to "ASCII-8BIT"
#--
def as_bytes
force_encoding("ASCII-8BIT") if self.respond_to?(:force_encoding)
self
end
#++
# A method to allow string-oriented operations in both Ruby 1.8 and Ruby 1.9
#
# Under 1.8, this is a NOOP. Under 1.9, it sets the encoding to "UTF-8"
#--
def as_utf8
force_encoding("UTF-8") if self.respond_to?(:force_encoding)
self
end
#++
# Take a string, and remove any invalid substrings, returning a valid utf-8 string.
#
# :call-seq:
@ -11,12 +31,16 @@ class String
#--
def purify
text = check_ncrs
text.split(//u).grep(UTF8_REGEX).join
if text.respond_to?(:encoding)
text.split(//).collect{|c| c.as_bytes}.grep(UTF8_REGEX).join.as_utf8
else
text.split(//u).grep(UTF8_REGEX).join
end
end
def check_ncrs
text = gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*') =~ UTF8_REGEX ? m : '' }
text.gsub(/&#(\d+);/) { |m| [$1.to_i].pack('U*') =~ UTF8_REGEX ? m : '' }
text = gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' }
text.gsub(/&#(\d+);/) { |m| [$1.to_i].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' }
end
UTF8_REGEX = /\A(
@ -42,7 +66,7 @@ class String
#--
def is_utf8?
#expand NCRs to utf-8
text = self.check_ncrs
text = self.check_ncrs.as_bytes
# You might think this is faster, but it isn't
#pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)