Merge branch 'bzr/golem' of /Users/distler/Sites/code/instiki

This commit is contained in:
Jacques Distler 2009-11-30 16:35:46 -06:00
commit f23d892bf9
142 changed files with 519 additions and 843 deletions

View file

@ -30,6 +30,7 @@ module Engines
class Textile < AbstractEngine
def mask
@content.as_utf8
redcloth = RedCloth.new(@content, [:hard_breaks] + @content.options[:engine_opts])
redcloth.filter_html = false
redcloth.no_span_caps = false
@ -39,6 +40,7 @@ module Engines
class Markdown < AbstractEngine
def mask
@content.as_utf8
# If the request is for S5, call Maruku accordingly (without math)
if @content.options[:mode] == :s5
my_content = Maruku.new(@content.delete("\r").to_utf8,
@ -56,6 +58,7 @@ module Engines
class MarkdownMML < AbstractEngine
def mask
@content.as_utf8
# If the request is for S5, call Maruku accordingly
if @content.options[:mode] == :s5
my_content = Maruku.new(@content.delete("\r").to_utf8,
@ -77,6 +80,7 @@ module Engines
class MarkdownPNG < AbstractEngine
def mask
@content.as_utf8
# If the request is for S5, call Maruku accordingly
if @content.options[:mode] == :s5
my_content = Maruku.new(@content.delete("\r").to_utf8,
@ -108,6 +112,7 @@ module Engines
class Mixed < AbstractEngine
def mask
@content.as_utf8
redcloth = RedCloth.new(@content, @content.options[:engine_opts])
redcloth.filter_html = false
redcloth.no_span_caps = false
@ -117,6 +122,7 @@ module Engines
class RDoc < AbstractEngine
def mask
@content.as_utf8
html = RDocSupport::RDocFormatter.new(@content).to_html
end
end

View file

@ -1,5 +1,5 @@
require 'chunks/chunk'
require 'sanitize'
require 'sanitizer'
# This chunks allows certain parts of a wiki page to be hidden from the
# rest of the rendering pipeline. It should be run at the beginning
@ -17,7 +17,7 @@ require 'sanitize'
class NoWiki < Chunk::Abstract
include Sanitize
include Sanitizer
NOWIKI_PATTERN = Regexp.new('<nowiki>(.*?)</nowiki>', Regexp::MULTILINE)
def self.pattern() NOWIKI_PATTERN end
@ -26,7 +26,7 @@ class NoWiki < Chunk::Abstract
def initialize(match_data, content)
super
@plain_text = @unmask_text = safe_sanitize_xhtml(match_data[1])
@plain_text = @unmask_text = safe_xhtml_sanitize(match_data[1])
end
end

View file

@ -1,262 +0,0 @@
# == Introduction
#
# This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes. Its genesis is {described here}[http://golem.ph.utexas.edu/~distler/blog/archives/001181.html].
#
# Uses the {HTML5lib parser}[http://code.google.com/p/html5lib/], so that the parsing behaviour should
# resemble that of browsers.
#
# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
# sanitize_html() is a case-insensitive sanitizer suitable for HTML
# sanitize_rexml() sanitizes a REXML tree, returning a string
# safe_sanitize_xhtml() makes extra-sure that the result is well-formed XHTML
# by running the output of sanitize_xhtml() through REXML
#
# == Files
#
# {sanitize.rb}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/lib/sanitize.rb],
# {HTML5lib}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/vendor/plugins/HTML5lib/]
#
# == Author
#
# {Jacques Distler}[http://golem.ph.utexas.edu/~distler/]
#
# == License
#
# Ruby License
module Sanitize
require 'html5/html5parser'
require 'html5/liberalxmlparser'
require 'html5/treewalkers'
require 'html5/treebuilders'
require 'html5/serializer'
require 'html5/sanitizer'
require 'stringsupport.rb'
include HTML5
# Sanitize a string, parsed using XHTML parsing rules.
#
# :call-seq:
# sanitize_xhtml(string) -> string
# sanitize_xhtml(string, {:encoding => 'iso-8859-1', :to_tree => true}) -> REXML::Document
#
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
# By default, the output is a string. But, optionally, you can return a REXML tree.
#
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
# (REXML trees are always utf-8 encoded.)
def sanitize_xhtml(html, options = {})
@encoding = 'utf-8'
@treebuilder = TreeBuilders::REXML::TreeBuilder
@to_tree = false
options.each do |name, value|
next unless %w(encoding treebuilder to_tree).include? name.to_s
if name.to_s == 'treebuilder'
@treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
else
instance_variable_set("@#{name}", value)
end
end
if @encoding == 'utf-8'
parsed = XHTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
:lowercase_element_name => false, :lowercase_attr_name => false,
:encoding => @encoding, :tree => @treebuilder })
else
parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
:lowercase_element_name => false, :lowercase_attr_name => false,
:encoding => @encoding, :tree => @treebuilder })
end
return parsed if @to_tree
return parsed.to_s
end
# Sanitize a string, parsed using XHTML parsing rules. Reparse the result to
# ensure well-formedness.
#
# :call-seq:
# safe_sanitize_xhtml(string) -> string
#
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
#
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
# (REXML trees are always utf-8 encoded.)
def safe_sanitize_xhtml(html, options = {})
options[:to_tree] = false
sanitized = sanitize_xhtml(html, options)
doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>")
sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
rescue REXML::ParseException
sanitized = sanitized.escapeHTML
end
# Sanitize a string, parsed using HTML parsing rules.
#
# :call-seq:
# sanitize_html( string ) -> string
# sanitize_html( string, {:encoding => 'iso-8859-1', :to_tree => true} ) -> REXML::Document
#
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
# By default, the output is a string. But, optionally, you can return a REXML tree.
#
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
# (REXML trees are always utf-8 encoded.)
def sanitize_html(html, options = {})
@encoding = 'utf-8'
@treebuilder = TreeBuilders::REXML::TreeBuilder
@to_tree = false
options.each do |name, value|
next unless %w(encoding treebuilder to_tree).include? name.to_s
if name.to_s == 'treebuilder'
@treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
else
instance_variable_set("@#{name}", value)
end
end
if @encoding == 'utf-8'
parsed = HTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
:encoding => @encoding, :tree => @treebuilder })
else
parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
:encoding => @encoding, :tree => @treebuilder })
end
return parsed if @to_tree
return parsed.to_s
end
# Sanitize a REXML tree. The output is a string.
#
# :call-seq:
# sanitize_rexml(tree) -> string
#
def sanitize_rexml(tree)
tokens = TreeWalkers.get_tree_walker('rexml2').new(tree)
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
:space_before_trailing_solidus => true,
:inject_meta_charset => false,
:sanitize => true})
end
end
require 'rexml/element'
module REXML #:nodoc:
class Element
# Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References
#
# :call-seq:
# tree.to_ncr -> REXML::Element
#
# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
# access the resulting REXML document.
#
# Note that this method needs to traverse the entire tree, converting text nodes and attributes
# for each element. This can be SLOW. It will often be faster to serialize to a string and then
# use String.to_ncr instead.
#
def to_ncr
self.each_element { |el|
el.texts.each_index {|i|
el.texts[i].value = el.texts[i].to_s.to_ncr
}
el.attributes.each { |name,val|
el.attributes[name] = val.to_ncr
}
el.to_ncr if el.has_elements?
}
return self
end
# Convert XHTML+MathML Named Entities in a REXML::Element to UTF-8
#
# :call-seq:
# tree.to_utf8 -> REXML::Element
#
# Note that this method needs to traverse the entire tree, converting text nodes and attributes
# for each element. This can be SLOW. It will often be faster to serialize to a string and then
# use String.to_utf8 instead.
#
def to_utf8
self.each_element { |el|
el.texts.each_index {|i|
el.texts[i].value = el.texts[i].to_s.to_utf8
}
el.attributes.each { |name,val|
el.attributes[name] = val.to_utf8
}
el.to_utf8 if el.has_elements?
}
return self
end
end
end
module HTML5 #:nodoc: all
module TreeWalkers
private
class << self
def [](name)
case name.to_s.downcase
when 'rexml'
require 'html5/treewalkers/rexml'
REXML::TreeWalker
when 'rexml2'
REXML2::TreeWalker
else
raise "Unknown TreeWalker #{name}"
end
end
alias :get_tree_walker :[]
end
module REXML2
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
private
def node_details(node)
case node
when ::REXML::Document
[:DOCUMENT]
when ::REXML::Element
if !node.name
[:DOCUMENT_FRAGMENT]
else
[:ELEMENT, node.name,
node.attributes.map {|name,value| [name,value.to_utf8]},
node.has_elements? || node.has_text?]
end
when ::REXML::Text
[:TEXT, node.value.to_utf8]
when ::REXML::Comment
[:COMMENT, node.string]
when ::REXML::DocType
[:DOCTYPE, node.name, node.public, node.system]
when ::REXML::XMLDecl
[nil]
else
[:UNKNOWN, node.class.inspect]
end
end
def first_child(node)
node.children.first
end
def next_sibling(node)
node.next_sibling
end
def parent(node)
node.parent
end
end
end
end
end

View file

@ -169,7 +169,7 @@ module Sanitizer
node.attributes.delete attr; next
end
if ATTR_VAL_IS_URI.include?(attr)
val_unescaped = val.unescapeHTML.gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
val_unescaped = val.unescapeHTML.as_bytes.gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
node.attributes.delete attr; next
end
@ -206,4 +206,23 @@ module Sanitizer
clean.join(' ')
end
# Sanitize a string, parsed using XHTML parsing rules. Reparse the result to
# ensure well-formedness.
#
# :call-seq:
# safe_sanitize_xhtml(string) -> string
#
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
#
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
# (REXML trees are always utf-8 encoded.)
def safe_xhtml_sanitize(html, options = {})
sanitized = xhtml_sanitize(html.purify)
doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>")
sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
rescue REXML::ParseException
sanitized = sanitized.escapeHTML
end
end

View file

@ -2,6 +2,26 @@
class String
# A method to allow byte-oriented operations in both Ruby 1.8 and Ruby 1.9
#
# Under 1.8, this is a NOOP. Under 1.9, it sets the encoding to "ASCII-8BIT"
#--
def as_bytes
force_encoding("ASCII-8BIT") if self.respond_to?(:force_encoding)
self
end
#++
# A method to allow string-oriented operations in both Ruby 1.8 and Ruby 1.9
#
# Under 1.8, this is a NOOP. Under 1.9, it sets the encoding to "UTF-8"
#--
def as_utf8
force_encoding("UTF-8") if self.respond_to?(:force_encoding)
self
end
#++
# Take a string, and remove any invalid substrings, returning a valid utf-8 string.
#
# :call-seq:
@ -11,12 +31,16 @@ class String
#--
def purify
text = check_ncrs
text.split(//u).grep(UTF8_REGEX).join
if text.respond_to?(:encoding)
text.split(//).collect{|c| c.as_bytes}.grep(UTF8_REGEX).join.as_utf8
else
text.split(//u).grep(UTF8_REGEX).join
end
end
def check_ncrs
text = gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*') =~ UTF8_REGEX ? m : '' }
text.gsub(/&#(\d+);/) { |m| [$1.to_i].pack('U*') =~ UTF8_REGEX ? m : '' }
text = gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' }
text.gsub(/&#(\d+);/) { |m| [$1.to_i].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' }
end
UTF8_REGEX = /\A(
@ -42,7 +66,7 @@ class String
#--
def is_utf8?
#expand NCRs to utf-8
text = self.check_ncrs
text = self.check_ncrs.as_bytes
# You might think this is faster, but it isn't
#pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)