Merge branch 'bzr/golem' of /Users/distler/Sites/code/instiki
This commit is contained in:
commit
f23d892bf9
142 changed files with 519 additions and 843 deletions
|
@ -30,6 +30,7 @@ module Engines
|
|||
|
||||
class Textile < AbstractEngine
|
||||
def mask
|
||||
@content.as_utf8
|
||||
redcloth = RedCloth.new(@content, [:hard_breaks] + @content.options[:engine_opts])
|
||||
redcloth.filter_html = false
|
||||
redcloth.no_span_caps = false
|
||||
|
@ -39,6 +40,7 @@ module Engines
|
|||
|
||||
class Markdown < AbstractEngine
|
||||
def mask
|
||||
@content.as_utf8
|
||||
# If the request is for S5, call Maruku accordingly (without math)
|
||||
if @content.options[:mode] == :s5
|
||||
my_content = Maruku.new(@content.delete("\r").to_utf8,
|
||||
|
@ -56,6 +58,7 @@ module Engines
|
|||
|
||||
class MarkdownMML < AbstractEngine
|
||||
def mask
|
||||
@content.as_utf8
|
||||
# If the request is for S5, call Maruku accordingly
|
||||
if @content.options[:mode] == :s5
|
||||
my_content = Maruku.new(@content.delete("\r").to_utf8,
|
||||
|
@ -77,6 +80,7 @@ module Engines
|
|||
|
||||
class MarkdownPNG < AbstractEngine
|
||||
def mask
|
||||
@content.as_utf8
|
||||
# If the request is for S5, call Maruku accordingly
|
||||
if @content.options[:mode] == :s5
|
||||
my_content = Maruku.new(@content.delete("\r").to_utf8,
|
||||
|
@ -108,6 +112,7 @@ module Engines
|
|||
|
||||
class Mixed < AbstractEngine
|
||||
def mask
|
||||
@content.as_utf8
|
||||
redcloth = RedCloth.new(@content, @content.options[:engine_opts])
|
||||
redcloth.filter_html = false
|
||||
redcloth.no_span_caps = false
|
||||
|
@ -117,6 +122,7 @@ module Engines
|
|||
|
||||
class RDoc < AbstractEngine
|
||||
def mask
|
||||
@content.as_utf8
|
||||
html = RDocSupport::RDocFormatter.new(@content).to_html
|
||||
end
|
||||
end
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
require 'chunks/chunk'
|
||||
require 'sanitize'
|
||||
require 'sanitizer'
|
||||
|
||||
# This chunks allows certain parts of a wiki page to be hidden from the
|
||||
# rest of the rendering pipeline. It should be run at the beginning
|
||||
|
@ -17,7 +17,7 @@ require 'sanitize'
|
|||
|
||||
class NoWiki < Chunk::Abstract
|
||||
|
||||
include Sanitize
|
||||
include Sanitizer
|
||||
|
||||
NOWIKI_PATTERN = Regexp.new('<nowiki>(.*?)</nowiki>', Regexp::MULTILINE)
|
||||
def self.pattern() NOWIKI_PATTERN end
|
||||
|
@ -26,7 +26,7 @@ class NoWiki < Chunk::Abstract
|
|||
|
||||
def initialize(match_data, content)
|
||||
super
|
||||
@plain_text = @unmask_text = safe_sanitize_xhtml(match_data[1])
|
||||
@plain_text = @unmask_text = safe_xhtml_sanitize(match_data[1])
|
||||
end
|
||||
|
||||
end
|
||||
|
|
262
lib/sanitize.rb
262
lib/sanitize.rb
|
@ -1,262 +0,0 @@
|
|||
# == Introduction
|
||||
#
|
||||
# This module provides sanitization of XHTML+MathML+SVG
|
||||
# and of inline style attributes. Its genesis is {described here}[http://golem.ph.utexas.edu/~distler/blog/archives/001181.html].
|
||||
#
|
||||
# Uses the {HTML5lib parser}[http://code.google.com/p/html5lib/], so that the parsing behaviour should
|
||||
# resemble that of browsers.
|
||||
#
|
||||
# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
|
||||
# sanitize_html() is a case-insensitive sanitizer suitable for HTML
|
||||
# sanitize_rexml() sanitizes a REXML tree, returning a string
|
||||
# safe_sanitize_xhtml() makes extra-sure that the result is well-formed XHTML
|
||||
# by running the output of sanitize_xhtml() through REXML
|
||||
#
|
||||
# == Files
|
||||
#
|
||||
# {sanitize.rb}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/lib/sanitize.rb],
|
||||
# {HTML5lib}[http://golem.ph.utexas.edu/~distler/code/instiki/svn/vendor/plugins/HTML5lib/]
|
||||
#
|
||||
# == Author
|
||||
#
|
||||
# {Jacques Distler}[http://golem.ph.utexas.edu/~distler/]
|
||||
#
|
||||
# == License
|
||||
#
|
||||
# Ruby License
|
||||
|
||||
module Sanitize
|
||||
|
||||
require 'html5/html5parser'
|
||||
require 'html5/liberalxmlparser'
|
||||
require 'html5/treewalkers'
|
||||
require 'html5/treebuilders'
|
||||
require 'html5/serializer'
|
||||
require 'html5/sanitizer'
|
||||
require 'stringsupport.rb'
|
||||
|
||||
include HTML5
|
||||
|
||||
# Sanitize a string, parsed using XHTML parsing rules.
|
||||
#
|
||||
# :call-seq:
|
||||
# sanitize_xhtml(string) -> string
|
||||
# sanitize_xhtml(string, {:encoding => 'iso-8859-1', :to_tree => true}) -> REXML::Document
|
||||
#
|
||||
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
||||
# By default, the output is a string. But, optionally, you can return a REXML tree.
|
||||
#
|
||||
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
||||
# (REXML trees are always utf-8 encoded.)
|
||||
def sanitize_xhtml(html, options = {})
|
||||
@encoding = 'utf-8'
|
||||
@treebuilder = TreeBuilders::REXML::TreeBuilder
|
||||
@to_tree = false
|
||||
options.each do |name, value|
|
||||
next unless %w(encoding treebuilder to_tree).include? name.to_s
|
||||
if name.to_s == 'treebuilder'
|
||||
@treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
|
||||
else
|
||||
instance_variable_set("@#{name}", value)
|
||||
end
|
||||
end
|
||||
if @encoding == 'utf-8'
|
||||
parsed = XHTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
|
||||
:lowercase_element_name => false, :lowercase_attr_name => false,
|
||||
:encoding => @encoding, :tree => @treebuilder })
|
||||
else
|
||||
parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
|
||||
:lowercase_element_name => false, :lowercase_attr_name => false,
|
||||
:encoding => @encoding, :tree => @treebuilder })
|
||||
end
|
||||
return parsed if @to_tree
|
||||
return parsed.to_s
|
||||
end
|
||||
|
||||
# Sanitize a string, parsed using XHTML parsing rules. Reparse the result to
|
||||
# ensure well-formedness.
|
||||
#
|
||||
# :call-seq:
|
||||
# safe_sanitize_xhtml(string) -> string
|
||||
#
|
||||
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
||||
#
|
||||
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
||||
# (REXML trees are always utf-8 encoded.)
|
||||
def safe_sanitize_xhtml(html, options = {})
|
||||
options[:to_tree] = false
|
||||
sanitized = sanitize_xhtml(html, options)
|
||||
doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>")
|
||||
sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
|
||||
rescue REXML::ParseException
|
||||
sanitized = sanitized.escapeHTML
|
||||
end
|
||||
|
||||
# Sanitize a string, parsed using HTML parsing rules.
|
||||
#
|
||||
# :call-seq:
|
||||
# sanitize_html( string ) -> string
|
||||
# sanitize_html( string, {:encoding => 'iso-8859-1', :to_tree => true} ) -> REXML::Document
|
||||
#
|
||||
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
||||
# By default, the output is a string. But, optionally, you can return a REXML tree.
|
||||
#
|
||||
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
||||
# (REXML trees are always utf-8 encoded.)
|
||||
def sanitize_html(html, options = {})
|
||||
@encoding = 'utf-8'
|
||||
@treebuilder = TreeBuilders::REXML::TreeBuilder
|
||||
@to_tree = false
|
||||
options.each do |name, value|
|
||||
next unless %w(encoding treebuilder to_tree).include? name.to_s
|
||||
if name.to_s == 'treebuilder'
|
||||
@treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
|
||||
else
|
||||
instance_variable_set("@#{name}", value)
|
||||
end
|
||||
end
|
||||
if @encoding == 'utf-8'
|
||||
parsed = HTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer,
|
||||
:encoding => @encoding, :tree => @treebuilder })
|
||||
else
|
||||
parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
|
||||
:encoding => @encoding, :tree => @treebuilder })
|
||||
end
|
||||
return parsed if @to_tree
|
||||
return parsed.to_s
|
||||
end
|
||||
|
||||
# Sanitize a REXML tree. The output is a string.
|
||||
#
|
||||
# :call-seq:
|
||||
# sanitize_rexml(tree) -> string
|
||||
#
|
||||
def sanitize_rexml(tree)
|
||||
tokens = TreeWalkers.get_tree_walker('rexml2').new(tree)
|
||||
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
|
||||
:space_before_trailing_solidus => true,
|
||||
:inject_meta_charset => false,
|
||||
:sanitize => true})
|
||||
end
|
||||
end
|
||||
|
||||
require 'rexml/element'
|
||||
module REXML #:nodoc:
|
||||
class Element
|
||||
|
||||
# Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References
|
||||
#
|
||||
# :call-seq:
|
||||
# tree.to_ncr -> REXML::Element
|
||||
#
|
||||
# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
|
||||
# access the resulting REXML document.
|
||||
#
|
||||
# Note that this method needs to traverse the entire tree, converting text nodes and attributes
|
||||
# for each element. This can be SLOW. It will often be faster to serialize to a string and then
|
||||
# use String.to_ncr instead.
|
||||
#
|
||||
def to_ncr
|
||||
self.each_element { |el|
|
||||
el.texts.each_index {|i|
|
||||
el.texts[i].value = el.texts[i].to_s.to_ncr
|
||||
}
|
||||
el.attributes.each { |name,val|
|
||||
el.attributes[name] = val.to_ncr
|
||||
}
|
||||
el.to_ncr if el.has_elements?
|
||||
}
|
||||
return self
|
||||
end
|
||||
|
||||
# Convert XHTML+MathML Named Entities in a REXML::Element to UTF-8
|
||||
#
|
||||
# :call-seq:
|
||||
# tree.to_utf8 -> REXML::Element
|
||||
#
|
||||
# Note that this method needs to traverse the entire tree, converting text nodes and attributes
|
||||
# for each element. This can be SLOW. It will often be faster to serialize to a string and then
|
||||
# use String.to_utf8 instead.
|
||||
#
|
||||
def to_utf8
|
||||
self.each_element { |el|
|
||||
el.texts.each_index {|i|
|
||||
el.texts[i].value = el.texts[i].to_s.to_utf8
|
||||
}
|
||||
el.attributes.each { |name,val|
|
||||
el.attributes[name] = val.to_utf8
|
||||
}
|
||||
el.to_utf8 if el.has_elements?
|
||||
}
|
||||
return self
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
module HTML5 #:nodoc: all
|
||||
module TreeWalkers
|
||||
|
||||
private
|
||||
|
||||
class << self
|
||||
def [](name)
|
||||
case name.to_s.downcase
|
||||
when 'rexml'
|
||||
require 'html5/treewalkers/rexml'
|
||||
REXML::TreeWalker
|
||||
when 'rexml2'
|
||||
REXML2::TreeWalker
|
||||
else
|
||||
raise "Unknown TreeWalker #{name}"
|
||||
end
|
||||
end
|
||||
|
||||
alias :get_tree_walker :[]
|
||||
end
|
||||
|
||||
module REXML2
|
||||
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
|
||||
|
||||
private
|
||||
|
||||
def node_details(node)
|
||||
case node
|
||||
when ::REXML::Document
|
||||
[:DOCUMENT]
|
||||
when ::REXML::Element
|
||||
if !node.name
|
||||
[:DOCUMENT_FRAGMENT]
|
||||
else
|
||||
[:ELEMENT, node.name,
|
||||
node.attributes.map {|name,value| [name,value.to_utf8]},
|
||||
node.has_elements? || node.has_text?]
|
||||
end
|
||||
when ::REXML::Text
|
||||
[:TEXT, node.value.to_utf8]
|
||||
when ::REXML::Comment
|
||||
[:COMMENT, node.string]
|
||||
when ::REXML::DocType
|
||||
[:DOCTYPE, node.name, node.public, node.system]
|
||||
when ::REXML::XMLDecl
|
||||
[nil]
|
||||
else
|
||||
[:UNKNOWN, node.class.inspect]
|
||||
end
|
||||
end
|
||||
|
||||
def first_child(node)
|
||||
node.children.first
|
||||
end
|
||||
|
||||
def next_sibling(node)
|
||||
node.next_sibling
|
||||
end
|
||||
|
||||
def parent(node)
|
||||
node.parent
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -169,7 +169,7 @@ module Sanitizer
|
|||
node.attributes.delete attr; next
|
||||
end
|
||||
if ATTR_VAL_IS_URI.include?(attr)
|
||||
val_unescaped = val.unescapeHTML.gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
|
||||
val_unescaped = val.unescapeHTML.as_bytes.gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
|
||||
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
|
||||
node.attributes.delete attr; next
|
||||
end
|
||||
|
@ -206,4 +206,23 @@ module Sanitizer
|
|||
|
||||
clean.join(' ')
|
||||
end
|
||||
|
||||
# Sanitize a string, parsed using XHTML parsing rules. Reparse the result to
|
||||
# ensure well-formedness.
|
||||
#
|
||||
# :call-seq:
|
||||
# safe_sanitize_xhtml(string) -> string
|
||||
#
|
||||
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
||||
#
|
||||
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
||||
# (REXML trees are always utf-8 encoded.)
|
||||
def safe_xhtml_sanitize(html, options = {})
|
||||
sanitized = xhtml_sanitize(html.purify)
|
||||
doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>")
|
||||
sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
|
||||
rescue REXML::ParseException
|
||||
sanitized = sanitized.escapeHTML
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -2,6 +2,26 @@
|
|||
|
||||
class String
|
||||
|
||||
# A method to allow byte-oriented operations in both Ruby 1.8 and Ruby 1.9
|
||||
#
|
||||
# Under 1.8, this is a NOOP. Under 1.9, it sets the encoding to "ASCII-8BIT"
|
||||
#--
|
||||
def as_bytes
|
||||
force_encoding("ASCII-8BIT") if self.respond_to?(:force_encoding)
|
||||
self
|
||||
end
|
||||
|
||||
#++
|
||||
# A method to allow string-oriented operations in both Ruby 1.8 and Ruby 1.9
|
||||
#
|
||||
# Under 1.8, this is a NOOP. Under 1.9, it sets the encoding to "UTF-8"
|
||||
#--
|
||||
def as_utf8
|
||||
force_encoding("UTF-8") if self.respond_to?(:force_encoding)
|
||||
self
|
||||
end
|
||||
|
||||
#++
|
||||
# Take a string, and remove any invalid substrings, returning a valid utf-8 string.
|
||||
#
|
||||
# :call-seq:
|
||||
|
@ -11,12 +31,16 @@ class String
|
|||
#--
|
||||
def purify
|
||||
text = check_ncrs
|
||||
text.split(//u).grep(UTF8_REGEX).join
|
||||
if text.respond_to?(:encoding)
|
||||
text.split(//).collect{|c| c.as_bytes}.grep(UTF8_REGEX).join.as_utf8
|
||||
else
|
||||
text.split(//u).grep(UTF8_REGEX).join
|
||||
end
|
||||
end
|
||||
|
||||
def check_ncrs
|
||||
text = gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*') =~ UTF8_REGEX ? m : '' }
|
||||
text.gsub(/&#(\d+);/) { |m| [$1.to_i].pack('U*') =~ UTF8_REGEX ? m : '' }
|
||||
text = gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' }
|
||||
text.gsub(/&#(\d+);/) { |m| [$1.to_i].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' }
|
||||
end
|
||||
|
||||
UTF8_REGEX = /\A(
|
||||
|
@ -42,7 +66,7 @@ class String
|
|||
#--
|
||||
def is_utf8?
|
||||
#expand NCRs to utf-8
|
||||
text = self.check_ncrs
|
||||
text = self.check_ncrs.as_bytes
|
||||
|
||||
# You might think this is faster, but it isn't
|
||||
#pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue