Updated to Latest HTML5lib
Synced with latest HTML5lib. Added some RDoc-compatible documentation to the sanitizer.
This commit is contained in:
parent
8badd0766a
commit
3bf560c3b3
|
@ -29,6 +29,8 @@ module Sanitize
|
|||
#
|
||||
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
||||
# By default, the output is a string. But, optionally, you can return a REXML tree.
|
||||
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
||||
# (REXML trees are always utf-8 encoded.)
|
||||
def sanitize_xhtml(html, options = {})
|
||||
@encoding = 'utf-8'
|
||||
@treebuilder = TreeBuilders::REXML::TreeBuilder
|
||||
|
@ -55,6 +57,8 @@ module Sanitize
|
|||
#
|
||||
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
||||
# By default, the output is a string. But, optionally, you can return a REXML tree.
|
||||
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
||||
# (REXML trees are always utf-8 encoded.)
|
||||
def sanitize_html(html, options = {})
|
||||
@encoding = 'utf-8'
|
||||
@treebuilder = TreeBuilders::REXML::TreeBuilder
|
||||
|
|
|
@ -2,6 +2,12 @@
|
|||
|
||||
class String
|
||||
|
||||
# Check whether a string is valid utf-8
|
||||
#
|
||||
# :call-seq:
|
||||
# string.is_utf8? -> boolean
|
||||
#
|
||||
# returns true if the sequence of bytes in string is valid utf-8
|
||||
def is_utf8?
|
||||
self =~ /^(
|
||||
[\x09\x0A\x0D\x20-\x7E] # ASCII
|
||||
|
@ -2138,10 +2144,21 @@ class String
|
|||
'zeetrf' => 'ℨ'
|
||||
}
|
||||
|
||||
# Converts XHTML+MathML named entities to Numeric Character References
|
||||
#
|
||||
# :call-seq:
|
||||
# string.to_ncr -> string
|
||||
#
|
||||
def to_ncr
|
||||
self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
|
||||
end
|
||||
|
||||
# Converts XHTML+MathML named entities to Numeric Character References
|
||||
#
|
||||
# :call-seq:
|
||||
# string.to_ncr! -> str or nil
|
||||
#
|
||||
# Substitution is done in-place.
|
||||
def to_ncr!
|
||||
self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
|
||||
end
|
||||
|
@ -2159,6 +2176,14 @@ end
|
|||
require 'rexml/element'
|
||||
module REXML
|
||||
class Element
|
||||
|
||||
# Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References
|
||||
#
|
||||
# :call-seq:
|
||||
# elt.to_ncr -> REXML::Element
|
||||
#
|
||||
# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
|
||||
# access the resulting REXML document.
|
||||
def to_ncr
|
||||
XPath.each(self, '//*') { |el|
|
||||
el.texts.each_index {|i|
|
||||
|
|
|
@ -41,14 +41,14 @@ module HTML5lib
|
|||
super(parser, tree)
|
||||
|
||||
# for special handling of whitespace in <pre>
|
||||
@processSpaceCharactersPre = false
|
||||
@processSpaceCharactersDropNewline = false
|
||||
end
|
||||
|
||||
def processSpaceCharactersPre(data)
|
||||
def processSpaceCharactersDropNewline(data)
|
||||
#Sometimes (start of <pre> blocks) we want to drop leading newlines
|
||||
@processSpaceCharactersPre = false
|
||||
@processSpaceCharactersDropNewline = false
|
||||
if (data.length > 0 and data[0] == ?\n and
|
||||
@tree.openElements[-1].name == 'pre' and
|
||||
%w[pre textarea].include?(@tree.openElements[-1].name) and
|
||||
not @tree.openElements[-1].hasContent)
|
||||
data = data[1..-1]
|
||||
end
|
||||
|
@ -56,8 +56,8 @@ module HTML5lib
|
|||
end
|
||||
|
||||
def processSpaceCharacters(data)
|
||||
if @processSpaceCharactersPre
|
||||
processSpaceCharactersPre(data)
|
||||
if @processSpaceCharactersDropNewline
|
||||
processSpaceCharactersDropNewline(data)
|
||||
else
|
||||
super(data)
|
||||
end
|
||||
|
@ -98,7 +98,7 @@ module HTML5lib
|
|||
def startTagCloseP(name, attributes)
|
||||
endTagP('p') if in_scope?('p')
|
||||
@tree.insertElement(name, attributes)
|
||||
@processSpaceCharactersPre = true if name == 'pre'
|
||||
@processSpaceCharactersDropNewline = true if name == 'pre'
|
||||
end
|
||||
|
||||
def startTagForm(name, attributes)
|
||||
|
@ -248,6 +248,7 @@ module HTML5lib
|
|||
# XXX Form element pointer checking here as well...
|
||||
@tree.insertElement(name, attributes)
|
||||
@parser.tokenizer.contentModelFlag = :RCDATA
|
||||
@processSpaceCharactersDropNewline = true
|
||||
end
|
||||
|
||||
# iframe, noembed noframes, noscript(if scripting enabled)
|
||||
|
@ -312,7 +313,7 @@ module HTML5lib
|
|||
|
||||
def endTagBlock(name)
|
||||
#Put us back in the right whitespace handling mode
|
||||
@processSpaceCharactersPre = false if name == 'pre'
|
||||
@processSpaceCharactersDropNewline = false if name == 'pre'
|
||||
|
||||
@tree.generateImpliedEndTags if in_scope?(name)
|
||||
|
||||
|
|
137
vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
vendored
137
vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
vendored
|
@ -34,7 +34,7 @@ module HTML5lib
|
|||
options.each { |name, value| instance_variable_set("@#{name}", value) }
|
||||
|
||||
# List of where new lines occur
|
||||
@new_lines = []
|
||||
@new_lines = [0]
|
||||
|
||||
# Raw Stream
|
||||
@raw_stream = open_stream(source)
|
||||
|
@ -55,26 +55,28 @@ module HTML5lib
|
|||
|
||||
# Read bytes from stream decoding them into Unicode
|
||||
uString = @raw_stream.read
|
||||
unless @char_encoding == 'utf-8'
|
||||
if @char_encoding == 'windows-1252'
|
||||
@win1252 = true
|
||||
elsif @char_encoding != 'utf-8'
|
||||
begin
|
||||
require 'iconv'
|
||||
uString = Iconv.iconv('utf-8', @char_encoding, uString)[0]
|
||||
begin
|
||||
uString = Iconv.iconv('utf-8', @char_encoding, uString).first
|
||||
rescue
|
||||
@win1252 = true
|
||||
end
|
||||
rescue LoadError
|
||||
rescue Exception
|
||||
@win1252 = true
|
||||
end
|
||||
end
|
||||
|
||||
# Normalize newlines and null characters
|
||||
uString.gsub!(/\r\n?/, "\n")
|
||||
uString.gsub!("\x00", [0xFFFD].pack('U'))
|
||||
|
||||
# Convert the unicode string into a list to be used as the data stream
|
||||
@data_stream = uString
|
||||
|
||||
@queue = []
|
||||
|
||||
# Reset position in the list to read from
|
||||
reset
|
||||
@tell = 0
|
||||
end
|
||||
|
||||
# Produces a file object from source.
|
||||
|
@ -136,10 +138,10 @@ module HTML5lib
|
|||
def detect_bom
|
||||
bom_dict = {
|
||||
"\xef\xbb\xbf" => 'utf-8',
|
||||
"\xff\xfe" => 'utf16le',
|
||||
"\xfe\xff" => 'utf16be',
|
||||
"\xff\xfe\x00\x00" => 'utf32le',
|
||||
"\x00\x00\xfe\xff" => 'utf32be'
|
||||
"\xff\xfe" => 'utf-16le',
|
||||
"\xfe\xff" => 'utf-16be',
|
||||
"\xff\xfe\x00\x00" => 'utf-32le',
|
||||
"\x00\x00\xfe\xff" => 'utf-32be'
|
||||
}
|
||||
|
||||
# Go to beginning of file and read in 4 bytes
|
||||
|
@ -175,68 +177,72 @@ module HTML5lib
|
|||
return parser.get_encoding
|
||||
end
|
||||
|
||||
def determine_new_lines
|
||||
# Looks through the stream to find where new lines occur so
|
||||
# the position method can tell where it is.
|
||||
@new_lines.push(0)
|
||||
(0...@data_stream.length).each { |i| @new_lines.push(i) if @data_stream[i] == ?\n }
|
||||
end
|
||||
|
||||
# Returns (line, col) of the current position in the stream.
|
||||
def position
|
||||
# Generate list of new lines first time around
|
||||
determine_new_lines if @new_lines.empty?
|
||||
line = 0
|
||||
tell = @tell
|
||||
@new_lines.each do |pos|
|
||||
break unless pos < tell
|
||||
break unless pos < @tell
|
||||
line += 1
|
||||
end
|
||||
col = tell - @new_lines[line-1] - 1
|
||||
col = @tell - @new_lines[line-1] - 1
|
||||
return [line, col]
|
||||
end
|
||||
|
||||
# Resets the position in the stream back to the start.
|
||||
def reset
|
||||
@tell = 0
|
||||
end
|
||||
|
||||
# Read one character from the stream or queue if available. Return
|
||||
# EOF when EOF is reached.
|
||||
def char
|
||||
unless @queue.empty?
|
||||
return @queue.shift
|
||||
else
|
||||
c = @data_stream[@tell]
|
||||
@tell += 1
|
||||
c = @data_stream[@tell - 1]
|
||||
|
||||
case c
|
||||
when 0xC2 .. 0xDF
|
||||
if @data_stream[@tell .. @tell] =~ /[\x80-\xBF]/
|
||||
@tell += 1
|
||||
@data_stream[@tell-2..@tell-1]
|
||||
else
|
||||
[0xFFFD].pack('U')
|
||||
when 0x01 .. 0x7F
|
||||
if c == 0x0D
|
||||
# normalize newlines
|
||||
@tell += 1 if @data_stream[@tell] == 0x0A
|
||||
c = 0x0A
|
||||
end
|
||||
when 0xE0 .. 0xEF
|
||||
if @data_stream[@tell .. @tell+1] =~ /[\x80-\xBF]{2}/
|
||||
@tell += 2
|
||||
@data_stream[@tell-3..@tell-1]
|
||||
|
||||
# record where newlines occur so that the position method
|
||||
# can tell where it is
|
||||
@new_lines << @tell-1 if c == 0x0A
|
||||
|
||||
c.chr
|
||||
|
||||
when 0x80 .. 0xBF
|
||||
if !@win1252
|
||||
[0xFFFD].pack('U') # invalid utf-8
|
||||
elsif c <= 0x9f
|
||||
[ENTITIES_WINDOWS1252[c-0x80]].pack('U')
|
||||
else
|
||||
[0xFFFD].pack('U')
|
||||
"\xC2" + c.chr # convert to utf-8
|
||||
end
|
||||
when 0xF0 .. 0xF3
|
||||
if @data_stream[@tell .. @tell+2] =~ /[\x80-\xBF]{3}/
|
||||
@tell += 3
|
||||
@data_stream[@tell-4..@tell-1]
|
||||
|
||||
when 0xC0 .. 0xFF
|
||||
if @win1252
|
||||
"\xC3" + (c-64).chr # convert to utf-8
|
||||
elsif @data_stream[@tell-1 .. -1] =~ /^
|
||||
( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
||||
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
||||
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
|
||||
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
|
||||
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
|
||||
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
|
||||
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
|
||||
)/x
|
||||
@tell += $1.length - 1
|
||||
$1
|
||||
else
|
||||
[0xFFFD].pack('U')
|
||||
[0xFFFD].pack('U') # invalid utf-8
|
||||
end
|
||||
|
||||
when 0x00
|
||||
[0xFFFD].pack('U') # null characters are invalid
|
||||
|
||||
else
|
||||
begin
|
||||
c.chr
|
||||
rescue
|
||||
:EOF
|
||||
end
|
||||
:EOF
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -247,28 +253,19 @@ module HTML5lib
|
|||
def chars_until(characters, opposite=false)
|
||||
char_stack = [char]
|
||||
|
||||
unless char_stack[0] == :EOF
|
||||
while (characters.include? char_stack[-1]) == opposite
|
||||
unless @queue.empty?
|
||||
# First from the queue
|
||||
char_stack.push(@queue.shift)
|
||||
break if char_stack[-1] == :EOF
|
||||
else
|
||||
# Then the rest
|
||||
begin
|
||||
@tell += 1
|
||||
char_stack.push(@data_stream[@tell-1].chr)
|
||||
rescue
|
||||
char_stack.push(:EOF)
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
while char_stack.last != :EOF
|
||||
break unless (characters.include?(char_stack.last)) == opposite
|
||||
char_stack.push(char)
|
||||
end
|
||||
|
||||
# Put the character stopped on back to the front of the queue
|
||||
# from where it came.
|
||||
@queue.insert(0, char_stack.pop)
|
||||
c = char_stack.pop
|
||||
if c == :EOF or @data_stream[@tell-1] == c[0]
|
||||
@tell -= 1
|
||||
else
|
||||
@queue.insert(0, c)
|
||||
end
|
||||
return char_stack.join('')
|
||||
end
|
||||
end
|
||||
|
|
|
@ -68,7 +68,6 @@ module HTML5lib
|
|||
# to return we yield the token which pauses processing until the next token
|
||||
# is requested.
|
||||
def each
|
||||
@stream.reset
|
||||
@tokenQueue = []
|
||||
# Start processing. When EOF is reached @state will return false
|
||||
# instead of true and the loop will terminate.
|
||||
|
|
|
@ -12,11 +12,11 @@ class SanitizeTest < Test::Unit::TestCase
|
|||
include HTML5lib
|
||||
|
||||
def sanitize_xhtml stream
|
||||
XHTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
|
||||
XHTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).join('').gsub(/'/,'"')
|
||||
end
|
||||
|
||||
def sanitize_html stream
|
||||
HTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
|
||||
HTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).join('').gsub(/'/,'"')
|
||||
end
|
||||
|
||||
def sanitize_rexml stream
|
||||
|
@ -259,5 +259,9 @@ class SanitizeTest < Test::Unit::TestCase
|
|||
sanitize_html("<p>𝒵 𝔸</p>")
|
||||
assert_equal "<p>\360\235\222\265 \360\235\224\270</p>",
|
||||
sanitize_rexml("<p>𝒵 𝔸</p>")
|
||||
assert_equal "<p><tspan>\360\235\224\270</tspan> a</p>",
|
||||
sanitize_html("<p><tspan>\360\235\224\270</tspan> a</p>")
|
||||
assert_equal "<p><tspan>\360\235\224\270</tspan> a</p>",
|
||||
sanitize_rexml("<p><tspan>\360\235\224\270</tspan> a</p>")
|
||||
end
|
||||
end
|
||||
|
|
26
vendor/plugins/HTML5lib/tests/test_stream.rb
vendored
26
vendor/plugins/HTML5lib/tests/test_stream.rb
vendored
|
@ -6,7 +6,7 @@ class HTMLInputStreamTest < Test::Unit::TestCase
|
|||
include HTML5lib
|
||||
|
||||
def test_char_ascii
|
||||
stream = HTMLInputStream.new("'")
|
||||
stream = HTMLInputStream.new("'", :encoding=>'ascii')
|
||||
assert_equal('ascii', stream.char_encoding)
|
||||
assert_equal("'", stream.char)
|
||||
end
|
||||
|
@ -17,11 +17,21 @@ class HTMLInputStreamTest < Test::Unit::TestCase
|
|||
end
|
||||
|
||||
def test_char_utf8
|
||||
stream = HTMLInputStream.new("\xe2\x80\x98")
|
||||
stream = HTMLInputStream.new("\xe2\x80\x98", :encoding=>'utf-8')
|
||||
assert_equal('utf-8', stream.char_encoding)
|
||||
assert_equal("\xe2\x80\x98", stream.char)
|
||||
end
|
||||
|
||||
def test_char_win1252
|
||||
stream = HTMLInputStream.new("\xa2\xc5\xf1\x92\x86")
|
||||
assert_equal('windows-1252', stream.char_encoding)
|
||||
assert_equal("\xc2\xa2", stream.char)
|
||||
assert_equal("\xc3\x85", stream.char)
|
||||
assert_equal("\xc3\xb1", stream.char)
|
||||
assert_equal("\xe2\x80\x99", stream.char)
|
||||
assert_equal("\xe2\x80\xa0", stream.char)
|
||||
end
|
||||
|
||||
def test_bom
|
||||
stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
|
||||
assert_equal('utf-8', stream.char_encoding)
|
||||
|
@ -31,12 +41,6 @@ class HTMLInputStreamTest < Test::Unit::TestCase
|
|||
begin
|
||||
require 'iconv'
|
||||
|
||||
def test_char_win1252
|
||||
stream = HTMLInputStream.new("\x91")
|
||||
assert_equal('windows-1252', stream.char_encoding)
|
||||
assert_equal("\xe2\x80\x98", stream.char)
|
||||
end
|
||||
|
||||
def test_utf_16
|
||||
stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
|
||||
assert(stream.char_encoding, 'utf-16-le')
|
||||
|
@ -51,10 +55,10 @@ class HTMLInputStreamTest < Test::Unit::TestCase
|
|||
assert_equal(0, stream.instance_eval {@tell})
|
||||
assert_equal("a\nbb\n", stream.chars_until('c'))
|
||||
assert_equal(6, stream.instance_eval {@tell})
|
||||
assert_equal([3,1], stream.position)
|
||||
assert_equal([3,0], stream.position)
|
||||
assert_equal("ccc\ndddd", stream.chars_until('x'))
|
||||
assert_equal(14, stream.instance_eval {@tell})
|
||||
assert_equal([4,5], stream.position)
|
||||
assert_equal([0,1,4,8], stream.instance_eval {@new_lines})
|
||||
assert_equal([4,4], stream.position)
|
||||
assert_equal([0,1,5,9], stream.instance_eval {@new_lines})
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in a new issue