Updated to Latest HTML5lib
Synced with latest HTML5lib. Added some RDoc-compatible documentation to the sanitizer.
This commit is contained in:
parent
8badd0766a
commit
3bf560c3b3
7 changed files with 127 additions and 93 deletions
|
@ -29,6 +29,8 @@ module Sanitize
|
||||||
#
|
#
|
||||||
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
||||||
# By default, the output is a string. But, optionally, you can return a REXML tree.
|
# By default, the output is a string. But, optionally, you can return a REXML tree.
|
||||||
|
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
||||||
|
# (REXML trees are always utf-8 encoded.)
|
||||||
def sanitize_xhtml(html, options = {})
|
def sanitize_xhtml(html, options = {})
|
||||||
@encoding = 'utf-8'
|
@encoding = 'utf-8'
|
||||||
@treebuilder = TreeBuilders::REXML::TreeBuilder
|
@treebuilder = TreeBuilders::REXML::TreeBuilder
|
||||||
|
@ -55,6 +57,8 @@ module Sanitize
|
||||||
#
|
#
|
||||||
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
|
||||||
# By default, the output is a string. But, optionally, you can return a REXML tree.
|
# By default, the output is a string. But, optionally, you can return a REXML tree.
|
||||||
|
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
|
||||||
|
# (REXML trees are always utf-8 encoded.)
|
||||||
def sanitize_html(html, options = {})
|
def sanitize_html(html, options = {})
|
||||||
@encoding = 'utf-8'
|
@encoding = 'utf-8'
|
||||||
@treebuilder = TreeBuilders::REXML::TreeBuilder
|
@treebuilder = TreeBuilders::REXML::TreeBuilder
|
||||||
|
|
|
@ -2,6 +2,12 @@
|
||||||
|
|
||||||
class String
|
class String
|
||||||
|
|
||||||
|
# Check whether a string is valid utf-8
|
||||||
|
#
|
||||||
|
# :call-seq:
|
||||||
|
# string.is_utf8? -> boolean
|
||||||
|
#
|
||||||
|
# returns true if the sequence of bytes in string is valid utf-8
|
||||||
def is_utf8?
|
def is_utf8?
|
||||||
self =~ /^(
|
self =~ /^(
|
||||||
[\x09\x0A\x0D\x20-\x7E] # ASCII
|
[\x09\x0A\x0D\x20-\x7E] # ASCII
|
||||||
|
@ -2138,10 +2144,21 @@ class String
|
||||||
'zeetrf' => 'ℨ'
|
'zeetrf' => 'ℨ'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Converts XHTML+MathML named entities to Numeric Character References
|
||||||
|
#
|
||||||
|
# :call-seq:
|
||||||
|
# string.to_ncr -> string
|
||||||
|
#
|
||||||
def to_ncr
|
def to_ncr
|
||||||
self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
|
self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Converts XHTML+MathML named entities to Numeric Character References
|
||||||
|
#
|
||||||
|
# :call-seq:
|
||||||
|
# string.to_ncr! -> str or nil
|
||||||
|
#
|
||||||
|
# Substitution is done in-place.
|
||||||
def to_ncr!
|
def to_ncr!
|
||||||
self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
|
self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
|
||||||
end
|
end
|
||||||
|
@ -2159,6 +2176,14 @@ end
|
||||||
require 'rexml/element'
|
require 'rexml/element'
|
||||||
module REXML
|
module REXML
|
||||||
class Element
|
class Element
|
||||||
|
|
||||||
|
# Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References
|
||||||
|
#
|
||||||
|
# :call-seq:
|
||||||
|
# elt.to_ncr -> REXML::Element
|
||||||
|
#
|
||||||
|
# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
|
||||||
|
# access the resulting REXML document.
|
||||||
def to_ncr
|
def to_ncr
|
||||||
XPath.each(self, '//*') { |el|
|
XPath.each(self, '//*') { |el|
|
||||||
el.texts.each_index {|i|
|
el.texts.each_index {|i|
|
||||||
|
|
|
@ -41,14 +41,14 @@ module HTML5lib
|
||||||
super(parser, tree)
|
super(parser, tree)
|
||||||
|
|
||||||
# for special handling of whitespace in <pre>
|
# for special handling of whitespace in <pre>
|
||||||
@processSpaceCharactersPre = false
|
@processSpaceCharactersDropNewline = false
|
||||||
end
|
end
|
||||||
|
|
||||||
def processSpaceCharactersPre(data)
|
def processSpaceCharactersDropNewline(data)
|
||||||
#Sometimes (start of <pre> blocks) we want to drop leading newlines
|
#Sometimes (start of <pre> blocks) we want to drop leading newlines
|
||||||
@processSpaceCharactersPre = false
|
@processSpaceCharactersDropNewline = false
|
||||||
if (data.length > 0 and data[0] == ?\n and
|
if (data.length > 0 and data[0] == ?\n and
|
||||||
@tree.openElements[-1].name == 'pre' and
|
%w[pre textarea].include?(@tree.openElements[-1].name) and
|
||||||
not @tree.openElements[-1].hasContent)
|
not @tree.openElements[-1].hasContent)
|
||||||
data = data[1..-1]
|
data = data[1..-1]
|
||||||
end
|
end
|
||||||
|
@ -56,8 +56,8 @@ module HTML5lib
|
||||||
end
|
end
|
||||||
|
|
||||||
def processSpaceCharacters(data)
|
def processSpaceCharacters(data)
|
||||||
if @processSpaceCharactersPre
|
if @processSpaceCharactersDropNewline
|
||||||
processSpaceCharactersPre(data)
|
processSpaceCharactersDropNewline(data)
|
||||||
else
|
else
|
||||||
super(data)
|
super(data)
|
||||||
end
|
end
|
||||||
|
@ -98,7 +98,7 @@ module HTML5lib
|
||||||
def startTagCloseP(name, attributes)
|
def startTagCloseP(name, attributes)
|
||||||
endTagP('p') if in_scope?('p')
|
endTagP('p') if in_scope?('p')
|
||||||
@tree.insertElement(name, attributes)
|
@tree.insertElement(name, attributes)
|
||||||
@processSpaceCharactersPre = true if name == 'pre'
|
@processSpaceCharactersDropNewline = true if name == 'pre'
|
||||||
end
|
end
|
||||||
|
|
||||||
def startTagForm(name, attributes)
|
def startTagForm(name, attributes)
|
||||||
|
@ -248,6 +248,7 @@ module HTML5lib
|
||||||
# XXX Form element pointer checking here as well...
|
# XXX Form element pointer checking here as well...
|
||||||
@tree.insertElement(name, attributes)
|
@tree.insertElement(name, attributes)
|
||||||
@parser.tokenizer.contentModelFlag = :RCDATA
|
@parser.tokenizer.contentModelFlag = :RCDATA
|
||||||
|
@processSpaceCharactersDropNewline = true
|
||||||
end
|
end
|
||||||
|
|
||||||
# iframe, noembed noframes, noscript(if scripting enabled)
|
# iframe, noembed noframes, noscript(if scripting enabled)
|
||||||
|
@ -312,7 +313,7 @@ module HTML5lib
|
||||||
|
|
||||||
def endTagBlock(name)
|
def endTagBlock(name)
|
||||||
#Put us back in the right whitespace handling mode
|
#Put us back in the right whitespace handling mode
|
||||||
@processSpaceCharactersPre = false if name == 'pre'
|
@processSpaceCharactersDropNewline = false if name == 'pre'
|
||||||
|
|
||||||
@tree.generateImpliedEndTags if in_scope?(name)
|
@tree.generateImpliedEndTags if in_scope?(name)
|
||||||
|
|
||||||
|
|
137
vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
vendored
137
vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
vendored
|
@ -34,7 +34,7 @@ module HTML5lib
|
||||||
options.each { |name, value| instance_variable_set("@#{name}", value) }
|
options.each { |name, value| instance_variable_set("@#{name}", value) }
|
||||||
|
|
||||||
# List of where new lines occur
|
# List of where new lines occur
|
||||||
@new_lines = []
|
@new_lines = [0]
|
||||||
|
|
||||||
# Raw Stream
|
# Raw Stream
|
||||||
@raw_stream = open_stream(source)
|
@raw_stream = open_stream(source)
|
||||||
|
@ -55,26 +55,28 @@ module HTML5lib
|
||||||
|
|
||||||
# Read bytes from stream decoding them into Unicode
|
# Read bytes from stream decoding them into Unicode
|
||||||
uString = @raw_stream.read
|
uString = @raw_stream.read
|
||||||
unless @char_encoding == 'utf-8'
|
if @char_encoding == 'windows-1252'
|
||||||
|
@win1252 = true
|
||||||
|
elsif @char_encoding != 'utf-8'
|
||||||
begin
|
begin
|
||||||
require 'iconv'
|
require 'iconv'
|
||||||
uString = Iconv.iconv('utf-8', @char_encoding, uString)[0]
|
begin
|
||||||
|
uString = Iconv.iconv('utf-8', @char_encoding, uString).first
|
||||||
|
rescue
|
||||||
|
@win1252 = true
|
||||||
|
end
|
||||||
rescue LoadError
|
rescue LoadError
|
||||||
rescue Exception
|
@win1252 = true
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# Normalize newlines and null characters
|
|
||||||
uString.gsub!(/\r\n?/, "\n")
|
|
||||||
uString.gsub!("\x00", [0xFFFD].pack('U'))
|
|
||||||
|
|
||||||
# Convert the unicode string into a list to be used as the data stream
|
# Convert the unicode string into a list to be used as the data stream
|
||||||
@data_stream = uString
|
@data_stream = uString
|
||||||
|
|
||||||
@queue = []
|
@queue = []
|
||||||
|
|
||||||
# Reset position in the list to read from
|
# Reset position in the list to read from
|
||||||
reset
|
@tell = 0
|
||||||
end
|
end
|
||||||
|
|
||||||
# Produces a file object from source.
|
# Produces a file object from source.
|
||||||
|
@ -136,10 +138,10 @@ module HTML5lib
|
||||||
def detect_bom
|
def detect_bom
|
||||||
bom_dict = {
|
bom_dict = {
|
||||||
"\xef\xbb\xbf" => 'utf-8',
|
"\xef\xbb\xbf" => 'utf-8',
|
||||||
"\xff\xfe" => 'utf16le',
|
"\xff\xfe" => 'utf-16le',
|
||||||
"\xfe\xff" => 'utf16be',
|
"\xfe\xff" => 'utf-16be',
|
||||||
"\xff\xfe\x00\x00" => 'utf32le',
|
"\xff\xfe\x00\x00" => 'utf-32le',
|
||||||
"\x00\x00\xfe\xff" => 'utf32be'
|
"\x00\x00\xfe\xff" => 'utf-32be'
|
||||||
}
|
}
|
||||||
|
|
||||||
# Go to beginning of file and read in 4 bytes
|
# Go to beginning of file and read in 4 bytes
|
||||||
|
@ -175,68 +177,72 @@ module HTML5lib
|
||||||
return parser.get_encoding
|
return parser.get_encoding
|
||||||
end
|
end
|
||||||
|
|
||||||
def determine_new_lines
|
|
||||||
# Looks through the stream to find where new lines occur so
|
|
||||||
# the position method can tell where it is.
|
|
||||||
@new_lines.push(0)
|
|
||||||
(0...@data_stream.length).each { |i| @new_lines.push(i) if @data_stream[i] == ?\n }
|
|
||||||
end
|
|
||||||
|
|
||||||
# Returns (line, col) of the current position in the stream.
|
# Returns (line, col) of the current position in the stream.
|
||||||
def position
|
def position
|
||||||
# Generate list of new lines first time around
|
|
||||||
determine_new_lines if @new_lines.empty?
|
|
||||||
line = 0
|
line = 0
|
||||||
tell = @tell
|
|
||||||
@new_lines.each do |pos|
|
@new_lines.each do |pos|
|
||||||
break unless pos < tell
|
break unless pos < @tell
|
||||||
line += 1
|
line += 1
|
||||||
end
|
end
|
||||||
col = tell - @new_lines[line-1] - 1
|
col = @tell - @new_lines[line-1] - 1
|
||||||
return [line, col]
|
return [line, col]
|
||||||
end
|
end
|
||||||
|
|
||||||
# Resets the position in the stream back to the start.
|
|
||||||
def reset
|
|
||||||
@tell = 0
|
|
||||||
end
|
|
||||||
|
|
||||||
# Read one character from the stream or queue if available. Return
|
# Read one character from the stream or queue if available. Return
|
||||||
# EOF when EOF is reached.
|
# EOF when EOF is reached.
|
||||||
def char
|
def char
|
||||||
unless @queue.empty?
|
unless @queue.empty?
|
||||||
return @queue.shift
|
return @queue.shift
|
||||||
else
|
else
|
||||||
|
c = @data_stream[@tell]
|
||||||
@tell += 1
|
@tell += 1
|
||||||
c = @data_stream[@tell - 1]
|
|
||||||
case c
|
case c
|
||||||
when 0xC2 .. 0xDF
|
when 0x01 .. 0x7F
|
||||||
if @data_stream[@tell .. @tell] =~ /[\x80-\xBF]/
|
if c == 0x0D
|
||||||
@tell += 1
|
# normalize newlines
|
||||||
@data_stream[@tell-2..@tell-1]
|
@tell += 1 if @data_stream[@tell] == 0x0A
|
||||||
else
|
c = 0x0A
|
||||||
[0xFFFD].pack('U')
|
|
||||||
end
|
end
|
||||||
when 0xE0 .. 0xEF
|
|
||||||
if @data_stream[@tell .. @tell+1] =~ /[\x80-\xBF]{2}/
|
# record where newlines occur so that the position method
|
||||||
@tell += 2
|
# can tell where it is
|
||||||
@data_stream[@tell-3..@tell-1]
|
@new_lines << @tell-1 if c == 0x0A
|
||||||
|
|
||||||
|
c.chr
|
||||||
|
|
||||||
|
when 0x80 .. 0xBF
|
||||||
|
if !@win1252
|
||||||
|
[0xFFFD].pack('U') # invalid utf-8
|
||||||
|
elsif c <= 0x9f
|
||||||
|
[ENTITIES_WINDOWS1252[c-0x80]].pack('U')
|
||||||
else
|
else
|
||||||
[0xFFFD].pack('U')
|
"\xC2" + c.chr # convert to utf-8
|
||||||
end
|
end
|
||||||
when 0xF0 .. 0xF3
|
|
||||||
if @data_stream[@tell .. @tell+2] =~ /[\x80-\xBF]{3}/
|
when 0xC0 .. 0xFF
|
||||||
@tell += 3
|
if @win1252
|
||||||
@data_stream[@tell-4..@tell-1]
|
"\xC3" + (c-64).chr # convert to utf-8
|
||||||
|
elsif @data_stream[@tell-1 .. -1] =~ /^
|
||||||
|
( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
||||||
|
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
||||||
|
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
|
||||||
|
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
|
||||||
|
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
|
||||||
|
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
|
||||||
|
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
|
||||||
|
)/x
|
||||||
|
@tell += $1.length - 1
|
||||||
|
$1
|
||||||
else
|
else
|
||||||
[0xFFFD].pack('U')
|
[0xFFFD].pack('U') # invalid utf-8
|
||||||
end
|
end
|
||||||
|
|
||||||
|
when 0x00
|
||||||
|
[0xFFFD].pack('U') # null characters are invalid
|
||||||
|
|
||||||
else
|
else
|
||||||
begin
|
:EOF
|
||||||
c.chr
|
|
||||||
rescue
|
|
||||||
:EOF
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@ -247,28 +253,19 @@ module HTML5lib
|
||||||
def chars_until(characters, opposite=false)
|
def chars_until(characters, opposite=false)
|
||||||
char_stack = [char]
|
char_stack = [char]
|
||||||
|
|
||||||
unless char_stack[0] == :EOF
|
while char_stack.last != :EOF
|
||||||
while (characters.include? char_stack[-1]) == opposite
|
break unless (characters.include?(char_stack.last)) == opposite
|
||||||
unless @queue.empty?
|
char_stack.push(char)
|
||||||
# First from the queue
|
|
||||||
char_stack.push(@queue.shift)
|
|
||||||
break if char_stack[-1] == :EOF
|
|
||||||
else
|
|
||||||
# Then the rest
|
|
||||||
begin
|
|
||||||
@tell += 1
|
|
||||||
char_stack.push(@data_stream[@tell-1].chr)
|
|
||||||
rescue
|
|
||||||
char_stack.push(:EOF)
|
|
||||||
break
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Put the character stopped on back to the front of the queue
|
# Put the character stopped on back to the front of the queue
|
||||||
# from where it came.
|
# from where it came.
|
||||||
@queue.insert(0, char_stack.pop)
|
c = char_stack.pop
|
||||||
|
if c == :EOF or @data_stream[@tell-1] == c[0]
|
||||||
|
@tell -= 1
|
||||||
|
else
|
||||||
|
@queue.insert(0, c)
|
||||||
|
end
|
||||||
return char_stack.join('')
|
return char_stack.join('')
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -68,7 +68,6 @@ module HTML5lib
|
||||||
# to return we yield the token which pauses processing until the next token
|
# to return we yield the token which pauses processing until the next token
|
||||||
# is requested.
|
# is requested.
|
||||||
def each
|
def each
|
||||||
@stream.reset
|
|
||||||
@tokenQueue = []
|
@tokenQueue = []
|
||||||
# Start processing. When EOF is reached @state will return false
|
# Start processing. When EOF is reached @state will return false
|
||||||
# instead of true and the loop will terminate.
|
# instead of true and the loop will terminate.
|
||||||
|
|
|
@ -12,11 +12,11 @@ class SanitizeTest < Test::Unit::TestCase
|
||||||
include HTML5lib
|
include HTML5lib
|
||||||
|
|
||||||
def sanitize_xhtml stream
|
def sanitize_xhtml stream
|
||||||
XHTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
|
XHTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).join('').gsub(/'/,'"')
|
||||||
end
|
end
|
||||||
|
|
||||||
def sanitize_html stream
|
def sanitize_html stream
|
||||||
HTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
|
HTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).join('').gsub(/'/,'"')
|
||||||
end
|
end
|
||||||
|
|
||||||
def sanitize_rexml stream
|
def sanitize_rexml stream
|
||||||
|
@ -259,5 +259,9 @@ class SanitizeTest < Test::Unit::TestCase
|
||||||
sanitize_html("<p>𝒵 𝔸</p>")
|
sanitize_html("<p>𝒵 𝔸</p>")
|
||||||
assert_equal "<p>\360\235\222\265 \360\235\224\270</p>",
|
assert_equal "<p>\360\235\222\265 \360\235\224\270</p>",
|
||||||
sanitize_rexml("<p>𝒵 𝔸</p>")
|
sanitize_rexml("<p>𝒵 𝔸</p>")
|
||||||
|
assert_equal "<p><tspan>\360\235\224\270</tspan> a</p>",
|
||||||
|
sanitize_html("<p><tspan>\360\235\224\270</tspan> a</p>")
|
||||||
|
assert_equal "<p><tspan>\360\235\224\270</tspan> a</p>",
|
||||||
|
sanitize_rexml("<p><tspan>\360\235\224\270</tspan> a</p>")
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
26
vendor/plugins/HTML5lib/tests/test_stream.rb
vendored
26
vendor/plugins/HTML5lib/tests/test_stream.rb
vendored
|
@ -6,7 +6,7 @@ class HTMLInputStreamTest < Test::Unit::TestCase
|
||||||
include HTML5lib
|
include HTML5lib
|
||||||
|
|
||||||
def test_char_ascii
|
def test_char_ascii
|
||||||
stream = HTMLInputStream.new("'")
|
stream = HTMLInputStream.new("'", :encoding=>'ascii')
|
||||||
assert_equal('ascii', stream.char_encoding)
|
assert_equal('ascii', stream.char_encoding)
|
||||||
assert_equal("'", stream.char)
|
assert_equal("'", stream.char)
|
||||||
end
|
end
|
||||||
|
@ -17,11 +17,21 @@ class HTMLInputStreamTest < Test::Unit::TestCase
|
||||||
end
|
end
|
||||||
|
|
||||||
def test_char_utf8
|
def test_char_utf8
|
||||||
stream = HTMLInputStream.new("\xe2\x80\x98")
|
stream = HTMLInputStream.new("\xe2\x80\x98", :encoding=>'utf-8')
|
||||||
assert_equal('utf-8', stream.char_encoding)
|
assert_equal('utf-8', stream.char_encoding)
|
||||||
assert_equal("\xe2\x80\x98", stream.char)
|
assert_equal("\xe2\x80\x98", stream.char)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def test_char_win1252
|
||||||
|
stream = HTMLInputStream.new("\xa2\xc5\xf1\x92\x86")
|
||||||
|
assert_equal('windows-1252', stream.char_encoding)
|
||||||
|
assert_equal("\xc2\xa2", stream.char)
|
||||||
|
assert_equal("\xc3\x85", stream.char)
|
||||||
|
assert_equal("\xc3\xb1", stream.char)
|
||||||
|
assert_equal("\xe2\x80\x99", stream.char)
|
||||||
|
assert_equal("\xe2\x80\xa0", stream.char)
|
||||||
|
end
|
||||||
|
|
||||||
def test_bom
|
def test_bom
|
||||||
stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
|
stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
|
||||||
assert_equal('utf-8', stream.char_encoding)
|
assert_equal('utf-8', stream.char_encoding)
|
||||||
|
@ -31,12 +41,6 @@ class HTMLInputStreamTest < Test::Unit::TestCase
|
||||||
begin
|
begin
|
||||||
require 'iconv'
|
require 'iconv'
|
||||||
|
|
||||||
def test_char_win1252
|
|
||||||
stream = HTMLInputStream.new("\x91")
|
|
||||||
assert_equal('windows-1252', stream.char_encoding)
|
|
||||||
assert_equal("\xe2\x80\x98", stream.char)
|
|
||||||
end
|
|
||||||
|
|
||||||
def test_utf_16
|
def test_utf_16
|
||||||
stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
|
stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
|
||||||
assert(stream.char_encoding, 'utf-16-le')
|
assert(stream.char_encoding, 'utf-16-le')
|
||||||
|
@ -51,10 +55,10 @@ class HTMLInputStreamTest < Test::Unit::TestCase
|
||||||
assert_equal(0, stream.instance_eval {@tell})
|
assert_equal(0, stream.instance_eval {@tell})
|
||||||
assert_equal("a\nbb\n", stream.chars_until('c'))
|
assert_equal("a\nbb\n", stream.chars_until('c'))
|
||||||
assert_equal(6, stream.instance_eval {@tell})
|
assert_equal(6, stream.instance_eval {@tell})
|
||||||
assert_equal([3,1], stream.position)
|
assert_equal([3,0], stream.position)
|
||||||
assert_equal("ccc\ndddd", stream.chars_until('x'))
|
assert_equal("ccc\ndddd", stream.chars_until('x'))
|
||||||
assert_equal(14, stream.instance_eval {@tell})
|
assert_equal(14, stream.instance_eval {@tell})
|
||||||
assert_equal([4,5], stream.position)
|
assert_equal([4,4], stream.position)
|
||||||
assert_equal([0,1,4,8], stream.instance_eval {@new_lines})
|
assert_equal([0,1,5,9], stream.instance_eval {@new_lines})
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Add table
Reference in a new issue