Updated to Latest HTML5lib

Synced with latest HTML5lib.
Added some RDoc-compatible documentation to the sanitizer.
This commit is contained in:
Jacques Distler 2007-06-08 17:26:00 -05:00
parent 8badd0766a
commit 3bf560c3b3
7 changed files with 127 additions and 93 deletions

View file

@ -29,6 +29,8 @@ module Sanitize
#
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
# By default, the output is a string. But, optionally, you can return a REXML tree.
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
# (REXML trees are always utf-8 encoded.)
def sanitize_xhtml(html, options = {})
@encoding = 'utf-8'
@treebuilder = TreeBuilders::REXML::TreeBuilder
@ -55,6 +57,8 @@ module Sanitize
#
# Unless otherwise specified, the string is assumed to be utf-8 encoded.
# By default, the output is a string. But, optionally, you can return a REXML tree.
# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
# (REXML trees are always utf-8 encoded.)
def sanitize_html(html, options = {})
@encoding = 'utf-8'
@treebuilder = TreeBuilders::REXML::TreeBuilder

View file

@ -2,6 +2,12 @@
class String
# Check whether a string is valid utf-8
#
# :call-seq:
# string.is_utf8? -> boolean
#
# returns true if the sequence of bytes in string is valid utf-8
def is_utf8?
self =~ /^(
[\x09\x0A\x0D\x20-\x7E] # ASCII
@ -2138,10 +2144,21 @@ class String
'zeetrf' => 'ℨ'
}
# Converts XHTML+MathML named entities to Numeric Character References
#
# :call-seq:
# string.to_ncr -> string
#
def to_ncr
self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
end
# Converts XHTML+MathML named entities to Numeric Character References
#
# :call-seq:
# string.to_ncr! -> str or nil
#
# Substitution is done in-place.
def to_ncr!
self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
end
@ -2159,6 +2176,14 @@ end
require 'rexml/element'
module REXML
class Element
# Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References
#
# :call-seq:
# elt.to_ncr -> REXML::Element
#
# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
# access the resulting REXML document.
def to_ncr
XPath.each(self, '//*') { |el|
el.texts.each_index {|i|

View file

@ -41,14 +41,14 @@ module HTML5lib
super(parser, tree)
# for special handling of whitespace in <pre>
@processSpaceCharactersPre = false
@processSpaceCharactersDropNewline = false
end
def processSpaceCharactersPre(data)
def processSpaceCharactersDropNewline(data)
#Sometimes (start of <pre> blocks) we want to drop leading newlines
@processSpaceCharactersPre = false
@processSpaceCharactersDropNewline = false
if (data.length > 0 and data[0] == ?\n and
@tree.openElements[-1].name == 'pre' and
%w[pre textarea].include?(@tree.openElements[-1].name) and
not @tree.openElements[-1].hasContent)
data = data[1..-1]
end
@ -56,8 +56,8 @@ module HTML5lib
end
def processSpaceCharacters(data)
if @processSpaceCharactersPre
processSpaceCharactersPre(data)
if @processSpaceCharactersDropNewline
processSpaceCharactersDropNewline(data)
else
super(data)
end
@ -98,7 +98,7 @@ module HTML5lib
def startTagCloseP(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@processSpaceCharactersPre = true if name == 'pre'
@processSpaceCharactersDropNewline = true if name == 'pre'
end
def startTagForm(name, attributes)
@ -248,6 +248,7 @@ module HTML5lib
# XXX Form element pointer checking here as well...
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :RCDATA
@processSpaceCharactersDropNewline = true
end
# iframe, noembed noframes, noscript(if scripting enabled)
@ -312,7 +313,7 @@ module HTML5lib
def endTagBlock(name)
#Put us back in the right whitespace handling mode
@processSpaceCharactersPre = false if name == 'pre'
@processSpaceCharactersDropNewline = false if name == 'pre'
@tree.generateImpliedEndTags if in_scope?(name)

View file

@ -34,7 +34,7 @@ module HTML5lib
options.each { |name, value| instance_variable_set("@#{name}", value) }
# List of where new lines occur
@new_lines = []
@new_lines = [0]
# Raw Stream
@raw_stream = open_stream(source)
@ -55,26 +55,28 @@ module HTML5lib
# Read bytes from stream decoding them into Unicode
uString = @raw_stream.read
unless @char_encoding == 'utf-8'
if @char_encoding == 'windows-1252'
@win1252 = true
elsif @char_encoding != 'utf-8'
begin
require 'iconv'
uString = Iconv.iconv('utf-8', @char_encoding, uString)[0]
begin
uString = Iconv.iconv('utf-8', @char_encoding, uString).first
rescue
@win1252 = true
end
rescue LoadError
rescue Exception
@win1252 = true
end
end
# Normalize newlines and null characters
uString.gsub!(/\r\n?/, "\n")
uString.gsub!("\x00", [0xFFFD].pack('U'))
# Convert the unicode string into a list to be used as the data stream
@data_stream = uString
@queue = []
# Reset position in the list to read from
reset
@tell = 0
end
# Produces a file object from source.
@ -136,10 +138,10 @@ module HTML5lib
def detect_bom
bom_dict = {
"\xef\xbb\xbf" => 'utf-8',
"\xff\xfe" => 'utf16le',
"\xfe\xff" => 'utf16be',
"\xff\xfe\x00\x00" => 'utf32le',
"\x00\x00\xfe\xff" => 'utf32be'
"\xff\xfe" => 'utf-16le',
"\xfe\xff" => 'utf-16be',
"\xff\xfe\x00\x00" => 'utf-32le',
"\x00\x00\xfe\xff" => 'utf-32be'
}
# Go to beginning of file and read in 4 bytes
@ -175,68 +177,72 @@ module HTML5lib
return parser.get_encoding
end
def determine_new_lines
# Looks through the stream to find where new lines occur so
# the position method can tell where it is.
@new_lines.push(0)
(0...@data_stream.length).each { |i| @new_lines.push(i) if @data_stream[i] == ?\n }
end
# Returns (line, col) of the current position in the stream.
def position
# Generate list of new lines first time around
determine_new_lines if @new_lines.empty?
line = 0
tell = @tell
@new_lines.each do |pos|
break unless pos < tell
break unless pos < @tell
line += 1
end
col = tell - @new_lines[line-1] - 1
col = @tell - @new_lines[line-1] - 1
return [line, col]
end
# Resets the position in the stream back to the start.
def reset
@tell = 0
end
# Read one character from the stream or queue if available. Return
# EOF when EOF is reached.
def char
unless @queue.empty?
return @queue.shift
else
c = @data_stream[@tell]
@tell += 1
c = @data_stream[@tell - 1]
case c
when 0xC2 .. 0xDF
if @data_stream[@tell .. @tell] =~ /[\x80-\xBF]/
@tell += 1
@data_stream[@tell-2..@tell-1]
else
[0xFFFD].pack('U')
when 0x01 .. 0x7F
if c == 0x0D
# normalize newlines
@tell += 1 if @data_stream[@tell] == 0x0A
c = 0x0A
end
when 0xE0 .. 0xEF
if @data_stream[@tell .. @tell+1] =~ /[\x80-\xBF]{2}/
@tell += 2
@data_stream[@tell-3..@tell-1]
else
[0xFFFD].pack('U')
end
when 0xF0 .. 0xF3
if @data_stream[@tell .. @tell+2] =~ /[\x80-\xBF]{3}/
@tell += 3
@data_stream[@tell-4..@tell-1]
else
[0xFFFD].pack('U')
end
else
begin
# record where newlines occur so that the position method
# can tell where it is
@new_lines << @tell-1 if c == 0x0A
c.chr
rescue
:EOF
when 0x80 .. 0xBF
if !@win1252
[0xFFFD].pack('U') # invalid utf-8
elsif c <= 0x9f
[ENTITIES_WINDOWS1252[c-0x80]].pack('U')
else
"\xC2" + c.chr # convert to utf-8
end
when 0xC0 .. 0xFF
if @win1252
"\xC3" + (c-64).chr # convert to utf-8
elsif @data_stream[@tell-1 .. -1] =~ /^
( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
)/x
@tell += $1.length - 1
$1
else
[0xFFFD].pack('U') # invalid utf-8
end
when 0x00
[0xFFFD].pack('U') # null characters are invalid
else
:EOF
end
end
end
@ -247,28 +253,19 @@ module HTML5lib
def chars_until(characters, opposite=false)
char_stack = [char]
unless char_stack[0] == :EOF
while (characters.include? char_stack[-1]) == opposite
unless @queue.empty?
# First from the queue
char_stack.push(@queue.shift)
break if char_stack[-1] == :EOF
else
# Then the rest
begin
@tell += 1
char_stack.push(@data_stream[@tell-1].chr)
rescue
char_stack.push(:EOF)
break
end
end
end
while char_stack.last != :EOF
break unless (characters.include?(char_stack.last)) == opposite
char_stack.push(char)
end
# Put the character stopped on back to the front of the queue
# from where it came.
@queue.insert(0, char_stack.pop)
c = char_stack.pop
if c == :EOF or @data_stream[@tell-1] == c[0]
@tell -= 1
else
@queue.insert(0, c)
end
return char_stack.join('')
end
end

View file

@ -68,7 +68,6 @@ module HTML5lib
# to return we yield the token which pauses processing until the next token
# is requested.
def each
@stream.reset
@tokenQueue = []
# Start processing. When EOF is reached @state will return false
# instead of true and the loop will terminate.

View file

@ -12,11 +12,11 @@ class SanitizeTest < Test::Unit::TestCase
include HTML5lib
def sanitize_xhtml stream
XHTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
XHTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).join('').gsub(/'/,'"')
end
def sanitize_html stream
HTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
HTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).join('').gsub(/'/,'"')
end
def sanitize_rexml stream
@ -259,5 +259,9 @@ class SanitizeTest < Test::Unit::TestCase
sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
assert_equal "<p>\360\235\222\265 \360\235\224\270</p>",
sanitize_rexml("<p>&#x1d4b5; &#x1d538;</p>")
assert_equal "<p><tspan>\360\235\224\270</tspan> a</p>",
sanitize_html("<p><tspan>\360\235\224\270</tspan> a</p>")
assert_equal "<p><tspan>\360\235\224\270</tspan> a</p>",
sanitize_rexml("<p><tspan>\360\235\224\270</tspan> a</p>")
end
end

View file

@ -6,7 +6,7 @@ class HTMLInputStreamTest < Test::Unit::TestCase
include HTML5lib
def test_char_ascii
stream = HTMLInputStream.new("'")
stream = HTMLInputStream.new("'", :encoding=>'ascii')
assert_equal('ascii', stream.char_encoding)
assert_equal("'", stream.char)
end
@ -17,11 +17,21 @@ class HTMLInputStreamTest < Test::Unit::TestCase
end
def test_char_utf8
stream = HTMLInputStream.new("\xe2\x80\x98")
stream = HTMLInputStream.new("\xe2\x80\x98", :encoding=>'utf-8')
assert_equal('utf-8', stream.char_encoding)
assert_equal("\xe2\x80\x98", stream.char)
end
def test_char_win1252
stream = HTMLInputStream.new("\xa2\xc5\xf1\x92\x86")
assert_equal('windows-1252', stream.char_encoding)
assert_equal("\xc2\xa2", stream.char)
assert_equal("\xc3\x85", stream.char)
assert_equal("\xc3\xb1", stream.char)
assert_equal("\xe2\x80\x99", stream.char)
assert_equal("\xe2\x80\xa0", stream.char)
end
def test_bom
stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
assert_equal('utf-8', stream.char_encoding)
@ -31,12 +41,6 @@ class HTMLInputStreamTest < Test::Unit::TestCase
begin
require 'iconv'
def test_char_win1252
stream = HTMLInputStream.new("\x91")
assert_equal('windows-1252', stream.char_encoding)
assert_equal("\xe2\x80\x98", stream.char)
end
def test_utf_16
stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
assert(stream.char_encoding, 'utf-16-le')
@ -51,10 +55,10 @@ class HTMLInputStreamTest < Test::Unit::TestCase
assert_equal(0, stream.instance_eval {@tell})
assert_equal("a\nbb\n", stream.chars_until('c'))
assert_equal(6, stream.instance_eval {@tell})
assert_equal([3,1], stream.position)
assert_equal([3,0], stream.position)
assert_equal("ccc\ndddd", stream.chars_until('x'))
assert_equal(14, stream.instance_eval {@tell})
assert_equal([4,5], stream.position)
assert_equal([0,1,4,8], stream.instance_eval {@new_lines})
assert_equal([4,4], stream.position)
assert_equal([0,1,5,9], stream.instance_eval {@new_lines})
end
end