a6429f8c22
Completely removed the html5lib sanitizer. Fixed the string-handling to work in both Ruby 1.8.x and 1.9.2. There are still, inexplicably, two functional tests that fail. But the rest seems to work quite well.
701 lines
20 KiB
Ruby
Executable file
701 lines
20 KiB
Ruby
Executable file
require 'stringio'
|
|
require 'html5/constants'
|
|
|
|
module HTML5
|
|
|
|
# Provides a unicode stream of characters to the HTMLTokenizer.
|
|
|
|
# This class takes care of character encoding and removing or replacing
|
|
# incorrect byte-sequences and also provides column and line tracking.
|
|
|
|
class HTMLInputStream
|
|
|
|
attr_accessor :queue, :char_encoding, :errors
|
|
|
|
# Initialises the HTMLInputStream.
|
|
#
|
|
# HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
|
# for use by the HTML5Lib.
|
|
#
|
|
# source can be either a file-object, local filename or a string.
|
|
#
|
|
# The optional encoding parameter must be a string that indicates
|
|
# the encoding. If specified, that encoding will be used,
|
|
# regardless of any BOM or later declaration (such as in a meta
|
|
# element)
|
|
#
|
|
# parseMeta - Look for a <meta> element containing encoding information
|
|
|
|
def initialize(source, options = {})
|
|
@encoding = nil
|
|
@parse_meta = true
|
|
@chardet = true
|
|
|
|
options.each {|name, value| instance_variable_set("@#{name}", value) }
|
|
|
|
# partial Ruby 1.9 support
|
|
if @encoding and source.respond_to? :force_encoding
|
|
source.force_encoding(@encoding) rescue nil
|
|
end
|
|
|
|
# Raw Stream
|
|
@raw_stream = open_stream(source)
|
|
|
|
# Encoding Information
|
|
#Number of bytes to use when looking for a meta element with
|
|
#encoding information
|
|
@NUM_BYTES_META = 512
|
|
#Number of bytes to use when using detecting encoding using chardet
|
|
@NUM_BYTES_CHARDET = 256
|
|
#Number of bytes to use when reading content
|
|
@NUM_BYTES_BUFFER = 1024
|
|
|
|
#Encoding to use if no other information can be found
|
|
@DEFAULT_ENCODING = 'windows-1252'
|
|
|
|
#Detect encoding iff no explicit "transport level" encoding is supplied
|
|
if @encoding.nil? or not HTML5.is_valid_encoding(@encoding)
|
|
@char_encoding = detect_encoding
|
|
else
|
|
@char_encoding = @encoding
|
|
end
|
|
|
|
# Read bytes from stream decoding them into Unicode
|
|
@buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || ''
|
|
if @char_encoding == 'windows-1252'
|
|
@win1252 = true
|
|
elsif @char_encoding != 'utf-8'
|
|
require 'iconv'
|
|
begin
|
|
@buffer << @raw_stream.read unless @raw_stream.eof?
|
|
@buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
|
|
rescue
|
|
@win1252 = true
|
|
end
|
|
end
|
|
|
|
@queue = []
|
|
@errors = []
|
|
|
|
# Reset position in the list to read from
|
|
@tell = 0
|
|
@line = @col = 0
|
|
@line_lengths = []
|
|
end
|
|
|
|
# Produces a file object from source.
|
|
#
|
|
# source can be either a file object, local filename or a string.
|
|
def open_stream(source)
|
|
# Already an IO like object
|
|
if source.respond_to?(:read)
|
|
source
|
|
else
|
|
# Treat source as a string and wrap in StringIO
|
|
StringIO.new(source)
|
|
end
|
|
end
|
|
|
|
def detect_encoding
|
|
|
|
#First look for a BOM
|
|
#This will also read past the BOM if present
|
|
encoding = detect_bom
|
|
|
|
#If there is no BOM need to look for meta elements with encoding
|
|
#information
|
|
if encoding.nil? and @parse_meta
|
|
encoding = detect_encoding_meta
|
|
end
|
|
|
|
#Guess with chardet, if avaliable
|
|
if encoding.nil? and @chardet
|
|
begin
|
|
require 'rubygems'
|
|
require 'UniversalDetector' # gem install chardet
|
|
buffers = []
|
|
detector = UniversalDetector::Detector.instance
|
|
detector.reset
|
|
until @raw_stream.eof?
|
|
buffer = @raw_stream.read(@NUM_BYTES_CHARDET)
|
|
break if !buffer or buffer.empty?
|
|
buffers << buffer
|
|
detector.feed(buffer)
|
|
break if detector.instance_eval {@done}
|
|
detector.instance_eval {
|
|
@_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar
|
|
}
|
|
end
|
|
detector.close
|
|
encoding = detector.result['encoding']
|
|
seek(buffers*'', 0)
|
|
rescue LoadError
|
|
end
|
|
end
|
|
|
|
# If all else fails use the default encoding
|
|
if encoding.nil?
|
|
encoding = @DEFAULT_ENCODING
|
|
end
|
|
|
|
#Substitute for equivalent encoding
|
|
if 'iso-8859-1' == encoding.downcase
|
|
encoding = 'windows-1252'
|
|
end
|
|
|
|
encoding
|
|
end
|
|
|
|
# Attempts to detect at BOM at the start of the stream. If
|
|
# an encoding can be determined from the BOM return the name of the
|
|
# encoding otherwise return nil
|
|
def detect_bom
|
|
bom_dict = {
|
|
"\xef\xbb\xbf" => 'utf-8',
|
|
"\xff\xfe" => 'utf-16le',
|
|
"\xfe\xff" => 'utf-16be',
|
|
"\xff\xfe\x00\x00" => 'utf-32le',
|
|
"\x00\x00\xfe\xff" => 'utf-32be'
|
|
}
|
|
|
|
# Go to beginning of file and read in 4 bytes
|
|
string = @raw_stream.read(4)
|
|
return nil unless string
|
|
|
|
# Try detecting the BOM using bytes from the string
|
|
encoding = bom_dict[string[0...3]] # UTF-8
|
|
seek = 3
|
|
unless encoding
|
|
# Need to detect UTF-32 before UTF-16
|
|
encoding = bom_dict[string] # UTF-32
|
|
seek = 4
|
|
unless encoding
|
|
encoding = bom_dict[string[0...2]] # UTF-16
|
|
seek = 2
|
|
end
|
|
end
|
|
|
|
# Set the read position past the BOM if one was found, otherwise
|
|
# set it to the start of the stream
|
|
seek(string, encoding ? seek : 0)
|
|
|
|
return encoding
|
|
end
|
|
|
|
def seek(buffer, n)
|
|
if @raw_stream.respond_to?(:unget)
|
|
@raw_stream.unget(buffer[n..-1])
|
|
return
|
|
end
|
|
|
|
if @raw_stream.respond_to?(:seek)
|
|
begin
|
|
@raw_stream.seek(n)
|
|
return
|
|
rescue Errno::ESPIPE
|
|
end
|
|
end
|
|
|
|
#TODO: huh?
|
|
require 'delegate'
|
|
@raw_stream = SimpleDelegator.new(@raw_stream)
|
|
|
|
class << @raw_stream
|
|
def read(chars=-1)
|
|
if chars == -1 or chars > @data.length
|
|
result = @data
|
|
@data = ''
|
|
return result if __getobj__.eof?
|
|
return result + __getobj__.read if chars == -1
|
|
return result + __getobj__.read(chars-result.length)
|
|
elsif @data.empty?
|
|
return __getobj__.read(chars)
|
|
else
|
|
result = @data[1...chars]
|
|
@data = @data[chars..-1]
|
|
return result
|
|
end
|
|
end
|
|
|
|
def unget(data)
|
|
if !@data or @data.empty?
|
|
@data = data
|
|
else
|
|
@data += data
|
|
end
|
|
end
|
|
end
|
|
|
|
@raw_stream.unget(buffer[n .. -1])
|
|
end
|
|
|
|
# Report the encoding declared by the meta element
|
|
def detect_encoding_meta
|
|
buffer = @raw_stream.read(@NUM_BYTES_META)
|
|
parser = EncodingParser.new(buffer)
|
|
seek(buffer, 0)
|
|
return parser.get_encoding
|
|
end
|
|
|
|
# Returns (line, col) of the current position in the stream.
|
|
def position
|
|
line, col = @line, @col
|
|
if @queue and @queue.last != :EOF
|
|
@queue.reverse.each do |c|
|
|
if c == "\n"
|
|
line -= 1
|
|
raise RuntimeError.new("col=#{col}") unless col == 0
|
|
col = @line_lengths[line]
|
|
else
|
|
col -= 1
|
|
end
|
|
end
|
|
end
|
|
return [line + 1, col]
|
|
end
|
|
|
|
# Read one character from the stream or queue if available. Return
|
|
# EOF when EOF is reached.
|
|
def char
|
|
unless @queue.empty?
|
|
return @queue.shift
|
|
else
|
|
if @tell + 3 > @buffer.length && !@raw_stream.eof?
|
|
# read next block
|
|
@buffer = @buffer[@tell..-1] + @raw_stream.read(@NUM_BYTES_BUFFER)
|
|
@tell = 0
|
|
end
|
|
|
|
c = @buffer[@tell]
|
|
@tell += 1
|
|
|
|
case c
|
|
|
|
when String
|
|
# partial Ruby 1.9 support
|
|
case c
|
|
when "\0"
|
|
@errors.push("null-character")
|
|
c = "\uFFFD" # null characters are invalid
|
|
when "\r"
|
|
@tell += 1 if @buffer[@tell] == "\n"
|
|
c = "\n"
|
|
when "\x80" .. "\x9F"
|
|
c = ENTITIES_WINDOWS1252[c.ord-0x80].chr('utf-8')
|
|
when "\xA0" .. "\xFF"
|
|
if c.encoding == Encoding::ASCII_8BIT
|
|
c = c.encode('utf-8','iso-8859-1')
|
|
end
|
|
end
|
|
|
|
if c == "\x0D"
|
|
# normalize newlines
|
|
@tell += 1 if @buffer[@tell] == 0x0A
|
|
c = 0x0A
|
|
end
|
|
|
|
# update position in stream
|
|
if c == "\x0a"
|
|
@line_lengths << @col
|
|
@line += 1
|
|
@col = 0
|
|
else
|
|
@col += 1
|
|
end
|
|
|
|
c
|
|
|
|
when 0x01..0x7F
|
|
if c == 0x0D
|
|
# normalize newlines
|
|
@tell += 1 if @buffer[@tell] == 0x0A
|
|
c = 0x0A
|
|
end
|
|
|
|
# update position in stream
|
|
if c == 0x0a
|
|
@line_lengths << @col
|
|
@line += 1
|
|
@col = 0
|
|
else
|
|
@col += 1
|
|
end
|
|
|
|
c.chr
|
|
|
|
when 0x80..0xBF
|
|
if !@win1252
|
|
[0xFFFD].pack('U') # invalid utf-8
|
|
elsif c <= 0x9f
|
|
[ENTITIES_WINDOWS1252[c-0x80]].pack('U')
|
|
else
|
|
"\xC2" + c.chr # convert to utf-8
|
|
end
|
|
|
|
when 0xC0..0xFF
|
|
if instance_variable_defined?("@win1252") && @win1252
|
|
"\xC3" + (c - 64).chr # convert to utf-8
|
|
# from http://www.w3.org/International/questions/qa-forms-utf-8.en.php
|
|
elsif @buffer[@tell - 1..@tell + 3] =~ /^
|
|
( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
|
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
|
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
|
|
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
|
|
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
|
|
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
|
|
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
|
|
)/x
|
|
@tell += $1.length - 1
|
|
$1
|
|
else
|
|
[0xFFFD].pack('U') # invalid utf-8
|
|
end
|
|
|
|
when 0x00
|
|
@errors.push("null-character")
|
|
[0xFFFD].pack('U') # null characters are invalid
|
|
|
|
else
|
|
:EOF
|
|
end
|
|
end
|
|
end
|
|
|
|
# Returns a string of characters from the stream up to but not
|
|
# including any character in characters or EOF. characters can be
|
|
# any container that supports the in method being called on it.
|
|
def chars_until(characters, opposite=false)
|
|
char_stack = [char]
|
|
|
|
while char_stack.last != :EOF
|
|
break unless (characters.include?(char_stack.last)) == opposite
|
|
char_stack.push(char)
|
|
end
|
|
|
|
# Put the character stopped on back to the front of the queue
|
|
# from where it came.
|
|
c = char_stack.pop
|
|
@queue.insert(0, c) unless c == :EOF
|
|
return char_stack.join('')
|
|
end
|
|
|
|
def unget(characters)
|
|
return if characters == :EOF
|
|
if characters.respond_to? :to_a
|
|
@queue.unshift(*characters.to_a)
|
|
else
|
|
characters.reverse.each_char {|c| @queue.unshift(c)}
|
|
end
|
|
end
|
|
end
|
|
|
|
# String-like object with an assosiated position and various extra methods
|
|
# If the position is ever greater than the string length then an exception is raised
|
|
class EncodingBytes < String
|
|
|
|
attr_accessor :position
|
|
|
|
def initialize(value)
|
|
super(value)
|
|
@position = -1
|
|
end
|
|
|
|
def each
|
|
while @position < length
|
|
@position += 1
|
|
yield self[@position]
|
|
end
|
|
rescue EOF
|
|
end
|
|
|
|
def current_byte
|
|
raise EOF if @position >= length
|
|
return self[@position].chr
|
|
end
|
|
|
|
# Skip past a list of characters
|
|
def skip(chars=SPACE_CHARACTERS)
|
|
while chars.include?(current_byte)
|
|
@position += 1
|
|
end
|
|
end
|
|
|
|
# Look for a sequence of bytes at the start of a string. If the bytes
|
|
# are found return true and advance the position to the byte after the
|
|
# match. Otherwise return false and leave the position alone
|
|
def match_bytes(bytes, lower=false)
|
|
data = self[position ... position+bytes.length]
|
|
data.downcase! if lower
|
|
rv = (data == bytes)
|
|
@position += bytes.length if rv == true
|
|
return rv
|
|
end
|
|
|
|
# Look for the next sequence of bytes matching a given sequence. If
|
|
# a match is found advance the position to the last byte of the match
|
|
def jump_to(bytes)
|
|
new_position = self[position .. -1].index(bytes)
|
|
if new_position
|
|
@position += (new_position + bytes.length-1)
|
|
return true
|
|
else
|
|
raise EOF
|
|
end
|
|
end
|
|
|
|
# Move the pointer so it points to the next byte in a set of possible
|
|
# bytes
|
|
def find_next(byte_list)
|
|
until byte_list.include?(current_byte)
|
|
@position += 1
|
|
end
|
|
end
|
|
end
|
|
|
|
# Mini parser for detecting character encoding from meta elements
|
|
class EncodingParser
|
|
|
|
# string - the data to work on for encoding detection
|
|
def initialize(data)
|
|
@data = EncodingBytes.new(data.to_s)
|
|
@encoding = nil
|
|
end
|
|
|
|
@@method_dispatch = [
|
|
['<!--', :handle_comment],
|
|
['<meta', :handle_meta],
|
|
['</', :handle_possible_end_tag],
|
|
['<!', :handle_other],
|
|
['<?', :handle_other],
|
|
['<', :handle_possible_start_tag]
|
|
]
|
|
|
|
def get_encoding
|
|
@data.each do |byte|
|
|
keep_parsing = true
|
|
@@method_dispatch.each do |(key, method)|
|
|
if @data.match_bytes(key, lower = true)
|
|
keep_parsing = send(method)
|
|
break
|
|
end
|
|
end
|
|
break unless keep_parsing
|
|
end
|
|
unless @encoding.nil?
|
|
@encoding = @encoding.strip
|
|
if ["UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE"].include?(@encoding.upcase)
|
|
@encoding = 'utf-8'
|
|
end
|
|
end
|
|
|
|
return @encoding
|
|
end
|
|
|
|
# Skip over comments
|
|
def handle_comment
|
|
return @data.jump_to('-->')
|
|
end
|
|
|
|
def handle_meta
|
|
# if we have <meta not followed by a space so just keep going
|
|
return true unless SPACE_CHARACTERS.include?(@data.current_byte)
|
|
|
|
#We have a valid meta element we want to search for attributes
|
|
while true
|
|
#Try to find the next attribute after the current position
|
|
attr = get_attribute
|
|
|
|
return true if attr.nil?
|
|
|
|
if attr[0] == 'charset'
|
|
tentative_encoding = attr[1]
|
|
if HTML5.is_valid_encoding(tentative_encoding)
|
|
@encoding = tentative_encoding
|
|
return false
|
|
end
|
|
elsif attr[0] == 'content'
|
|
content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
|
|
tentative_encoding = content_parser.parse
|
|
if HTML5.is_valid_encoding(tentative_encoding)
|
|
@encoding = tentative_encoding
|
|
return false
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
def handle_possible_start_tag
|
|
return handle_possible_tag(false)
|
|
end
|
|
|
|
def handle_possible_end_tag
|
|
@data.position += 1
|
|
return handle_possible_tag(true)
|
|
end
|
|
|
|
def handle_possible_tag(end_tag)
|
|
unless ASCII_LETTERS.include?(@data.current_byte)
|
|
#If the next byte is not an ascii letter either ignore this
|
|
#fragment (possible start tag case) or treat it according to
|
|
#handleOther
|
|
if end_tag
|
|
@data.position -= 1
|
|
handle_other
|
|
end
|
|
return true
|
|
end
|
|
|
|
@data.find_next(SPACE_CHARACTERS + ['<', '>'])
|
|
|
|
if @data.current_byte == '<'
|
|
#return to the first step in the overall "two step" algorithm
|
|
#reprocessing the < byte
|
|
@data.position -= 1
|
|
else
|
|
#Read all attributes
|
|
{} until get_attribute.nil?
|
|
end
|
|
return true
|
|
end
|
|
|
|
def handle_other
|
|
return @data.jump_to('>')
|
|
end
|
|
|
|
# Return a name,value pair for the next attribute in the stream,
|
|
# if one is found, or nil
|
|
def get_attribute
|
|
@data.skip(SPACE_CHARACTERS + ['/'])
|
|
|
|
if @data.current_byte == '<'
|
|
@data.position -= 1
|
|
return nil
|
|
elsif @data.current_byte == '>'
|
|
return nil
|
|
end
|
|
|
|
attr_name = []
|
|
attr_value = []
|
|
space_found = false
|
|
#Step 5 attribute name
|
|
while true
|
|
if @data.current_byte == '=' and attr_name
|
|
break
|
|
elsif SPACE_CHARACTERS.include?(@data.current_byte)
|
|
space_found = true
|
|
break
|
|
elsif ['/', '<', '>'].include?(@data.current_byte)
|
|
return [attr_name.join(''), '']
|
|
elsif ASCII_UPPERCASE.include?(@data.current_byte)
|
|
attr_name.push(@data.current_byte.downcase)
|
|
else
|
|
attr_name.push(@data.current_byte)
|
|
end
|
|
#Step 6
|
|
@data.position += 1
|
|
end
|
|
#Step 7
|
|
if space_found
|
|
@data.skip
|
|
#Step 8
|
|
unless @data.current_byte == '='
|
|
@data.position -= 1
|
|
return [attr_name.join(''), '']
|
|
end
|
|
end
|
|
#XXX need to advance position in both spaces and value case
|
|
#Step 9
|
|
@data.position += 1
|
|
#Step 10
|
|
@data.skip
|
|
#Step 11
|
|
if ["'", '"'].include?(@data.current_byte)
|
|
#11.1
|
|
quote_char = @data.current_byte
|
|
while true
|
|
@data.position+=1
|
|
#11.3
|
|
if @data.current_byte == quote_char
|
|
@data.position += 1
|
|
return [attr_name.join(''), attr_value.join('')]
|
|
#11.4
|
|
elsif ASCII_UPPERCASE.include?(@data.current_byte)
|
|
attr_value.push(@data.current_byte.downcase)
|
|
#11.5
|
|
else
|
|
attr_value.push(@data.current_byte)
|
|
end
|
|
end
|
|
elsif ['>', '<'].include?(@data.current_byte)
|
|
return [attr_name.join(''), '']
|
|
elsif ASCII_UPPERCASE.include?(@data.current_byte)
|
|
attr_value.push(@data.current_byte.downcase)
|
|
else
|
|
attr_value.push(@data.current_byte)
|
|
end
|
|
while true
|
|
@data.position += 1
|
|
if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
|
|
return [attr_name.join(''), attr_value.join('')]
|
|
elsif ASCII_UPPERCASE.include?(@data.current_byte)
|
|
attr_value.push(@data.current_byte.downcase)
|
|
else
|
|
attr_value.push(@data.current_byte)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
class ContentAttrParser
|
|
def initialize(data)
|
|
@data = data
|
|
end
|
|
|
|
def parse
|
|
begin
|
|
#Skip to the first ";"
|
|
@data.position = 0
|
|
@data.jump_to(';')
|
|
@data.position += 1
|
|
@data.skip
|
|
#Check if the attr name is charset
|
|
#otherwise return
|
|
@data.jump_to('charset')
|
|
@data.position += 1
|
|
@data.skip
|
|
unless @data.current_byte == '='
|
|
#If there is no = sign keep looking for attrs
|
|
return nil
|
|
end
|
|
@data.position += 1
|
|
@data.skip
|
|
#Look for an encoding between matching quote marks
|
|
if ['"', "'"].include?(@data.current_byte)
|
|
quote_mark = @data.current_byte
|
|
@data.position += 1
|
|
old_position = @data.position
|
|
@data.jump_to(quote_mark)
|
|
return @data[old_position ... @data.position]
|
|
else
|
|
#Unquoted value
|
|
old_position = @data.position
|
|
begin
|
|
@data.find_next(SPACE_CHARACTERS)
|
|
return @data[old_position ... @data.position]
|
|
rescue EOF
|
|
#Return the whole remaining value
|
|
return @data[old_position .. -1]
|
|
end
|
|
end
|
|
rescue EOF
|
|
return nil
|
|
end
|
|
end
|
|
end
|
|
|
|
# Determine if a string is a supported encoding
|
|
def self.is_valid_encoding(encoding)
|
|
(not encoding.nil? and encoding.kind_of?(String) and ENCODINGS.include?(encoding.downcase.strip))
|
|
end
|
|
|
|
end
|