Sync with Latest HTML5lib

Some more tweaks
This commit is contained in:
Jacques Distler 2007-06-06 08:12:03 -05:00
parent fd183eac04
commit 8846b2cda5
5 changed files with 48 additions and 27 deletions

View file

@ -59,7 +59,8 @@ module HTML5lib
begin begin
require 'iconv' require 'iconv'
uString = Iconv.iconv('utf-8', @char_encoding, uString)[0] uString = Iconv.iconv('utf-8', @char_encoding, uString)[0]
rescue rescue LoadError
rescue Exception
end end
end end
@ -206,21 +207,36 @@ module HTML5lib
unless @queue.empty? unless @queue.empty?
return @queue.shift return @queue.shift
else else
begin @tell += 1
@tell += 1 c = @data_stream[@tell - 1]
c = @data_stream[@tell - 1] case c
case c when 0xC2 .. 0xDF
when 0xC2 .. 0xDF if @data_stream[@tell .. @tell] =~ /[\x80-\xBF]/
@tell += 1 @tell += 1
c.chr + @data_stream[@tell-1].chr @data_stream[@tell-2..@tell-1]
when 0xE0 .. 0xF0
@tell += 2
c.chr + @data_stream[@tell-2].chr + @data_stream[@tell-1].chr
else else
c.chr [0xFFFD].pack('U')
end
when 0xE0 .. 0xEF
if @data_stream[@tell .. @tell+1] =~ /[\x80-\xBF]{2}/
@tell += 2
@data_stream[@tell-3..@tell-1]
else
[0xFFFD].pack('U')
end
when 0xF0 .. 0xF3
if @data_stream[@tell .. @tell+2] =~ /[\x80-\xBF]{3}/
@tell += 3
@data_stream[@tell-4..@tell-1]
else
[0xFFFD].pack('U')
end
else
begin
c.chr
rescue
:EOF
end end
rescue
return :EOF
end end
end end
end end

View file

@ -1,5 +1,4 @@
require 'html5lib/constants' require 'html5lib/constants'
require 'jcode'
module HTML5lib module HTML5lib
@ -309,7 +308,7 @@ class HTMLSerializer
if @quote_attr_values or v.empty? if @quote_attr_values or v.empty?
quote_attr = true quote_attr = true
else else
quote_attr = (SPACE_CHARACTERS.join('') + "<>\"'").each_char.any? {|c| v.include?(c)} quote_attr = (SPACE_CHARACTERS + %w(< > " ')).any? {|c| v.include?(c)}
end end
v = v.gsub("&", "&amp;") v = v.gsub("&", "&amp;")
if encoding if encoding

View file

@ -15,7 +15,7 @@ begin
rescue LoadError rescue LoadError
class JSON class JSON
def self.parse json def self.parse json
json.gsub! /"\s*:/, '"=>' json.gsub!(/"\s*:/, '"=>')
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')} json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
eval json eval json
end end

View file

@ -22,22 +22,28 @@ class HTMLInputStreamTest < Test::Unit::TestCase
assert_equal("\xe2\x80\x98", stream.char) assert_equal("\xe2\x80\x98", stream.char)
end end
def test_char_win1252
stream = HTMLInputStream.new("\x91")
assert_equal('windows-1252', stream.char_encoding)
assert_equal("\xe2\x80\x98", stream.char)
end
def test_bom def test_bom
stream = HTMLInputStream.new("\xef\xbb\xbf" + "'") stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
assert_equal('utf-8', stream.char_encoding) assert_equal('utf-8', stream.char_encoding)
assert_equal("'", stream.char) assert_equal("'", stream.char)
end end
def test_utf_16 begin
stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025) require 'iconv'
assert(stream.char_encoding, 'utf-16-le')
assert_equal(1025, stream.chars_until(' ',true).length) def test_char_win1252
stream = HTMLInputStream.new("\x91")
assert_equal('windows-1252', stream.char_encoding)
assert_equal("\xe2\x80\x98", stream.char)
end
def test_utf_16
stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
assert(stream.char_encoding, 'utf-16-le')
assert_equal(1025, stream.chars_until(' ',true).length)
end
rescue LoadError
puts "iconv not found, skipping iconv tests"
end end
def test_newlines def test_newlines

View file

@ -11,7 +11,7 @@ class TokenizerTestParser
debug = nil debug = nil
for token in @tokenizer for token in @tokenizer
debug = token.inspect if token[:type] == :ParseError debug = token.inspect if token[:type] == :ParseError
send ('process' + token[:type].to_s), token send(('process' + token[:type].to_s), token)
end end
return @outputTokens return @outputTokens