Updated to Latest HTML5lib

Synced with latest HTML5lib.
Added some RDoc-compatible documentation to the sanitizer.
This commit is contained in:
Jacques Distler 2007-06-08 17:26:00 -05:00
parent 8badd0766a
commit 3bf560c3b3
7 changed files with 127 additions and 93 deletions

View file

@ -12,11 +12,11 @@ class SanitizeTest < Test::Unit::TestCase
include HTML5lib
def sanitize_xhtml stream
XHTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
XHTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).join('').gsub(/'/,'"')
end
def sanitize_html stream
HTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
HTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).join('').gsub(/'/,'"')
end
def sanitize_rexml stream
@ -259,5 +259,9 @@ class SanitizeTest < Test::Unit::TestCase
sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
assert_equal "<p>\360\235\222\265 \360\235\224\270</p>",
sanitize_rexml("<p>&#x1d4b5; &#x1d538;</p>")
assert_equal "<p><tspan>\360\235\224\270</tspan> a</p>",
sanitize_html("<p><tspan>\360\235\224\270</tspan> a</p>")
assert_equal "<p><tspan>\360\235\224\270</tspan> a</p>",
sanitize_rexml("<p><tspan>\360\235\224\270</tspan> a</p>")
end
end

View file

@ -6,7 +6,7 @@ class HTMLInputStreamTest < Test::Unit::TestCase
include HTML5lib
def test_char_ascii
stream = HTMLInputStream.new("'")
stream = HTMLInputStream.new("'", :encoding=>'ascii')
assert_equal('ascii', stream.char_encoding)
assert_equal("'", stream.char)
end
@ -17,11 +17,21 @@ class HTMLInputStreamTest < Test::Unit::TestCase
end
def test_char_utf8
stream = HTMLInputStream.new("\xe2\x80\x98")
stream = HTMLInputStream.new("\xe2\x80\x98", :encoding=>'utf-8')
assert_equal('utf-8', stream.char_encoding)
assert_equal("\xe2\x80\x98", stream.char)
end
def test_char_win1252
stream = HTMLInputStream.new("\xa2\xc5\xf1\x92\x86")
assert_equal('windows-1252', stream.char_encoding)
assert_equal("\xc2\xa2", stream.char)
assert_equal("\xc3\x85", stream.char)
assert_equal("\xc3\xb1", stream.char)
assert_equal("\xe2\x80\x99", stream.char)
assert_equal("\xe2\x80\xa0", stream.char)
end
def test_bom
stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
assert_equal('utf-8', stream.char_encoding)
@ -31,12 +41,6 @@ class HTMLInputStreamTest < Test::Unit::TestCase
begin
require 'iconv'
def test_char_win1252
stream = HTMLInputStream.new("\x91")
assert_equal('windows-1252', stream.char_encoding)
assert_equal("\xe2\x80\x98", stream.char)
end
def test_utf_16
stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
assert(stream.char_encoding, 'utf-16-le')
@ -51,10 +55,10 @@ class HTMLInputStreamTest < Test::Unit::TestCase
assert_equal(0, stream.instance_eval {@tell})
assert_equal("a\nbb\n", stream.chars_until('c'))
assert_equal(6, stream.instance_eval {@tell})
assert_equal([3,1], stream.position)
assert_equal([3,0], stream.position)
assert_equal("ccc\ndddd", stream.chars_until('x'))
assert_equal(14, stream.instance_eval {@tell})
assert_equal([4,5], stream.position)
assert_equal([0,1,4,8], stream.instance_eval {@new_lines})
assert_equal([4,4], stream.position)
assert_equal([0,1,5,9], stream.instance_eval {@new_lines})
end
end