Sync with latest HTML5lib

This commit is contained in:
Jacques Distler 2007-06-22 03:12:08 -05:00
parent bf572e295f
commit 8e92e4a3ab
41 changed files with 1334 additions and 564 deletions

View file

@ -20,20 +20,43 @@ module HTML5lib
when :EmptyTag when :EmptyTag
if token[:name].downcase == "meta" if token[:name].downcase == "meta"
if token[:data].any? {|name,value| name=='charset'} # replace charset with actual encoding
# replace charset with actual encoding token[:data].each_with_index do |(name,value),index|
attrs=Hash[*token[:data].flatten] if name == 'charset'
attrs['charset'] = @encoding token[:data][index][1]=@encoding
token[:data] = attrs.to_a.sort meta_found = true
meta_found = true end
end
# replace charset with actual encoding
has_http_equiv_content_type = false
content_index = -1
token[:data].each_with_index do |(name,value),i|
if name.downcase == 'charset'
token[:data][i] = ['charset', @encoding]
meta_found = true
break
elsif name == 'http-equiv' and value.downcase == 'content-type'
has_http_equiv_content_type = true
elsif name == 'content'
content_index = i
end
end
if not meta_found
if has_http_equiv_content_type and content_index >= 0
token[:data][content_index][1] =
'text/html; charset=%s' % @encoding
meta_found = true
end
end end
elsif token[:name].downcase == "head" and not meta_found elsif token[:name].downcase == "head" and not meta_found
# insert meta into empty head # insert meta into empty head
yield({:type => :StartTag, :name => "head", :data => {}}) yield(:type => :StartTag, :name => "head", :data => token[:data])
yield({:type => :EmptyTag, :name => "meta", yield(:type => :EmptyTag, :name => "meta",
:data => {"charset" => @encoding}}) :data => [["charset", @encoding]])
yield({:type => :EndTag, :name => "head"}) yield(:type => :EndTag, :name => "head")
meta_found = true meta_found = true
next next
end end
@ -42,8 +65,8 @@ module HTML5lib
if token[:name].downcase == "head" and pending.any? if token[:name].downcase == "head" and pending.any?
# insert meta into head (if necessary) and flush pending queue # insert meta into head (if necessary) and flush pending queue
yield pending.shift yield pending.shift
yield({:type => :EmptyTag, :name => "meta", yield(:type => :EmptyTag, :name => "meta",
:data => {"charset" => @encoding}}) if not meta_found :data => [["charset", @encoding]]) if not meta_found
yield pending.shift while pending.any? yield pending.shift while pending.any?
meta_found = true meta_found = true
state = :post_head state = :post_head

View file

@ -62,7 +62,8 @@ module HTML5lib
@errors = [] @errors = []
@tokenizer = @tokenizer.class unless Class === @tokenizer @tokenizer = @tokenizer.class unless Class === @tokenizer
@tokenizer = @tokenizer.new(stream, :encoding => encoding, :parseMeta => innerHTML) @tokenizer = @tokenizer.new(stream, :encoding => encoding,
:parseMeta => !innerHTML)
if innerHTML if innerHTML
case @innerHTML = container.downcase case @innerHTML = container.downcase
@ -99,10 +100,13 @@ module HTML5lib
case token[:type] case token[:type]
when :Characters, :SpaceCharacters, :Comment when :Characters, :SpaceCharacters, :Comment
@phase.send method, token[:data] @phase.send method, token[:data]
when :StartTag, :Doctype when :StartTag
@phase.send method, token[:name], token[:data] @phase.send method, token[:name], token[:data]
when :EndTag when :EndTag
@phase.send method, token[:name] @phase.send method, token[:name]
when :Doctype
@phase.send method, token[:name], token[:publicId],
token[:systemId], token[:correct]
else else
parseError(token[:data]) parseError(token[:data])
end end
@ -147,10 +151,6 @@ module HTML5lib
raise ParseError if @strict raise ParseError if @strict
end end
# This error is not an error
def atheistParseError
end
# HTML5 specific normalizations to the token stream # HTML5 specific normalizations to the token stream
def normalizeToken(token) def normalizeToken(token)
@ -160,9 +160,7 @@ module HTML5lib
# element. If it matches a void element atheists did the wrong # element. If it matches a void element atheists did the wrong
# thing and if it doesn't it's wrong for everyone. # thing and if it doesn't it's wrong for everyone.
if VOID_ELEMENTS.include?(token[:name]) unless VOID_ELEMENTS.include?(token[:name])
atheistParseError
else
parseError(_('Solidus (/) incorrectly placed in tag.')) parseError(_('Solidus (/) incorrectly placed in tag.'))
end end

View file

@ -5,7 +5,7 @@ module HTML5lib
handle_start 'html', 'head' handle_start 'html', 'head'
handle_end 'html' handle_end %w( html head body br ) => 'ImplyHead'
def processEOF def processEOF
startTagHead('head', {}) startTagHead('head', {})
@ -28,7 +28,7 @@ module HTML5lib
@parser.phase.processStartTag(name, attributes) @parser.phase.processStartTag(name, attributes)
end end
def endTagHtml(name) def endTagImplyHead(name)
startTagHead('head', {}) startTagHead('head', {})
@parser.phase.processEndTag(name) @parser.phase.processEndTag(name)
end end

View file

@ -5,15 +5,20 @@ module HTML5lib
# http://www.whatwg.org/specs/web-apps/current-work/#in-body # http://www.whatwg.org/specs/web-apps/current-work/#in-body
handle_start 'html', 'body', 'form', 'plaintext', 'a', 'button', 'xmp', 'table', 'hr', 'image' handle_start 'html'
handle_start %w( base link meta script style ) => 'ProcessInHead'
handle_start 'title'
handle_start 'input', 'textarea', 'select', 'isindex', %w( script style ), %w( marquee object ) handle_start 'body', 'form', 'plaintext', 'a', 'button', 'xmp', 'table', 'hr', 'image'
handle_start %w( li dd dt ) => 'ListItem', %w( base link meta title ) => 'FromHead' handle_start 'input', 'textarea', 'select', 'isindex', %w( marquee object )
handle_start %w( li dd dt ) => 'ListItem'
handle_start %w( address blockquote center dir div dl fieldset listing menu ol p pre ul ) => 'CloseP' handle_start %w( address blockquote center dir div dl fieldset listing menu ol p pre ul ) => 'CloseP'
handle_start %w( b big em font i nobr s small strike strong tt u ) => 'Formatting' handle_start %w( b big em font i s small strike strong tt u ) => 'Formatting'
handle_start 'nobr'
handle_start %w( area basefont bgsound br embed img param spacer wbr ) => 'VoidFormatting' handle_start %w( area basefont bgsound br embed img param spacer wbr ) => 'VoidFormatting'
@ -33,7 +38,9 @@ module HTML5lib
handle_end %w( head frameset select optgroup option table caption colgroup col thead tfoot tbody tr td th ) => 'Misplaced' handle_end %w( head frameset select optgroup option table caption colgroup col thead tfoot tbody tr td th ) => 'Misplaced'
handle_end %w( area basefont bgsound br embed hr image img input isindex param spacer wbr frame ) => 'None' handle_end 'br'
handle_end %w( area basefont bgsound embed hr image img input isindex param spacer wbr frame ) => 'None'
handle_end %w( noframes noscript noembed textarea xmp iframe ) => 'CdataTextAreaXmp' handle_end %w( noframes noscript noembed textarea xmp iframe ) => 'CdataTextAreaXmp'
@ -73,11 +80,11 @@ module HTML5lib
@tree.insertText(data) @tree.insertText(data)
end end
def startTagScriptStyle(name, attributes) def startTagProcessInHead(name, attributes)
@parser.phases[:inHead].processStartTag(name, attributes) @parser.phases[:inHead].processStartTag(name, attributes)
end end
def startTagFromHead(name, attributes) def startTagTitle(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) that belongs in the head. Moved.")) @parser.parseError(_("Unexpected start tag (#{name}) that belongs in the head. Moved."))
@parser.phases[:inHead].processStartTag(name, attributes) @parser.phases[:inHead].processStartTag(name, attributes)
end end
@ -120,7 +127,12 @@ module HTML5lib
@tree.openElements.reverse.each_with_index do |node, i| @tree.openElements.reverse.each_with_index do |node, i|
if stopName.include?(node.name) if stopName.include?(node.name)
(i + 1).times { @tree.openElements.pop } poppedNodes = (0..i).collect { @tree.openElements.pop }
if i >= 1
@parser.parseError("Missing end tag%s (%s)" % [
(i>1 ? 's' : ''),
poppedNodes.reverse.map {|item| item.name}.join(', ')])
end
break break
end end
@ -142,15 +154,19 @@ module HTML5lib
def startTagHeading(name, attributes) def startTagHeading(name, attributes)
endTagP('p') if in_scope?('p') endTagP('p') if in_scope?('p')
HEADING_ELEMENTS.each do |element|
if in_scope?(element)
@parser.parseError(_("Unexpected start tag (#{name})."))
remove_open_elements_until { |element| HEADING_ELEMENTS.include?(element.name) } # Uncomment the following for IE7 behavior:
# HEADING_ELEMENTS.each do |element|
break # if in_scope?(element)
end # @parser.parseError(_("Unexpected start tag (#{name})."))
end #
# remove_open_elements_until do |element|
# HEADING_ELEMENTS.include?(element.name)
# end
#
# break
# end
# end
@tree.insertElement(name, attributes) @tree.insertElement(name, attributes)
end end
@ -170,6 +186,12 @@ module HTML5lib
addFormattingElement(name, attributes) addFormattingElement(name, attributes)
end end
def startTagNobr(name, attributes)
@tree.reconstructActiveFormattingElements
processEndTag('nobr') if in_scope?('nobr')
addFormattingElement(name, attributes)
end
def startTagButton(name, attributes) def startTagButton(name, attributes)
if in_scope?('button') if in_scope?('button')
@parser.parseError(_('Unexpected start tag (button) implied end tag (button).')) @parser.parseError(_('Unexpected start tag (button) implied end tag (button).'))
@ -497,6 +519,13 @@ module HTML5lib
@parser.parseError(_("Unexpected end tag (#{name}). Ignored.")) @parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end end
def endTagBr(name)
@parser.parseError(_("Unexpected end tag (br). Treated as br element."))
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, {})
@tree.openElements.pop()
end
def endTagNone(name) def endTagNone(name)
# This handles elements with no end tag. # This handles elements with no end tag.
@parser.parseError(_("This tag (#{name}) has no end tag")) @parser.parseError(_("This tag (#{name}) has no end tag"))

View file

@ -5,7 +5,9 @@ module HTML5lib
handle_start 'html', 'head', 'title', 'style', 'script', %w( base link meta ) handle_start 'html', 'head', 'title', 'style', 'script', %w( base link meta )
handle_end 'head', 'html', %w( title style script ) handle_end 'head'
handle_end %w( html body br ) => 'ImplyAfterHead'
handle_end %w( title style script )
def processEOF def processEOF
if ['title', 'style', 'script'].include?(name = @tree.openElements[-1].name) if ['title', 'style', 'script'].include?(name = @tree.openElements[-1].name)
@ -63,7 +65,11 @@ module HTML5lib
def startTagBaseLinkMeta(name, attributes) def startTagBaseLinkMeta(name, attributes)
element = @tree.createElement(name, attributes) element = @tree.createElement(name, attributes)
appendToHead(element) if @tree.headPointer != nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.openElements[-1].appendChild(element)
end
end end
def startTagOther(name, attributes) def startTagOther(name, attributes)
@ -80,7 +86,7 @@ module HTML5lib
@parser.phase = @parser.phases[:afterHead] @parser.phase = @parser.phases[:afterHead]
end end
def endTagHtml(name) def endTagImplyAfterHead(name)
anythingElse anythingElse
@parser.phase.processEndTag(name) @parser.phase.processEndTag(name)
end end

View file

@ -89,10 +89,10 @@ module HTML5lib
def endTagOther(name) def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) in table context caused voodoo mode.")) @parser.parseError(_("Unexpected end tag (#{name}) in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in # Make all the special element rearranging voodoo kick in
@parser.insertFromTable = true @tree.insertFromTable = true
# Process the end tag in the "in body" mode # Process the end tag in the "in body" mode
@parser.phases[:inBody].processEndTag(name) @parser.phases[:inBody].processEndTag(name)
@parser.insertFromTable = false @tree.insertFromTable = false
end end
protected protected

View file

@ -17,9 +17,95 @@ module HTML5lib
@tree.insertComment(data, @tree.document) @tree.insertComment(data, @tree.document)
end end
def processDoctype(name, error) def processDoctype(name, publicId, systemId, correct)
@parser.parseError(_('Erroneous DOCTYPE.')) if error if name.downcase != 'html' or publicId or systemId
@parser.parseError(_('Erroneous DOCTYPE.'))
end
# XXX need to update DOCTYPE tokens
@tree.insertDoctype(name) @tree.insertDoctype(name)
publicId = publicId.to_s.upcase
if name.downcase != 'html'
# XXX quirks mode
else
if ["+//silmaril//dtd html pro v0r11 19970101//en",
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
"-//as//dtd html 3.0 aswedit + extensions//en",
"-//ietf//dtd html 2.0 level 1//en",
"-//ietf//dtd html 2.0 level 2//en",
"-//ietf//dtd html 2.0 strict level 1//en",
"-//ietf//dtd html 2.0 strict level 2//en",
"-//ietf//dtd html 2.0 strict//en",
"-//ietf//dtd html 2.0//en",
"-//ietf//dtd html 2.1e//en",
"-//ietf//dtd html 3.0//en",
"-//ietf//dtd html 3.0//en//",
"-//ietf//dtd html 3.2 final//en",
"-//ietf//dtd html 3.2//en",
"-//ietf//dtd html 3//en",
"-//ietf//dtd html level 0//en",
"-//ietf//dtd html level 0//en//2.0",
"-//ietf//dtd html level 1//en",
"-//ietf//dtd html level 1//en//2.0",
"-//ietf//dtd html level 2//en",
"-//ietf//dtd html level 2//en//2.0",
"-//ietf//dtd html level 3//en",
"-//ietf//dtd html level 3//en//3.0",
"-//ietf//dtd html strict level 0//en",
"-//ietf//dtd html strict level 0//en//2.0",
"-//ietf//dtd html strict level 1//en",
"-//ietf//dtd html strict level 1//en//2.0",
"-//ietf//dtd html strict level 2//en",
"-//ietf//dtd html strict level 2//en//2.0",
"-//ietf//dtd html strict level 3//en",
"-//ietf//dtd html strict level 3//en//3.0",
"-//ietf//dtd html strict//en",
"-//ietf//dtd html strict//en//2.0",
"-//ietf//dtd html strict//en//3.0",
"-//ietf//dtd html//en",
"-//ietf//dtd html//en//2.0",
"-//ietf//dtd html//en//3.0",
"-//metrius//dtd metrius presentational//en",
"-//microsoft//dtd internet explorer 2.0 html strict//en",
"-//microsoft//dtd internet explorer 2.0 html//en",
"-//microsoft//dtd internet explorer 2.0 tables//en",
"-//microsoft//dtd internet explorer 3.0 html strict//en",
"-//microsoft//dtd internet explorer 3.0 html//en",
"-//microsoft//dtd internet explorer 3.0 tables//en",
"-//netscape comm. corp.//dtd html//en",
"-//netscape comm. corp.//dtd strict html//en",
"-//o'reilly and associates//dtd html 2.0//en",
"-//o'reilly and associates//dtd html extended 1.0//en",
"-//spyglass//dtd html 2.0 extended//en",
"-//sq//dtd html 2.0 hotmetal + extensions//en",
"-//sun microsystems corp.//dtd hotjava html//en",
"-//sun microsystems corp.//dtd hotjava strict html//en",
"-//w3c//dtd html 3 1995-03-24//en",
"-//w3c//dtd html 3.2 draft//en",
"-//w3c//dtd html 3.2 final//en",
"-//w3c//dtd html 3.2//en",
"-//w3c//dtd html 3.2s draft//en",
"-//w3c//dtd html 4.0 frameset//en",
"-//w3c//dtd html 4.0 transitional//en",
"-//w3c//dtd html experimental 19960712//en",
"-//w3c//dtd html experimental 970421//en",
"-//w3c//dtd w3 html//en",
"-//w3o//dtd w3 html 3.0//en",
"-//w3o//dtd w3 html 3.0//en//",
"-//w3o//dtd w3 html strict 3.0//en//",
"-//webtechs//dtd mozilla html 2.0//en",
"-//webtechs//dtd mozilla html//en",
"-/w3c/dtd html 4.0 transitional/en",
"html"].include?(publicId) or
(systemId == nil and
["-//w3c//dtd html 4.01 frameset//EN",
"-//w3c//dtd html 4.01 transitional//EN"].include?(publicId)) or
(systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")
#XXX quirks mode
end
end
@parser.phase = @parser.phases[:rootElement] @parser.phase = @parser.phases[:rootElement]
end end

View file

@ -101,7 +101,7 @@ module HTML5lib
@tree.insertComment(data, @tree.openElements[-1]) @tree.insertComment(data, @tree.openElements[-1])
end end
def processDoctype(name, error) def processDoctype(name, publicId, systemId, correct)
@parser.parseError(_('Unexpected DOCTYPE. Ignored.')) @parser.parseError(_('Unexpected DOCTYPE. Ignored.'))
end end

View file

@ -33,9 +33,6 @@ module HTML5lib
options.each { |name, value| instance_variable_set("@#{name}", value) } options.each { |name, value| instance_variable_set("@#{name}", value) }
# List of where new lines occur
@new_lines = [0]
# Raw Stream # Raw Stream
@raw_stream = open_stream(source) @raw_stream = open_stream(source)
@ -77,6 +74,8 @@ module HTML5lib
# Reset position in the list to read from # Reset position in the list to read from
@tell = 0 @tell = 0
@line = @col = 0
@line_lengths = []
end end
# Produces a file object from source. # Produces a file object from source.
@ -112,7 +111,7 @@ module HTML5lib
require 'UniversalDetector' # gem install chardet require 'UniversalDetector' # gem install chardet
buffer = @raw_stream.read buffer = @raw_stream.read
encoding = UniversalDetector::chardet(buffer)['encoding'] encoding = UniversalDetector::chardet(buffer)['encoding']
@raw_stream = open_stream(buffer) seek(buffer, 0)
rescue LoadError rescue LoadError
end end
end end
@ -122,7 +121,7 @@ module HTML5lib
encoding = @DEFAULT_ENCODING encoding = @DEFAULT_ENCODING
end end
#Substitute for equivalent encodings: #Substitute for equivalent encodings
encoding_sub = {'iso-8859-1' => 'windows-1252'} encoding_sub = {'iso-8859-1' => 'windows-1252'}
if encoding_sub.has_key?(encoding.downcase) if encoding_sub.has_key?(encoding.downcase)
@ -145,7 +144,6 @@ module HTML5lib
} }
# Go to beginning of file and read in 4 bytes # Go to beginning of file and read in 4 bytes
@raw_stream.seek(0)
string = @raw_stream.read(4) string = @raw_stream.read(4)
return nil unless string return nil unless string
@ -162,30 +160,80 @@ module HTML5lib
end end
end end
#AT - move this to the caller?
# Set the read position past the BOM if one was found, otherwise # Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream # set it to the start of the stream
@raw_stream.seek(encoding ? seek : 0) seek(string, encoding ? seek : 0)
return encoding return encoding
end end
def seek(buffer, n)
if @raw_stream.respond_to?(:unget)
@raw_stream.unget(buffer[n..-1])
return
end
if @raw_stream.respond_to?(:seek)
begin
@raw_stream.seek(n)
return
rescue Errno::ESPIPE
end
end
require 'delegate'
@raw_stream = SimpleDelegator.new(@raw_stream)
class << @raw_stream
def read(chars=-1)
if chars == -1 or chars > @data.length
result = @data
@data = ''
return result if __getobj__.eof?
return result + __getobj__.read if chars == -1
return result + __getobj__.read(chars-result.length)
elsif @data.empty?
return __getobj__.read(chars)
else
result = @data[1...chars]
@data = @data[chars..-1]
return result
end
end
def unget(data)
if !@data or @data.empty?
@data = data
else
@data += data
end
end
end
@raw_stream.unget(buffer[n .. -1])
end
# Report the encoding declared by the meta element # Report the encoding declared by the meta element
def detect_encoding_meta def detect_encoding_meta
parser = EncodingParser.new(@raw_stream.read(@NUM_BYTES_META)) buffer = @raw_stream.read(@NUM_BYTES_META)
@raw_stream.seek(0) parser = EncodingParser.new(buffer)
seek(buffer, 0)
return parser.get_encoding return parser.get_encoding
end end
# Returns (line, col) of the current position in the stream. # Returns (line, col) of the current position in the stream.
def position def position
line = 0 line, col = @line, @col
@new_lines.each do |pos| @queue.reverse.each do |c|
break unless pos < @tell if c == "\n"
line += 1 line -= 1
raise RuntimeError.new("col=#{col}") unless col == 0
col = @line_lengths[line]
else
col -= 1
end
end end
col = @tell - @new_lines[line-1] - 1 return [line+1, col]
return [line, col]
end end
# Read one character from the stream or queue if available. Return # Read one character from the stream or queue if available. Return
@ -205,9 +253,14 @@ module HTML5lib
c = 0x0A c = 0x0A
end end
# record where newlines occur so that the position method # update position in stream
# can tell where it is if c == 0x0a
@new_lines << @tell-1 if c == 0x0A @line_lengths << @col
@line += 1
@col = 0
else
@col += 1
end
c.chr c.chr
@ -261,11 +314,7 @@ module HTML5lib
# Put the character stopped on back to the front of the queue # Put the character stopped on back to the front of the queue
# from where it came. # from where it came.
c = char_stack.pop c = char_stack.pop
if c == :EOF or @data_stream[@tell-1] == c[0] @queue.insert(0, c) unless c == :EOF
@tell -= 1
else
@queue.insert(0, c)
end
return char_stack.join('') return char_stack.join('')
end end
end end
@ -454,7 +503,7 @@ module HTML5lib
space_found = false space_found = false
#Step 5 attribute name #Step 5 attribute name
while true while true
if @data.current_byte == '=' and attr_name: if @data.current_byte == '=' and attr_name
break break
elsif SPACE_CHARACTERS.include?(@data.current_byte) elsif SPACE_CHARACTERS.include?(@data.current_byte)
space_found = true space_found = true

View file

@ -69,15 +69,22 @@ module HTML5lib
# ensure that non-void XHTML elements have content so that separate # ensure that non-void XHTML elements have content so that separate
# open and close tags are emitted # open and close tags are emitted
if token[:type] == :EndTag and \ if token[:type] == :EndTag
not VOID_ELEMENTS.include? token[:name] and \ if VOID_ELEMENTS.include? token[:name]
token[:name] == @tree.openElements[-1].name and \ if @tree.openElements[-1].name != token["name"]:
not @tree.openElements[-1].hasContent token[:type] = :EmptyTag
@tree.insertText('') unless token["data"] ||= {}
@tree.openElements.any? {|e| end
e.attributes.keys.include? 'xmlns' and else
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml' if token[:name] == @tree.openElements[-1].name and \
} not @tree.openElements[-1].hasContent
@tree.insertText('') unless
@tree.openElements.any? {|e|
e.attributes.keys.include? 'xmlns' and
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
}
end
end
end end
return token return token

View file

@ -1,178 +1,2 @@
require 'html5lib/constants' require 'html5lib/serializer/htmlserializer'
require 'html5lib/serializer/xhtmlserializer'
module HTML5lib
class HTMLSerializer
CDATA_ELEMENTS = %w[style script xmp iframe noembed noframes noscript]
def self.serialize(stream, options = {})
new(options).serialize(stream, options[:encoding])
end
def initialize(options={})
@quote_attr_values = false
@quote_char = '"'
@use_best_quote_char = true
@minimize_boolean_attributes = true
@use_trailing_solidus = false
@space_before_trailing_solidus = true
@omit_optional_tags = true
@sanitize = false
@strip_whitespace = false
@inject_meta_charset = true
options.each do |name, value|
next unless %w(quote_attr_values quote_char use_best_quote_char
minimize_boolean_attributes use_trailing_solidus
space_before_trailing_solidus omit_optional_tags sanitize
strip_whitespace inject_meta_charset).include? name.to_s
@use_best_quote_char = false if name.to_s == 'quote_char'
instance_variable_set("@#{name}", value)
end
@errors = []
end
def serialize(treewalker, encoding=nil)
in_cdata = false
@errors = []
if encoding and @inject_meta_charset
require 'html5lib/filters/inject_meta_charset'
treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
end
if @strip_whitespace
require 'html5lib/filters/whitespace'
treewalker = Filters::WhitespaceFilter.new(treewalker)
end
if @sanitize
require 'html5lib/filters/sanitizer'
treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
end
if @omit_optional_tags
require 'html5lib/filters/optionaltags'
treewalker = Filters::OptionalTagFilter.new(treewalker)
end
result = []
treewalker.each do |token|
type = token[:type]
if type == :Doctype
doctype = "<!DOCTYPE %s>" % token[:name]
result << doctype
elsif [:Characters, :SpaceCharacters].include? type
if type == :SpaceCharacters or in_cdata
if in_cdata and token[:data].include?("</")
serializeError(_("Unexpected </ in CDATA"))
end
result << token[:data]
else
result << token[:data].
gsub("&", "&amp;").
gsub("<", "&lt;").
gsub(">", "&gt;")
end
elsif [:StartTag, :EmptyTag].include? type
name = token[:name]
if CDATA_ELEMENTS.include?(name)
in_cdata = true
elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element"))
end
attributes = []
for k,v in attrs = token[:data].to_a.sort
attributes << ' '
attributes << k
if not @minimize_boolean_attributes or \
(!(BOOLEAN_ATTRIBUTES[name]||[]).include?(k) \
and !BOOLEAN_ATTRIBUTES[:global].include?(k))
attributes << "="
if @quote_attr_values or v.empty?
quote_attr = true
else
quote_attr = (SPACE_CHARACTERS + %w(< > " ')).any? {|c| v.include?(c)}
end
v = v.gsub("&", "&amp;")
if quote_attr
quote_char = @quote_char
if @use_best_quote_char
if v.index("'") and !v.index('"')
quote_char = '"'
elsif v.index('"') and !v.index("'")
quote_char = "'"
end
end
if quote_char == "'"
v = v.gsub("'", "&#39;")
else
v = v.gsub('"', "&quot;")
end
attributes << quote_char << v << quote_char
else
attributes << v
end
end
end
if VOID_ELEMENTS.include?(name) and @use_trailing_solidus
if @space_before_trailing_solidus
attributes << " /"
else
attributes << "/"
end
end
result << "<%s%s>" % [name, attributes.join('')]
elsif type == :EndTag
name = token[:name]
if CDATA_ELEMENTS.include?(name)
in_cdata = false
elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element"))
end
end_tag = "</#{name}>"
result << end_tag
elsif type == :Comment
data = token[:data]
serializeError(_("Comment contains --")) if data.index("--")
comment = "<!--%s-->" % token[:data]
result << comment
else
serializeError(token[:data])
end
end
if encoding and encoding != 'utf-8'
require 'iconv'
Iconv.iconv(encoding, 'utf-8', result.join('')).first
else
result.join('')
end
end
alias :render :serialize
def serializeError(data="XXX ERROR MESSAGE NEEDED")
# XXX The idea is to make data mandatory.
@errors.push(data)
if @strict
raise SerializeError
end
end
end
# Error in serialized tree
class SerializeError < Exception
end
end

View file

@ -0,0 +1,177 @@
require 'html5lib/constants'
module HTML5lib
class HTMLSerializer
def self.serialize(stream, options = {})
new(options).serialize(stream, options[:encoding])
end
def escape(string)
string.gsub("&", "&amp;").gsub("<", "&lt;").gsub(">", "&gt;")
end
def initialize(options={})
@quote_attr_values = false
@quote_char = '"'
@use_best_quote_char = true
@minimize_boolean_attributes = true
@use_trailing_solidus = false
@space_before_trailing_solidus = true
@escape_lt_in_attrs = false
@omit_optional_tags = true
@sanitize = false
@strip_whitespace = false
@inject_meta_charset = true
options.each do |name, value|
next unless instance_variables.include?("@#{name}")
@use_best_quote_char = false if name.to_s == 'quote_char'
instance_variable_set("@#{name}", value)
end
@errors = []
end
def serialize(treewalker, encoding=nil)
in_cdata = false
@errors = []
if encoding and @inject_meta_charset
require 'html5lib/filters/inject_meta_charset'
treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
end
if @strip_whitespace
require 'html5lib/filters/whitespace'
treewalker = Filters::WhitespaceFilter.new(treewalker)
end
if @sanitize
require 'html5lib/filters/sanitizer'
treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
end
if @omit_optional_tags
require 'html5lib/filters/optionaltags'
treewalker = Filters::OptionalTagFilter.new(treewalker)
end
result = []
treewalker.each do |token|
type = token[:type]
if type == :Doctype
doctype = "<!DOCTYPE %s>" % token[:name]
result << doctype
elsif [:Characters, :SpaceCharacters].include? type
if type == :SpaceCharacters or in_cdata
if in_cdata and token[:data].include?("</")
serializeError(_("Unexpected </ in CDATA"))
end
result << token[:data]
else
result << escape(token[:data])
end
elsif [:StartTag, :EmptyTag].include? type
name = token[:name]
if RCDATA_ELEMENTS.include?(name)
in_cdata = true
elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element"))
end
attributes = []
for k,v in attrs = token[:data].to_a.sort
attributes << ' '
attributes << k
if not @minimize_boolean_attributes or \
(!(BOOLEAN_ATTRIBUTES[name]||[]).include?(k) \
and !BOOLEAN_ATTRIBUTES[:global].include?(k))
attributes << "="
if @quote_attr_values or v.empty?
quote_attr = true
else
quote_attr = (SPACE_CHARACTERS + %w(< > " ')).any? {|c| v.include?(c)}
end
v = v.gsub("&", "&amp;")
v = v.gsub("<", "&lt;") if @escape_lt_in_attrs
if quote_attr
quote_char = @quote_char
if @use_best_quote_char
if v.index("'") and !v.index('"')
quote_char = '"'
elsif v.index('"') and !v.index("'")
quote_char = "'"
end
end
if quote_char == "'"
v = v.gsub("'", "&#39;")
else
v = v.gsub('"', "&quot;")
end
attributes << quote_char << v << quote_char
else
attributes << v
end
end
end
if VOID_ELEMENTS.include?(name) and @use_trailing_solidus
if @space_before_trailing_solidus
attributes << " /"
else
attributes << "/"
end
end
result << "<%s%s>" % [name, attributes.join('')]
elsif type == :EndTag
name = token[:name]
if RCDATA_ELEMENTS.include?(name)
in_cdata = false
elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element"))
end
end_tag = "</#{name}>"
result << end_tag
elsif type == :Comment
data = token[:data]
serializeError(_("Comment contains --")) if data.index("--")
comment = "<!--%s-->" % token[:data]
result << comment
else
serializeError(token[:data])
end
end
if encoding and encoding != 'utf-8'
require 'iconv'
Iconv.iconv(encoding, 'utf-8', result.join('')).first
else
result.join('')
end
end
alias :render :serialize
def serializeError(data="XXX ERROR MESSAGE NEEDED")
# XXX The idea is to make data mandatory.
@errors.push(data)
if @strict
raise SerializeError
end
end
end
# Error in serialized tree
class SerializeError < Exception
end
end

View file

@ -0,0 +1,19 @@
require 'html5lib/serializer/htmlserializer'
module HTML5lib
class XHTMLSerializer < HTMLSerializer
DEFAULTS = {
:quote_attr_values => true,
:minimize_boolean_attributes => false,
:use_trailing_solidus => true,
:escape_lt_in_attrs => true,
:omit_optional_tags => false
}
def initialize(options={})
super(DEFAULTS.clone.update(options))
end
end
end

View file

@ -41,19 +41,31 @@ module HTML5lib
:attributeValueUnQuoted => :attributeValueUnQuotedState, :attributeValueUnQuoted => :attributeValueUnQuotedState,
:bogusComment => :bogusCommentState, :bogusComment => :bogusCommentState,
:markupDeclarationOpen => :markupDeclarationOpenState, :markupDeclarationOpen => :markupDeclarationOpenState,
:commentStart => :commentStartState,
:commentStartDash => :commentStartDashState,
:comment => :commentState, :comment => :commentState,
:commentDash => :commentDashState, :commentEndDash => :commentEndDashState,
:commentEnd => :commentEndState, :commentEnd => :commentEndState,
:doctype => :doctypeState, :doctype => :doctypeState,
:beforeDoctypeName => :beforeDoctypeNameState, :beforeDoctypeName => :beforeDoctypeNameState,
:doctypeName => :doctypeNameState, :doctypeName => :doctypeNameState,
:afterDoctypeName => :afterDoctypeNameState, :afterDoctypeName => :afterDoctypeNameState,
:beforeDoctypePublicIdentifier => :beforeDoctypePublicIdentifierState,
:doctypePublicIdentifierDoubleQuoted => :doctypePublicIdentifierDoubleQuotedState,
:doctypePublicIdentifierSingleQuoted => :doctypePublicIdentifierSingleQuotedState,
:afterDoctypePublicIdentifier => :afterDoctypePublicIdentifierState,
:beforeDoctypeSystemIdentifier => :beforeDoctypeSystemIdentifierState,
:doctypeSystemIdentifierDoubleQuoted => :doctypeSystemIdentifierDoubleQuotedState,
:doctypeSystemIdentifierSingleQuoted => :doctypeSystemIdentifierSingleQuotedState,
:afterDoctypeSystemIdentifier => :afterDoctypeSystemIdentifierState,
:bogusDoctype => :bogusDoctypeState :bogusDoctype => :bogusDoctypeState
} }
# Setup the initial tokenizer state # Setup the initial tokenizer state
@contentModelFlag = :PCDATA @contentModelFlag = :PCDATA
@state = @states[:data] @state = @states[:data]
@escapeFlag = false
@lastFourChars = []
# The current token being created # The current token being created
@currentToken = nil @currentToken = nil
@ -133,24 +145,14 @@ module HTML5lib
# If the integer is between 127 and 160 (so 128 and bigger and 159 and # If the integer is between 127 and 160 (so 128 and bigger and 159 and
# smaller) we need to do the "windows trick". # smaller) we need to do the "windows trick".
if (127...160).include? charAsInt if (127...160).include? charAsInt
#XXX - removed parse error from windows 1252 entity for now @tokenQueue.push({:type => :ParseError, :data =>
#we may want to reenable this later _("Entity used with illegal number (windows-1252 reference).")})
#@tokenQueue.push({:type => :ParseError, :data =>
# _("Entity used with illegal number (windows-1252 reference).")})
charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128] charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
end end
# 0 is not a good number. if charAsInt > 0 and charAsInt <= 1114111
if charAsInt == 0
charAsInt = 65533
end
if charAsInt <= 0x10FFFF
char = [charAsInt].pack('U') char = [charAsInt].pack('U')
else
@tokenQueue.push({:type => :ParseError, :data =>
_("Numeric entity couldn't be converted to character.")})
end end
# Discard the ; if present. Otherwise, put it back on the queue and # Discard the ; if present. Otherwise, put it back on the queue and
@ -167,7 +169,10 @@ module HTML5lib
def consumeEntity def consumeEntity
char = nil char = nil
charStack = [@stream.char] charStack = [@stream.char]
if charStack[0] == "#" if SPACE_CHARACTERS.include?(charStack[0]) or
[:EOF, '<', '&'].include?(charStack[0])
@stream.queue+= charStack
elsif charStack[0] == "#"
# We might have a number entity here. # We might have a number entity here.
charStack += [@stream.char, @stream.char] charStack += [@stream.char, @stream.char]
if charStack.include? :EOF if charStack.include? :EOF
@ -194,10 +199,6 @@ module HTML5lib
_("Numeric entity expected but none found.")}) _("Numeric entity expected but none found.")})
end end
end end
# Break out if we reach the end of the file
elsif charStack[0] == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Entity expected. Got end of file instead.")})
else else
# At this point in the process might have named entity. Entities # At this point in the process might have named entity. Entities
# are stored in the global variable "entities". # are stored in the global variable "entities".
@ -267,14 +268,33 @@ module HTML5lib
# statements should be. # statements should be.
def dataState def dataState
data = @stream.char data = @stream.char
if data == "&" and (@contentModelFlag == :PCDATA or
@contentModelFlag == :RCDATA) if @contentModelFlag == :CDATA or @contentModelFlag == :RCDATA
@lastFourChars << data
@lastFourChars.shift if @lastFourChars.length > 4
end
if data == "&" and [:PCDATA,:RCDATA].include?(@contentModelFlag)
@state = @states[:entityData] @state = @states[:entityData]
elsif data == "<" and @contentModelFlag != :PLAINTEXT
@state = @states[:tagOpen] elsif data == "-" and [:CDATA,:RCDATA].include?(@contentModelFlag) and
@escapeFlag == false and @lastFourChars.join('') == "<!--"
@escapeFlag = true
@tokenQueue.push({:type => :Characters, :data => data})
elsif data == "<" and @escapeFlag == false and
[:PCDATA,:CDATA,:RCDATA].include?(@contentModelFlag)
@state = @states[:tagOpen]
elsif data == ">" and [:CDATA,:RCDATA].include?(@contentModelFlag) and
@escapeFlag == true and @lastFourChars[1..-1].join('') == "-->"
@escapeFlag = false
@tokenQueue.push({:type => :Characters, :data => data})
elsif data == :EOF elsif data == :EOF
# Tokenization ends. # Tokenization ends.
return false return false
elsif SPACE_CHARACTERS.include? data elsif SPACE_CHARACTERS.include? data
# Directly after emitting a token you switch back to the "data # Directly after emitting a token you switch back to the "data
# state". At that point SPACE_CHARACTERS are important so they are # state". At that point SPACE_CHARACTERS are important so they are
@ -285,7 +305,7 @@ module HTML5lib
data + @stream.chars_until(SPACE_CHARACTERS, true)}) data + @stream.chars_until(SPACE_CHARACTERS, true)})
else else
@tokenQueue.push({:type => :Characters, :data => @tokenQueue.push({:type => :Characters, :data =>
data + @stream.chars_until(["&", "<"])}) data + @stream.chars_until(%w[& < > -])})
end end
return true return true
end end
@ -380,8 +400,6 @@ module HTML5lib
# emitting the end tag token. # emitting the end tag token.
@contentModelFlag = :PCDATA @contentModelFlag = :PCDATA
else else
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected closing tag after seeing '</'. None found.")})
@tokenQueue.push({:type => :Characters, :data => "</"}) @tokenQueue.push({:type => :Characters, :data => "</"})
@state = @states[:data] @state = @states[:data]
@ -391,29 +409,27 @@ module HTML5lib
end end
end end
if @contentModelFlag == :PCDATA data = @stream.char
data = @stream.char if data == :EOF
if data == :EOF @tokenQueue.push({:type => :ParseError, :data =>
@tokenQueue.push({:type => :ParseError, :data => _("Expected closing tag. Unexpected end of file.")})
_("Expected closing tag. Unexpected end of file.")}) @tokenQueue.push({:type => :Characters, :data => "</"})
@tokenQueue.push({:type => :Characters, :data => "</"}) @state = @states[:data]
@state = @states[:data] elsif ASCII_LETTERS.include? data
elsif ASCII_LETTERS.include? data @currentToken = {:type => :EndTag, :name => data, :data => []}
@currentToken =\ @state = @states[:tagName]
{:type => :EndTag, :name => data, :data => []} elsif data == ">"
@state = @states[:tagName] @tokenQueue.push({:type => :ParseError, :data =>
elsif data == ">" _("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
@tokenQueue.push({:type => :ParseError, :data => @state = @states[:data]
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")}) else
@state = @states[:data] # XXX data can be _'_...
else @tokenQueue.push({:type => :ParseError, :data =>
# XXX data can be _'_... _("Expected closing tag. Unexpected character '#{data}' found.")})
@tokenQueue.push({:type => :ParseError, :data => @stream.queue.push(data)
_("Expected closing tag. Unexpected character '" + data + "' found.")}) @state = @states[:bogusComment]
@stream.queue.push(data)
@state = @states[:bogusComment]
end
end end
return true return true
end end
@ -430,11 +446,6 @@ module HTML5lib
@stream.chars_until(ASCII_LETTERS, true) @stream.chars_until(ASCII_LETTERS, true)
elsif data == ">" elsif data == ">"
emitCurrentToken emitCurrentToken
elsif data == "<"
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected < character when getting the tag name.")})
emitCurrentToken
elsif data == "/" elsif data == "/"
processSolidusInTag processSolidusInTag
@state = @states[:beforeAttributeName] @state = @states[:beforeAttributeName]
@ -459,11 +470,6 @@ module HTML5lib
emitCurrentToken emitCurrentToken
elsif data == "/" elsif data == "/"
processSolidusInTag processSolidusInTag
elsif data == "<"
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected < character. Expected attribute name instead.")})
emitCurrentToken
else else
@currentToken[:data].push([data, ""]) @currentToken[:data].push([data, ""])
@state = @states[:attributeName] @state = @states[:attributeName]
@ -494,12 +500,6 @@ module HTML5lib
elsif data == "/" elsif data == "/"
processSolidusInTag processSolidusInTag
@state = @states[:beforeAttributeName] @state = @states[:beforeAttributeName]
elsif data == "<"
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected < character in attribute name.")})
emitCurrentToken
leavingThisState = false
else else
@currentToken[:data][-1][0] += data @currentToken[:data][-1][0] += data
leavingThisState = false leavingThisState = false
@ -537,11 +537,6 @@ module HTML5lib
elsif data == "/" elsif data == "/"
processSolidusInTag processSolidusInTag
@state = @states[:beforeAttributeName] @state = @states[:beforeAttributeName]
elsif data == "<"
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected < character. Expected = or end of tag.")})
emitCurrentToken
elsif data == :EOF elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file. Expected = or end of tag.")}) _("Unexpected end of file. Expected = or end of tag.")})
@ -566,11 +561,6 @@ module HTML5lib
@state = @states[:attributeValueSingleQuoted] @state = @states[:attributeValueSingleQuoted]
elsif data == ">" elsif data == ">"
emitCurrentToken emitCurrentToken
elsif data == "<"
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected < character. Expected attribute value.")})
emitCurrentToken
elsif data == :EOF elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file. Expected attribute value.")}) _("Unexpected end of file. Expected attribute value.")})
@ -624,11 +614,6 @@ module HTML5lib
processEntityInAttribute processEntityInAttribute
elsif data == ">" elsif data == ">"
emitCurrentToken emitCurrentToken
elsif data == "<"
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected < character in attribute value.")})
emitCurrentToken
elsif data == :EOF elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in attribute value.")}) _("Unexpected end of file in attribute value.")})
@ -658,14 +643,15 @@ module HTML5lib
charStack = [@stream.char, @stream.char] charStack = [@stream.char, @stream.char]
if charStack == ["-", "-"] if charStack == ["-", "-"]
@currentToken = {:type => :Comment, :data => ""} @currentToken = {:type => :Comment, :data => ""}
@state = @states[:comment] @state = @states[:commentStart]
else else
5.times { charStack.push(@stream.char) } 5.times { charStack.push(@stream.char) }
# Put in explicit :EOF check # Put in explicit :EOF check
if ((not charStack.include? :EOF) and if ((not charStack.include? :EOF) and
charStack.join("").upcase == "DOCTYPE") charStack.join("").upcase == "DOCTYPE")
@currentToken =\ @currentToken =\
{:type => :Doctype, :name => "", :data => true} {:type => :Doctype, :name => "",
:publicId => nil, :systemId => nil, :correct => true}
@state = @states[:doctype] @state = @states[:doctype]
else else
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
@ -677,10 +663,52 @@ module HTML5lib
return true return true
end end
def commentStartState
data = @stream.char
if data == "-"
@state = @states[:commentStartDash]
elsif data == ">"
@tokenQueue.push({:type => :ParseError, :data =>
_("Incorrect comment.")})
@tokenQueue.push(@currentToken)
@state = @states[:data]
elsif data == EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in comment.")})
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@currentToken[:data] += data + @stream.chars_until("-")
@state = @states[:comment]
end
return true
end
def commentStartDashState
data = @stream.char
if data == "-"
@state = @states[:commentEnd]
elsif data == ">"
@tokenQueue.push({:type => :ParseError, :data =>
_("Incorrect comment.")})
@tokenQueue.push(@currentToken)
@state = @states[:data]
elsif data == EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in comment.")})
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@currentToken[:data] += data + @stream.chars_until("-")
@state = @states[:comment]
end
return true
end
def commentState def commentState
data = @stream.char data = @stream.char
if data == "-" if data == "-"
@state = @states[:commentDash] @state = @states[:commentEndDash]
elsif data == :EOF elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in comment.")}) _("Unexpected end of file in comment.")})
@ -692,7 +720,7 @@ module HTML5lib
return true return true
end end
def commentDashState def commentEndDashState
data = @stream.char data = @stream.char
if data == "-" if data == "-"
@state = @states[:commentEnd] @state = @states[:commentEnd]
@ -752,19 +780,16 @@ module HTML5lib
def beforeDoctypeNameState def beforeDoctypeNameState
data = @stream.char data = @stream.char
if SPACE_CHARACTERS.include? data if SPACE_CHARACTERS.include? data
elsif ASCII_LOWERCASE.include? data
@currentToken[:name] = data.upcase
@state = @states[:doctypeName]
elsif data == ">" elsif data == ">"
# Character needs to be consumed per the specification so don't
# invoke emitCurrentTokenWithParseError with :data as argument.
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected > character. Expected DOCTYPE name.")}) _("Unexpected > character. Expected DOCTYPE name.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken) @tokenQueue.push(@currentToken)
@state = @states[:data] @state = @states[:data]
elsif data == :EOF elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file. Expected DOCTYPE name.")}) _("Unexpected end of file. Expected DOCTYPE name.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken) @tokenQueue.push(@currentToken)
@state = @states[:data] @state = @states[:data]
else else
@ -776,33 +801,21 @@ module HTML5lib
def doctypeNameState def doctypeNameState
data = @stream.char data = @stream.char
needsDoctypeCheck = false
if SPACE_CHARACTERS.include? data if SPACE_CHARACTERS.include? data
@state = @states[:afterDoctypeName] @state = @states[:afterDoctypeName]
needsDoctypeCheck = true
elsif data == ">" elsif data == ">"
@tokenQueue.push(@currentToken) @tokenQueue.push(@currentToken)
@state = @states[:data] @state = @states[:data]
elsif data == :EOF elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE name.")}) _("Unexpected end of file in DOCTYPE name.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken) @tokenQueue.push(@currentToken)
@state = @states[:data] @state = @states[:data]
else else
# We can't just uppercase everything that arrives here. For
# instance, non-ASCII characters.
if ASCII_LOWERCASE.include? data
data = data.upcase
end
@currentToken[:name] += data @currentToken[:name] += data
needsDoctypeCheck = true
end end
# After some iterations through this state it should eventually say
# "HTML". Otherwise there's an error.
if needsDoctypeCheck and @currentToken[:name] == "HTML"
@currentToken[:data] = false
end
return true return true
end end
@ -814,16 +827,195 @@ module HTML5lib
@state = @states[:data] @state = @states[:data]
elsif data == :EOF elsif data == :EOF
@currentToken[:data] = true @currentToken[:data] = true
# XXX EMIT
@stream.queue.push(data) @stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")}) _("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
charStack = [data]
5.times { charStack << stream.char }
token = charStack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
if token == "public"
@state = @states[:beforeDoctypePublicIdentifier]
elsif token == "system"
@state = @states[:beforeDoctypeSystemIdentifier]
else
@stream.queue += charStack
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected 'public' or 'system'. Got '#{charStack.join('')}'")})
@state = @states[:bogusDoctype]
end
end
return true
end
def beforeDoctypePublicIdentifierState
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == "\""
@currentToken[:publicId] = ""
@state = @states[:doctypePublicIdentifierDoubleQuoted]
elsif data == "'"
@currentToken[:publicId] = ""
@state = @states[:doctypePublicIdentifierSingleQuoted]
elsif data == ">"
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken) @tokenQueue.push(@currentToken)
@state = @states[:data] @state = @states[:data]
else else
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Expected space or '>'. Got '" + data + "'")}) _("Unexpected character in DOCTYPE.")})
@currentToken[:data] = true @state = @states[:bogusDoctype]
end
return true
end
def doctypePublicIdentifierDoubleQuotedState
data = @stream.char
if data == "\""
@state = @states[:afterDoctypePublicIdentifier]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@currentToken[:publicId] += data
end
return true
end
def doctypePublicIdentifierSingleQuotedState
data = @stream.char
if data == "'"
@state = @states[:afterDoctypePublicIdentifier]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@currentToken[:publicId] += data
end
return true
end
def afterDoctypePublicIdentifierState
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == "\""
@currentToken[:systemId] = ""
@state = @states[:doctypeSystemIdentifierDoubleQuoted]
elsif data == "'"
@currentToken[:systemId] = ""
@state = @states[:doctypeSystemIdentifierSingleQuoted]
elsif data == ">"
@tokenQueue.push(@currentToken)
@state = @states[:data]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected character in DOCTYPE.")})
@state = @states[:bogusDoctype]
end
return true
end
def beforeDoctypeSystemIdentifierState
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == "\""
@currentToken[:systemId] = ""
@state = @states[:doctypeSystemIdentifierDoubleQuoted]
elsif data == "'"
@currentToken[:systemId] = ""
@state = @states[:doctypeSystemIdentifierSingleQuoted]
elsif data == ">"
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected character in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected character in DOCTYPE.")})
@state = @states[:bogusDoctype]
end
return true
end
def doctypeSystemIdentifierDoubleQuotedState
data = @stream.char
if data == "\""
@state = @states[:afterDoctypeSystemIdentifier]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@currentToken[:systemId] += data
end
return true
end
def doctypeSystemIdentifierSingleQuotedState
data = @stream.char
if data == "'"
@state = @states[:afterDoctypeSystemIdentifier]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@currentToken[:systemId] += data
end
return true
end
def afterDoctypeSystemIdentifierState
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == ">"
@tokenQueue.push(@currentToken)
@state = @states[:data]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected character in DOCTYPE.")})
@state = @states[:bogusDoctype] @state = @states[:bogusDoctype]
end end
return true return true
@ -839,6 +1031,7 @@ module HTML5lib
@stream.queue.push(data) @stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in bogus doctype.")}) _("Unexpected end of file in bogus doctype.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken) @tokenQueue.push(@currentToken)
@state = @states[:data] @state = @states[:data]
end end

View file

@ -27,6 +27,9 @@ module HTML5lib
childNodes << node childNodes << node
hpricot.children << node.hpricot hpricot.children << node.hpricot
end end
if (oldparent = node.hpricot.parent) != nil
oldparent.children.delete_at(oldparent.children.index(node.hpricot))
end
node.hpricot.parent = hpricot node.hpricot.parent = hpricot
node.parent = self node.parent = self
end end

View file

@ -9,7 +9,7 @@ module HTML5lib
def node_details(node) def node_details(node)
case node case node
when ::Hpricot::Elem when ::Hpricot::Elem
if !node.name if node.name.empty?
[:DOCUMENT_FRAGMENT] [:DOCUMENT_FRAGMENT]
else else
[:ELEMENT, node.name, [:ELEMENT, node.name,

View file

@ -5,12 +5,20 @@
$:.unshift File.dirname(__FILE__),'lib' $:.unshift File.dirname(__FILE__),'lib'
def parse(opts, args) def parse(opts, args)
encoding = nil
f = args[-1] f = args[-1]
if f if f
begin begin
require 'open-uri' if f[0..6] == 'http://' if f[0..6] == 'http://'
f = open(f) require 'open-uri'
f = URI.parse(f).open
encoding = f.charset
elsif f == '-'
f = $stdin
else
f = open(f)
end
rescue rescue
end end
else else
@ -29,22 +37,28 @@ def parse(opts, args)
p = HTML5lib::HTMLParser.new(:tree=>treebuilder) p = HTML5lib::HTMLParser.new(:tree=>treebuilder)
end end
if opts.parsemethod == :parse
args = [f, encoding]
else
args = [f, 'div', encoding]
end
if opts.profile if opts.profile
require 'profiler' require 'profiler'
Profiler__::start_profile Profiler__::start_profile
p.send(opts.parsemethod,f) p.send(opts.parsemethod, *args)
Profiler__::stop_profile Profiler__::stop_profile
Profiler__::print_profile($stderr) Profiler__::print_profile($stderr)
elsif opts.time elsif opts.time
require 'time' require 'time'
t0 = Time.new t0 = Time.new
document = p.send(opts.parsemethod,f) document = p.send(opts.parsemethod, *args)
t1 = Time.new t1 = Time.new
printOutput(p, document, opts) printOutput(p, document, opts)
t2 = Time.new t2 = Time.new
puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1] puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1]
else else
document = p.send(opts.parsemethod,f) document = p.send(opts.parsemethod, *args)
printOutput(p, document, opts) printOutput(p, document, opts)
end end
end end
@ -59,7 +73,7 @@ def printOutput(parser, document, opts)
require 'html5lib/treewalkers' require 'html5lib/treewalkers'
tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document) tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document)
require 'html5lib/serializer' require 'html5lib/serializer'
print HTML5lib::HTMLSerializer.serialize(tokens, opts.serializer) puts HTML5lib::HTMLSerializer.serialize(tokens, opts.serializer)
when :hilite when :hilite
print document.hilite print document.hilite
when :tree when :tree
@ -93,26 +107,35 @@ options.serializer = {
require 'optparse' require 'optparse'
opts = OptionParser.new do |opts| opts = OptionParser.new do |opts|
opts.on("-p", "--[no-]profile", "Profile the run") do |profile| opts.separator ""
options.profile = profile opts.separator "Parse Options:"
end
opts.on("-t", "--[no-]time", "Time the run") do |time|
options.time = time
end
opts.on("-b", "--treebuilder NAME") do |treebuilder| opts.on("-b", "--treebuilder NAME") do |treebuilder|
options.treebuilder = treebuilder options.treebuilder = treebuilder
end end
opts.on("-e", "--error", "Print a list of parse errors") do |error|
options.error = error
end
opts.on("-f", "--fragment", "Parse as a fragment") do |parse| opts.on("-f", "--fragment", "Parse as a fragment") do |parse|
options.parsemethod = :parseFragment options.parsemethod = :parseFragment
end end
opts.separator ""
opts.separator "Filter Options:"
opts.on("--[no-]inject-meta-charset", "inject <meta charset>") do |inject|
options.serializer[:inject_meta_charset] = inject
end
opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
options.serializer[:strip_whitespace] = strip
end
opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
options.serializer[:sanitize] = sanitize
end
opts.separator ""
opts.separator "Output Options:"
opts.on("--tree", "output as debug tree") do |tree| opts.on("--tree", "output as debug tree") do |tree|
options.output = :tree options.output = :tree
end end
@ -130,26 +153,56 @@ opts = OptionParser.new do |opts|
options.output = :hilite options.output = :hilite
end end
opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding| opts.on("-e", "--error", "Print a list of parse errors") do |error|
options.encoding = encoding options.error = error
end end
opts.on("--[no-]inject-meta-charset", "inject <meta charset>") do |inject| opts.separator ""
options.serializer[:inject_meta_charset] = inject opts.separator "Serialization Options:"
end
opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
options.serializer[:strip_whitespace] = strip
end
opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
options.serializer[:sanitize] = sanitize
end
opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit| opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit|
options.serializer[:omit_optional_tags] = omit options.serializer[:omit_optional_tags] = omit
end end
opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote|
options.serializer[:quote_attr_values] = quote
end
opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best|
options.serializer[:use_best_quote_char] = best
end
opts.on("--quote-char C", "Use specified quote character") do |c|
options.serializer[:quote_char] = c
end
opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min|
options.serializer[:minimize_boolean_attributes] = min
end
opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash|
options.serializer[:use_trailing_solidus] = slash
end
opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt|
options.serializer[:escape_lt_in_attrs] = lt
end
opts.separator ""
opts.separator "Other Options:"
opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
options.profile = profile
end
opts.on("-t", "--[no-]time", "Time the run") do |time|
options.time = time
end
opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
options.encoding = encoding
end
opts.on_tail("-h", "--help", "Show this message") do opts.on_tail("-h", "--help", "Show this message") do
puts opts puts opts
exit exit

View file

@ -322,12 +322,14 @@ Windows-1252
#encoding #encoding
Windows-1252 Windows-1252
#data <!-- 4096 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx--> #data
<!-- 4096 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx-->
<meta charset="ISO-8859-9"> <meta charset="ISO-8859-9">
#encoding #encoding
Windows-1252 Windows-1252
#data <!-- 4097 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxz--> #data
<!-- 4097 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxz-->
<meta charset="ISO-8859-9"> <meta charset="ISO-8859-9">
#encoding #encoding
Windows-1252 Windows-1252

View file

@ -35,7 +35,7 @@
{ {
"name": "div_background_image_unicode_encoded", "name": "div_background_image_unicode_encoded",
"input": "<div style=\"background-image:\a5\a2\006C\0028'\006a\0061\a6\0061\a3\0063\a2\0069\a0\a4\003a\0061\006c\0065\a2\a4\0028.1027\0058.1053\0053\0027\0029'\0029\">foo</div>", "input": "<div style=\"background-image:\u00a5\u00a2\u006C\u0028'\u006a\u0061\u00a6\u0061\u00a3\u0063\u00a2\u0069\u00a0\u00a4\u003a\u0061\u006c\u0065\u00a2\u00a4\u0028.1027\u0058.1053\u0053\u0027\u0029'\u0029\">foo</div>",
"output": "<div style=''>foo</div>" "output": "<div style=''>foo</div>"
}, },
@ -48,14 +48,14 @@
{ {
"name": "double_open_angle_brackets", "name": "double_open_angle_brackets",
"input": "<img src=http://ha.ckers.org/scriptlet.html <", "input": "<img src=http://ha.ckers.org/scriptlet.html <",
"output": "<img src='http://ha.ckers.org/scriptlet.html'/>&lt;", "output": "<img src='http://ha.ckers.org/scriptlet.html'/>",
"rexml": "Ill-formed XHTML!" "rexml": "Ill-formed XHTML!"
}, },
{ {
"name": "double_open_angle_brackets_2", "name": "double_open_angle_brackets_2",
"input": "<script src=http://ha.ckers.org/scriptlet.html <", "input": "<script src=http://ha.ckers.org/scriptlet.html <",
"output": "&lt;script src=\"http://ha.ckers.org/scriptlet.html\"&gt;&lt;", "output": "&lt;script src=\"http://ha.ckers.org/scriptlet.html\" &lt;=\"\"&gt;",
"rexml": "Ill-formed XHTML!" "rexml": "Ill-formed XHTML!"
}, },
@ -110,7 +110,7 @@
{ {
"name": "no_closing_script_tags", "name": "no_closing_script_tags",
"input": "<script src=http://ha.ckers.org/xss.js?<b>", "input": "<script src=http://ha.ckers.org/xss.js?<b>",
"output": "&lt;script src=\"http://ha.ckers.org/xss.js?\"&gt;<b/>", "output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;",
"rexml": "Ill-formed XHTML!" "rexml": "Ill-formed XHTML!"
}, },
@ -123,7 +123,7 @@
{ {
"name": "non_alpha_non_digit_2", "name": "non_alpha_non_digit_2",
"input": "<a onclick!\#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>foo</a>", "input": "<a onclick!\\#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>foo</a>",
"output": "<a>foo</a>", "output": "<a>foo</a>",
"rexml": "Ill-formed XHTML!" "rexml": "Ill-formed XHTML!"
}, },
@ -137,7 +137,7 @@
{ {
"name": "non_alpha_non_digit_II", "name": "non_alpha_non_digit_II",
"input": "<a href!\#$%&()*~+-_.,:;?@[/|]^`=alert('XSS')>foo</a>", "input": "<a href!\\#$%&()*~+-_.,:;?@[/|]^`=alert('XSS')>foo</a>",
"output": "<a>foo</a>", "output": "<a>foo</a>",
"rexml": "Ill-formed XHTML!" "rexml": "Ill-formed XHTML!"
}, },
@ -351,7 +351,7 @@
{ {
"name": "should_sanitize_script_tag_with_multiple_open_brackets_2", "name": "should_sanitize_script_tag_with_multiple_open_brackets_2",
"input": "<iframe src=http://ha.ckers.org/scriptlet.html\n<", "input": "<iframe src=http://ha.ckers.org/scriptlet.html\n<",
"output": "&lt;iframe src=\"http://ha.ckers.org/scriptlet.html\"&gt;&lt;", "output": "&lt;iframe src=\"http://ha.ckers.org/scriptlet.html\" &lt;=\"\"&gt;",
"rexml": "Ill-formed XHTML!" "rexml": "Ill-formed XHTML!"
}, },
@ -365,7 +365,7 @@
{ {
"name": "should_sanitize_unclosed_script", "name": "should_sanitize_unclosed_script",
"input": "<script src=http://ha.ckers.org/xss.js?<b>", "input": "<script src=http://ha.ckers.org/xss.js?<b>",
"output": "&lt;script src=\"http://ha.ckers.org/xss.js?\"&gt;<b/>", "output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;",
"rexml": "Ill-formed XHTML!" "rexml": "Ill-formed XHTML!"
}, },

View file

@ -7,7 +7,8 @@
{"description": "proper attribute value non-quoting", {"description": "proper attribute value non-quoting",
"input": [["StartTag", "span", {"title": "foo"}]], "input": [["StartTag", "span", {"title": "foo"}]],
"expected": ["<span title=foo>"] "expected": ["<span title=foo>"],
"xhtml": ["<span title=\"foo\">"]
}, },
{"description": "proper attribute value quoting (with >)", {"description": "proper attribute value quoting (with >)",
@ -17,7 +18,8 @@
{"description": "proper attribute value quoting (with <)", {"description": "proper attribute value quoting (with <)",
"input": [["StartTag", "span", {"title": "foo<bar"}]], "input": [["StartTag", "span", {"title": "foo<bar"}]],
"expected": ["<span title=\"foo<bar\">"] "expected": ["<span title=\"foo<bar\">"],
"xhtml": ["<span title=\"foo&lt;bar\">"]
}, },
{"description": "proper attribute value quoting (with \")", {"description": "proper attribute value quoting (with \")",
@ -67,12 +69,14 @@
{"description": "void element (as EmptyTag token)", {"description": "void element (as EmptyTag token)",
"input": [["EmptyTag", "img", {}]], "input": [["EmptyTag", "img", {}]],
"expected": ["<img>"] "expected": ["<img>"],
"xhtml": ["<img />"]
}, },
{"description": "void element (as StartTag token)", {"description": "void element (as StartTag token)",
"input": [["StartTag", "img", {}]], "input": [["StartTag", "img", {}]],
"expected": ["<img>"] "expected": ["<img>"],
"xhtml": ["<img />"]
}, },
{"description": "doctype in error", {"description": "doctype in error",
@ -80,6 +84,17 @@
"expected": ["<!DOCTYPE foo>"] "expected": ["<!DOCTYPE foo>"]
}, },
{"description": "character data",
"options": {"encoding":"utf-8"},
"input": [["Characters", "a<b>c&d"]],
"expected": ["a&lt;b&gt;c&amp;d"]
},
{"description": "rcdata",
"input": [["StartTag", "script", {}], ["Characters", "a<b>c&d"]],
"expected": ["<script>a<b>c&d"]
},
{"description": "doctype", {"description": "doctype",
"input": [["Doctype", "HTML"]], "input": [["Doctype", "HTML"]],
"expected": ["<!DOCTYPE HTML>"] "expected": ["<!DOCTYPE HTML>"]

View file

@ -9,31 +9,57 @@
{"description": "empytag head", {"description": "empytag head",
"options": {"inject_meta_charset": true, "encoding":"utf-8"}, "options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["EmptyTag", "head", {}]], "input": [["EmptyTag", "head", {}]],
"expected": ["<head><meta charset=utf-8>"] "expected": ["<head><meta charset=utf-8>"],
"xhtml": ["<head><meta charset=\"utf-8\" /></head>"]
}, },
{"description": "head w/title", {"description": "head w/title",
"options": {"inject_meta_charset": true, "encoding":"utf-8"}, "options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "head", {}], ["StartTag","title",{}], ["Characters", "foo"],["EndTag", "title"], ["EndTag", "head"]], "input": [["StartTag", "head", {}], ["StartTag","title",{}], ["Characters", "foo"],["EndTag", "title"], ["EndTag", "head"]],
"expected": ["<head><meta charset=utf-8><title>foo</title>"] "expected": ["<head><meta charset=utf-8><title>foo</title>"],
"xhtml": ["<head><meta charset=\"utf-8\" /><title>foo</title></head>"]
}, },
{"description": "head w/meta-charset", {"description": "head w/meta-charset",
"options": {"inject_meta_charset": true, "encoding":"utf-8"}, "options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "head", {}], ["EmptyTag","meta",{"charset":"ascii"}], ["EndTag", "head"]], "input": [["StartTag", "head", {}], ["EmptyTag","meta",{"charset":"ascii"}], ["EndTag", "head"]],
"expected": ["<head><meta charset=utf-8>"] "expected": ["<head><meta charset=utf-8>"],
"xhtml": ["<head><meta charset=\"utf-8\" /></head>"]
},
{"description": "head w/ two meta-charset",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "head", {}], ["EmptyTag","meta",{"charset":"ascii"}], ["EmptyTag","meta",{"charset":"ascii"}], ["EndTag", "head"]],
"expected": ["<head><meta charset=utf-8><meta charset=utf-8>", "<head><meta charset=utf-8><meta charset=ascii>"],
"xhtml": ["<head><meta charset=\"utf-8\" /><meta charset=\"utf-8\" /></head>", "<head><meta charset=\"utf-8\" /><meta charset=\"ascii\" /></head>"]
}, },
{"description": "head w/robots", {"description": "head w/robots",
"options": {"inject_meta_charset": true, "encoding":"utf-8"}, "options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "head", {}], ["EmptyTag","meta",{"name":"robots","content":"noindex"}], ["EndTag", "head"]], "input": [["StartTag", "head", {}], ["EmptyTag","meta",{"name":"robots","content":"noindex"}], ["EndTag", "head"]],
"expected": ["<head><meta charset=utf-8><meta content=noindex name=robots>"] "expected": ["<head><meta charset=utf-8><meta content=noindex name=robots>"],
"xhtml": ["<head><meta charset=\"utf-8\" /><meta content=\"noindex\" name=\"robots\" /></head>"]
}, },
{"description": "head w/robots & charset", {"description": "head w/robots & charset",
"options": {"inject_meta_charset": true, "encoding":"utf-8"}, "options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "head", {}], ["EmptyTag","meta",{"name":"robots","content":"noindex"}], ["EmptyTag","meta",{"charset":"ascii"}], ["EndTag", "head"]], "input": [["StartTag", "head", {}], ["EmptyTag","meta",{"name":"robots","content":"noindex"}], ["EmptyTag","meta",{"charset":"ascii"}], ["EndTag", "head"]],
"expected": ["<head><meta content=noindex name=robots><meta charset=utf-8>"] "expected": ["<head><meta content=noindex name=robots><meta charset=utf-8>"],
"xhtml": ["<head><meta content=\"noindex\" name=\"robots\" /><meta charset=\"utf-8\" /></head>"]
},
{"description": "head w/ charset in http-equiv content-type",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "head", {}], ["EmptyTag","meta",{"http-equiv":"content-type", "content":"text/html; charset=ascii"}], ["EndTag", "head"]],
"expected": ["<head><meta content=\"text/html; charset=utf-8\" http-equiv=content-type>"],
"xhtml": ["<head><meta content=\"text/html; charset=utf-8\" http-equiv=\"content-type\" /></head>"]
},
{"description": "head w/robots & charset in http-equiv content-type",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "head", {}], ["EmptyTag","meta",{"name":"robots","content":"noindex"}], ["EmptyTag","meta",{"http-equiv":"content-type", "content":"text/html; charset=ascii"}], ["EndTag", "head"]],
"expected": ["<head><meta content=noindex name=robots><meta content=\"text/html; charset=utf-8\" http-equiv=content-type>"],
"xhtml": ["<head><meta content=\"noindex\" name=\"robots\" /><meta content=\"text/html; charset=utf-8\" http-equiv=\"content-type\" /></head>"]
} }
]} ]}

View file

@ -9,13 +9,15 @@
{"description": "quote_attr_values=true", {"description": "quote_attr_values=true",
"options": {"quote_attr_values": true}, "options": {"quote_attr_values": true},
"input": [["StartTag", "button", {"disabled": "disabled"}]], "input": [["StartTag", "button", {"disabled": "disabled"}]],
"expected": ["<button disabled>"] "expected": ["<button disabled>"],
"xhtml": ["<button disabled=\"disabled\">"]
}, },
{"description": "quote_attr_values=true with irrelevant", {"description": "quote_attr_values=true with irrelevant",
"options": {"quote_attr_values": true}, "options": {"quote_attr_values": true},
"input": [["StartTag", "div", {"irrelevant": "irrelevant"}]], "input": [["StartTag", "div", {"irrelevant": "irrelevant"}]],
"expected": ["<div irrelevant>"] "expected": ["<div irrelevant>"],
"xhtml": ["<div irrelevant=\"irrelevant\">"]
}, },
{"description": "use_trailing_solidus=true with void element", {"description": "use_trailing_solidus=true with void element",
@ -33,13 +35,20 @@
{"description": "minimize_boolean_attributes=false", {"description": "minimize_boolean_attributes=false",
"options": {"minimize_boolean_attributes": false}, "options": {"minimize_boolean_attributes": false},
"input": [["StartTag", "div", {"irrelevant": "irrelevant"}]], "input": [["StartTag", "div", {"irrelevant": "irrelevant"}]],
"expected": ["<div irrelevant=irrelevant>"] "expected": ["<div irrelevant=irrelevant>"],
"xhtml": ["<div irrelevant=\"irrelevant\">"]
}, },
{"description": "minimize_boolean_attributes=false with empty value", {"description": "minimize_boolean_attributes=false with empty value",
"options": {"minimize_boolean_attributes": false}, "options": {"minimize_boolean_attributes": false},
"input": [["StartTag", "div", {"irrelevant": ""}]], "input": [["StartTag", "div", {"irrelevant": ""}]],
"expected": ["<div irrelevant=\"\">"] "expected": ["<div irrelevant=\"\">"]
},
{"description": "escape less than signs in attribute values",
"options": {"escape_lt_in_attrs": true},
"input": [["StartTag", "a", {"title": "a<b>c&d"}]],
"expected": ["<a title=\"a&lt;b>c&amp;d\">"]
} }
]} ]}

View file

@ -15,7 +15,7 @@
"contentModelFlags":["RCDATA", "CDATA"], "contentModelFlags":["RCDATA", "CDATA"],
"lastStartTag":"baz", "lastStartTag":"baz",
"input":"</foo>bar</baz>", "input":"</foo>bar</baz>",
"output":["ParseError", ["Character", "</foo>bar"], ["EndTag", "baz"]]}, "output":[["Character", "</foo>bar"], ["EndTag", "baz"]]},
{"description":"End tag closing RCDATA or CDATA, switching back to PCDATA", {"description":"End tag closing RCDATA or CDATA, switching back to PCDATA",
"contentModelFlags":["RCDATA", "CDATA"], "contentModelFlags":["RCDATA", "CDATA"],

View file

@ -0,0 +1,21 @@
{"tests": [
{"description":"Commented close tag in [R]CDATA",
"contentModelFlags":["RCDATA", "CDATA"],
"lastStartTag":"bar",
"input":"foo<!--</bar>--></bar>",
"output":[["Character", "foo<!--</bar>-->"], ["EndTag", "bar"]]},
{"description":"Bogus comment in [R]CDATA",
"contentModelFlags":["RCDATA", "CDATA"],
"lastStartTag":"bar",
"input":"foo<!-->baz</bar>",
"output":[["Character", "foo<!-->baz"], ["EndTag", "bar"]]},
{"description":"End tag surrounded by bogus comment in [R]CDATA",
"contentModelFlags":["RCDATA", "CDATA"],
"lastStartTag":"bar",
"input":"foo<!--></bar><!-->baz</bar>",
"output":[["Character", "foo<!-->"], ["EndTag", "bar"], "ParseError", ["Comment", ""], ["Character", "baz"], ["EndTag", "bar"]]}
]}

View file

@ -2,15 +2,15 @@
{"description":"Correct Doctype lowercase", {"description":"Correct Doctype lowercase",
"input":"<!DOCTYPE html>", "input":"<!DOCTYPE html>",
"output":[["DOCTYPE", "HTML", false]]}, "output":[["DOCTYPE", "html", null, null, true]]},
{"description":"Correct Doctype uppercase", {"description":"Correct Doctype uppercase",
"input":"<!DOCTYPE HTML>", "input":"<!DOCTYPE HTML>",
"output":[["DOCTYPE", "HTML", false]]}, "output":[["DOCTYPE", "HTML", null, null, true]]},
{"description":"Correct Doctype mixed case", {"description":"Correct Doctype mixed case",
"input":"<!DOCTYPE HtMl>", "input":"<!DOCTYPE HtMl>",
"output":[["DOCTYPE", "HTML", false]]}, "output":[["DOCTYPE", "HtMl", null, null, true]]},
{"description":"Truncated doctype start", {"description":"Truncated doctype start",
"input":"<!DOC>", "input":"<!DOC>",
@ -18,7 +18,7 @@
{"description":"Doctype in error", {"description":"Doctype in error",
"input":"<!DOCTYPE foo>", "input":"<!DOCTYPE foo>",
"output":[["DOCTYPE", "FOO", true]]}, "output":[["DOCTYPE", "foo", null, null, true]]},
{"description":"Single Start Tag", {"description":"Single Start Tag",
"input":"<h>", "input":"<h>",
@ -84,17 +84,38 @@
"input":"<!-", "input":"<!-",
"output":["ParseError", ["Comment", "-"]]}, "output":["ParseError", ["Comment", "-"]]},
{"description":"Ampersand only", {"description":"Short comment",
"input":"<!-->",
"output":["ParseError", ["Comment", ""]]},
{"description":"Short comment two",
"input":"<!--->",
"output":["ParseError", ["Comment", ""]]},
{"description":"Short comment three",
"input":"<!---->",
"output":[["Comment", ""]]},
{"description":"Ampersand EOF",
"input":"&", "input":"&",
"output":["ParseError", ["Character", "&"]]}, "output":[["Character", "&"]]},
{"description":"Ampersand ampersand EOF",
"input":"&&",
"output":[["Character", "&&"]]},
{"description":"Ampersand space EOF",
"input":"& ",
"output":[["Character", "& "]]},
{"description":"Unfinished entity", {"description":"Unfinished entity",
"input":"&f", "input":"&f",
"output":["ParseError", ["Character", "&"], ["Character", "f"]]}, "output":["ParseError", ["Character", "&f"]]},
{"description":"Ampersand, number sign", {"description":"Ampersand, number sign",
"input":"&#", "input":"&#",
"output":["ParseError", ["Character", "&"], ["Character", "#"]]}, "output":["ParseError", ["Character", "&#"]]},
{"description":"Unfinished numeric entity", {"description":"Unfinished numeric entity",
"input":"&#x", "input":"&#x",
@ -110,8 +131,7 @@
{"description":"Entity without trailing semicolon (1)", {"description":"Entity without trailing semicolon (1)",
"input":"I'm &notit", "input":"I'm &notit",
"output":[["Character","I'm "], "ParseError", ["Character", "¬"], "output":[["Character","I'm "], "ParseError", ["Character", "¬it"]]},
["Character", "it"]]},
{"description":"Entity without trailing semicolon (2)", {"description":"Entity without trailing semicolon (2)",
"input":"I'm &notin", "input":"I'm &notin",

View file

@ -1,24 +1,44 @@
{"tests": [ {"tests": [
{"description":"Doctype without a name", {"description":"DOCTYPE without name",
"input":"<!DOCTYPE>", "input":"<!DOCTYPE>",
"output":["ParseError", "ParseError", ["DOCTYPE", "", true]]}, "output":["ParseError", "ParseError", ["DOCTYPE", "", null, null, false]]},
{"description":"Correct doctype without a space before name", {"description":"DOCTYPE without space before name",
"input":"<!DOCTYPEhtml>", "input":"<!DOCTYPEhtml>",
"output":["ParseError", ["DOCTYPE", "HTML", false]]}, "output":["ParseError", ["DOCTYPE", "html", null, null, true]]},
{"description":"Incorrect doctype without a space before name", {"description":"Incorrect DOCTYPE without a space before name",
"input":"<!DOCTYPEfoo>", "input":"<!DOCTYPEfoo>",
"output":["ParseError", ["DOCTYPE", "FOO", true]]}, "output":["ParseError", ["DOCTYPE", "foo", null, null, true]]},
{"description":"Bogus doctype", {"description":"DOCTYPE with publicId",
"input":"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML Transitional 4.01//EN\">", "input":"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML Transitional 4.01//EN\">",
"output":["ParseError", ["DOCTYPE", "HTML", true]]}, "output":[["DOCTYPE", "html", "-//W3C//DTD HTML Transitional 4.01//EN", null, true]]},
{"description":"DOCTYPE with EOF after PUBLIC",
"input":"<!DOCTYPE html PUBLIC",
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"DOCTYPE with EOF after PUBLIC '",
"input":"<!DOCTYPE html PUBLIC '",
"output":["ParseError", ["DOCTYPE", "html", "", null, false]]},
{"description":"DOCTYPE with EOF after PUBLIC 'x",
"input":"<!DOCTYPE html PUBLIC 'x",
"output":["ParseError", ["DOCTYPE", "html", "x", null, false]]},
{"description":"DOCTYPE with systemId",
"input":"<!DOCTYPE html SYSTEM \"-//W3C//DTD HTML Transitional 4.01//EN\">",
"output":[["DOCTYPE", "html", null, "-//W3C//DTD HTML Transitional 4.01//EN", true]]},
{"description":"DOCTYPE with publicId and systemId",
"input":"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML Transitional 4.01//EN\" \"-//W3C//DTD HTML Transitional 4.01//EN\">",
"output":[["DOCTYPE", "html", "-//W3C//DTD HTML Transitional 4.01//EN", "-//W3C//DTD HTML Transitional 4.01//EN", true]]},
{"description":"Incomplete doctype", {"description":"Incomplete doctype",
"input":"<!DOCTYPE html ", "input":"<!DOCTYPE html ",
"output":["ParseError", ["DOCTYPE", "HTML", true]]}, "output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"Numeric entity representing the NUL character", {"description":"Numeric entity representing the NUL character",
"input":"&#0000;", "input":"&#0000;",
@ -30,19 +50,19 @@
{"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)", {"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)",
"input":"&#2225222;", "input":"&#2225222;",
"output":["ParseError", ["Character", "\uFFFD"]]}, "output":[["Character", "\uFFFD"]]},
{"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)", {"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)",
"input":"&#x1010FFFF;", "input":"&#x1010FFFF;",
"output":["ParseError", ["Character", "\uFFFD"]]}, "output":[["Character", "\uFFFD"]]},
{"description":"Numeric entity representing a Windows-1252 'codepoint'", {"description":"Numeric entity representing a Windows-1252 'codepoint'",
"input":"&#137;", "input":"&#137;",
"output":[["Character", "\u2030"]]}, "output":["ParseError", ["Character", "\u2030"]]},
{"description":"Hexadecimal entity representing a Windows-1252 'codepoint'", {"description":"Hexadecimal entity representing a Windows-1252 'codepoint'",
"input":"&#x89;", "input":"&#x89;",
"output":[["Character", "\u2030"]]}, "output":["ParseError", ["Character", "\u2030"]]},
{"description":"Hexadecimal entity with mixed uppercase and lowercase", {"description":"Hexadecimal entity with mixed uppercase and lowercase",
"input":"&#xaBcD;", "input":"&#xaBcD;",
@ -58,7 +78,7 @@
{"description":"StartTag containing <", {"description":"StartTag containing <",
"input":"<a<b>", "input":"<a<b>",
"output":["ParseError", ["StartTag", "a", { }], ["StartTag", "b", { }]]}, "output":[["StartTag", "a<b", { }]]},
{"description":"Non-void element containing trailing /", {"description":"Non-void element containing trailing /",
"input":"<h/>", "input":"<h/>",

View file

@ -226,7 +226,6 @@ Line1<br>Line2<br>Line3<br>Line4
<h1>Hello<h2>World <h1>Hello<h2>World
#errors #errors
4: missing document type declaration 4: missing document type declaration
13: h2 element start tag implying h1 element end tag
19: mismatched body element end tag (premature end of file?) 19: mismatched body element end tag (premature end of file?)
#document #document
| <html> | <html>
@ -234,8 +233,8 @@ Line1<br>Line2<br>Line3<br>Line4
| <body> | <body>
| <h1> | <h1>
| "Hello" | "Hello"
| <h2> | <h2>
| "World" | "World"
#data #data
<a><p>X<a>Y</a>Z</p></a> <a><p>X<a>Y</a>Z</p></a>
@ -307,13 +306,18 @@ Line1<br>Line2<br>Line3<br>Line4
#data #data
<!--><div>--<!--> <!--><div>--<!-->
#errors #errors
13: unexpected character after two '-' characters while parsing comment Incorrect comment token
18: missing document type declaration Missing document type declaration
Incorrect comment token
Unexpected end of file
#document #document
| <!-- ><div>--<! --> | <!-- -->
| <html> | <html>
| <head> | <head>
| <body> | <body>
| <div>
| "--"
| <!-- -->
#data #data
<p><hr></p> <p><hr></p>
@ -638,10 +642,10 @@ Line1<br>Line2<br>Line3<br>Line4
| <html> | <html>
| <head> | <head>
| <script> | <script>
| " <!-- " | " <!-- </script> --> "
| " " | " "
| <body> | <body>
| "--> EOF" | "EOF"
#data #data
<b><p></b>TEST <b><p></b>TEST
@ -1248,15 +1252,13 @@ Line1<br>Line2<br>Line3<br>Line4
#data #data
<style><!--</style><meta><script>--><link></script> <style><!--</style><meta><script>--><link></script>
#errors #errors
7: missing document type declaration missing document type declaration
unexpected EOF
#document #document
| <html> | <html>
| <head> | <head>
| <style> | <style>
| "<!--" | "<!--</style><meta><script>--><link></script>"
| <meta>
| <script>
| "--><link>"
| <body> | <body>
#data #data
@ -1305,12 +1307,12 @@ Line1<br>Line2<br>Line3<br>Line4
#document #document
| <html> | <html>
| <head> | <head>
| <base>
| <link>
| <meta>
| <title> | <title>
| "<p>" | "<p>"
| <body> | <body>
| <base>
| <link>
| <meta>
| <p> | <p>
#data #data
@ -1381,12 +1383,11 @@ Line1<br>Line2<br>Line3<br>Line4
6: missing document type declaration 6: missing document type declaration
19: unexpected node at end of document 19: unexpected node at end of document
19: unexpected node after body element end tag 19: unexpected node after body element end tag
19: meta element start tag out of place
#document #document
| <html> | <html>
| <head> | <head>
| <meta>
| <body> | <body>
| <meta>
| <p> | <p>
#data #data
@ -1430,14 +1431,13 @@ Line1<br>Line2<br>Line3<br>Line4
<h1><h2> <h1><h2>
#errors #errors
4: missing document type declaration 4: missing document type declaration
8: h2 element start tag implying h1 element end tag
9: mismatched body element end tag (premature end of file?) 9: mismatched body element end tag (premature end of file?)
#document #document
| <html> | <html>
| <head> | <head>
| <body> | <body>
| <h1> | <h1>
| <h2> | <h2>
#data #data
<a><p><a></a></p></a> <a><p><a></a></p></a>
@ -1630,8 +1630,7 @@ Line1<br>Line2<br>Line3<br>Line4
4: missing document type declaration 4: missing document type declaration
15: required tr element start tag implied by unexpected td element start tag 15: required tr element start tag implied by unexpected td element start tag
27: unexpected td element end tag implied other end tags 27: unexpected td element end tag implied other end tags
31: h3 element start tag implying h1 element end tag Unexpected EOF
36: mismatched h1 element end tag
#document #document
| <html> | <html>
| <head> | <head>
@ -1642,7 +1641,7 @@ Line1<br>Line2<br>Line3<br>Line4
| <tr> | <tr>
| <td> | <td>
| <h3> | <h3>
| <h3> | <h3>
#data #data
<table><colgroup><col><colgroup><col><col><col><colgroup><col><col><thead><tr><td></table> <table><colgroup><col><colgroup><col><col><col><colgroup><col><col><thead><tr><td></table>
@ -1807,6 +1806,7 @@ Line1<br>Line2<br>Line3<br>Line4
| <html> | <html>
| <head> | <head>
| <body> | <body>
| <br>
#data #data
<table><tr></strong></b></em></i></u></strike></s></blink></tt></pre></big></small></font></select></h1></h2></h3></h4></h5></h6></body></br></a></img></title></span></style></script></table></th></td></tr></frame></area></link></param></hr></input></col></base></meta></basefont></bgsound></embed></spacer></p></dd></dt></caption></colgroup></tbody></tfoot></thead></address></blockquote></center></dir></div></dl></fieldset></listing></menu></ol></ul></li></nobr></wbr></form></button></marquee></object></html></frameset></head></iframe></image></isindex></noembed></noframes></noscript></optgroup></option></plaintext></textarea> <table><tr></strong></b></em></i></u></strike></s></blink></tt></pre></big></small></font></select></h1></h2></h3></h4></h5></h6></body></br></a></img></title></span></style></script></table></th></td></tr></frame></area></link></param></hr></input></col></base></meta></basefont></bgsound></embed></spacer></p></dd></dt></caption></colgroup></tbody></tfoot></thead></address></blockquote></center></dir></div></dl></fieldset></listing></menu></ol></ul></li></nobr></wbr></form></button></marquee></object></html></frameset></head></iframe></image></isindex></noembed></noframes></noscript></optgroup></option></plaintext></textarea>
@ -1924,6 +1924,7 @@ Line1<br>Line2<br>Line3<br>Line4
| <html> | <html>
| <head> | <head>
| <body> | <body>
| <br>
| <table> | <table>
| <tbody> | <tbody>
| <tr> | <tr>

View file

@ -1,5 +1,5 @@
#data #data
<!doctype html>Test <!DOCTYPE HTML>Test
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE HTML>
@ -63,7 +63,7 @@ frame element can't occur here
| "test" | "test"
#data #data
<!doctype html><frameset>test <!DOCTYPE HTML><frameset>test
#errors #errors
frameset can't contain text frameset can't contain text
Unexpected end of file Unexpected end of file
@ -74,7 +74,7 @@ Unexpected end of file
| <frameset> | <frameset>
#data #data
<!doctype html><frameset><!doctype html> <!DOCTYPE HTML><frameset><!DOCTYPE HTML>
#errors #errors
document type declaration can only occur at the start of a document document type declaration can only occur at the start of a document
Expected end tag </frameset> Expected end tag </frameset>
@ -85,7 +85,7 @@ Expected end tag </frameset>
| <frameset> | <frameset>
#data #data
<!doctype html><font><p><b>test</font> <!DOCTYPE HTML><font><p><b>test</font>
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE HTML>
@ -99,7 +99,7 @@ Expected end tag </frameset>
| "test" | "test"
#data #data
<!DOCTYPE htmL><dt><div><dd> <!DOCTYPE HTML><dt><div><dd>
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE HTML>
@ -151,7 +151,7 @@ Unexpected end of file.
| "</plaintext>" | "</plaintext>"
#data #data
<!doctype html><table><tr>TEST <!DOCTYPE HTML><table><tr>TEST
#errors #errors
TEST can't occur in <tr> TEST can't occur in <tr>
Unexpected end of file. Unexpected end of file.
@ -166,7 +166,7 @@ Unexpected end of file.
| <tr> | <tr>
#data #data
<!doctype html><body t1=1><body t2=2><body t3=3 t4=4> <!DOCTYPE HTML><body t1=1><body t2=2><body t3=3 t4=4>
#errors #errors
Unexpected start tag "body" Unexpected start tag "body"
Unexpected start tag "body" Unexpected start tag "body"
@ -193,21 +193,18 @@ Unexpected end tag.
| <body> | <body>
#data #data
<!doctype HtML></b test<b &=&amp>X <!DOCTYPE HTML></b test<b &=&amp>X
#errors #errors
Unexpected < in attribute Unexpected < in attribute
End tag contains attributes. End tag contains attributes.
Unexpected end tag. Unexpected end tag.
Named entity didn't end with ; Named entity didn't end with ;
Unexpected EOF. Missing closing tag.
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE HTML>
| <html> | <html>
| <head> | <head>
| <body> | <body>
| <b> | "X"
| &="&"
| "X"
#data #data
<!doctypehtml><scrIPt type=text/x-foobar;baz>X</SCRipt <!doctypehtml><scrIPt type=text/x-foobar;baz>X</SCRipt
@ -215,7 +212,7 @@ Unexpected EOF. Missing closing tag.
No space after literal DOCTYPE. No space after literal DOCTYPE.
Unexpected EOF in (end) tag name Unexpected EOF in (end) tag name
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE html>
| <html> | <html>
| <head> | <head>
| <script> | <script>
@ -294,7 +291,7 @@ Unfinished named entity.
#errors #errors
No space after literal DOCTYPE. No space after literal DOCTYPE.
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE html>
| <html> | <html>
| <head> | <head>
| <body> | <body>
@ -302,7 +299,7 @@ No space after literal DOCTYPE.
| <li> | <li>
#data #data
<!doctypehtml><p><dt> <!doctypeHTML><p><dt>
#errors #errors
No space after literal DOCTYPE. No space after literal DOCTYPE.
#document #document
@ -314,11 +311,11 @@ No space after literal DOCTYPE.
| <dt> | <dt>
#data #data
<!doctypehtml><p><dd> <!doctypehtmL><p><dd>
#errors #errors
No space after literal DOCTYPE. No space after literal DOCTYPE.
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE htmL>
| <html> | <html>
| <head> | <head>
| <body> | <body>
@ -331,7 +328,7 @@ No space after literal DOCTYPE.
No space after literal DOCTYPE. No space after literal DOCTYPE.
Unexpected EOF. Unexpected EOF.
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE html>
| <html> | <html>
| <head> | <head>
| <body> | <body>
@ -339,7 +336,7 @@ Unexpected EOF.
| <form> | <form>
#data #data
<!doctype html><p><b><i><u></p> <p>X <!DOCTYPE HTML><p><b><i><u></p> <p>X
#errors #errors
Unexpected end tag </p>. Unexpected end tag </p>.
Unexpected end EOF. Missing closing tags. Unexpected end EOF. Missing closing tags.
@ -360,7 +357,7 @@ Unexpected end EOF. Missing closing tags.
| "X" | "X"
#data #data
<!doctype html><p></P>X <!DOCTYPE HTML><p></P>X
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE HTML>
@ -393,7 +390,7 @@ Invalid entity.
| "&AMp;" | "&AMp;"
#data #data
<!doctype html><html><head></head><body><thisISasillyTESTelementNameToMakeSureCrazyTagNamesArePARSEDcorrectLY> <!DOCTYPE HTML><html><head></head><body><thisISasillyTESTelementNameToMakeSureCrazyTagNamesArePARSEDcorrectLY>
#errors #errors
Unexpected end of file. Unexpected end of file.
#document #document
@ -404,7 +401,7 @@ Unexpected end of file.
| <thisisasillytestelementnametomakesurecrazytagnamesareparsedcorrectly> | <thisisasillytestelementnametomakesurecrazytagnamesareparsedcorrectly>
#data #data
<!doctype html>X</body>X <!DOCTYPE HTML>X</body>X
#errors #errors
Unexpected non-space characters in the after body phase. Unexpected non-space characters in the after body phase.
#document #document
@ -415,7 +412,7 @@ Unexpected non-space characters in the after body phase.
| "XX" | "XX"
#data #data
<!doctype html><!-- X <!DOCTYPE HTML><!-- X
#errors #errors
Unexpected end of file in comment. Unexpected end of file in comment.
#document #document
@ -426,7 +423,7 @@ Unexpected end of file in comment.
| <body> | <body>
#data #data
<!doctype html><table><caption>test TEST</caption><td>test <!DOCTYPE HTML><table><caption>test TEST</caption><td>test
#errors #errors
Unexpected <td> in table body phase. Unexpected <td> in table body phase.
Unexpected end of file. Unexpected end of file.
@ -444,7 +441,7 @@ Unexpected end of file.
| "test" | "test"
#data #data
<!doctype html><select><option><optgroup> <!DOCTYPE HTML><select><option><optgroup>
#errors #errors
Unexpected end of file. Missing closing tags. Unexpected end of file. Missing closing tags.
#document #document
@ -457,7 +454,7 @@ Unexpected end of file. Missing closing tags.
| <optgroup> | <optgroup>
#data #data
<!doctype html><select><optgroup><option></optgroup><option><select><option> <!DOCTYPE HTML><select><optgroup><option></optgroup><option><select><option>
#errors #errors
Unexpected start tag <select> in <select>. Unexpected start tag <select> in <select>.
Unexpected start tag <option>. Unexpected start tag <option>.
@ -472,7 +469,7 @@ Unexpected start tag <option>.
| <option> | <option>
#data #data
<!doctype html><select><optgroup><option><optgroup> <!DOCTYPE HTML><select><optgroup><option><optgroup>
#errors #errors
Unexpected end of file. Missing closing tags. Unexpected end of file. Missing closing tags.
#document #document
@ -486,7 +483,7 @@ Unexpected end of file. Missing closing tags.
| <optgroup> | <optgroup>
#data #data
<!doctype html><font><input><input></font> <!DOCTYPE HTML><font><input><input></font>
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE HTML>
@ -498,7 +495,7 @@ Unexpected end of file. Missing closing tags.
| <input> | <input>
#data #data
<!DoctypE html><!-- XXX - XXX --> <!DOCTYPE HTML><!-- XXX - XXX -->
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE HTML>
@ -508,7 +505,7 @@ Unexpected end of file. Missing closing tags.
| <body> | <body>
#data #data
<!DoctypE html><!-- XXX - XXX <!DOCTYPE HTML><!-- XXX - XXX
#errors #errors
Unexpected EOF in comment. Unexpected EOF in comment.
#document #document
@ -519,7 +516,7 @@ Unexpected EOF in comment.
| <body> | <body>
#data #data
<!DoctypE html><!-- XXX - XXX - XXX --> <!DOCTYPE HTML><!-- XXX - XXX - XXX -->
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE HTML>
@ -583,7 +580,7 @@ Unexpected EOF.
| "X" | "X"
#data #data
<!doctype html><body><title>test</body></title> <!DOCTYPE HTML><body><title>test</body></title>
#errors #errors
Unexpected start tag that belongs in the head. Unexpected start tag that belongs in the head.
Expected closing tag after </. Expected closing tag after </.
@ -596,7 +593,7 @@ Expected closing tag after </.
| <body> | <body>
#data #data
<!doctype html><body><title>X</title><meta name=z><link rel=foo><style> <!DOCTYPE HTML><body><title>X</title><meta name=z><link rel=foo><style>
x { content:"</style" } </style> x { content:"</style" } </style>
#errors #errors
Unexpected start tag that belongs in head. Unexpected start tag that belongs in head.
@ -609,17 +606,17 @@ Expected closing tag after </.
| <head> | <head>
| <title> | <title>
| "X" | "X"
| <body>
| <meta> | <meta>
| name="z" | name="z"
| <link> | <link>
| rel="foo" | rel="foo"
| <body>
| <style> | <style>
| " | "
x { content:"</style" } " x { content:"</style" } "
#data #data
<!doctype html><select><optgroup></optgroup></select> <!DOCTYPE HTML><select><optgroup></optgroup></select>
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE HTML>
@ -642,7 +639,7 @@ No doctype.
| <body> | <body>
#data #data
<!doctype html> <html> <!DOCTYPE HTML> <html>
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE HTML>
@ -652,7 +649,7 @@ No doctype.
| <body> | <body>
#data #data
<!doctype html><script> <!DOCTYPE HTML><script>
</script> <title>x</title> </head> </script> <title>x</title> </head>
#errors #errors
#document #document
@ -669,7 +666,7 @@ No doctype.
| <body> | <body>
#data #data
<!doctype html><html><body><html id=x> <!DOCTYPE HTML><html><body><html id=x>
#errors #errors
duplicate html start tag duplicate html start tag
#document #document
@ -680,7 +677,7 @@ duplicate html start tag
| <body> | <body>
#data #data
<!doctype html>X</body><html id="x"> <!DOCTYPE HTML>X</body><html id="x">
#errors #errors
Unexpected html start tag in the after body phase. Unexpected html start tag in the after body phase.
html needs to be the first start tag. html needs to be the first start tag.
@ -693,7 +690,7 @@ html needs to be the first start tag.
| "X" | "X"
#data #data
<!doctype html><head><html id=x> <!DOCTYPE HTML><head><html id=x>
#errors #errors
html start tag too late html start tag too late
#document #document
@ -704,7 +701,7 @@ html start tag too late
| <body> | <body>
#data #data
<!doctype html>X</html>X <!DOCTYPE HTML>X</html>X
#errors #errors
Unexpected non-space characters. Expected end of file. Unexpected non-space characters. Expected end of file.
Unexpected non-space characters in after body phase. Expected end of file. Unexpected non-space characters in after body phase. Expected end of file.
@ -716,7 +713,7 @@ Unexpected non-space characters in after body phase. Expected end of file.
| "XX" | "XX"
#data #data
<!doctype html>X</html> <!DOCTYPE HTML>X</html>
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE HTML>
@ -726,7 +723,7 @@ Unexpected non-space characters in after body phase. Expected end of file.
| "X " | "X "
#data #data
<!doctype html>X</html><p>X <!DOCTYPE HTML>X</html><p>X
#errors #errors
Unexpected start tag <p> in trailing end phase. Unexpected start tag <p> in trailing end phase.
Unexpected start tag <p> in after body phase. Unexpected start tag <p> in after body phase.
@ -740,7 +737,7 @@ Unexpected start tag <p> in after body phase.
| "X" | "X"
#data #data
<!doctype html>X<p/x/y/z> <!DOCTYPE HTML>X<p/x/y/z>
#errors #errors
Solidus (/) incorrectly placed. Solidus (/) incorrectly placed.
Solidus (/) incorrectly placed. Solidus (/) incorrectly placed.
@ -757,7 +754,7 @@ Solidus (/) incorrectly placed.
| z="" | z=""
#data #data
<!doctype html><!--x-- <!DOCTYPE HTML><!--x--
#errors #errors
Unexpected end of file in comment. Unexpected end of file in comment.
#document #document
@ -768,7 +765,7 @@ Unexpected end of file in comment.
| <body> | <body>
#data #data
<!doctype html><table><tr><td></p></table> <!DOCTYPE HTML><table><tr><td></p></table>
#errors #errors
Unexpected </p> end tag. Unexpected </p> end tag.
#document #document

View file

@ -49,23 +49,23 @@ No DOCTYPE
| <script> | <script>
#data #data
<!DOCTYPE html><html><head></head><body><pre> <!DOCTYPE htML><html><head></head><body><pre>
</pre></body></html> </pre></body></html>
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE htML>
| <html> | <html>
| <head> | <head>
| <body> | <body>
| <pre> | <pre>
#data #data
<!DOCTYPE html><html><head></head><body><pre> <!DOCTYPE htML><html><head></head><body><pre>
foo</pre></body></html> foo</pre></body></html>
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE htML>
| <html> | <html>
| <head> | <head>
| <body> | <body>
@ -74,13 +74,13 @@ foo</pre></body></html>
#data #data
<!DOCTYPE html><html><head></head><body><pre> <!DOCTYPE htML><html><head></head><body><pre>
foo foo
</pre></body></html> </pre></body></html>
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE htML>
| <html> | <html>
| <head> | <head>
| <body> | <body>
@ -89,11 +89,11 @@ foo
" "
#data #data
<!DOCTYPE html><html><head></head><body><pre>x</pre><span> <!DOCTYPE htML><html><head></head><body><pre>x</pre><span>
</span></body></html> </span></body></html>
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE htML>
| <html> | <html>
| <head> | <head>
| <body> | <body>
@ -104,11 +104,11 @@ foo
" "
#data #data
<!DOCTYPE html><html><head></head><body><pre>x <!DOCTYPE htML><html><head></head><body><pre>x
y</pre></body></html> y</pre></body></html>
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE htML>
| <html> | <html>
| <head> | <head>
| <body> | <body>
@ -117,11 +117,11 @@ y</pre></body></html>
y" y"
#data #data
<!DOCTYPE html><html><head></head><body><pre>x<div> <!DOCTYPE htML><html><head></head><body><pre>x<div>
y</pre></body></html> y</pre></body></html>
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE htML>
| <html> | <html>
| <head> | <head>
| <body> | <body>
@ -132,20 +132,20 @@ y</pre></body></html>
| y" | y"
#data #data
<!DOCTYPE html><HTML><META><HEAD></HEAD></HTML> <!DOCTYPE htML><HTML><META><HEAD></HEAD></HTML>
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE htML>
| <html> | <html>
| <head> | <head>
| <meta> | <meta>
| <body> | <body>
#data #data
<!DOCTYPE html><HTML><HEAD><head></HEAD></HTML> <!DOCTYPE htML><HTML><HEAD><head></HEAD></HTML>
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE htML>
| <html> | <html>
| <head> | <head>
| <body> | <body>
@ -171,23 +171,23 @@ y</pre></body></html>
| <body> | <body>
#data #data
<!DOCTYPE html><textarea> <!DOCTYPE htML><textarea>
</textarea> </textarea>
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE htML>
| <html> | <html>
| <head> | <head>
| <body> | <body>
| <textarea> | <textarea>
#data #data
<!DOCTYPE html><textarea> <!DOCTYPE htML><textarea>
foo</textarea> foo</textarea>
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE htML>
| <html> | <html>
| <head> | <head>
| <body> | <body>
@ -195,11 +195,11 @@ foo</textarea>
| "foo" | "foo"
#data #data
<!DOCTYPE html><html><head></head><body><ul><li><div><p><li></ul></body></html> <!DOCTYPE htML><html><head></head><body><ul><li><div><p><li></ul></body></html>
#errors #errors
Missing end tag (div) Missing end tag (div)
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE htML>
| <html> | <html>
| <head> | <head>
| <body> | <body>
@ -208,3 +208,29 @@ Missing end tag (div)
| <div> | <div>
| <p> | <p>
| <li> | <li>
#data
<!doctype html><nobr><nobr><nobr>
#errors
Unexpected end of file.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <nobr>
| <nobr>
| <nobr>
#data
<!doctype html><nobr><nobr></nobr><nobr>
#errors
Unexpected end of file.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <nobr>
| <nobr>
| <nobr>

View file

@ -0,0 +1,120 @@
#data
<style> <!-- </style>x
#errors
No DOCTYPE
Unexpected end of file
#document
| <html>
| <head>
| <style>
| " <!-- </style>x"
| <body>
#data
<style> <!-- </style> --> </style>x
#errors
No DOCTYPE
#document
| <html>
| <head>
| <style>
| " <!-- </style> --> "
| <body>
| "x"
#data
<style> <!--> </style>x
#errors
No DOCTYPE
#document
| <html>
| <head>
| <style>
| " <!--> "
| <body>
| "x"
#data
<style> <!---> </style>x
#errors
No DOCTYPE
#document
| <html>
| <head>
| <style>
| " <!---> "
| <body>
| "x"
#data
<iframe> <!---> </iframe>x
#errors
No DOCTYPE
#document
| <html>
| <head>
| <body>
| <iframe>
| " <!---> "
| "x"
#data
<iframe> <!--- </iframe>->x</iframe> --> </iframe>x
#errors
No DOCTYPE
#document
| <html>
| <head>
| <body>
| <iframe>
| " <!--- </iframe>->x</iframe> --> "
| "x"
#data
<script> <!-- </script> --> </script>x
#errors
No DOCTYPE
#document
| <html>
| <head>
| <script>
| " <!-- </script> --> "
| <body>
| "x"
#data
<title> <!-- </title> --> </title>x
#errors
No DOCTYPE
#document
| <html>
| <head>
| <title>
| " <!-- </title> --> "
| <body>
| "x"
#data
<textarea> <!--- </textarea>->x</textarea> --> </textarea>x
#errors
No DOCTYPE
#document
| <html>
| <head>
| <body>
| <textarea>
| " <!--- </textarea>->x</textarea> --> "
| "x"
#data
<style> <!</-- </style>x
#errors
No DOCTYPE
Unexpected end of file
#document
| <html>
| <head>
| <style>
| " <!</-- "
| <body>
| "x"

View file

@ -0,0 +1,29 @@
#data
<!doctype html></head> <head>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| " "
| <body>
#data
<!doctype html></html> <head>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| " "
#data
<!doctype html></body><meta>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <meta>

View file

@ -24,6 +24,7 @@ rescue LoadError
def self.parse json def self.parse json
json.gsub!(/"\s*:/, '"=>') json.gsub!(/"\s*:/, '"=>')
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')} json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
null = nil
eval json eval json
end end
end end

View file

@ -191,13 +191,13 @@ EOX
end end
def test_br def test_br
assert_xhtml_equal <<EOX assert_xhtml_equal <<EOX1
<html xmlns="http://www.w3.org/1999/xhtml"> <html xmlns="http://www.w3.org/1999/xhtml">
<head><title>XLINK</title></head> <head><title>XLINK</title></head>
<body> <body>
<br/> <br/>
</body></html> </body></html>
EOX EOX1
end end
def xtest_strong def xtest_strong

View file

@ -12,7 +12,7 @@ begin
rescue LoadError rescue LoadError
end end
$CHECK_PARSER_ERRORS = ARGV.delete('-p') $CHECK_PARSER_ERRORS = ARGV.delete('-p') # TODO
puts 'Testing tree builders: ' + $tree_types_to_test * ', ' puts 'Testing tree builders: ' + $tree_types_to_test * ', '
@ -45,9 +45,9 @@ class Html5ParserTestCase < Test::Unit::TestCase
actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document)) actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
assert_equal sortattrs(expected_output), sortattrs(actual_output), [ assert_equal sortattrs(expected_output), sortattrs(actual_output), [
'Input:', input, '', 'Input:', input,
'Expected:', expected_output, '', 'Expected:', expected_output,
'Recieved:', actual_output '', 'Recieved:', actual_output
].join("\n") ].join("\n")
if $CHECK_PARSER_ERRORS if $CHECK_PARSER_ERRORS

View file

@ -30,7 +30,7 @@ class SanitizeTest < Test::Unit::TestCase
:use_trailing_solidus => true, :use_trailing_solidus => true,
:omit_optional_tags => false, :omit_optional_tags => false,
:inject_meta_charset => false, :inject_meta_charset => false,
:sanitize => true}).gsub(/^<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>$/, '\1') :sanitize => true}).gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
rescue REXML::ParseException rescue REXML::ParseException
return "Ill-formed XHTML!" return "Ill-formed XHTML!"
end end
@ -65,6 +65,7 @@ class SanitizeTest < Test::Unit::TestCase
elsif VOID_ELEMENTS.include?(tag_name) elsif VOID_ELEMENTS.include?(tag_name)
htmloutput = "<#{tag_name} title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz" htmloutput = "<#{tag_name} title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
xhtmloutput = htmloutput xhtmloutput = htmloutput
htmloutput += '<br/>' if tag_name == 'br'
rexmloutput = "<#{tag_name} title='1' />" rexmloutput = "<#{tag_name} title='1' />"
end end
check_sanitization(input, htmloutput, xhtmloutput, rexmloutput) check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)

View file

@ -49,6 +49,18 @@ class Html5SerializeTestcase < Test::Unit::TestCase
elsif !expected.include?(result) elsif !expected.include?(result)
flunk("Expected: #{expected.inspect}, Received: #{result.inspect}") flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
end end
return if test_name == 'optionaltags'
result = HTML5lib::XHTMLSerializer.
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
expected = test["xhtml"] || test["expected"]
if expected.length == 1
assert_equal(expected[0], result, test["description"])
elsif !expected.include?(result)
flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
end
end end
end end

View file

@ -52,13 +52,11 @@ class HTMLInputStreamTest < Test::Unit::TestCase
def test_newlines def test_newlines
stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd") stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
assert_equal(0, stream.instance_eval {@tell}) assert_equal([1,0], stream.position)
assert_equal("a\nbb\n", stream.chars_until('c')) assert_equal("a\nbb\n", stream.chars_until('c'))
assert_equal(6, stream.instance_eval {@tell})
assert_equal([3,0], stream.position) assert_equal([3,0], stream.position)
assert_equal("ccc\ndddd", stream.chars_until('x')) assert_equal("ccc\ndddd", stream.chars_until('x'))
assert_equal(14, stream.instance_eval {@tell})
assert_equal([4,4], stream.position) assert_equal([4,4], stream.position)
assert_equal([0,1,5,9], stream.instance_eval {@new_lines}) assert_equal([1,2,3], stream.instance_eval {@line_lengths})
end end
end end

View file

@ -30,9 +30,10 @@ class Html5TokenizerTestCase < Test::Unit::TestCase
def tokenizer_test(data) def tokenizer_test(data)
(data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag| (data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
message = [ message = [
'Description:', data['description'], '', 'Description:', data['description'],
'Input:', data['input'], '', 'Input:', data['input'],
'Content Model Flag:', content_model_flag ] * "\n" '', 'Content Model Flag:', content_model_flag,
'' ] * "\n"
assert_nothing_raised message do assert_nothing_raised message do
tokenizer = HTML5lib::HTMLTokenizer.new(data['input']) tokenizer = HTML5lib::HTMLTokenizer.new(data['input'])

View file

@ -11,9 +11,9 @@ $tree_types_to_test = {
'rexml' => 'rexml' =>
{:builder => HTML5lib::TreeBuilders['rexml'], {:builder => HTML5lib::TreeBuilders['rexml'],
:walker => HTML5lib::TreeWalkers['rexml']}, :walker => HTML5lib::TreeWalkers['rexml']},
# 'hpricot' => 'hpricot' =>
# {:builder => HTML5lib::TreeBuilders['hpricot'], {:builder => HTML5lib::TreeBuilders['hpricot'],
# :walker => HTML5lib::TreeWalkers['hpricot']}, :walker => HTML5lib::TreeWalkers['hpricot']},
} }
puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', ' puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
@ -46,7 +46,7 @@ class TestTreeWalkers < Test::Unit::TestCase
output = [] output = []
indent = 0 indent = 0
concatenateCharacterTokens(tokens) do |token| concatenateCharacterTokens(tokens) do |token|
case token[:type] case token[:type]
when :StartTag, :EmptyTag when :StartTag, :EmptyTag
output << "#{' '*indent}<#{token[:name]}>" output << "#{' '*indent}<#{token[:name]}>"
indent += 2 indent += 2
@ -65,7 +65,7 @@ class TestTreeWalkers < Test::Unit::TestCase
output << "#{' '*indent}\"#{token[:data]}\"" output << "#{' '*indent}\"#{token[:data]}\""
else else
# TODO: what to do with errors? # TODO: what to do with errors?
end end
end end
return output.join("\n") return output.join("\n")
end end
@ -73,6 +73,7 @@ class TestTreeWalkers < Test::Unit::TestCase
html5lib_test_files('tree-construction').each do |test_file| html5lib_test_files('tree-construction').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '') test_name = File.basename(test_file).sub('.dat', '')
next if test_name == 'tests5' # TODO
File.read(test_file).split("#data\n").each_with_index do |data, index| File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty? next if data.empty?
@ -80,12 +81,11 @@ class TestTreeWalkers < Test::Unit::TestCase
innerHTML, input, expected_output, expected_errors = innerHTML, input, expected_output, expected_errors =
HTML5lib::TestSupport::parseTestcase(data) HTML5lib::TestSupport::parseTestcase(data)
rexml = $tree_types_to_test['rexml'] $tree_types_to_test.each do |tree_name, tree_class|
$tree_types_to_test.each do |tree_name, treeClass|
define_method "test_#{test_name}_#{index}_#{tree_name}" do define_method "test_#{test_name}_#{index}_#{tree_name}" do
parser = HTML5lib::HTMLParser.new(:tree => treeClass[:builder]) parser = HTML5lib::HTMLParser.new(:tree => tree_class[:builder])
if innerHTML if innerHTML
parser.parseFragment(input, innerHTML) parser.parseFragment(input, innerHTML)
@ -96,10 +96,13 @@ class TestTreeWalkers < Test::Unit::TestCase
document = parser.tree.getDocument document = parser.tree.getDocument
begin begin
output = sortattrs(convertTokens(treeClass[:walker].new(document))) output = sortattrs(convertTokens(tree_class[:walker].new(document)))
expected = sortattrs(expected_output) expected = sortattrs(expected_output)
errorMsg = "\n\nExpected:\n#{expected}\nRecieved:\n#{output}\n" assert_equal expected, output, [
assert_equal(expected, output, errorMsg) '', 'Input:', input,
'', 'Expected:', expected,
'', 'Recieved:', output
].join("\n")
rescue NotImplementedError rescue NotImplementedError
# Amnesty for those that confess... # Amnesty for those that confess...
end end

View file

@ -18,7 +18,8 @@ class TokenizerTestParser
end end
def processDoctype(token) def processDoctype(token)
@outputTokens.push(["DOCTYPE", token[:name], token[:data]]) @outputTokens.push(["DOCTYPE", token[:name], token[:publicId],
token[:systemId], token[:correct]])
end end
def processStartTag(token) def processStartTag(token)