Ruby 1.9 Compatibility

Completely removed the html5lib sanitizer.
Fixed the string-handling to work in both
Ruby 1.8.x and 1.9.2. There are still,
inexplicably, two functional tests that
fail. But the rest seems to work quite well.
This commit is contained in:
Jacques Distler 2009-11-30 16:28:18 -06:00
parent 79c8572053
commit a6429f8c22
142 changed files with 519 additions and 843 deletions

View file

@ -0,0 +1,970 @@
require 'html5/constants'
require 'html5/inputstream'
module HTML5
# This class takes care of tokenizing HTML.
#
# * @current_token
# Holds the token that is currently being processed.
#
# * @state
# Holds a reference to the method to be invoked... XXX
#
# * @states
# Holds a mapping between states and methods that implement the state.
#
# * @stream
# Points to HTMLInputStream object.
class HTMLTokenizer
attr_accessor :content_model_flag, :current_token
attr_reader :stream
# XXX need to fix documentation
def initialize(stream, options = {})
@stream = HTMLInputStream.new(stream, options)
# Setup the initial tokenizer state
@content_model_flag = :PCDATA
@state = :data_state
@escapeFlag = false
@lastFourChars = []
# The current token being created
@current_token = nil
# Tokens to be processed.
@token_queue = []
@lowercase_element_name = options[:lowercase_element_name] != false
@lowercase_attr_name = options[:lowercase_attr_name] != false
end
# This is where the magic happens.
#
# We do our usually processing through the states and when we have a token
# to return we yield the token which pauses processing until the next token
# is requested.
def each
@token_queue = []
# Start processing. When EOF is reached @state will return false
# instead of true and the loop will terminate.
while send @state
yield :type => :ParseError, :data => @stream.errors.shift until @stream.errors.empty?
yield @token_queue.shift until @token_queue.empty?
end
end
# Below are various helper functions the tokenizer states use worked out.
# If the next character is a '>', convert the current_token into
# an EmptyTag
def process_solidus_in_tag
# We need to consume another character to make sure it's a ">"
data = @stream.char
if @current_token[:type] == :StartTag and data == ">"
@current_token[:type] = :EmptyTag
else
@token_queue << {:type => :ParseError, :data => "incorrectly-placed-solidus"}
end
# The character we just consumed need to be put back on the stack so it
# doesn't get lost...
@stream.unget(data)
end
# This function returns either U+FFFD or the character based on the
# decimal or hexadecimal representation. It also discards ";" if present.
# If not present @token_queue << {:type => :ParseError}" is invoked.
def consume_number_entity(isHex)
# XXX More need to be done here. For instance, #13 should prolly be
# converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
# such. Thoughts on this appreciated.
allowed = DIGITS
radix = 10
if isHex
allowed = HEX_DIGITS
radix = 16
end
char_stack = []
# Consume all the characters that are in range while making sure we
# don't hit an EOF.
c = @stream.char
while allowed.include?(c) and c != :EOF
char_stack.push(c)
c = @stream.char
end
# Convert the set of characters consumed to an int.
charAsInt = char_stack.join('').to_i(radix)
if charAsInt == 13
@token_queue << {:type => :ParseError, :data => "incorrect-cr-newline-entity"}
charAsInt = 10
elsif (128..159).include? charAsInt
# If the integer is between 127 and 160 (so 128 and bigger and 159
# and smaller) we need to do the "windows trick".
@token_queue << {:type => :ParseError, :data => "illegal-windows-1252-entity"}
charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
end
if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343)
if String.method_defined? :force_encoding
char = charAsInt.chr('utf-8')
else
char = [charAsInt].pack('U')
end
else
char = [0xFFFD].pack('U')
@token_queue << {:type => :ParseError, :data => "cant-convert-numeric-entity", :datavars => {"charAsInt" => charAsInt}}
end
# Discard the ; if present. Otherwise, put it back on the queue and
# invoke parse_error on parser.
if c != ";"
@token_queue << {:type => :ParseError, :data => "numeric-entity-without-semicolon"}
@stream.unget(c)
end
return char
end
def consume_entity(from_attribute=false)
char = nil
char_stack = [@stream.char]
if SPACE_CHARACTERS.include?(char_stack[0]) or [:EOF, '<', '&'].include?(char_stack[0])
@stream.unget(char_stack)
elsif char_stack[0] == '#'
# We might have a number entity here.
char_stack += [@stream.char, @stream.char]
if char_stack[0 .. 1].include? :EOF
# If we reach the end of the file put everything up to :EOF
# back in the queue
char_stack = char_stack[0...char_stack.index(:EOF)]
@stream.unget(char_stack)
@token_queue << {:type => :ParseError, :data => "expected-numeric-entity-but-got-eof"}
else
if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
# Hexadecimal entity detected.
@stream.unget(char_stack[2])
char = consume_number_entity(true)
elsif DIGITS.include? char_stack[1]
# Decimal entity detected.
@stream.unget(char_stack[1..-1])
char = consume_number_entity(false)
else
# No number entity detected.
@stream.unget(char_stack)
@token_queue << {:type => :ParseError, :data => "expected-numeric-entity"}
end
end
else
# At this point in the process might have named entity. Entities
# are stored in the global variable "entities".
#
# Consume characters and compare to these to a substring of the
# entity names in the list until the substring no longer matches.
filteredEntityList = ENTITIES.keys
filteredEntityList.reject! {|e| e[0].chr != char_stack[0]}
entityName = nil
# Try to find the longest entity the string will match to take care
# of &noti for instance.
while char_stack.last != :EOF
name = char_stack.join('')
if filteredEntityList.any? {|e| e[0...name.length] == name}
filteredEntityList.reject! {|e| e[0...name.length] != name}
char_stack.push(@stream.char)
else
break
end
if ENTITIES.include? name
entityName = name
break if entityName[-1] == ';'
end
end
if entityName != nil
char = ENTITIES[entityName]
# Check whether or not the last character returned can be
# discarded or needs to be put back.
if entityName[-1] != ?;
@token_queue << {:type => :ParseError, :data => "named-entity-without-semicolon"}
end
if entityName[-1] != ";" and from_attribute and
(ASCII_LETTERS.include?(char_stack[entityName.length]) or
DIGITS.include?(char_stack[entityName.length]))
@stream.unget(char_stack)
char = '&'
else
@stream.unget(char_stack[entityName.length..-1])
end
else
@token_queue << {:type => :ParseError, :data => "expected-named-entity"}
@stream.unget(char_stack)
end
end
return char
end
# This method replaces the need for "entityInAttributeValueState".
def process_entity_in_attribute
entity = consume_entity()
if entity
@current_token[:data][-1][1] += entity
else
@current_token[:data][-1][1] += "&"
end
end
# This method is a generic handler for emitting the tags. It also sets
# the state to "data" because that's what's needed after a token has been
# emitted.
def emit_current_token
# Add token to the queue to be yielded
token = @current_token
if [:StartTag, :EndTag, :EmptyTag].include?(token[:type])
if @lowercase_element_name
token[:name] = token[:name].downcase
end
@token_queue << token
@state = :data_state
end
end
# Below are the various tokenizer states worked out.
# XXX AT Perhaps we should have Hixie run some evaluation on billions of
# documents to figure out what the order of the various if and elsif
# statements should be.
def data_state
data = @stream.char
if @content_model_flag == :CDATA or @content_model_flag == :RCDATA
@lastFourChars << data
@lastFourChars.shift if @lastFourChars.length > 4
end
if data == "&" and [:PCDATA,:RCDATA].include?(@content_model_flag) and !@escapeFlag
@state = :entity_data_state
elsif data == "-" && [:CDATA, :RCDATA].include?(@content_model_flag) && !@escapeFlag && @lastFourChars.join('') == "<!--"
@escapeFlag = true
@token_queue << {:type => :Characters, :data => data}
elsif data == "<" and !@escapeFlag and
[:PCDATA,:CDATA,:RCDATA].include?(@content_model_flag)
@state = :tag_open_state
elsif data == ">" and @escapeFlag and
[:CDATA,:RCDATA].include?(@content_model_flag) and
@lastFourChars[1..-1].join('') == "-->"
@escapeFlag = false
@token_queue << {:type => :Characters, :data => data}
elsif data == :EOF
# Tokenization ends.
return false
elsif SPACE_CHARACTERS.include? data
# Directly after emitting a token you switch back to the "data
# state". At that point SPACE_CHARACTERS are important so they are
# emitted separately.
# XXX need to check if we don't need a special "spaces" flag on
# characters.
@token_queue << {:type => :SpaceCharacters, :data => data + @stream.chars_until(SPACE_CHARACTERS, true)}
else
@token_queue << {:type => :Characters, :data => data + @stream.chars_until(%w[& < > -])}
end
return true
end
def entity_data_state
entity = consume_entity
if entity
@token_queue << {:type => :Characters, :data => entity}
else
@token_queue << {:type => :Characters, :data => "&"}
end
@state = :data_state
return true
end
def tag_open_state
data = @stream.char
if @content_model_flag == :PCDATA
if data == "!"
@state = :markup_declaration_open_state
elsif data == "/"
@state = :close_tag_open_state
elsif data != :EOF and ASCII_LETTERS.include? data
@current_token = {:type => :StartTag, :name => data, :data => []}
@state = :tag_name_state
elsif data == ">"
# XXX In theory it could be something besides a tag name. But
# do we really care?
@token_queue << {:type => :ParseError, :data => "expected-tag-name-but-got-right-bracket"}
@token_queue << {:type => :Characters, :data => "<>"}
@state = :data_state
elsif data == "?"
# XXX In theory it could be something besides a tag name. But
# do we really care?
@token_queue.push({:type => :ParseError, :data => "expected-tag-name-but-got-question-mark"})
@stream.unget(data)
@state = :bogus_comment_state
else
# XXX
@token_queue << {:type => :ParseError, :data => "expected-tag-name"}
@token_queue << {:type => :Characters, :data => "<"}
@stream.unget(data)
@state = :data_state
end
else
# We know the content model flag is set to either RCDATA or CDATA
# now because this state can never be entered with the PLAINTEXT
# flag.
if data == "/"
@state = :close_tag_open_state
else
@token_queue << {:type => :Characters, :data => "<"}
@stream.unget(data)
@state = :data_state
end
end
return true
end
def close_tag_open_state
if (@content_model_flag == :RCDATA or @content_model_flag == :CDATA)
if @current_token
char_stack = []
# So far we know that "</" has been consumed. We now need to know
# whether the next few characters match the name of last emitted
# start tag which also happens to be the current_token. We also need
# to have the character directly after the characters that could
# match the start tag name.
(@current_token[:name].length + 1).times do
char_stack.push(@stream.char)
# Make sure we don't get hit by :EOF
break if char_stack[-1] == :EOF
end
# Since this is just for checking. We put the characters back on
# the stack.
@stream.unget(char_stack)
end
if @current_token and
@current_token[:name].downcase ==
char_stack[0...-1].join('').downcase and
(SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? char_stack[-1]
# Because the characters are correct we can safely switch to
# PCDATA mode now. This also means we don't have to do it when
# emitting the end tag token.
@content_model_flag = :PCDATA
else
@token_queue << {:type => :Characters, :data => "</"}
@state = :data_state
# Need to return here since we don't want the rest of the
# method to be walked through.
return true
end
end
data = @stream.char
if data == :EOF
@token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-eof"}
@token_queue << {:type => :Characters, :data => "</"}
@state = :data_state
elsif ASCII_LETTERS.include? data
@current_token = {:type => :EndTag, :name => data, :data => []}
@state = :tag_name_state
elsif data == ">"
@token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-right-bracket"}
@state = :data_state
else
# XXX data can be _'_...
@token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-char", :datavars => {:data => data}}
@stream.unget(data)
@state = :bogus_comment_state
end
return true
end
def tag_name_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@state = :before_attribute_name_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
emit_current_token
elsif ASCII_LETTERS.include? data
@current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
elsif data == ">"
emit_current_token
elsif data == "/"
process_solidus_in_tag
@state = :before_attribute_name_state
else
@current_token[:name] += data
end
return true
end
def before_attribute_name_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@stream.chars_until(SPACE_CHARACTERS, true)
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "expected-attribute-name-but-got-eof"}
emit_current_token
elsif ASCII_LETTERS.include? data
@current_token[:data].push([data, ""])
@state = :attribute_name_state
elsif data == ">"
emit_current_token
elsif data == "/"
process_solidus_in_tag
else
@current_token[:data].push([data, ""])
@state = :attribute_name_state
end
return true
end
def attribute_name_state
data = @stream.char
leavingThisState = true
emitToken = false
if data == "="
@state = :before_attribute_value_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "eof-in-attribute-name"}
@state = :data_state
emitToken = true
elsif ASCII_LETTERS.include? data
@current_token[:data][-1][0] += data + @stream.chars_until(ASCII_LETTERS, true)
leavingThisState = false
elsif data == ">"
# XXX If we emit here the attributes are converted to a dict
# without being checked and when the code below runs we error
# because data is a dict not a list
emitToken = true
elsif SPACE_CHARACTERS.include? data
@state = :after_attribute_name_state
elsif data == "/"
process_solidus_in_tag
@state = :before_attribute_name_state
else
@current_token[:data][-1][0] += data
leavingThisState = false
end
if leavingThisState
# Attributes are not dropped at this stage. That happens when the
# start tag token is emitted so values can still be safely appended
# to attributes, but we do want to report the parse error in time.
if @lowercase_attr_name
@current_token[:data][-1][0] = @current_token[:data].last.first.downcase
end
@current_token[:data][0...-1].each {|name,value|
if @current_token[:data].last.first == name
@token_queue << {:type => :ParseError, :data => "duplicate-attribute"}
break # don't report an error more than once
end
}
# XXX Fix for above XXX
emit_current_token if emitToken
end
return true
end
def after_attribute_name_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@stream.chars_until(SPACE_CHARACTERS, true)
elsif data == "="
@state = :before_attribute_value_state
elsif data == ">"
emit_current_token
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "expected-end-of-tag-but-got-eof"}
emit_current_token
elsif ASCII_LETTERS.include? data
@current_token[:data].push([data, ""])
@state = :attribute_name_state
elsif data == "/"
process_solidus_in_tag
@state = :before_attribute_name_state
else
@current_token[:data].push([data, ""])
@state = :attribute_name_state
end
return true
end
def before_attribute_value_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@stream.chars_until(SPACE_CHARACTERS, true)
elsif data == "\""
@state = :attribute_value_double_quoted_state
elsif data == "&"
@state = :attribute_value_unquoted_state
@stream.unget(data);
elsif data == "'"
@state = :attribute_value_single_quoted_state
elsif data == ">"
emit_current_token
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "expected-attribute-value-but-got-eof"}
emit_current_token
else
@current_token[:data][-1][1] += data
@state = :attribute_value_unquoted_state
end
return true
end
def attribute_value_double_quoted_state
data = @stream.char
if data == "\""
@state = :before_attribute_name_state
elsif data == "&"
process_entity_in_attribute
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-double-quote"}
emit_current_token
else
@current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
end
return true
end
def attribute_value_single_quoted_state
data = @stream.char
if data == "'"
@state = :before_attribute_name_state
elsif data == "&"
process_entity_in_attribute
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-single-quote"}
emit_current_token
else
@current_token[:data][-1][1] += data +\
@stream.chars_until(["'", "&"])
end
return true
end
def attribute_value_unquoted_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@state = :before_attribute_name_state
elsif data == "&"
process_entity_in_attribute
elsif data == ">"
emit_current_token
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-no-quotes"}
emit_current_token
else
@current_token[:data][-1][1] += data + @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
end
return true
end
def bogus_comment_state
# Make a new comment token and give it as value all the characters
# until the first > or :EOF (chars_until checks for :EOF automatically)
# and emit it.
@token_queue << {:type => :Comment, :data => @stream.chars_until((">"))}
# Eat the character directly after the bogus comment which is either a
# ">" or an :EOF.
@stream.char
@state = :data_state
return true
end
def markup_declaration_open_state
char_stack = [@stream.char, @stream.char]
if char_stack == ["-", "-"]
@current_token = {:type => :Comment, :data => ""}
@state = :comment_start_state
else
5.times { char_stack.push(@stream.char) }
# Put in explicit :EOF check
if !char_stack.include?(:EOF) && char_stack.join("").upcase == "DOCTYPE"
@current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
@state = :doctype_state
else
@token_queue << {:type => :ParseError, :data => "expected-dashes-or-doctype"}
@stream.unget(char_stack)
@state = :bogus_comment_state
end
end
return true
end
def comment_start_state
data = @stream.char
if data == "-"
@state = :comment_start_dash_state
elsif data == ">"
@token_queue << {:type => :ParseError, :data => "incorrect-comment"}
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "eof-in-comment"}
@token_queue << @current_token
@state = :data_state
else
@current_token[:data] += data + @stream.chars_until("-")
@state = :comment_state
end
return true
end
def comment_start_dash_state
data = @stream.char
if data == "-"
@state = :comment_end_state
elsif data == ">"
@token_queue << {:type => :ParseError, :data => "incorrect-comment"}
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "eof-in-comment"}
@token_queue << @current_token
@state = :data_state
else
@current_token[:data] += '-' + data + @stream.chars_until("-")
@state = :comment_state
end
return true
end
def comment_state
data = @stream.char
if data == "-"
@state = :comment_end_dash_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "eof-in-comment"}
@token_queue << @current_token
@state = :data_state
else
@current_token[:data] += data + @stream.chars_until("-")
end
return true
end
def comment_end_dash_state
data = @stream.char
if data == "-"
@state = :comment_end_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "eof-in-comment-end-dash"}
@token_queue << @current_token
@state = :data_state
else
@current_token[:data] += "-" + data +\
@stream.chars_until("-")
# Consume the next character which is either a "-" or an :EOF as
# well so if there's a "-" directly after the "-" we go nicely to
# the "comment end state" without emitting a ParseError there.
@stream.char
end
return true
end
def comment_end_state
data = @stream.char
if data == ">"
@token_queue << @current_token
@state = :data_state
elsif data == "-"
@token_queue << {:type => :ParseError, :data => "unexpected-dash-after-double-dash-in-comment"}
@current_token[:data] += data
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "eof-in-comment-double-dash"}
@token_queue << @current_token
@state = :data_state
else
# XXX
@token_queue << {:type => :ParseError, :data => "unexpected-char-in-comment"}
@current_token[:data] += "--" + data
@state = :comment_state
end
return true
end
def doctype_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@state = :before_doctype_name_state
else
@token_queue << {:type => :ParseError, :data => "need-space-after-doctype"}
@stream.unget(data)
@state = :before_doctype_name_state
end
return true
end
def before_doctype_name_state
data = @stream.char
if SPACE_CHARACTERS.include? data
elsif data == ">"
@token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-right-bracket"}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-eof"}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@current_token[:name] = data
@state = :doctype_name_state
end
return true
end
def doctype_name_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@state = :after_doctype_name_state
elsif data == ">"
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "eof-in-doctype-name"}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@current_token[:name] += data
end
return true
end
def after_doctype_name_state
data = @stream.char
if SPACE_CHARACTERS.include? data
elsif data == ">"
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@current_token[:correct] = false
@stream.unget(data)
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
@token_queue << @current_token
@state = :data_state
else
char_stack = [data]
5.times { char_stack << stream.char }
token = char_stack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
if token == "public" and !char_stack.include?(:EOF)
@state = :before_doctype_public_identifier_state
elsif token == "system" and !char_stack.include?(:EOF)
@state = :before_doctype_system_identifier_state
else
@stream.unget(char_stack)
@token_queue << {:type => :ParseError, :data => "expected-space-or-right-bracket-in-doctype", "datavars" => {"data" => data}}
@state = :bogus_doctype_state
end
end
return true
end
def before_doctype_public_identifier_state
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == "\""
@current_token[:publicId] = ""
@state = :doctype_public_identifier_double_quoted_state
elsif data == "'"
@current_token[:publicId] = ""
@state = :doctype_public_identifier_single_quoted_state
elsif data == ">"
@token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
@state = :bogus_doctype_state
end
return true
end
def doctype_public_identifier_double_quoted_state
data = @stream.char
if data == "\""
@state = :after_doctype_public_identifier_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@current_token[:publicId] += data
end
return true
end
def doctype_public_identifier_single_quoted_state
data = @stream.char
if data == "'"
@state = :after_doctype_public_identifier_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@current_token[:publicId] += data
end
return true
end
def after_doctype_public_identifier_state
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == "\""
@current_token[:systemId] = ""
@state = :doctype_system_identifier_double_quoted_state
elsif data == "'"
@current_token[:systemId] = ""
@state = :doctype_system_identifier_single_quoted_state
elsif data == ">"
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
@state = :bogus_doctype_state
end
return true
end
def before_doctype_system_identifier_state
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == "\""
@current_token[:systemId] = ""
@state = :doctype_system_identifier_double_quoted_state
elsif data == "'"
@current_token[:systemId] = ""
@state = :doctype_system_identifier_single_quoted_state
elsif data == ">"
@token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
@state = :bogus_doctype_state
end
return true
end
def doctype_system_identifier_double_quoted_state
data = @stream.char
if data == "\""
@state = :after_doctype_system_identifier_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@current_token[:systemId] += data
end
return true
end
def doctype_system_identifier_single_quoted_state
data = @stream.char
if data == "'"
@state = :after_doctype_system_identifier_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@current_token[:systemId] += data
end
return true
end
def after_doctype_system_identifier_state
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == ">"
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
@state = :bogus_doctype_state
end
return true
end
def bogus_doctype_state
data = @stream.char
@current_token[:correct] = false
if data == ">"
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
# XXX EMIT
@stream.unget(data)
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
end
return true
end
end
end