ed68d975df
Fix that Tokenizer bug for real this time.
967 lines
32 KiB
Ruby
967 lines
32 KiB
Ruby
require 'html5/constants'
|
|
require 'html5/inputstream'
|
|
|
|
module HTML5
|
|
|
|
# This class takes care of tokenizing HTML.
|
|
#
|
|
# * @current_token
|
|
# Holds the token that is currently being processed.
|
|
#
|
|
# * @state
|
|
# Holds a reference to the method to be invoked... XXX
|
|
#
|
|
# * @states
|
|
# Holds a mapping between states and methods that implement the state.
|
|
#
|
|
# * @stream
|
|
# Points to HTMLInputStream object.
|
|
|
|
class HTMLTokenizer
|
|
attr_accessor :content_model_flag, :current_token
|
|
attr_reader :stream
|
|
|
|
# XXX need to fix documentation
|
|
|
|
def initialize(stream, options = {})
|
|
@stream = HTMLInputStream.new(stream, options)
|
|
|
|
# Setup the initial tokenizer state
|
|
@content_model_flag = :PCDATA
|
|
@state = :data_state
|
|
@escapeFlag = false
|
|
@lastFourChars = []
|
|
|
|
# The current token being created
|
|
@current_token = nil
|
|
|
|
# Tokens to be processed.
|
|
@token_queue = []
|
|
@lowercase_element_name = options[:lowercase_element_name] != false
|
|
@lowercase_attr_name = options[:lowercase_attr_name] != false
|
|
end
|
|
|
|
# This is where the magic happens.
|
|
#
|
|
# We do our usually processing through the states and when we have a token
|
|
# to return we yield the token which pauses processing until the next token
|
|
# is requested.
|
|
def each
|
|
@token_queue = []
|
|
# Start processing. When EOF is reached @state will return false
|
|
# instead of true and the loop will terminate.
|
|
while send @state
|
|
yield :type => :ParseError, :data => @stream.errors.shift until @stream.errors.empty?
|
|
yield @token_queue.shift until @token_queue.empty?
|
|
end
|
|
end
|
|
|
|
# Below are various helper functions the tokenizer states use worked out.
|
|
|
|
# If the next character is a '>', convert the current_token into
|
|
# an EmptyTag
|
|
|
|
def process_solidus_in_tag
|
|
|
|
# We need to consume another character to make sure it's a ">"
|
|
data = @stream.char
|
|
|
|
if @current_token[:type] == :StartTag and data == ">"
|
|
@current_token[:type] = :EmptyTag
|
|
else
|
|
@token_queue << {:type => :ParseError, :data => "incorrectly-placed-solidus"}
|
|
end
|
|
|
|
# The character we just consumed need to be put back on the stack so it
|
|
# doesn't get lost...
|
|
@stream.unget(data)
|
|
end
|
|
|
|
# This function returns either U+FFFD or the character based on the
|
|
# decimal or hexadecimal representation. It also discards ";" if present.
|
|
# If not present @token_queue << {:type => :ParseError}" is invoked.
|
|
|
|
def consume_number_entity(isHex)
|
|
|
|
# XXX More need to be done here. For instance, #13 should prolly be
|
|
# converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
|
|
# such. Thoughts on this appreciated.
|
|
allowed = DIGITS
|
|
radix = 10
|
|
if isHex
|
|
allowed = HEX_DIGITS
|
|
radix = 16
|
|
end
|
|
|
|
char_stack = []
|
|
|
|
# Consume all the characters that are in range while making sure we
|
|
# don't hit an EOF.
|
|
c = @stream.char
|
|
while allowed.include?(c) and c != :EOF
|
|
char_stack.push(c)
|
|
c = @stream.char
|
|
end
|
|
|
|
# Convert the set of characters consumed to an int.
|
|
charAsInt = char_stack.join('').to_i(radix)
|
|
|
|
if charAsInt == 13
|
|
@token_queue << {:type => :ParseError, :data => "incorrect-cr-newline-entity"}
|
|
charAsInt = 10
|
|
elsif (128..159).include? charAsInt
|
|
# If the integer is between 127 and 160 (so 128 and bigger and 159
|
|
# and smaller) we need to do the "windows trick".
|
|
@token_queue << {:type => :ParseError, :data => "illegal-windows-1252-entity"}
|
|
|
|
charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
|
|
end
|
|
|
|
if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343)
|
|
char = [charAsInt].pack('U')
|
|
else
|
|
char = [0xFFFD].pack('U')
|
|
@token_queue << {:type => :ParseError, :data => "cant-convert-numeric-entity", :datavars => {"charAsInt" => charAsInt}}
|
|
end
|
|
|
|
# Discard the ; if present. Otherwise, put it back on the queue and
|
|
# invoke parse_error on parser.
|
|
if c != ";"
|
|
@token_queue << {:type => :ParseError, :data => "numeric-entity-without-semicolon"}
|
|
@stream.unget(c)
|
|
end
|
|
|
|
return char
|
|
end
|
|
|
|
def consume_entity(from_attribute=false)
|
|
char = nil
|
|
char_stack = [@stream.char]
|
|
if SPACE_CHARACTERS.include?(char_stack[0]) or [:EOF, '<', '&'].include?(char_stack[0])
|
|
@stream.unget(char_stack)
|
|
elsif char_stack[0] == '#'
|
|
# We might have a number entity here.
|
|
char_stack += [@stream.char, @stream.char]
|
|
if char_stack[0 .. 1].include? :EOF
|
|
# If we reach the end of the file put everything up to :EOF
|
|
# back in the queue
|
|
char_stack = char_stack[0...char_stack.index(:EOF)]
|
|
@stream.unget(char_stack)
|
|
@token_queue << {:type => :ParseError, :data => "expected-numeric-entity-but-got-eof"}
|
|
else
|
|
if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
|
|
# Hexadecimal entity detected.
|
|
@stream.unget(char_stack[2])
|
|
char = consume_number_entity(true)
|
|
elsif DIGITS.include? char_stack[1]
|
|
# Decimal entity detected.
|
|
@stream.unget(char_stack[1..-1])
|
|
char = consume_number_entity(false)
|
|
else
|
|
# No number entity detected.
|
|
@stream.unget(char_stack)
|
|
@token_queue << {:type => :ParseError, :data => "expected-numeric-entity"}
|
|
end
|
|
end
|
|
else
|
|
# At this point in the process might have named entity. Entities
|
|
# are stored in the global variable "entities".
|
|
#
|
|
# Consume characters and compare to these to a substring of the
|
|
# entity names in the list until the substring no longer matches.
|
|
filteredEntityList = ENTITIES.keys
|
|
filteredEntityList.reject! {|e| e[0].chr != char_stack[0]}
|
|
entityName = nil
|
|
|
|
# Try to find the longest entity the string will match to take care
|
|
# of ¬i for instance.
|
|
while char_stack.last != :EOF
|
|
name = char_stack.join('')
|
|
if filteredEntityList.any? {|e| e[0...name.length] == name}
|
|
filteredEntityList.reject! {|e| e[0...name.length] != name}
|
|
char_stack.push(@stream.char)
|
|
else
|
|
break
|
|
end
|
|
|
|
if ENTITIES.include? name
|
|
entityName = name
|
|
break if entityName[-1] == ';'
|
|
end
|
|
end
|
|
|
|
if entityName != nil
|
|
char = ENTITIES[entityName]
|
|
|
|
# Check whether or not the last character returned can be
|
|
# discarded or needs to be put back.
|
|
if entityName[-1] != ?;
|
|
@token_queue << {:type => :ParseError, :data => "named-entity-without-semicolon"}
|
|
end
|
|
|
|
if entityName[-1] != ";" and from_attribute and
|
|
(ASCII_LETTERS.include?(char_stack[entityName.length]) or
|
|
DIGITS.include?(char_stack[entityName.length]))
|
|
@stream.unget(char_stack)
|
|
char = '&'
|
|
else
|
|
@stream.unget(char_stack[entityName.length..-1])
|
|
end
|
|
else
|
|
@token_queue << {:type => :ParseError, :data => "expected-named-entity"}
|
|
@stream.unget(char_stack)
|
|
end
|
|
end
|
|
return char
|
|
end
|
|
|
|
# This method replaces the need for "entityInAttributeValueState".
|
|
def process_entity_in_attribute
|
|
entity = consume_entity()
|
|
if entity
|
|
@current_token[:data][-1][1] += entity
|
|
else
|
|
@current_token[:data][-1][1] += "&"
|
|
end
|
|
end
|
|
|
|
# This method is a generic handler for emitting the tags. It also sets
|
|
# the state to "data" because that's what's needed after a token has been
|
|
# emitted.
|
|
def emit_current_token
|
|
# Add token to the queue to be yielded
|
|
token = @current_token
|
|
if [:StartTag, :EndTag, :EmptyTag].include?(token[:type])
|
|
if @lowercase_element_name
|
|
token[:name] = token[:name].downcase
|
|
end
|
|
@token_queue << token
|
|
@state = :data_state
|
|
end
|
|
|
|
end
|
|
|
|
# Below are the various tokenizer states worked out.
|
|
|
|
# XXX AT Perhaps we should have Hixie run some evaluation on billions of
|
|
# documents to figure out what the order of the various if and elsif
|
|
# statements should be.
|
|
def data_state
|
|
data = @stream.char
|
|
|
|
if @content_model_flag == :CDATA or @content_model_flag == :RCDATA
|
|
@lastFourChars << data
|
|
@lastFourChars.shift if @lastFourChars.length > 4
|
|
end
|
|
|
|
if data == "&" and [:PCDATA,:RCDATA].include?(@content_model_flag) and !@escapeFlag
|
|
@state = :entity_data_state
|
|
elsif data == "-" && [:CDATA, :RCDATA].include?(@content_model_flag) && !@escapeFlag && @lastFourChars.join('') == "<!--"
|
|
@escapeFlag = true
|
|
@token_queue << {:type => :Characters, :data => data}
|
|
elsif data == "<" and !@escapeFlag and
|
|
[:PCDATA,:CDATA,:RCDATA].include?(@content_model_flag)
|
|
@state = :tag_open_state
|
|
elsif data == ">" and @escapeFlag and
|
|
[:CDATA,:RCDATA].include?(@content_model_flag) and
|
|
@lastFourChars[1..-1].join('') == "-->"
|
|
@escapeFlag = false
|
|
@token_queue << {:type => :Characters, :data => data}
|
|
|
|
elsif data == :EOF
|
|
# Tokenization ends.
|
|
return false
|
|
|
|
elsif SPACE_CHARACTERS.include? data
|
|
# Directly after emitting a token you switch back to the "data
|
|
# state". At that point SPACE_CHARACTERS are important so they are
|
|
# emitted separately.
|
|
# XXX need to check if we don't need a special "spaces" flag on
|
|
# characters.
|
|
@token_queue << {:type => :SpaceCharacters, :data => data + @stream.chars_until(SPACE_CHARACTERS, true)}
|
|
else
|
|
@token_queue << {:type => :Characters, :data => data + @stream.chars_until(%w[& < > -])}
|
|
end
|
|
return true
|
|
end
|
|
|
|
def entity_data_state
|
|
entity = consume_entity
|
|
if entity
|
|
@token_queue << {:type => :Characters, :data => entity}
|
|
else
|
|
@token_queue << {:type => :Characters, :data => "&"}
|
|
end
|
|
@state = :data_state
|
|
return true
|
|
end
|
|
|
|
def tag_open_state
|
|
data = @stream.char
|
|
if @content_model_flag == :PCDATA
|
|
if data == "!"
|
|
@state = :markup_declaration_open_state
|
|
elsif data == "/"
|
|
@state = :close_tag_open_state
|
|
elsif data != :EOF and ASCII_LETTERS.include? data
|
|
@current_token = {:type => :StartTag, :name => data, :data => []}
|
|
@state = :tag_name_state
|
|
elsif data == ">"
|
|
# XXX In theory it could be something besides a tag name. But
|
|
# do we really care?
|
|
@token_queue << {:type => :ParseError, :data => "expected-tag-name-but-got-right-bracket"}
|
|
@token_queue << {:type => :Characters, :data => "<>"}
|
|
@state = :data_state
|
|
elsif data == "?"
|
|
# XXX In theory it could be something besides a tag name. But
|
|
# do we really care?
|
|
@token_queue.push({:type => :ParseError, :data => "expected-tag-name-but-got-question-mark"})
|
|
@stream.unget(data)
|
|
@state = :bogus_comment_state
|
|
else
|
|
# XXX
|
|
@token_queue << {:type => :ParseError, :data => "expected-tag-name"}
|
|
@token_queue << {:type => :Characters, :data => "<"}
|
|
@stream.unget(data)
|
|
@state = :data_state
|
|
end
|
|
else
|
|
# We know the content model flag is set to either RCDATA or CDATA
|
|
# now because this state can never be entered with the PLAINTEXT
|
|
# flag.
|
|
if data == "/"
|
|
@state = :close_tag_open_state
|
|
else
|
|
@token_queue << {:type => :Characters, :data => "<"}
|
|
@stream.unget(data)
|
|
@state = :data_state
|
|
end
|
|
end
|
|
return true
|
|
end
|
|
|
|
def close_tag_open_state
|
|
if (@content_model_flag == :RCDATA or @content_model_flag == :CDATA)
|
|
if @current_token
|
|
char_stack = []
|
|
|
|
# So far we know that "</" has been consumed. We now need to know
|
|
# whether the next few characters match the name of last emitted
|
|
# start tag which also happens to be the current_token. We also need
|
|
# to have the character directly after the characters that could
|
|
# match the start tag name.
|
|
(@current_token[:name].length + 1).times do
|
|
char_stack.push(@stream.char)
|
|
# Make sure we don't get hit by :EOF
|
|
break if char_stack[-1] == :EOF
|
|
end
|
|
|
|
# Since this is just for checking. We put the characters back on
|
|
# the stack.
|
|
@stream.unget(char_stack)
|
|
end
|
|
|
|
if @current_token and
|
|
@current_token[:name].downcase ==
|
|
char_stack[0...-1].join('').downcase and
|
|
(SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? char_stack[-1]
|
|
# Because the characters are correct we can safely switch to
|
|
# PCDATA mode now. This also means we don't have to do it when
|
|
# emitting the end tag token.
|
|
@content_model_flag = :PCDATA
|
|
else
|
|
@token_queue << {:type => :Characters, :data => "</"}
|
|
@state = :data_state
|
|
|
|
# Need to return here since we don't want the rest of the
|
|
# method to be walked through.
|
|
return true
|
|
end
|
|
end
|
|
|
|
data = @stream.char
|
|
if data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-eof"}
|
|
@token_queue << {:type => :Characters, :data => "</"}
|
|
@state = :data_state
|
|
elsif ASCII_LETTERS.include? data
|
|
@current_token = {:type => :EndTag, :name => data, :data => []}
|
|
@state = :tag_name_state
|
|
elsif data == ">"
|
|
@token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-right-bracket"}
|
|
@state = :data_state
|
|
else
|
|
# XXX data can be _'_...
|
|
@token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-char", :datavars => {:data => data}}
|
|
@stream.unget(data)
|
|
@state = :bogus_comment_state
|
|
end
|
|
|
|
return true
|
|
end
|
|
|
|
def tag_name_state
|
|
data = @stream.char
|
|
if SPACE_CHARACTERS.include? data
|
|
@state = :before_attribute_name_state
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
|
|
emit_current_token
|
|
elsif ASCII_LETTERS.include? data
|
|
@current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
|
|
elsif data == ">"
|
|
emit_current_token
|
|
elsif data == "/"
|
|
process_solidus_in_tag
|
|
@state = :before_attribute_name_state
|
|
else
|
|
@current_token[:name] += data
|
|
end
|
|
return true
|
|
end
|
|
|
|
def before_attribute_name_state
|
|
data = @stream.char
|
|
if SPACE_CHARACTERS.include? data
|
|
@stream.chars_until(SPACE_CHARACTERS, true)
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "expected-attribute-name-but-got-eof"}
|
|
emit_current_token
|
|
elsif ASCII_LETTERS.include? data
|
|
@current_token[:data].push([data, ""])
|
|
@state = :attribute_name_state
|
|
elsif data == ">"
|
|
emit_current_token
|
|
elsif data == "/"
|
|
process_solidus_in_tag
|
|
else
|
|
@current_token[:data].push([data, ""])
|
|
@state = :attribute_name_state
|
|
end
|
|
return true
|
|
end
|
|
|
|
def attribute_name_state
|
|
data = @stream.char
|
|
leavingThisState = true
|
|
emitToken = false
|
|
if data == "="
|
|
@state = :before_attribute_value_state
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-attribute-name"}
|
|
@state = :data_state
|
|
emitToken = true
|
|
elsif ASCII_LETTERS.include? data
|
|
@current_token[:data][-1][0] += data + @stream.chars_until(ASCII_LETTERS, true)
|
|
leavingThisState = false
|
|
elsif data == ">"
|
|
# XXX If we emit here the attributes are converted to a dict
|
|
# without being checked and when the code below runs we error
|
|
# because data is a dict not a list
|
|
emitToken = true
|
|
elsif SPACE_CHARACTERS.include? data
|
|
@state = :after_attribute_name_state
|
|
elsif data == "/"
|
|
process_solidus_in_tag
|
|
@state = :before_attribute_name_state
|
|
else
|
|
@current_token[:data][-1][0] += data
|
|
leavingThisState = false
|
|
end
|
|
|
|
if leavingThisState
|
|
# Attributes are not dropped at this stage. That happens when the
|
|
# start tag token is emitted so values can still be safely appended
|
|
# to attributes, but we do want to report the parse error in time.
|
|
if @lowercase_attr_name
|
|
@current_token[:data][-1][0] = @current_token[:data].last.first.downcase
|
|
end
|
|
@current_token[:data][0...-1].each {|name,value|
|
|
if @current_token[:data].last.first == name
|
|
@token_queue << {:type => :ParseError, :data => "duplicate-attribute"}
|
|
break # don't report an error more than once
|
|
end
|
|
}
|
|
# XXX Fix for above XXX
|
|
emit_current_token if emitToken
|
|
end
|
|
return true
|
|
end
|
|
|
|
def after_attribute_name_state
|
|
data = @stream.char
|
|
if SPACE_CHARACTERS.include? data
|
|
@stream.chars_until(SPACE_CHARACTERS, true)
|
|
elsif data == "="
|
|
@state = :before_attribute_value_state
|
|
elsif data == ">"
|
|
emit_current_token
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "expected-end-of-tag-but-got-eof"}
|
|
emit_current_token
|
|
elsif ASCII_LETTERS.include? data
|
|
@current_token[:data].push([data, ""])
|
|
@state = :attribute_name_state
|
|
elsif data == "/"
|
|
process_solidus_in_tag
|
|
@state = :before_attribute_name_state
|
|
else
|
|
@current_token[:data].push([data, ""])
|
|
@state = :attribute_name_state
|
|
end
|
|
return true
|
|
end
|
|
|
|
def before_attribute_value_state
|
|
data = @stream.char
|
|
if SPACE_CHARACTERS.include? data
|
|
@stream.chars_until(SPACE_CHARACTERS, true)
|
|
elsif data == "\""
|
|
@state = :attribute_value_double_quoted_state
|
|
elsif data == "&"
|
|
@state = :attribute_value_unquoted_state
|
|
@stream.unget(data);
|
|
elsif data == "'"
|
|
@state = :attribute_value_single_quoted_state
|
|
elsif data == ">"
|
|
emit_current_token
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "expected-attribute-value-but-got-eof"}
|
|
emit_current_token
|
|
else
|
|
@current_token[:data][-1][1] += data
|
|
@state = :attribute_value_unquoted_state
|
|
end
|
|
return true
|
|
end
|
|
|
|
def attribute_value_double_quoted_state
|
|
data = @stream.char
|
|
if data == "\""
|
|
@state = :before_attribute_name_state
|
|
elsif data == "&"
|
|
process_entity_in_attribute
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-double-quote"}
|
|
emit_current_token
|
|
else
|
|
@current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
|
|
end
|
|
return true
|
|
end
|
|
|
|
def attribute_value_single_quoted_state
|
|
data = @stream.char
|
|
if data == "'"
|
|
@state = :before_attribute_name_state
|
|
elsif data == "&"
|
|
process_entity_in_attribute
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-single-quote"}
|
|
emit_current_token
|
|
else
|
|
@current_token[:data][-1][1] += data +\
|
|
@stream.chars_until(["'", "&"])
|
|
end
|
|
return true
|
|
end
|
|
|
|
def attribute_value_unquoted_state
|
|
data = @stream.char
|
|
if SPACE_CHARACTERS.include? data
|
|
@state = :before_attribute_name_state
|
|
elsif data == "&"
|
|
process_entity_in_attribute
|
|
elsif data == ">"
|
|
emit_current_token
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-no-quotes"}
|
|
emit_current_token
|
|
else
|
|
@current_token[:data][-1][1] += data + @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
|
|
end
|
|
return true
|
|
end
|
|
|
|
def bogus_comment_state
|
|
# Make a new comment token and give it as value all the characters
|
|
# until the first > or :EOF (chars_until checks for :EOF automatically)
|
|
# and emit it.
|
|
@token_queue << {:type => :Comment, :data => @stream.chars_until((">"))}
|
|
|
|
# Eat the character directly after the bogus comment which is either a
|
|
# ">" or an :EOF.
|
|
@stream.char
|
|
@state = :data_state
|
|
return true
|
|
end
|
|
|
|
def markup_declaration_open_state
|
|
char_stack = [@stream.char, @stream.char]
|
|
if char_stack == ["-", "-"]
|
|
@current_token = {:type => :Comment, :data => ""}
|
|
@state = :comment_start_state
|
|
else
|
|
5.times { char_stack.push(@stream.char) }
|
|
# Put in explicit :EOF check
|
|
if !char_stack.include?(:EOF) && char_stack.join("").upcase == "DOCTYPE"
|
|
@current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
|
|
@state = :doctype_state
|
|
else
|
|
@token_queue << {:type => :ParseError, :data => "expected-dashes-or-doctype"}
|
|
@stream.unget(char_stack)
|
|
@state = :bogus_comment_state
|
|
end
|
|
end
|
|
return true
|
|
end
|
|
|
|
def comment_start_state
|
|
data = @stream.char
|
|
if data == "-"
|
|
@state = :comment_start_dash_state
|
|
elsif data == ">"
|
|
@token_queue << {:type => :ParseError, :data => "incorrect-comment"}
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-comment"}
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
else
|
|
@current_token[:data] += data + @stream.chars_until("-")
|
|
@state = :comment_state
|
|
end
|
|
return true
|
|
end
|
|
|
|
def comment_start_dash_state
|
|
data = @stream.char
|
|
if data == "-"
|
|
@state = :comment_end_state
|
|
elsif data == ">"
|
|
@token_queue << {:type => :ParseError, :data => "incorrect-comment"}
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-comment"}
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
else
|
|
@current_token[:data] += '-' + data + @stream.chars_until("-")
|
|
@state = :comment_state
|
|
end
|
|
return true
|
|
end
|
|
|
|
def comment_state
|
|
data = @stream.char
|
|
if data == "-"
|
|
@state = :comment_end_dash_state
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-comment"}
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
else
|
|
@current_token[:data] += data + @stream.chars_until("-")
|
|
end
|
|
return true
|
|
end
|
|
|
|
def comment_end_dash_state
|
|
data = @stream.char
|
|
if data == "-"
|
|
@state = :comment_end_state
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-comment-end-dash"}
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
else
|
|
@current_token[:data] += "-" + data +\
|
|
@stream.chars_until("-")
|
|
# Consume the next character which is either a "-" or an :EOF as
|
|
# well so if there's a "-" directly after the "-" we go nicely to
|
|
# the "comment end state" without emitting a ParseError there.
|
|
@stream.char
|
|
end
|
|
return true
|
|
end
|
|
|
|
def comment_end_state
|
|
data = @stream.char
|
|
if data == ">"
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
elsif data == "-"
|
|
@token_queue << {:type => :ParseError, :data => "unexpected-dash-after-double-dash-in-comment"}
|
|
@current_token[:data] += data
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-comment-double-dash"}
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
else
|
|
# XXX
|
|
@token_queue << {:type => :ParseError, :data => "unexpected-char-in-comment"}
|
|
@current_token[:data] += "--" + data
|
|
@state = :comment_state
|
|
end
|
|
return true
|
|
end
|
|
|
|
def doctype_state
|
|
data = @stream.char
|
|
if SPACE_CHARACTERS.include? data
|
|
@state = :before_doctype_name_state
|
|
else
|
|
@token_queue << {:type => :ParseError, :data => "need-space-after-doctype"}
|
|
@stream.unget(data)
|
|
@state = :before_doctype_name_state
|
|
end
|
|
return true
|
|
end
|
|
|
|
def before_doctype_name_state
|
|
data = @stream.char
|
|
if SPACE_CHARACTERS.include? data
|
|
elsif data == ">"
|
|
@token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-right-bracket"}
|
|
@current_token[:correct] = false
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-eof"}
|
|
@current_token[:correct] = false
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
else
|
|
@current_token[:name] = data
|
|
@state = :doctype_name_state
|
|
end
|
|
return true
|
|
end
|
|
|
|
def doctype_name_state
|
|
data = @stream.char
|
|
if SPACE_CHARACTERS.include? data
|
|
@state = :after_doctype_name_state
|
|
elsif data == ">"
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-doctype-name"}
|
|
@current_token[:correct] = false
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
else
|
|
@current_token[:name] += data
|
|
end
|
|
|
|
return true
|
|
end
|
|
|
|
def after_doctype_name_state
|
|
data = @stream.char
|
|
if SPACE_CHARACTERS.include? data
|
|
elsif data == ">"
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
elsif data == :EOF
|
|
@current_token[:correct] = false
|
|
@stream.unget(data)
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
else
|
|
char_stack = [data]
|
|
5.times { char_stack << stream.char }
|
|
token = char_stack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
|
|
if token == "public" and !char_stack.include?(:EOF)
|
|
@state = :before_doctype_public_identifier_state
|
|
elsif token == "system" and !char_stack.include?(:EOF)
|
|
@state = :before_doctype_system_identifier_state
|
|
else
|
|
@stream.unget(char_stack)
|
|
@token_queue << {:type => :ParseError, :data => "expected-space-or-right-bracket-in-doctype", "datavars" => {"data" => data}}
|
|
@state = :bogus_doctype_state
|
|
end
|
|
end
|
|
return true
|
|
end
|
|
|
|
def before_doctype_public_identifier_state
|
|
data = @stream.char
|
|
|
|
if SPACE_CHARACTERS.include?(data)
|
|
elsif data == "\""
|
|
@current_token[:publicId] = ""
|
|
@state = :doctype_public_identifier_double_quoted_state
|
|
elsif data == "'"
|
|
@current_token[:publicId] = ""
|
|
@state = :doctype_public_identifier_single_quoted_state
|
|
elsif data == ">"
|
|
@token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
|
|
@current_token[:correct] = false
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
|
@current_token[:correct] = false
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
else
|
|
@token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
|
|
@state = :bogus_doctype_state
|
|
end
|
|
|
|
return true
|
|
end
|
|
|
|
def doctype_public_identifier_double_quoted_state
|
|
data = @stream.char
|
|
if data == "\""
|
|
@state = :after_doctype_public_identifier_state
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
|
@current_token[:correct] = false
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
else
|
|
@current_token[:publicId] += data
|
|
end
|
|
return true
|
|
end
|
|
|
|
def doctype_public_identifier_single_quoted_state
|
|
data = @stream.char
|
|
if data == "'"
|
|
@state = :after_doctype_public_identifier_state
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
|
@current_token[:correct] = false
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
else
|
|
@current_token[:publicId] += data
|
|
end
|
|
return true
|
|
end
|
|
|
|
def after_doctype_public_identifier_state
|
|
data = @stream.char
|
|
if SPACE_CHARACTERS.include?(data)
|
|
elsif data == "\""
|
|
@current_token[:systemId] = ""
|
|
@state = :doctype_system_identifier_double_quoted_state
|
|
elsif data == "'"
|
|
@current_token[:systemId] = ""
|
|
@state = :doctype_system_identifier_single_quoted_state
|
|
elsif data == ">"
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
|
@current_token[:correct] = false
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
else
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
|
@state = :bogus_doctype_state
|
|
end
|
|
return true
|
|
end
|
|
|
|
def before_doctype_system_identifier_state
|
|
data = @stream.char
|
|
if SPACE_CHARACTERS.include?(data)
|
|
elsif data == "\""
|
|
@current_token[:systemId] = ""
|
|
@state = :doctype_system_identifier_double_quoted_state
|
|
elsif data == "'"
|
|
@current_token[:systemId] = ""
|
|
@state = :doctype_system_identifier_single_quoted_state
|
|
elsif data == ">"
|
|
@token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
|
|
@current_token[:correct] = false
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
|
@current_token[:correct] = false
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
else
|
|
@token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
|
|
@state = :bogus_doctype_state
|
|
end
|
|
return true
|
|
end
|
|
|
|
def doctype_system_identifier_double_quoted_state
|
|
data = @stream.char
|
|
if data == "\""
|
|
@state = :after_doctype_system_identifier_state
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
|
@current_token[:correct] = false
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
else
|
|
@current_token[:systemId] += data
|
|
end
|
|
return true
|
|
end
|
|
|
|
def doctype_system_identifier_single_quoted_state
|
|
data = @stream.char
|
|
if data == "'"
|
|
@state = :after_doctype_system_identifier_state
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
|
@current_token[:correct] = false
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
else
|
|
@current_token[:systemId] += data
|
|
end
|
|
return true
|
|
end
|
|
|
|
def after_doctype_system_identifier_state
|
|
data = @stream.char
|
|
if SPACE_CHARACTERS.include?(data)
|
|
elsif data == ">"
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
elsif data == :EOF
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
|
@current_token[:correct] = false
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
else
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
|
@state = :bogus_doctype_state
|
|
end
|
|
return true
|
|
end
|
|
|
|
def bogus_doctype_state
|
|
data = @stream.char
|
|
@current_token[:correct] = false
|
|
if data == ">"
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
elsif data == :EOF
|
|
# XXX EMIT
|
|
@stream.unget(data)
|
|
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
|
@current_token[:correct] = false
|
|
@token_queue << @current_token
|
|
@state = :data_state
|
|
end
|
|
return true
|
|
end
|
|
|
|
end
|
|
|
|
end
|