2007-05-26 03:52:27 +02:00
|
|
|
require 'html5lib/constants'
|
|
|
|
require 'html5lib/inputstream'
|
|
|
|
|
|
|
|
module HTML5lib
|
|
|
|
|
2007-05-30 17:45:52 +02:00
|
|
|
# This class takes care of tokenizing HTML.
|
|
|
|
#
|
|
|
|
# * @currentToken
|
|
|
|
# Holds the token that is currently being processed.
|
|
|
|
#
|
|
|
|
# * @state
|
|
|
|
# Holds a reference to the method to be invoked... XXX
|
|
|
|
#
|
|
|
|
# * @states
|
|
|
|
# Holds a mapping between states and methods that implement the state.
|
|
|
|
#
|
|
|
|
# * @stream
|
|
|
|
# Points to HTMLInputStream object.
|
|
|
|
|
|
|
|
class HTMLTokenizer
|
2007-05-26 03:52:27 +02:00
|
|
|
attr_accessor :contentModelFlag, :currentToken
|
|
|
|
attr_reader :stream
|
|
|
|
|
|
|
|
# XXX need to fix documentation
|
|
|
|
|
2007-05-30 17:45:52 +02:00
|
|
|
def initialize(stream, options = {})
|
|
|
|
@stream = HTMLInputStream.new(stream, options)
|
|
|
|
|
|
|
|
@states = {
|
|
|
|
:data => :dataState,
|
|
|
|
:entityData => :entityDataState,
|
|
|
|
:tagOpen => :tagOpenState,
|
|
|
|
:closeTagOpen => :closeTagOpenState,
|
|
|
|
:tagName => :tagNameState,
|
|
|
|
:beforeAttributeName => :beforeAttributeNameState,
|
|
|
|
:attributeName => :attributeNameState,
|
|
|
|
:afterAttributeName => :afterAttributeNameState,
|
|
|
|
:beforeAttributeValue => :beforeAttributeValueState,
|
|
|
|
:attributeValueDoubleQuoted => :attributeValueDoubleQuotedState,
|
|
|
|
:attributeValueSingleQuoted => :attributeValueSingleQuotedState,
|
|
|
|
:attributeValueUnQuoted => :attributeValueUnQuotedState,
|
|
|
|
:bogusComment => :bogusCommentState,
|
|
|
|
:markupDeclarationOpen => :markupDeclarationOpenState,
|
|
|
|
:comment => :commentState,
|
|
|
|
:commentDash => :commentDashState,
|
|
|
|
:commentEnd => :commentEndState,
|
|
|
|
:doctype => :doctypeState,
|
|
|
|
:beforeDoctypeName => :beforeDoctypeNameState,
|
|
|
|
:doctypeName => :doctypeNameState,
|
|
|
|
:afterDoctypeName => :afterDoctypeNameState,
|
|
|
|
:bogusDoctype => :bogusDoctypeState
|
|
|
|
}
|
|
|
|
|
|
|
|
# Setup the initial tokenizer state
|
|
|
|
@contentModelFlag = :PCDATA
|
|
|
|
@state = @states[:data]
|
|
|
|
|
|
|
|
# The current token being created
|
|
|
|
@currentToken = nil
|
|
|
|
|
|
|
|
# Tokens to be processed.
|
|
|
|
@tokenQueue = []
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
# This is where the magic happens.
|
|
|
|
#
|
|
|
|
# We do our usually processing through the states and when we have a token
|
|
|
|
# to return we yield the token which pauses processing until the next token
|
|
|
|
# is requested.
|
|
|
|
def each
|
2007-05-30 17:45:52 +02:00
|
|
|
@tokenQueue = []
|
|
|
|
# Start processing. When EOF is reached @state will return false
|
|
|
|
# instead of true and the loop will terminate.
|
|
|
|
while send @state
|
|
|
|
while not @tokenQueue.empty?
|
|
|
|
yield @tokenQueue.shift
|
|
|
|
end
|
|
|
|
end
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
# Below are various helper functions the tokenizer states use worked out.
|
2007-05-30 17:45:52 +02:00
|
|
|
|
2007-05-26 03:52:27 +02:00
|
|
|
# If the next character is a '>', convert the currentToken into
|
|
|
|
# an EmptyTag
|
|
|
|
|
|
|
|
def processSolidusInTag
|
|
|
|
|
2007-05-30 17:45:52 +02:00
|
|
|
# We need to consume another character to make sure it's a ">"
|
|
|
|
data = @stream.char
|
2007-05-26 03:52:27 +02:00
|
|
|
|
2007-05-30 17:45:52 +02:00
|
|
|
if @currentToken[:type] == :StartTag and data == ">"
|
|
|
|
@currentToken[:type] = :EmptyTag
|
|
|
|
else
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Solidus (/) incorrectly placed in tag.")})
|
|
|
|
end
|
2007-05-26 03:52:27 +02:00
|
|
|
|
2007-05-30 17:45:52 +02:00
|
|
|
# The character we just consumed need to be put back on the stack so it
|
|
|
|
# doesn't get lost...
|
|
|
|
@stream.queue.push(data)
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
# This function returns either U+FFFD or the character based on the
|
|
|
|
# decimal or hexadecimal representation. It also discards ";" if present.
|
|
|
|
# If not present @tokenQueue.push({:type => :ParseError}") is invoked.
|
|
|
|
|
|
|
|
def consumeNumberEntity(isHex)
|
|
|
|
|
2007-05-30 17:45:52 +02:00
|
|
|
# XXX More need to be done here. For instance, #13 should prolly be
|
|
|
|
# converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
|
|
|
|
# such. Thoughts on this appreciated.
|
|
|
|
allowed = DIGITS
|
|
|
|
radix = 10
|
|
|
|
if isHex
|
|
|
|
allowed = HEX_DIGITS
|
|
|
|
radix = 16
|
|
|
|
end
|
|
|
|
|
|
|
|
char = [0xFFFD].pack('U')
|
|
|
|
charStack = []
|
|
|
|
|
|
|
|
# Consume all the characters that are in range while making sure we
|
|
|
|
# don't hit an EOF.
|
|
|
|
c = @stream.char
|
|
|
|
while allowed.include?(c) and c != :EOF
|
|
|
|
charStack.push(c)
|
2007-05-26 03:52:27 +02:00
|
|
|
c = @stream.char
|
2007-05-30 17:45:52 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
# Convert the set of characters consumed to an int.
|
|
|
|
charAsInt = charStack.join('').to_i(radix)
|
|
|
|
|
|
|
|
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
|
|
|
|
# smaller) we need to do the "windows trick".
|
|
|
|
if (127...160).include? charAsInt
|
|
|
|
#XXX - removed parse error from windows 1252 entity for now
|
|
|
|
#we may want to reenable this later
|
|
|
|
#@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
# _("Entity used with illegal number (windows-1252 reference).")})
|
|
|
|
|
|
|
|
charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
|
|
|
|
end
|
|
|
|
|
|
|
|
# 0 is not a good number.
|
|
|
|
if charAsInt == 0
|
|
|
|
charAsInt = 65533
|
|
|
|
end
|
|
|
|
|
|
|
|
if charAsInt <= 0x10FFFF
|
|
|
|
char = [charAsInt].pack('U')
|
|
|
|
else
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Numeric entity couldn't be converted to character.")})
|
|
|
|
end
|
|
|
|
|
|
|
|
# Discard the ; if present. Otherwise, put it back on the queue and
|
|
|
|
# invoke parseError on parser.
|
|
|
|
if c != ";"
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Numeric entity didn't end with ';'.")})
|
|
|
|
@stream.queue.push(c)
|
|
|
|
end
|
|
|
|
|
|
|
|
return char
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def consumeEntity
|
2007-05-30 17:45:52 +02:00
|
|
|
char = nil
|
|
|
|
charStack = [@stream.char]
|
|
|
|
if charStack[0] == "#"
|
|
|
|
# We might have a number entity here.
|
|
|
|
charStack += [@stream.char, @stream.char]
|
|
|
|
if charStack.include? :EOF
|
|
|
|
# If we reach the end of the file put everything up to :EOF
|
|
|
|
# back in the queue
|
|
|
|
charStack = charStack[0...charStack.index(:EOF)]
|
|
|
|
@stream.queue+= charStack
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Numeric entity expected. Got end of file instead.")})
|
|
|
|
else
|
|
|
|
if charStack[1].downcase == "x" \
|
|
|
|
and HEX_DIGITS.include? charStack[2]
|
|
|
|
# Hexadecimal entity detected.
|
|
|
|
@stream.queue.push(charStack[2])
|
|
|
|
char = consumeNumberEntity(true)
|
|
|
|
elsif DIGITS.include? charStack[1]
|
|
|
|
# Decimal entity detected.
|
|
|
|
@stream.queue += charStack[1..-1]
|
|
|
|
char = consumeNumberEntity(false)
|
|
|
|
else
|
|
|
|
# No number entity detected.
|
|
|
|
@stream.queue += charStack
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Numeric entity expected but none found.")})
|
|
|
|
end
|
|
|
|
end
|
|
|
|
# Break out if we reach the end of the file
|
|
|
|
elsif charStack[0] == :EOF
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Entity expected. Got end of file instead.")})
|
|
|
|
else
|
|
|
|
# At this point in the process might have named entity. Entities
|
|
|
|
# are stored in the global variable "entities".
|
|
|
|
#
|
|
|
|
# Consume characters and compare to these to a substring of the
|
|
|
|
# entity names in the list until the substring no longer matches.
|
|
|
|
filteredEntityList = ENTITIES.keys
|
|
|
|
filteredEntityList.reject! {|e| e[0].chr != charStack[0]}
|
|
|
|
entityName = nil
|
|
|
|
|
|
|
|
while charStack[-1] != :EOF
|
|
|
|
name = charStack.join('')
|
|
|
|
if filteredEntityList.any? {|e| e[0...name.length] == name}
|
|
|
|
filteredEntityList.reject! {|e| e[0...name.length] != name}
|
|
|
|
charStack.push(@stream.char)
|
|
|
|
else
|
|
|
|
break
|
|
|
|
end
|
|
|
|
|
|
|
|
if ENTITIES.include? name
|
|
|
|
entityName = name
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
if entityName != nil
|
|
|
|
char = ENTITIES[entityName]
|
|
|
|
|
|
|
|
# Check whether or not the last character returned can be
|
|
|
|
# discarded or needs to be put back.
|
|
|
|
if not charStack[-1] == ";"
|
2007-05-26 03:52:27 +02:00
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
2007-05-30 17:45:52 +02:00
|
|
|
_("Named entity didn't end with ';'.")})
|
|
|
|
@stream.queue += charStack[entityName.length..-1]
|
|
|
|
end
|
2007-05-26 03:52:27 +02:00
|
|
|
else
|
2007-05-30 17:45:52 +02:00
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Named entity expected. Got none.")})
|
|
|
|
@stream.queue += charStack
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
2007-05-30 17:45:52 +02:00
|
|
|
end
|
|
|
|
return char
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
# This method replaces the need for "entityInAttributeValueState".
|
|
|
|
def processEntityInAttribute
|
2007-05-30 17:45:52 +02:00
|
|
|
entity = consumeEntity
|
|
|
|
if entity
|
|
|
|
@currentToken[:data][-1][1] += entity
|
|
|
|
else
|
|
|
|
@currentToken[:data][-1][1] += "&"
|
|
|
|
end
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
# This method is a generic handler for emitting the tags. It also sets
|
|
|
|
# the state to "data" because that's what's needed after a token has been
|
|
|
|
# emitted.
|
|
|
|
def emitCurrentToken
|
2007-05-30 17:45:52 +02:00
|
|
|
# Add token to the queue to be yielded
|
|
|
|
@tokenQueue.push(@currentToken)
|
|
|
|
@state = @states[:data]
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
# Below are the various tokenizer states worked out.
|
|
|
|
|
|
|
|
# XXX AT Perhaps we should have Hixie run some evaluation on billions of
|
|
|
|
# documents to figure out what the order of the various if and elsif
|
|
|
|
# statements should be.
|
|
|
|
def dataState
|
2007-05-30 17:45:52 +02:00
|
|
|
data = @stream.char
|
|
|
|
if data == "&" and (@contentModelFlag == :PCDATA or
|
|
|
|
@contentModelFlag == :RCDATA)
|
|
|
|
@state = @states[:entityData]
|
|
|
|
elsif data == "<" and @contentModelFlag != :PLAINTEXT
|
|
|
|
@state = @states[:tagOpen]
|
|
|
|
elsif data == :EOF
|
|
|
|
# Tokenization ends.
|
|
|
|
return false
|
|
|
|
elsif SPACE_CHARACTERS.include? data
|
|
|
|
# Directly after emitting a token you switch back to the "data
|
|
|
|
# state". At that point SPACE_CHARACTERS are important so they are
|
|
|
|
# emitted separately.
|
|
|
|
# XXX need to check if we don't need a special "spaces" flag on
|
|
|
|
# characters.
|
|
|
|
@tokenQueue.push({:type => :SpaceCharacters, :data =>
|
|
|
|
data + @stream.chars_until(SPACE_CHARACTERS, true)})
|
|
|
|
else
|
|
|
|
@tokenQueue.push({:type => :Characters, :data =>
|
|
|
|
data + @stream.chars_until(["&", "<"])})
|
|
|
|
end
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def entityDataState
|
2007-05-30 17:45:52 +02:00
|
|
|
entity = consumeEntity
|
|
|
|
if entity
|
|
|
|
@tokenQueue.push({:type => :Characters, :data => entity})
|
|
|
|
else
|
|
|
|
@tokenQueue.push({:type => :Characters, :data => "&"})
|
|
|
|
end
|
|
|
|
@state = @states[:data]
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def tagOpenState
|
2007-05-30 17:45:52 +02:00
|
|
|
data = @stream.char
|
|
|
|
if @contentModelFlag == :PCDATA
|
|
|
|
if data == "!"
|
|
|
|
@state = @states[:markupDeclarationOpen]
|
|
|
|
elsif data == "/"
|
|
|
|
@state = @states[:closeTagOpen]
|
|
|
|
elsif data != :EOF and ASCII_LETTERS.include? data
|
|
|
|
@currentToken =\
|
|
|
|
{:type => :StartTag, :name => data, :data => []}
|
|
|
|
@state = @states[:tagName]
|
|
|
|
elsif data == ">"
|
|
|
|
# XXX In theory it could be something besides a tag name. But
|
|
|
|
# do we really care?
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Expected tag name. Got '>' instead.")})
|
|
|
|
@tokenQueue.push({:type => :Characters, :data => "<>"})
|
|
|
|
@state = @states[:data]
|
|
|
|
elsif data == "?"
|
|
|
|
# XXX In theory it could be something besides a tag name. But
|
|
|
|
# do we really care?
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Expected tag name. Got '?' instead (HTML doesn't " +
|
|
|
|
"support processing instructions).")})
|
|
|
|
@stream.queue.push(data)
|
|
|
|
@state = @states[:bogusComment]
|
2007-05-26 03:52:27 +02:00
|
|
|
else
|
2007-05-30 17:45:52 +02:00
|
|
|
# XXX
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Expected tag name. Got something else instead")})
|
|
|
|
@tokenQueue.push({:type => :Characters, :data => "<"})
|
|
|
|
@stream.queue.push(data)
|
|
|
|
@state = @states[:data]
|
|
|
|
end
|
|
|
|
else
|
|
|
|
# We know the content model flag is set to either RCDATA or CDATA
|
|
|
|
# now because this state can never be entered with the PLAINTEXT
|
|
|
|
# flag.
|
|
|
|
if data == "/"
|
|
|
|
@state = @states[:closeTagOpen]
|
|
|
|
else
|
|
|
|
@tokenQueue.push({:type => :Characters, :data => "<"})
|
|
|
|
@stream.queue.insert(0, data)
|
|
|
|
@state = @states[:data]
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
2007-05-30 17:45:52 +02:00
|
|
|
end
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def closeTagOpenState
|
2007-05-30 17:45:52 +02:00
|
|
|
if (@contentModelFlag == :RCDATA or @contentModelFlag == :CDATA)
|
|
|
|
if @currentToken
|
|
|
|
charStack = []
|
|
|
|
|
|
|
|
# So far we know that "</" has been consumed. We now need to know
|
|
|
|
# whether the next few characters match the name of last emitted
|
|
|
|
# start tag which also happens to be the currentToken. We also need
|
|
|
|
# to have the character directly after the characters that could
|
|
|
|
# match the start tag name.
|
|
|
|
(@currentToken[:name].length + 1).times do
|
|
|
|
charStack.push(@stream.char)
|
|
|
|
# Make sure we don't get hit by :EOF
|
|
|
|
break if charStack[-1] == :EOF
|
|
|
|
end
|
|
|
|
|
|
|
|
# Since this is just for checking. We put the characters back on
|
|
|
|
# the stack.
|
|
|
|
@stream.queue += charStack
|
|
|
|
end
|
|
|
|
|
|
|
|
if @currentToken and
|
|
|
|
@currentToken[:name].downcase ==
|
|
|
|
charStack[0...-1].join('').downcase and
|
|
|
|
(SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? charStack[-1]
|
|
|
|
# Because the characters are correct we can safely switch to
|
|
|
|
# PCDATA mode now. This also means we don't have to do it when
|
|
|
|
# emitting the end tag token.
|
|
|
|
@contentModelFlag = :PCDATA
|
|
|
|
else
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Expected closing tag after seeing '</'. None found.")})
|
|
|
|
@tokenQueue.push({:type => :Characters, :data => "</"})
|
|
|
|
@state = @states[:data]
|
2007-05-26 03:52:27 +02:00
|
|
|
|
2007-05-30 17:45:52 +02:00
|
|
|
# Need to return here since we don't want the rest of the
|
|
|
|
# method to be walked through.
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
2007-05-30 17:45:52 +02:00
|
|
|
end
|
2007-05-26 03:52:27 +02:00
|
|
|
|
2007-05-30 17:45:52 +02:00
|
|
|
if @contentModelFlag == :PCDATA
|
2007-05-26 03:52:27 +02:00
|
|
|
data = @stream.char
|
2007-05-30 17:45:52 +02:00
|
|
|
if data == :EOF
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Expected closing tag. Unexpected end of file.")})
|
|
|
|
@tokenQueue.push({:type => :Characters, :data => "</"})
|
|
|
|
@state = @states[:data]
|
2007-05-26 03:52:27 +02:00
|
|
|
elsif ASCII_LETTERS.include? data
|
2007-05-30 17:45:52 +02:00
|
|
|
@currentToken =\
|
|
|
|
{:type => :EndTag, :name => data, :data => []}
|
|
|
|
@state = @states[:tagName]
|
2007-05-26 03:52:27 +02:00
|
|
|
elsif data == ">"
|
2007-05-30 17:45:52 +02:00
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
|
|
|
|
@state = @states[:data]
|
2007-05-26 03:52:27 +02:00
|
|
|
else
|
2007-05-30 17:45:52 +02:00
|
|
|
# XXX data can be _'_...
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Expected closing tag. Unexpected character '" + data + "' found.")})
|
|
|
|
@stream.queue.push(data)
|
|
|
|
@state = @states[:bogusComment]
|
|
|
|
end
|
|
|
|
end
|
|
|
|
return true
|
|
|
|
end
|
|
|
|
|
|
|
|
def tagNameState
|
|
|
|
data = @stream.char
|
|
|
|
if SPACE_CHARACTERS.include? data
|
|
|
|
@state = @states[:beforeAttributeName]
|
|
|
|
elsif data == :EOF
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected end of file in the tag name.")})
|
|
|
|
emitCurrentToken
|
|
|
|
elsif ASCII_LETTERS.include? data
|
|
|
|
@currentToken[:name] += data +\
|
|
|
|
@stream.chars_until(ASCII_LETTERS, true)
|
|
|
|
elsif data == ">"
|
|
|
|
emitCurrentToken
|
|
|
|
elsif data == "<"
|
|
|
|
@stream.queue.push(data)
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected < character when getting the tag name.")})
|
|
|
|
emitCurrentToken
|
|
|
|
elsif data == "/"
|
|
|
|
processSolidusInTag
|
|
|
|
@state = @states[:beforeAttributeName]
|
|
|
|
else
|
|
|
|
@currentToken[:name] += data
|
|
|
|
end
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def beforeAttributeNameState
|
2007-05-30 17:45:52 +02:00
|
|
|
data = @stream.char
|
|
|
|
if SPACE_CHARACTERS.include? data
|
|
|
|
@stream.chars_until(SPACE_CHARACTERS, true)
|
|
|
|
elsif data == :EOF
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected end of file. Expected attribute name instead.")})
|
|
|
|
emitCurrentToken
|
|
|
|
elsif ASCII_LETTERS.include? data
|
|
|
|
@currentToken[:data].push([data, ""])
|
|
|
|
@state = @states[:attributeName]
|
|
|
|
elsif data == ">"
|
|
|
|
emitCurrentToken
|
|
|
|
elsif data == "/"
|
|
|
|
processSolidusInTag
|
|
|
|
elsif data == "<"
|
|
|
|
@stream.queue.push(data)
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected < character. Expected attribute name instead.")})
|
|
|
|
emitCurrentToken
|
|
|
|
else
|
|
|
|
@currentToken[:data].push([data, ""])
|
|
|
|
@state = @states[:attributeName]
|
|
|
|
end
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def attributeNameState
|
2007-05-30 17:45:52 +02:00
|
|
|
data = @stream.char
|
|
|
|
leavingThisState = true
|
|
|
|
if data == "="
|
|
|
|
@state = @states[:beforeAttributeValue]
|
|
|
|
elsif data == :EOF
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected end of file in attribute name.")})
|
|
|
|
emitCurrentToken
|
|
|
|
leavingThisState = false
|
|
|
|
elsif ASCII_LETTERS.include? data
|
|
|
|
@currentToken[:data][-1][0] += data +\
|
|
|
|
@stream.chars_until(ASCII_LETTERS, true)
|
|
|
|
leavingThisState = false
|
|
|
|
elsif data == ">"
|
|
|
|
# XXX If we emit here the attributes are converted to a dict
|
|
|
|
# without being checked and when the code below runs we error
|
|
|
|
# because data is a dict not a list
|
|
|
|
elsif SPACE_CHARACTERS.include? data
|
|
|
|
@state = @states[:afterAttributeName]
|
|
|
|
elsif data == "/"
|
|
|
|
processSolidusInTag
|
|
|
|
@state = @states[:beforeAttributeName]
|
|
|
|
elsif data == "<"
|
|
|
|
@stream.queue.push(data)
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected < character in attribute name.")})
|
|
|
|
emitCurrentToken
|
|
|
|
leavingThisState = false
|
|
|
|
else
|
|
|
|
@currentToken[:data][-1][0] += data
|
|
|
|
leavingThisState = false
|
|
|
|
end
|
|
|
|
|
|
|
|
if leavingThisState
|
|
|
|
# Attributes are not dropped at this stage. That happens when the
|
|
|
|
# start tag token is emitted so values can still be safely appended
|
|
|
|
# to attributes, but we do want to report the parse error in time.
|
|
|
|
@currentToken[:data][0...-1].each {|name,value|
|
|
|
|
if @currentToken[:data][-1][0] == name
|
2007-05-26 03:52:27 +02:00
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
2007-05-30 17:45:52 +02:00
|
|
|
_("Dropped duplicate attribute on tag.")})
|
|
|
|
end
|
|
|
|
}
|
|
|
|
# XXX Fix for above XXX
|
|
|
|
if data == ">"
|
|
|
|
emitCurrentToken
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
2007-05-30 17:45:52 +02:00
|
|
|
end
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def afterAttributeNameState
|
2007-05-30 17:45:52 +02:00
|
|
|
data = @stream.char
|
|
|
|
if SPACE_CHARACTERS.include? data
|
|
|
|
@stream.chars_until(SPACE_CHARACTERS, true)
|
|
|
|
elsif data == "="
|
|
|
|
@state = @states[:beforeAttributeValue]
|
|
|
|
elsif data == ">"
|
|
|
|
emitCurrentToken
|
|
|
|
elsif ASCII_LETTERS.include? data
|
|
|
|
@currentToken[:data].push([data, ""])
|
|
|
|
@state = @states[:attributeName]
|
|
|
|
elsif data == "/"
|
|
|
|
processSolidusInTag
|
|
|
|
@state = @states[:beforeAttributeName]
|
|
|
|
elsif data == "<"
|
|
|
|
@stream.queue.push(data)
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected < character. Expected = or end of tag.")})
|
|
|
|
emitCurrentToken
|
|
|
|
elsif data == :EOF
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected end of file. Expected = or end of tag.")})
|
|
|
|
emitCurrentToken
|
|
|
|
else
|
|
|
|
@currentToken[:data].push([data, ""])
|
|
|
|
@state = @states[:attributeName]
|
|
|
|
end
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def beforeAttributeValueState
|
2007-05-30 17:45:52 +02:00
|
|
|
data = @stream.char
|
|
|
|
if SPACE_CHARACTERS.include? data
|
|
|
|
@stream.chars_until(SPACE_CHARACTERS, true)
|
|
|
|
elsif data == "\""
|
|
|
|
@state = @states[:attributeValueDoubleQuoted]
|
|
|
|
elsif data == "&"
|
|
|
|
@state = @states[:attributeValueUnQuoted]
|
|
|
|
@stream.queue.push(data);
|
|
|
|
elsif data == "'"
|
|
|
|
@state = @states[:attributeValueSingleQuoted]
|
|
|
|
elsif data == ">"
|
|
|
|
emitCurrentToken
|
|
|
|
elsif data == "<"
|
|
|
|
@stream.queue.push(data)
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected < character. Expected attribute value.")})
|
|
|
|
emitCurrentToken
|
|
|
|
elsif data == :EOF
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected end of file. Expected attribute value.")})
|
|
|
|
emitCurrentToken
|
|
|
|
else
|
|
|
|
@currentToken[:data][-1][1] += data
|
|
|
|
@state = @states[:attributeValueUnQuoted]
|
|
|
|
end
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def attributeValueDoubleQuotedState
|
2007-05-30 17:45:52 +02:00
|
|
|
data = @stream.char
|
|
|
|
if data == "\""
|
|
|
|
@state = @states[:beforeAttributeName]
|
|
|
|
elsif data == "&"
|
|
|
|
processEntityInAttribute
|
|
|
|
elsif data == :EOF
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected end of file in attribute value (\").")})
|
|
|
|
emitCurrentToken
|
|
|
|
else
|
|
|
|
@currentToken[:data][-1][1] += data +\
|
|
|
|
@stream.chars_until(["\"", "&"])
|
|
|
|
end
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def attributeValueSingleQuotedState
|
2007-05-30 17:45:52 +02:00
|
|
|
data = @stream.char
|
|
|
|
if data == "'"
|
|
|
|
@state = @states[:beforeAttributeName]
|
|
|
|
elsif data == "&"
|
|
|
|
processEntityInAttribute
|
|
|
|
elsif data == :EOF
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected end of file in attribute value (').")})
|
|
|
|
emitCurrentToken
|
|
|
|
else
|
|
|
|
@currentToken[:data][-1][1] += data +\
|
|
|
|
@stream.chars_until(["'", "&"])
|
|
|
|
end
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def attributeValueUnQuotedState
|
2007-05-30 17:45:52 +02:00
|
|
|
data = @stream.char
|
|
|
|
if SPACE_CHARACTERS.include? data
|
|
|
|
@state = @states[:beforeAttributeName]
|
|
|
|
elsif data == "&"
|
|
|
|
processEntityInAttribute
|
|
|
|
elsif data == ">"
|
|
|
|
emitCurrentToken
|
|
|
|
elsif data == "<"
|
|
|
|
@stream.queue.push(data)
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected < character in attribute value.")})
|
|
|
|
emitCurrentToken
|
|
|
|
elsif data == :EOF
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected end of file in attribute value.")})
|
|
|
|
emitCurrentToken
|
|
|
|
else
|
|
|
|
@currentToken[:data][-1][1] += data +
|
|
|
|
@stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
|
|
|
|
end
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def bogusCommentState
|
2007-05-30 17:45:52 +02:00
|
|
|
# Make a new comment token and give it as value all the characters
|
|
|
|
# until the first > or :EOF (chars_until checks for :EOF automatically)
|
|
|
|
# and emit it.
|
|
|
|
@tokenQueue.push(
|
|
|
|
{:type => :Comment, :data => @stream.chars_until((">"))})
|
|
|
|
|
|
|
|
# Eat the character directly after the bogus comment which is either a
|
|
|
|
# ">" or an :EOF.
|
|
|
|
@stream.char
|
|
|
|
@state = @states[:data]
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def markupDeclarationOpenState
|
2007-05-30 17:45:52 +02:00
|
|
|
charStack = [@stream.char, @stream.char]
|
|
|
|
if charStack == ["-", "-"]
|
|
|
|
@currentToken = {:type => :Comment, :data => ""}
|
|
|
|
@state = @states[:comment]
|
|
|
|
else
|
|
|
|
5.times { charStack.push(@stream.char) }
|
|
|
|
# Put in explicit :EOF check
|
|
|
|
if ((not charStack.include? :EOF) and
|
|
|
|
charStack.join("").upcase == "DOCTYPE")
|
|
|
|
@currentToken =\
|
|
|
|
{:type => :Doctype, :name => "", :data => true}
|
|
|
|
@state = @states[:doctype]
|
2007-05-26 03:52:27 +02:00
|
|
|
else
|
2007-05-30 17:45:52 +02:00
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Expected '--' or 'DOCTYPE'. Not found.")})
|
|
|
|
@stream.queue += charStack
|
|
|
|
@state = @states[:bogusComment]
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
2007-05-30 17:45:52 +02:00
|
|
|
end
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def commentState
|
2007-05-30 17:45:52 +02:00
|
|
|
data = @stream.char
|
|
|
|
if data == "-"
|
|
|
|
@state = @states[:commentDash]
|
|
|
|
elsif data == :EOF
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected end of file in comment.")})
|
|
|
|
@tokenQueue.push(@currentToken)
|
|
|
|
@state = @states[:data]
|
|
|
|
else
|
|
|
|
@currentToken[:data] += data + @stream.chars_until("-")
|
|
|
|
end
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def commentDashState
|
2007-05-30 17:45:52 +02:00
|
|
|
data = @stream.char
|
|
|
|
if data == "-"
|
|
|
|
@state = @states[:commentEnd]
|
|
|
|
elsif data == :EOF
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected end of file in comment (-)")})
|
|
|
|
@tokenQueue.push(@currentToken)
|
|
|
|
@state = @states[:data]
|
|
|
|
else
|
|
|
|
@currentToken[:data] += "-" + data +\
|
|
|
|
@stream.chars_until("-")
|
|
|
|
# Consume the next character which is either a "-" or an :EOF as
|
|
|
|
# well so if there's a "-" directly after the "-" we go nicely to
|
|
|
|
# the "comment end state" without emitting a ParseError there.
|
|
|
|
@stream.char
|
|
|
|
end
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def commentEndState
|
2007-05-30 17:45:52 +02:00
|
|
|
data = @stream.char
|
|
|
|
if data == ">"
|
|
|
|
@tokenQueue.push(@currentToken)
|
|
|
|
@state = @states[:data]
|
|
|
|
elsif data == "-"
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected '-' after '--' found in comment.")})
|
|
|
|
@currentToken[:data] += data
|
|
|
|
elsif data == :EOF
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected end of file in comment (--).")})
|
|
|
|
@tokenQueue.push(@currentToken)
|
|
|
|
@state = @states[:data]
|
|
|
|
else
|
|
|
|
# XXX
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected character in comment found.")})
|
|
|
|
@currentToken[:data] += "--" + data
|
|
|
|
@state = @states[:comment]
|
|
|
|
end
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def doctypeState
|
2007-05-30 17:45:52 +02:00
|
|
|
data = @stream.char
|
|
|
|
if SPACE_CHARACTERS.include? data
|
|
|
|
@state = @states[:beforeDoctypeName]
|
|
|
|
else
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("No space after literal string 'DOCTYPE'.")})
|
|
|
|
@stream.queue.push(data)
|
|
|
|
@state = @states[:beforeDoctypeName]
|
|
|
|
end
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def beforeDoctypeNameState
|
2007-05-30 17:45:52 +02:00
|
|
|
data = @stream.char
|
|
|
|
if SPACE_CHARACTERS.include? data
|
|
|
|
elsif ASCII_LOWERCASE.include? data
|
|
|
|
@currentToken[:name] = data.upcase
|
|
|
|
@state = @states[:doctypeName]
|
|
|
|
elsif data == ">"
|
|
|
|
# Character needs to be consumed per the specification so don't
|
|
|
|
# invoke emitCurrentTokenWithParseError with :data as argument.
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected > character. Expected DOCTYPE name.")})
|
|
|
|
@tokenQueue.push(@currentToken)
|
|
|
|
@state = @states[:data]
|
|
|
|
elsif data == :EOF
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected end of file. Expected DOCTYPE name.")})
|
|
|
|
@tokenQueue.push(@currentToken)
|
|
|
|
@state = @states[:data]
|
|
|
|
else
|
|
|
|
@currentToken[:name] = data
|
|
|
|
@state = @states[:doctypeName]
|
|
|
|
end
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def doctypeNameState
|
2007-05-30 17:45:52 +02:00
|
|
|
data = @stream.char
|
|
|
|
needsDoctypeCheck = false
|
|
|
|
if SPACE_CHARACTERS.include? data
|
|
|
|
@state = @states[:afterDoctypeName]
|
|
|
|
needsDoctypeCheck = true
|
|
|
|
elsif data == ">"
|
|
|
|
@tokenQueue.push(@currentToken)
|
|
|
|
@state = @states[:data]
|
|
|
|
elsif data == :EOF
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected end of file in DOCTYPE name.")})
|
|
|
|
@tokenQueue.push(@currentToken)
|
|
|
|
@state = @states[:data]
|
|
|
|
else
|
|
|
|
# We can't just uppercase everything that arrives here. For
|
|
|
|
# instance, non-ASCII characters.
|
|
|
|
if ASCII_LOWERCASE.include? data
|
|
|
|
data = data.upcase
|
|
|
|
end
|
|
|
|
@currentToken[:name] += data
|
|
|
|
needsDoctypeCheck = true
|
|
|
|
end
|
|
|
|
|
|
|
|
# After some iterations through this state it should eventually say
|
|
|
|
# "HTML". Otherwise there's an error.
|
|
|
|
if needsDoctypeCheck and @currentToken[:name] == "HTML"
|
|
|
|
@currentToken[:data] = false
|
|
|
|
end
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def afterDoctypeNameState
|
2007-05-30 17:45:52 +02:00
|
|
|
data = @stream.char
|
|
|
|
if SPACE_CHARACTERS.include? data
|
|
|
|
elsif data == ">"
|
|
|
|
@tokenQueue.push(@currentToken)
|
|
|
|
@state = @states[:data]
|
|
|
|
elsif data == :EOF
|
|
|
|
@currentToken[:data] = true
|
|
|
|
# XXX EMIT
|
|
|
|
@stream.queue.push(data)
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected end of file in DOCTYPE.")})
|
|
|
|
@tokenQueue.push(@currentToken)
|
|
|
|
@state = @states[:data]
|
|
|
|
else
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Expected space or '>'. Got '" + data + "'")})
|
|
|
|
@currentToken[:data] = true
|
|
|
|
@state = @states[:bogusDoctype]
|
|
|
|
end
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def bogusDoctypeState
|
2007-05-30 17:45:52 +02:00
|
|
|
data = @stream.char
|
|
|
|
if data == ">"
|
|
|
|
@tokenQueue.push(@currentToken)
|
|
|
|
@state = @states[:data]
|
|
|
|
elsif data == :EOF
|
|
|
|
# XXX EMIT
|
|
|
|
@stream.queue.push(data)
|
|
|
|
@tokenQueue.push({:type => :ParseError, :data =>
|
|
|
|
_("Unexpected end of file in bogus doctype.")})
|
|
|
|
@tokenQueue.push(@currentToken)
|
|
|
|
@state = @states[:data]
|
|
|
|
end
|
|
|
|
return true
|
2007-05-26 03:52:27 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
def _(string); string; end
|
2007-05-30 17:45:52 +02:00
|
|
|
end
|
2007-05-26 03:52:27 +02:00
|
|
|
|
|
|
|
end
|