Ruby 1.9 Compatibility

Completely removed the html5lib sanitizer. Fixed the string-handling to work in both Ruby 1.8.x and 1.9.2. There are still, inexplicably, two functional tests that fail. But the rest seems to work quite well.
2009-11-30 16:28:18 -06:00 · 2009-11-30 16:28:18 -06:00 · a6429f8c22
commit a6429f8c22
parent 79c8572053
142 changed files with 519 additions and 843 deletions
--- a/attic/vendor/plugins/HTML5lib/lib/html5/tokenizer.rb
+++ b/attic/vendor/plugins/HTML5lib/lib/html5/tokenizer.rb
@ -0,0 +1,970 @@
+require 'html5/constants'
+require 'html5/inputstream'
+
+module HTML5
+
+  # This class takes care of tokenizing HTML.
+  #
+  # * @current_token
+  #   Holds the token that is currently being processed.
+  #
+  # * @state
+  #   Holds a reference to the method to be invoked... XXX
+  #
+  # * @states
+  #   Holds a mapping between states and methods that implement the state.
+  #
+  # * @stream
+  #   Points to HTMLInputStream object.
+
+  class HTMLTokenizer
+    attr_accessor :content_model_flag, :current_token
+    attr_reader :stream
+
+    # XXX need to fix documentation
+
+    def initialize(stream, options = {})
+      @stream = HTMLInputStream.new(stream, options)
+
+      # Setup the initial tokenizer state
+      @content_model_flag = :PCDATA
+      @state              = :data_state
+      @escapeFlag         = false
+      @lastFourChars      = []
+
+      # The current token being created
+      @current_token = nil
+
+      # Tokens to be processed.
+      @token_queue             = []
+      @lowercase_element_name = options[:lowercase_element_name] != false
+      @lowercase_attr_name    = options[:lowercase_attr_name]    != false
+    end
+
+    # This is where the magic happens.
+    #
+    # We do our usually processing through the states and when we have a token
+    # to return we yield the token which pauses processing until the next token
+    # is requested.
+    def each
+      @token_queue = []
+      # Start processing. When EOF is reached @state will return false
+      # instead of true and the loop will terminate.
+      while send @state
+        yield :type => :ParseError, :data => @stream.errors.shift until @stream.errors.empty?
+        yield @token_queue.shift until @token_queue.empty?
+      end
+    end
+
+    # Below are various helper functions the tokenizer states use worked out.
+  
+    # If the next character is a '>', convert the current_token into
+    # an EmptyTag
+
+    def process_solidus_in_tag
+
+      # We need to consume another character to make sure it's a ">"
+      data = @stream.char
+
+      if @current_token[:type] == :StartTag and data == ">"
+        @current_token[:type] = :EmptyTag
+      else
+        @token_queue << {:type => :ParseError, :data => "incorrectly-placed-solidus"}
+      end
+
+      # The character we just consumed need to be put back on the stack so it
+      # doesn't get lost...
+      @stream.unget(data)
+    end
+
+    # This function returns either U+FFFD or the character based on the
+    # decimal or hexadecimal representation. It also discards ";" if present.
+    # If not present @token_queue << {:type => :ParseError}" is invoked.
+
+    def consume_number_entity(isHex)
+
+      # XXX More need to be done here. For instance, #13 should prolly be
+      # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
+      # such. Thoughts on this appreciated.
+      allowed = DIGITS
+      radix = 10
+      if isHex
+        allowed = HEX_DIGITS
+        radix = 16
+      end
+
+      char_stack = []
+
+      # Consume all the characters that are in range while making sure we
+      # don't hit an EOF.
+      c = @stream.char
+      while allowed.include?(c) and c != :EOF
+        char_stack.push(c)
+        c = @stream.char
+      end
+
+      # Convert the set of characters consumed to an int.
+      charAsInt = char_stack.join('').to_i(radix)
+
+      if charAsInt == 13
+        @token_queue << {:type => :ParseError, :data => "incorrect-cr-newline-entity"}
+        charAsInt = 10
+      elsif (128..159).include? charAsInt
+        # If the integer is between 127 and 160 (so 128 and bigger and 159
+        # and smaller) we need to do the "windows trick".
+        @token_queue << {:type => :ParseError, :data => "illegal-windows-1252-entity"}
+
+        charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
+      end
+
+      if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343)
+        if String.method_defined? :force_encoding
+          char = charAsInt.chr('utf-8')
+        else
+          char = [charAsInt].pack('U')
+        end
+      else
+        char = [0xFFFD].pack('U')
+        @token_queue << {:type => :ParseError, :data => "cant-convert-numeric-entity", :datavars => {"charAsInt" => charAsInt}}
+      end
+
+      # Discard the ; if present. Otherwise, put it back on the queue and
+      # invoke parse_error on parser.
+      if c != ";"
+        @token_queue << {:type => :ParseError, :data => "numeric-entity-without-semicolon"}
+        @stream.unget(c)
+      end
+
+      return char
+    end
+
+    def consume_entity(from_attribute=false)
+      char = nil
+      char_stack = [@stream.char]
+      if SPACE_CHARACTERS.include?(char_stack[0]) or [:EOF, '<', '&'].include?(char_stack[0])
+        @stream.unget(char_stack)
+      elsif char_stack[0] == '#'
+        # We might have a number entity here.
+        char_stack += [@stream.char, @stream.char]
+        if char_stack[0 .. 1].include? :EOF
+          # If we reach the end of the file put everything up to :EOF
+          # back in the queue
+          char_stack = char_stack[0...char_stack.index(:EOF)]
+          @stream.unget(char_stack)
+          @token_queue << {:type => :ParseError, :data => "expected-numeric-entity-but-got-eof"}
+        else
+          if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
+            # Hexadecimal entity detected.
+            @stream.unget(char_stack[2])
+            char = consume_number_entity(true)
+          elsif DIGITS.include? char_stack[1]
+            # Decimal entity detected.
+            @stream.unget(char_stack[1..-1])
+            char = consume_number_entity(false)
+          else
+            # No number entity detected.
+            @stream.unget(char_stack)
+            @token_queue << {:type => :ParseError, :data => "expected-numeric-entity"}
+          end
+        end
+      else
+        # At this point in the process might have named entity. Entities
+        # are stored in the global variable "entities".
+        #
+        # Consume characters and compare to these to a substring of the
+        # entity names in the list until the substring no longer matches.
+        filteredEntityList = ENTITIES.keys
+        filteredEntityList.reject! {|e| e[0].chr != char_stack[0]}
+        entityName = nil
+
+        # Try to find the longest entity the string will match to take care
+        # of &noti for instance.
+        while char_stack.last != :EOF
+          name = char_stack.join('')
+          if filteredEntityList.any? {|e| e[0...name.length] == name}
+            filteredEntityList.reject! {|e| e[0...name.length] != name}
+            char_stack.push(@stream.char)
+          else
+            break
+          end
+
+          if ENTITIES.include? name
+            entityName = name
+            break if entityName[-1] == ';'
+          end
+        end
+
+        if entityName != nil
+          char = ENTITIES[entityName]
+
+          # Check whether or not the last character returned can be
+          # discarded or needs to be put back.
+          if entityName[-1] != ?;
+            @token_queue << {:type => :ParseError, :data => "named-entity-without-semicolon"}
+          end
+
+          if entityName[-1] != ";" and from_attribute and
+             (ASCII_LETTERS.include?(char_stack[entityName.length]) or
+              DIGITS.include?(char_stack[entityName.length]))
+            @stream.unget(char_stack)
+            char = '&'
+          else
+            @stream.unget(char_stack[entityName.length..-1])
+          end
+        else
+          @token_queue << {:type => :ParseError, :data => "expected-named-entity"}
+          @stream.unget(char_stack)
+        end
+      end
+      return char
+    end
+
+    # This method replaces the need for "entityInAttributeValueState".
+    def process_entity_in_attribute
+      entity = consume_entity()
+      if entity
+        @current_token[:data][-1][1] += entity
+      else
+        @current_token[:data][-1][1] += "&"
+      end
+    end
+
+    # This method is a generic handler for emitting the tags. It also sets
+    # the state to "data" because that's what's needed after a token has been
+    # emitted.
+    def emit_current_token
+      # Add token to the queue to be yielded
+      token = @current_token
+      if [:StartTag, :EndTag, :EmptyTag].include?(token[:type])
+        if @lowercase_element_name
+          token[:name] = token[:name].downcase
+        end
+        @token_queue << token
+        @state = :data_state
+      end
+      
+    end
+
+    # Below are the various tokenizer states worked out.
+
+    # XXX AT Perhaps we should have Hixie run some evaluation on billions of
+    # documents to figure out what the order of the various if and elsif
+    # statements should be.
+    def data_state
+      data = @stream.char
+
+      if @content_model_flag == :CDATA or @content_model_flag == :RCDATA
+        @lastFourChars << data
+        @lastFourChars.shift if @lastFourChars.length > 4
+      end
+
+      if data == "&" and [:PCDATA,:RCDATA].include?(@content_model_flag) and !@escapeFlag
+          @state = :entity_data_state
+      elsif data == "-" && [:CDATA, :RCDATA].include?(@content_model_flag) && !@escapeFlag && @lastFourChars.join('') == "<!--"
+          @escapeFlag = true
+          @token_queue << {:type => :Characters, :data => data}
+      elsif data == "<" and !@escapeFlag and
+        [:PCDATA,:CDATA,:RCDATA].include?(@content_model_flag)
+          @state = :tag_open_state
+      elsif data == ">" and @escapeFlag and 
+        [:CDATA,:RCDATA].include?(@content_model_flag) and
+        @lastFourChars[1..-1].join('') == "-->"
+          @escapeFlag = false
+          @token_queue << {:type => :Characters, :data => data}
+
+      elsif data == :EOF
+        # Tokenization ends.
+        return false
+
+      elsif SPACE_CHARACTERS.include? data
+        # Directly after emitting a token you switch back to the "data
+        # state". At that point SPACE_CHARACTERS are important so they are
+        # emitted separately.
+        # XXX need to check if we don't need a special "spaces" flag on
+        # characters.
+        @token_queue << {:type => :SpaceCharacters, :data => data + @stream.chars_until(SPACE_CHARACTERS, true)}
+      else
+        @token_queue << {:type => :Characters, :data => data + @stream.chars_until(%w[& < > -])}
+      end
+      return true
+    end
+
+    def entity_data_state
+      entity = consume_entity
+      if entity
+        @token_queue << {:type => :Characters, :data => entity}
+      else
+        @token_queue << {:type => :Characters, :data => "&"}
+      end
+      @state = :data_state
+      return true
+    end
+
+    def tag_open_state
+      data = @stream.char
+      if @content_model_flag == :PCDATA
+        if data == "!"
+          @state = :markup_declaration_open_state
+        elsif data == "/"
+          @state = :close_tag_open_state
+        elsif data != :EOF and ASCII_LETTERS.include? data
+          @current_token = {:type => :StartTag, :name => data, :data => []}
+          @state = :tag_name_state
+        elsif data == ">"
+          # XXX In theory it could be something besides a tag name. But
+          # do we really care?
+          @token_queue << {:type => :ParseError, :data =>       "expected-tag-name-but-got-right-bracket"}
+          @token_queue << {:type => :Characters, :data => "<>"}
+          @state = :data_state
+        elsif data == "?"
+          # XXX In theory it could be something besides a tag name. But
+          # do we really care?
+          @token_queue.push({:type => :ParseError, :data => "expected-tag-name-but-got-question-mark"})
+          @stream.unget(data)
+          @state = :bogus_comment_state
+        else
+          # XXX
+          @token_queue << {:type => :ParseError, :data => "expected-tag-name"}
+          @token_queue << {:type => :Characters, :data => "<"}
+          @stream.unget(data)
+          @state = :data_state
+        end
+      else
+        # We know the content model flag is set to either RCDATA or CDATA
+        # now because this state can never be entered with the PLAINTEXT
+        # flag.
+        if data == "/"
+          @state = :close_tag_open_state
+        else
+          @token_queue << {:type => :Characters, :data => "<"}
+          @stream.unget(data)
+          @state = :data_state
+        end
+      end
+      return true
+    end
+
+    def close_tag_open_state
+      if (@content_model_flag == :RCDATA or @content_model_flag == :CDATA)
+        if @current_token
+          char_stack = []
+
+          # So far we know that "</" has been consumed. We now need to know
+          # whether the next few characters match the name of last emitted
+          # start tag which also happens to be the current_token. We also need
+          # to have the character directly after the characters that could
+          # match the start tag name.
+          (@current_token[:name].length + 1).times do
+            char_stack.push(@stream.char)
+            # Make sure we don't get hit by :EOF
+            break if char_stack[-1] == :EOF
+          end
+
+          # Since this is just for checking. We put the characters back on
+          # the stack.
+          @stream.unget(char_stack)
+        end
+
+        if @current_token and
+          @current_token[:name].downcase == 
+          char_stack[0...-1].join('').downcase and
+          (SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? char_stack[-1]
+          # Because the characters are correct we can safely switch to
+          # PCDATA mode now. This also means we don't have to do it when
+          # emitting the end tag token.
+          @content_model_flag = :PCDATA
+        else
+          @token_queue << {:type => :Characters, :data => "</"}
+          @state = :data_state
+
+          # Need to return here since we don't want the rest of the
+          # method to be walked through.
+          return true
+        end
+      end
+
+      data = @stream.char
+      if data == :EOF
+        @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-eof"}
+        @token_queue << {:type => :Characters, :data => "</"}
+        @state = :data_state
+      elsif ASCII_LETTERS.include? data
+        @current_token = {:type => :EndTag, :name => data, :data => []}
+        @state = :tag_name_state
+      elsif data == ">"
+        @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-right-bracket"}
+        @state = :data_state
+      else
+        # XXX data can be _'_...
+        @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-char", :datavars => {:data => data}}
+        @stream.unget(data)
+        @state = :bogus_comment_state
+      end
+
+      return true
+    end
+
+    def tag_name_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include? data
+        @state = :before_attribute_name_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
+        emit_current_token
+      elsif ASCII_LETTERS.include? data
+        @current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
+      elsif data == ">"
+        emit_current_token
+      elsif data == "/"
+        process_solidus_in_tag
+        @state = :before_attribute_name_state
+      else
+        @current_token[:name] += data
+      end
+      return true
+    end
+
+    def before_attribute_name_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include? data
+        @stream.chars_until(SPACE_CHARACTERS, true)
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "expected-attribute-name-but-got-eof"}
+        emit_current_token
+      elsif ASCII_LETTERS.include? data
+        @current_token[:data].push([data, ""])
+        @state = :attribute_name_state
+      elsif data == ">"
+        emit_current_token
+      elsif data == "/"
+        process_solidus_in_tag
+      else
+        @current_token[:data].push([data, ""])
+        @state = :attribute_name_state
+      end
+      return true
+    end
+
+    def attribute_name_state
+      data = @stream.char
+      leavingThisState = true
+      emitToken = false
+      if data == "="
+        @state = :before_attribute_value_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-attribute-name"}
+        @state = :data_state
+        emitToken = true
+      elsif ASCII_LETTERS.include? data
+        @current_token[:data][-1][0] += data + @stream.chars_until(ASCII_LETTERS, true)
+        leavingThisState = false
+      elsif data == ">"
+        # XXX If we emit here the attributes are converted to a dict
+        # without being checked and when the code below runs we error
+        # because data is a dict not a list
+        emitToken = true
+      elsif SPACE_CHARACTERS.include? data
+        @state = :after_attribute_name_state
+      elsif data == "/"
+        process_solidus_in_tag
+        @state = :before_attribute_name_state
+      else
+        @current_token[:data][-1][0] += data
+        leavingThisState = false
+      end
+
+      if leavingThisState
+        # Attributes are not dropped at this stage. That happens when the
+        # start tag token is emitted so values can still be safely appended
+        # to attributes, but we do want to report the parse error in time.
+        if @lowercase_attr_name
+            @current_token[:data][-1][0] = @current_token[:data].last.first.downcase
+        end
+        @current_token[:data][0...-1].each {|name,value|
+          if @current_token[:data].last.first == name
+            @token_queue << {:type => :ParseError, :data => "duplicate-attribute"}
+            break # don't report an error more than once
+          end
+        }
+        # XXX Fix for above XXX
+        emit_current_token if emitToken
+      end
+      return true
+    end
+
+    def after_attribute_name_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include? data
+        @stream.chars_until(SPACE_CHARACTERS, true)
+      elsif data == "="
+        @state = :before_attribute_value_state
+      elsif data == ">"
+        emit_current_token
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "expected-end-of-tag-but-got-eof"}
+        emit_current_token
+      elsif ASCII_LETTERS.include? data
+        @current_token[:data].push([data, ""])
+        @state = :attribute_name_state
+      elsif data == "/"
+        process_solidus_in_tag
+        @state = :before_attribute_name_state
+      else
+        @current_token[:data].push([data, ""])
+        @state = :attribute_name_state
+      end
+      return true
+    end
+
+    def before_attribute_value_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include? data
+        @stream.chars_until(SPACE_CHARACTERS, true)
+      elsif data == "\""
+        @state = :attribute_value_double_quoted_state
+      elsif data == "&"
+        @state = :attribute_value_unquoted_state
+        @stream.unget(data);
+      elsif data == "'"
+        @state = :attribute_value_single_quoted_state
+      elsif data == ">"
+        emit_current_token
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "expected-attribute-value-but-got-eof"}
+        emit_current_token
+      else
+        @current_token[:data][-1][1] += data
+        @state = :attribute_value_unquoted_state
+      end
+      return true
+    end
+
+    def attribute_value_double_quoted_state
+      data = @stream.char
+      if data == "\""
+        @state = :before_attribute_name_state
+      elsif data == "&"
+        process_entity_in_attribute
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-double-quote"}
+        emit_current_token
+      else
+        @current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
+      end
+      return true
+    end
+
+    def attribute_value_single_quoted_state
+      data = @stream.char
+      if data == "'"
+        @state = :before_attribute_name_state
+      elsif data == "&"
+        process_entity_in_attribute
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-single-quote"}
+        emit_current_token
+      else
+        @current_token[:data][-1][1] += data +\
+          @stream.chars_until(["'", "&"])
+      end
+      return true
+    end
+
+    def attribute_value_unquoted_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include? data
+        @state = :before_attribute_name_state
+      elsif data == "&"
+        process_entity_in_attribute
+      elsif data == ">"
+        emit_current_token
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-no-quotes"}
+        emit_current_token
+      else
+        @current_token[:data][-1][1] += data +  @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
+      end
+      return true
+    end
+
+    def bogus_comment_state
+      # Make a new comment token and give it as value all the characters
+      # until the first > or :EOF (chars_until checks for :EOF automatically)
+      # and emit it.
+      @token_queue << {:type => :Comment, :data => @stream.chars_until((">"))}
+
+      # Eat the character directly after the bogus comment which is either a
+      # ">" or an :EOF.
+      @stream.char
+      @state = :data_state
+      return true
+    end
+
+    def markup_declaration_open_state
+      char_stack = [@stream.char, @stream.char]
+      if char_stack == ["-", "-"]
+        @current_token = {:type => :Comment, :data => ""}
+        @state = :comment_start_state
+      else
+        5.times { char_stack.push(@stream.char) }
+        # Put in explicit :EOF check
+        if !char_stack.include?(:EOF) && char_stack.join("").upcase == "DOCTYPE"
+          @current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
+          @state = :doctype_state
+        else
+          @token_queue << {:type => :ParseError, :data => "expected-dashes-or-doctype"}
+          @stream.unget(char_stack)
+          @state = :bogus_comment_state
+        end
+      end
+      return true
+    end
+
+    def comment_start_state
+        data = @stream.char
+        if data == "-"
+            @state = :comment_start_dash_state
+        elsif data == ">"
+            @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
+            @token_queue << @current_token
+            @state = :data_state
+        elsif data == :EOF
+            @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
+            @token_queue << @current_token
+            @state = :data_state
+        else
+            @current_token[:data] += data + @stream.chars_until("-")
+            @state = :comment_state
+        end
+        return true
+    end
+    
+    def comment_start_dash_state
+        data = @stream.char
+        if data == "-"
+            @state = :comment_end_state
+        elsif data == ">"
+            @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
+            @token_queue << @current_token
+            @state = :data_state
+        elsif data == :EOF
+            @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
+            @token_queue << @current_token
+            @state = :data_state
+        else
+            @current_token[:data] += '-' + data + @stream.chars_until("-")
+            @state = :comment_state
+        end
+        return true
+    end
+
+    def comment_state
+      data = @stream.char
+      if data == "-"
+        @state = :comment_end_dash_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @current_token[:data] += data + @stream.chars_until("-")
+      end
+      return true
+    end
+
+    def comment_end_dash_state
+      data = @stream.char
+      if data == "-"
+        @state = :comment_end_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-comment-end-dash"}
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @current_token[:data] += "-" + data +\
+          @stream.chars_until("-")
+        # Consume the next character which is either a "-" or an :EOF as
+        # well so if there's a "-" directly after the "-" we go nicely to
+        # the "comment end state" without emitting a ParseError there.
+        @stream.char
+      end
+      return true
+    end
+
+    def comment_end_state
+      data = @stream.char
+      if data == ">"
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == "-"
+        @token_queue << {:type => :ParseError, :data => "unexpected-dash-after-double-dash-in-comment"}
+        @current_token[:data] += data
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-comment-double-dash"}
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        # XXX
+        @token_queue << {:type => :ParseError, :data => "unexpected-char-in-comment"}
+        @current_token[:data] += "--" + data
+        @state = :comment_state
+      end
+      return true
+    end
+
+    def doctype_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include? data
+        @state = :before_doctype_name_state
+      else
+        @token_queue << {:type => :ParseError, :data => "need-space-after-doctype"}
+        @stream.unget(data)
+        @state = :before_doctype_name_state
+      end
+      return true
+    end
+
+    def before_doctype_name_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include? data
+      elsif data == ">"
+        @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-right-bracket"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-eof"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @current_token[:name] = data
+        @state = :doctype_name_state
+      end
+      return true
+    end
+
+    def doctype_name_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include? data
+        @state = :after_doctype_name_state
+      elsif data == ">"
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype-name"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @current_token[:name] += data
+      end
+
+      return true
+    end
+
+    def after_doctype_name_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include? data
+      elsif data == ">"
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @current_token[:correct] = false
+        @stream.unget(data)
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        char_stack = [data]  
+        5.times { char_stack << stream.char }
+        token = char_stack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
+        if token == "public" and !char_stack.include?(:EOF)
+          @state = :before_doctype_public_identifier_state
+        elsif token == "system" and !char_stack.include?(:EOF)
+          @state = :before_doctype_system_identifier_state
+        else
+          @stream.unget(char_stack)
+          @token_queue << {:type => :ParseError, :data => "expected-space-or-right-bracket-in-doctype", "datavars" => {"data" => data}}
+          @state = :bogus_doctype_state
+        end
+      end
+      return true
+    end
+    
+    def before_doctype_public_identifier_state
+      data = @stream.char
+
+      if SPACE_CHARACTERS.include?(data)
+      elsif data == "\""
+        @current_token[:publicId] = ""
+        @state = :doctype_public_identifier_double_quoted_state
+      elsif data == "'"
+        @current_token[:publicId] = ""
+        @state = :doctype_public_identifier_single_quoted_state
+      elsif data == ">"
+        @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
+        @state = :bogus_doctype_state
+      end
+
+      return true
+    end
+ 
+    def doctype_public_identifier_double_quoted_state
+      data = @stream.char
+      if data == "\""
+        @state = :after_doctype_public_identifier_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @current_token[:publicId] += data
+      end
+      return true
+    end
+
+    def doctype_public_identifier_single_quoted_state
+      data = @stream.char
+      if data == "'"
+        @state = :after_doctype_public_identifier_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @current_token[:publicId] += data
+      end
+      return true
+    end
+
+    def after_doctype_public_identifier_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include?(data)
+      elsif data == "\""
+        @current_token[:systemId] = ""
+        @state = :doctype_system_identifier_double_quoted_state
+      elsif data == "'"
+        @current_token[:systemId] = ""
+        @state = :doctype_system_identifier_single_quoted_state
+      elsif data == ">"
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @state = :bogus_doctype_state
+      end
+      return true
+    end
+    
+    def before_doctype_system_identifier_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include?(data)
+      elsif data == "\""
+        @current_token[:systemId] = ""
+        @state = :doctype_system_identifier_double_quoted_state
+      elsif data == "'"
+        @current_token[:systemId] = ""
+        @state = :doctype_system_identifier_single_quoted_state
+      elsif data == ">"
+        @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
+        @state = :bogus_doctype_state
+      end
+      return true
+    end
+
+    def doctype_system_identifier_double_quoted_state
+      data = @stream.char
+      if data == "\""
+        @state = :after_doctype_system_identifier_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @current_token[:systemId] += data
+      end
+      return true
+    end
+
+    def doctype_system_identifier_single_quoted_state
+      data = @stream.char
+      if data == "'"
+        @state = :after_doctype_system_identifier_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @current_token[:systemId] += data
+      end
+      return true
+    end
+
+    def after_doctype_system_identifier_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include?(data)
+      elsif data == ">"
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @state = :bogus_doctype_state
+      end
+      return true
+    end
+
+    def bogus_doctype_state
+      data = @stream.char
+      @current_token[:correct] = false
+      if data == ">"
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        # XXX EMIT
+        @stream.unget(data)
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      end
+      return true
+    end
+
+  end
+
+end