HTML5lib is Back.

Synced with latest version of HTML5lib, which fixes problem with Astral plane characters. I should really do some tests, but the HTML5lib Sanitizer seems to be 2-5 times slower than the old sanitizer.
2007-05-30 10:45:52 -05:00 · 2007-05-30 10:45:52 -05:00 · 4dd70af5ae
commit 4dd70af5ae
parent e1a6827f1f
39 changed files with 4843 additions and 5576 deletions
--- a/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
@ -3,14 +3,14 @@ require 'html5lib/constants'

 module HTML5lib

-# Provides a unicode stream of characters to the HTMLTokenizer.
+  # Provides a unicode stream of characters to the HTMLTokenizer.

-# This class takes care of character encoding and removing or replacing
-# incorrect byte-sequences and also provides column and line tracking.
+  # This class takes care of character encoding and removing or replacing
+  # incorrect byte-sequences and also provides column and line tracking.

-class HTMLInputStream
+  class HTMLInputStream

-    attr_accessor :queue, :charEncoding
+    attr_accessor :queue, :char_encoding

    # Initialises the HTMLInputStream.
    # 
@ -27,523 +27,524 @@ class HTMLInputStream
    # parseMeta - Look for a <meta> element containing encoding information

    def initialize(source, options = {})
-        @encoding = nil
-        @parseMeta = true
-        @chardet = true
+      @encoding = nil
+      @parse_meta = true
+      @chardet = true

-        options.each { |name, value| instance_variable_set("@#{name}", value) }
+      options.each { |name, value| instance_variable_set("@#{name}", value) }

-        # List of where new lines occur
-        @newLines = []
+      # List of where new lines occur
+      @new_lines = []

-        # Raw Stream
-        @rawStream = openStream(source)
+      # Raw Stream
+      @raw_stream = open_stream(source)

-        # Encoding Information
-        #Number of bytes to use when looking for a meta element with
-        #encoding information
-        @NUM_BYTES_META = 512
-        #Encoding to use if no other information can be found
-        @DEFAULT_ENCODING = 'windows-1252'
-        
-        #Detect encoding iff no explicit "transport level" encoding is supplied
-        if @encoding.nil? or not HTML5lib.isValidEncoding(@encoding)
-            @charEncoding = detectEncoding
-        else
-            @charEncoding = @encoding
+      # Encoding Information
+      #Number of bytes to use when looking for a meta element with
+      #encoding information
+      @NUM_BYTES_META = 512
+      #Encoding to use if no other information can be found
+      @DEFAULT_ENCODING = 'windows-1252'
+    
+      #Detect encoding iff no explicit "transport level" encoding is supplied
+      if @encoding.nil? or not HTML5lib.is_valid_encoding(@encoding)
+        @char_encoding = detect_encoding
+      else
+        @char_encoding = @encoding
+      end
+
+      # Read bytes from stream decoding them into Unicode
+      uString = @raw_stream.read
+      unless @char_encoding == 'utf-8'
+        begin
+          require 'iconv'
+          uString = Iconv.iconv('utf-8', @encoding, uString)[0]
+        rescue
        end
+      end

-        # Read bytes from stream decoding them into Unicode
-        uString = @rawStream.read
-        unless @charEncoding == 'utf-8'
-            begin
-                require 'iconv'
-                uString = Iconv.iconv('utf-8', @encoding, uString)[0]
-            rescue
-            end
-        end
+      # Normalize newlines and null characters
+      uString.gsub!(/\r\n?/, "\n")
+      uString.gsub!("\x00", [0xFFFD].pack('U'))

-        # Normalize newlines and null characters
-        uString.gsub!(/\r\n?/, "\n")
-        uString.gsub!("\x00", [0xFFFD].pack('U'))
+      # Convert the unicode string into a list to be used as the data stream
+      @data_stream = uString

-        # Convert the unicode string into a list to be used as the data stream
-        @dataStream = uString
+      @queue = []

-        @queue = []
-
-        # Reset position in the list to read from
-        reset
+      # Reset position in the list to read from
+      reset
    end

    # Produces a file object from source.
    #
    # source can be either a file object, local filename or a string.
-    def openStream(source)
-        # Already an IO like object
-        if source.respond_to?(:read)
-            @stream = source
-        else
-            # Treat source as a string and wrap in StringIO
-            @stream = StringIO.new(source)
-        end
-        return @stream
+    def open_stream(source)
+      # Already an IO like object
+      if source.respond_to?(:read)
+        @stream = source
+      else
+        # Treat source as a string and wrap in StringIO
+        @stream = StringIO.new(source)
+      end
+      return @stream
    end

-    def detectEncoding
+    def detect_encoding

-        #First look for a BOM
-        #This will also read past the BOM if present
-        encoding = detectBOM
-        #If there is no BOM need to look for meta elements with encoding 
-        #information
-        if encoding.nil? and @parseMeta
-            encoding = detectEncodingMeta
+      #First look for a BOM
+      #This will also read past the BOM if present
+      encoding = detect_bom
+      #If there is no BOM need to look for meta elements with encoding 
+      #information
+      if encoding.nil? and @parse_meta
+        encoding = detect_encoding_meta
+      end
+      #Guess with chardet, if avaliable
+      if encoding.nil? and @chardet
+        begin
+          require 'rubygems'
+          require 'UniversalDetector' # gem install chardet
+          buffer = @raw_stream.read
+          encoding = UniversalDetector::chardet(buffer)['encoding']
+          @raw_stream = open_stream(buffer)
+        rescue LoadError
        end
-        #Guess with chardet, if avaliable
-        if encoding.nil? and @chardet
-            begin
-                require 'rubygems'
-                require 'UniversalDetector' # gem install chardet
-                buffer = @rawStream.read
-                encoding = UniversalDetector::chardet(buffer)['encoding']
-                @rawStream = openStream(buffer)
-            rescue LoadError
-            end
-        end
-        # If all else fails use the default encoding
-        if encoding.nil?
-            encoding = @DEFAULT_ENCODING
-        end
-        
-        #Substitute for equivalent encodings:
-        encodingSub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
+      end
+      # If all else fails use the default encoding
+      if encoding.nil?
+        encoding = @DEFAULT_ENCODING
+      end
+    
+      #Substitute for equivalent encodings:
+      encoding_sub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}

-        if encodingSub.has_key?(encoding.downcase)
-            encoding = encodingSub[encoding.downcase]
-        end
+      if encoding_sub.has_key?(encoding.downcase)
+        encoding = encoding_sub[encoding.downcase]
+      end

-        return encoding
+      return encoding
    end

    # Attempts to detect at BOM at the start of the stream. If
    # an encoding can be determined from the BOM return the name of the
    # encoding otherwise return nil
-    def detectBOM
-        bomDict = {
-            "\xef\xbb\xbf" => 'utf-8',
-            "\xff\xfe" => 'utf-16-le',
-            "\xfe\xff" => 'utf-16-be',
-            "\xff\xfe\x00\x00" => 'utf-32-le',
-            "\x00\x00\xfe\xff" => 'utf-32-be'
-        }
+    def detect_bom
+      bom_dict = {
+        "\xef\xbb\xbf" => 'utf-8',
+        "\xff\xfe" => 'utf-16-le',
+        "\xfe\xff" => 'utf-16-be',
+        "\xff\xfe\x00\x00" => 'utf-32-le',
+        "\x00\x00\xfe\xff" => 'utf-32-be'
+      }

-        # Go to beginning of file and read in 4 bytes
-        @rawStream.seek(0)
-        string = @rawStream.read(4)
-        return nil unless string
+      # Go to beginning of file and read in 4 bytes
+      @raw_stream.seek(0)
+      string = @raw_stream.read(4)
+      return nil unless string

-        # Try detecting the BOM using bytes from the string
-        encoding = bomDict[string[0...3]]          # UTF-8
-        seek = 3
+      # Try detecting the BOM using bytes from the string
+      encoding = bom_dict[string[0...3]]      # UTF-8
+      seek = 3
+      unless encoding
+        # Need to detect UTF-32 before UTF-16
+        encoding = bom_dict[string]       # UTF-32
+        seek = 4
        unless encoding
-            # Need to detect UTF-32 before UTF-16
-            encoding = bomDict[string]             # UTF-32
-            seek = 4
-            unless encoding
-                encoding = bomDict[string[0...2]]  # UTF-16
-                seek = 2
-            end
+          encoding = bom_dict[string[0...2]]  # UTF-16
+          seek = 2
        end
+      end

-        #AT - move this to the caller?
-        # Set the read position past the BOM if one was found, otherwise
-        # set it to the start of the stream
-        @rawStream.seek(encoding ? seek : 0)
+      #AT - move this to the caller?
+      # Set the read position past the BOM if one was found, otherwise
+      # set it to the start of the stream
+      @raw_stream.seek(encoding ? seek : 0)

-        return encoding
+      return encoding
    end

    # Report the encoding declared by the meta element
-    def detectEncodingMeta
-        parser = EncodingParser.new(@rawStream.read(@NUM_BYTES_META))
-        @rawStream.seek(0)
-        return parser.getEncoding
+    def detect_encoding_meta
+      parser = EncodingParser.new(@raw_stream.read(@NUM_BYTES_META))
+      @raw_stream.seek(0)
+      return parser.get_encoding
    end

-    def determineNewLines
-        # Looks through the stream to find where new lines occur so
-        # the position method can tell where it is.
-        @newLines.push(0)
-        (0...@dataStream.length).each { |i| @newLines.push(i) if @dataStream[i] == ?\n }
+    def determine_new_lines
+      # Looks through the stream to find where new lines occur so
+      # the position method can tell where it is.
+      @new_lines.push(0)
+      (0...@data_stream.length).each { |i| @new_lines.push(i) if @data_stream[i] == ?\n }
    end

    # Returns (line, col) of the current position in the stream.
    def position
-        # Generate list of new lines first time around
-        determineNewLines if @newLines.empty?
-        line = 0
-        tell = @tell
-        @newLines.each do |pos|
-            break unless pos < tell
-            line += 1
-        end
-        col = tell - @newLines[line-1] - 1
-        return [line, col]
+      # Generate list of new lines first time around
+      determine_new_lines if @new_lines.empty?
+      line = 0
+      tell = @tell
+      @new_lines.each do |pos|
+        break unless pos < tell
+        line += 1
+      end
+      col = tell - @new_lines[line-1] - 1
+      return [line, col]
    end

    # Resets the position in the stream back to the start.
    def reset
-        @tell = 0
+      @tell = 0
    end

    # Read one character from the stream or queue if available. Return
    # EOF when EOF is reached.
    def char
-        unless @queue.empty?
-            return @queue.shift
-        else
-            begin
-                @tell += 1
-                return @dataStream[@tell - 1].chr
-            rescue
-                return :EOF
-            end
+      unless @queue.empty?
+        return @queue.shift
+      else
+        begin
+          @tell += 1
+          return @data_stream[@tell - 1].chr
+        rescue
+          return :EOF
        end
+      end
    end

    # Returns a string of characters from the stream up to but not
    # including any character in characters or EOF. characters can be
    # any container that supports the in method being called on it.
-    def charsUntil(characters, opposite = false)
-        charStack = [char]
+    def chars_until(characters, opposite=false)
+      char_stack = [char]

-        unless charStack[0] == :EOF
-            while (characters.include? charStack[-1]) == opposite
-                unless @queue.empty?
-                    # First from the queue
-                    charStack.push(@queue.shift)
-                    break if charStack[-1] == :EOF
-                else
-                    # Then the rest
-                    begin
-                        charStack.push(@dataStream[@tell].chr)
-                        @tell += 1
-                    rescue
-                        charStack.push(:EOF)
-                        break
-                    end
-                end
+      unless char_stack[0] == :EOF
+        while (characters.include? char_stack[-1]) == opposite
+          unless @queue.empty?
+            # First from the queue
+            char_stack.push(@queue.shift)
+            break if char_stack[-1] == :EOF
+          else
+            # Then the rest
+            begin
+              char_stack.push(@data_stream[@tell].chr)
+              @tell += 1
+            rescue
+              char_stack.push(:EOF)
+              break
            end
+          end
        end
+      end

-        # Put the character stopped on back to the front of the queue
-        # from where it came.
-        @queue.insert(0, charStack.pop)
-        return charStack.join('')
+      # Put the character stopped on back to the front of the queue
+      # from where it came.
+      @queue.insert(0, char_stack.pop)
+      return char_stack.join('')
    end
-end
+  end

-# String-like object with an assosiated position and various extra methods
-# If the position is ever greater than the string length then an exception is raised
-class EncodingBytes < String
+  # String-like object with an assosiated position and various extra methods
+  # If the position is ever greater than the string length then an exception is raised
+  class EncodingBytes < String

    attr_accessor :position

    def initialize(value)
-        super(value)
-        @position = -1
+      super(value)
+      @position = -1
    end
-    
+  
    def each
-        while @position < length
-            @position += 1
-            yield self[@position]
-        end
+      while @position < length
+        @position += 1
+        yield self[@position]
+      end
    rescue EOF
    end
-    
-    def currentByte
-        raise EOF if @position >= length
-        return self[@position].chr
+  
+    def current_byte
+      raise EOF if @position >= length
+      return self[@position].chr
    end
-    
+  
    # Skip past a list of characters
-    def skip(chars = SPACE_CHARACTERS)
-        while chars.include?(currentByte)
-            @position += 1
-        end
+    def skip(chars=SPACE_CHARACTERS)
+      while chars.include?(current_byte)
+        @position += 1
+      end
    end

    # Look for a sequence of bytes at the start of a string. If the bytes 
    # are found return true and advance the position to the byte after the 
    # match. Otherwise return false and leave the position alone
-    def matchBytes(bytes, lower = false)
-        data = self[position ... position+bytes.length]
-        data.downcase! if lower
-        rv = (data == bytes)
-        @position += bytes.length if rv == true
-        return rv
+    def match_bytes(bytes, lower=false)
+      data = self[position ... position+bytes.length]
+      data.downcase! if lower
+      rv = (data == bytes)
+      @position += bytes.length if rv == true
+      return rv
    end
-    
+  
    # Look for the next sequence of bytes matching a given sequence. If
    # a match is found advance the position to the last byte of the match
-    def jumpTo(bytes)
-        newPosition = self[position .. -1].index(bytes)
-        if newPosition
-            @position += (newPosition + bytes.length-1)
-            return true
-        else
-            raise EOF
-        end
+    def jump_to(bytes)
+      new_position = self[position .. -1].index(bytes)
+      if new_position
+        @position += (new_position + bytes.length-1)
+        return true
+      else
+        raise EOF
+      end
    end
-    
+  
    # Move the pointer so it points to the next byte in a set of possible
    # bytes
-    def findNext(byteList)
-        until byteList.include?(currentByte)
-            @position += 1
-        end
+    def find_next(byte_list)
+      until byte_list.include?(current_byte)
+        @position += 1
+      end
    end
-end
+  end

-# Mini parser for detecting character encoding from meta elements
-class EncodingParser
+  # Mini parser for detecting character encoding from meta elements
+  class EncodingParser

    # string - the data to work on for encoding detection
    def initialize(data)
-        @data = EncodingBytes.new(data.to_s)
-        @encoding = nil
+      @data = EncodingBytes.new(data.to_s)
+      @encoding = nil
    end

    @@method_dispatch = [
-        ['<!--', :handleComment],
-        ['<meta', :handleMeta],
-        ['</', :handlePossibleEndTag],
-        ['<!', :handleOther],
-        ['<?', :handleOther],
-        ['<', :handlePossibleStartTag]
+      ['<!--', :handle_comment],
+      ['<meta', :handle_meta],
+      ['</', :handle_possible_end_tag],
+      ['<!', :handle_other],
+      ['<?', :handle_other],
+      ['<', :handle_possible_start_tag]
    ]

-    def getEncoding
-        @data.each do |byte|
-            keepParsing = true
-            @@method_dispatch.each do |(key, method)|
-                if @data.matchBytes(key, lower = true)
-                    keepParsing = send(method)    
-                    break
-                end
-            end
-            break unless keepParsing
+    def get_encoding
+      @data.each do |byte|
+        keep_parsing = true
+        @@method_dispatch.each do |(key, method)|
+          if @data.match_bytes(key, lower = true)
+            keep_parsing = send(method)
+            break
+          end
        end
-        @encoding = @encoding.strip unless @encoding.nil?
-        return @encoding
+        break unless keep_parsing
+      end
+      @encoding = @encoding.strip unless @encoding.nil?
+      return @encoding
    end

    # Skip over comments
-    def handleComment
-        return @data.jumpTo('-->')
+    def handle_comment
+      return @data.jump_to('-->')
    end

-    def handleMeta
-        # if we have <meta not followed by a space so just keep going
-        return true unless SPACE_CHARACTERS.include?(@data.currentByte)
+    def handle_meta
+      # if we have <meta not followed by a space so just keep going
+      return true unless SPACE_CHARACTERS.include?(@data.current_byte)

-        #We have a valid meta element we want to search for attributes
-        while true
-            #Try to find the next attribute after the current position
-            attr = getAttribute
+      #We have a valid meta element we want to search for attributes
+      while true
+        #Try to find the next attribute after the current position
+        attr = get_attribute

-            return true if attr.nil?
-                
-            if attr[0] == 'charset'
-                tentativeEncoding = attr[1]
-                if HTML5lib.isValidEncoding(tentativeEncoding)
-                    @encoding = tentativeEncoding    
-                    return false
-                end
-            elsif attr[0] == 'content'
-                contentParser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
-                tentativeEncoding = contentParser.parse
-                if HTML5lib.isValidEncoding(tentativeEncoding)
-                    @encoding = tentativeEncoding    
-                    return false
-                end
-            end
-        end
-    end
-
-    def handlePossibleStartTag
-        return handlePossibleTag(false)
-    end
-
-    def handlePossibleEndTag
-        @data.position+=1
-        return handlePossibleTag(true)
-    end
-
-    def handlePossibleTag(endTag)
-        unless ASCII_LETTERS.include?(@data.currentByte)
-            #If the next byte is not an ascii letter either ignore this
-            #fragment (possible start tag case) or treat it according to 
-            #handleOther
-            if endTag
-                @data.position -= 1
-                handleOther
-            end
-            return true
-        end
+        return true if attr.nil?
        
-        @data.findNext(SPACE_CHARACTERS + ['<', '>'])
+        if attr[0] == 'charset'
+          tentative_encoding = attr[1]
+          if HTML5lib.is_valid_encoding(tentative_encoding)
+            @encoding = tentative_encoding  
+            return false
+          end
+        elsif attr[0] == 'content'
+          content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
+          tentative_encoding = content_parser.parse
+          if HTML5lib.is_valid_encoding(tentative_encoding)
+            @encoding = tentative_encoding
+            return false
+          end
+        end
+      end
+    end

-        if @data.currentByte == '<'
-            #return to the first step in the overall "two step" algorithm
-            #reprocessing the < byte
-            @data.position -= 1    
-        else
-            #Read all attributes
-            {} until getAttribute.nil?
+    def handle_possible_start_tag
+      return handle_possible_tag(false)
+    end
+
+    def handle_possible_end_tag
+      @data.position += 1
+      return handle_possible_tag(true)
+    end
+
+    def handle_possible_tag(end_tag)
+      unless ASCII_LETTERS.include?(@data.current_byte)
+        #If the next byte is not an ascii letter either ignore this
+        #fragment (possible start tag case) or treat it according to 
+        #handleOther
+        if end_tag
+          @data.position -= 1
+          handle_other
        end
        return true
+      end
+    
+      @data.find_next(SPACE_CHARACTERS + ['<', '>'])
+
+      if @data.current_byte == '<'
+        #return to the first step in the overall "two step" algorithm
+        #reprocessing the < byte
+        @data.position -= 1  
+      else
+        #Read all attributes
+        {} until get_attribute.nil?
+      end
+      return true
    end

-    def handleOther
-        return @data.jumpTo('>')
+    def handle_other
+      return @data.jump_to('>')
    end

-    # Return a name,value pair for the next attribute in the stream, 
+    # Return a name,value pair for the next attribute in the stream,
    # if one is found, or nil
-    def getAttribute
-        @data.skip(SPACE_CHARACTERS + ['/'])
+    def get_attribute
+      @data.skip(SPACE_CHARACTERS + ['/'])

-        if @data.currentByte == '<'
-            @data.position -= 1
-            return nil
-        elsif @data.currentByte == '>'
-            return nil
-        end
+      if @data.current_byte == '<'
+        @data.position -= 1
+        return nil
+      elsif @data.current_byte == '>'
+        return nil
+      end

-        attrName = []
-        attrValue = []
-        spaceFound = false
-        #Step 5 attribute name
-        while true
-            if @data.currentByte == '=' and attrName:   
-                break
-            elsif SPACE_CHARACTERS.include?(@data.currentByte)
-                spaceFound = true
-                break
-            elsif ['/', '<', '>'].include?(@data.currentByte)
-                return [attrName.join(''), '']
-            elsif ASCII_UPPERCASE.include?(@data.currentByte)
-                attrName.push(@data.currentByte.downcase)
-            else
-                attrName.push(@data.currentByte)
-            end
-            #Step 6
-            @data.position += 1
-        end
-        #Step 7
-        if spaceFound
-            @data.skip
-            #Step 8
-            unless @data.currentByte == '='
-                @data.position -= 1
-                return [attrName.join(''), '']
-            end
-        end
-        #XXX need to advance position in both spaces and value case
-        #Step 9
-        @data.position += 1
-        #Step 10
-        @data.skip
-        #Step 11
-        if ["'", '"'].include?(@data.currentByte)
-            #11.1
-            quoteChar = @data.currentByte
-            while true
-                @data.position+=1
-                #11.3
-                if @data.currentByte == quoteChar
-                    @data.position += 1
-                    return [attrName.join(''), attrValue.join('')]
-                #11.4
-                elsif ASCII_UPPERCASE.include?(@data.currentByte)
-                    attrValue.push(@data.currentByte.downcase)
-                #11.5
-                else
-                    attrValue.push(@data.currentByte)
-                end
-            end
-        elsif ['>', '<'].include?(@data.currentByte)
-            return [attrName.join(''), '']
-        elsif ASCII_UPPERCASE.include?(@data.currentByte)
-            attrValue.push(@data.currentByte.downcase)
+      attr_name = []
+      attr_value = []
+      space_found = false
+      #Step 5 attribute name
+      while true
+        if @data.current_byte == '=' and attr_name:
+          break
+        elsif SPACE_CHARACTERS.include?(@data.current_byte)
+          space_found = true
+          break
+        elsif ['/', '<', '>'].include?(@data.current_byte)
+          return [attr_name.join(''), '']
+        elsif ASCII_UPPERCASE.include?(@data.current_byte)
+          attr_name.push(@data.current_byte.downcase)
        else
-            attrValue.push(@data.currentByte)
+          attr_name.push(@data.current_byte)
        end
+        #Step 6
+        @data.position += 1
+      end
+      #Step 7
+      if space_found
+        @data.skip
+        #Step 8
+        unless @data.current_byte == '='
+          @data.position -= 1
+          return [attr_name.join(''), '']
+        end
+      end
+      #XXX need to advance position in both spaces and value case
+      #Step 9
+      @data.position += 1
+      #Step 10
+      @data.skip
+      #Step 11
+      if ["'", '"'].include?(@data.current_byte)
+        #11.1
+        quote_char = @data.current_byte
        while true
-            @data.position +=1
-            if (SPACE_CHARACTERS + ['>', '<']).include?(@data.currentByte)
-                return [attrName.join(''), attrValue.join('')]
-            elsif ASCII_UPPERCASE.include?(@data.currentByte)
-                attrValue.push(@data.currentByte.downcase)
-            else
-                attrValue.push(@data.currentByte)
-            end
+          @data.position+=1
+          #11.3
+          if @data.current_byte == quote_char
+            @data.position += 1
+            return [attr_name.join(''), attr_value.join('')]
+          #11.4
+          elsif ASCII_UPPERCASE.include?(@data.current_byte)
+            attr_value.push(@data.current_byte.downcase)
+          #11.5
+          else
+            attr_value.push(@data.current_byte)
+          end
        end
+      elsif ['>', '<'].include?(@data.current_byte)
+        return [attr_name.join(''), '']
+      elsif ASCII_UPPERCASE.include?(@data.current_byte)
+        attr_value.push(@data.current_byte.downcase)
+      else
+        attr_value.push(@data.current_byte)
+      end
+      while true
+        @data.position += 1
+        if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
+          return [attr_name.join(''), attr_value.join('')]
+        elsif ASCII_UPPERCASE.include?(@data.current_byte)
+          attr_value.push(@data.current_byte.downcase)
+        else
+          attr_value.push(@data.current_byte)
+        end
+      end
    end
-end
+  end

-class ContentAttrParser
+  class ContentAttrParser
    def initialize(data)
-        @data = data
+      @data = data
    end
+
    def parse
-        begin
-            #Skip to the first ";"
-            @data.position = 0
-            @data.jumpTo(';')
-            @data.position += 1
-            @data.skip
-            #Check if the attr name is charset 
-            #otherwise return
-            @data.jumpTo('charset')
-            @data.position += 1
-            @data.skip
-            unless @data.currentByte == '='
-                #If there is no = sign keep looking for attrs
-                return nil
-            end
-            @data.position += 1
-            @data.skip
-            #Look for an encoding between matching quote marks
-            if ['"', "'"].include?(@data.currentByte)
-                quoteMark = @data.currentByte
-                @data.position += 1
-                oldPosition = @data.position
-                @data.jumpTo(quoteMark)
-                return @data[oldPosition ... @data.position]
-            else
-                #Unquoted value
-                oldPosition = @data.position
-                begin
-                    @data.findNext(SPACE_CHARACTERS)
-                    return @data[oldPosition ... @data.position]
-                rescue EOF
-                    #Return the whole remaining value
-                    return @data[oldPosition .. -1]
-                end
-            end
-        rescue EOF
-            return nil
+      begin
+        #Skip to the first ";"
+        @data.position = 0
+        @data.jump_to(';')
+        @data.position += 1
+        @data.skip
+        #Check if the attr name is charset 
+        #otherwise return
+        @data.jump_to('charset')
+        @data.position += 1
+        @data.skip
+        unless @data.current_byte == '='
+          #If there is no = sign keep looking for attrs
+          return nil
        end
+        @data.position += 1
+        @data.skip
+        #Look for an encoding between matching quote marks
+        if ['"', "'"].include?(@data.current_byte)
+          quote_mark = @data.current_byte
+          @data.position += 1
+          old_position = @data.position
+          @data.jump_to(quote_mark)
+          return @data[old_position ... @data.position]
+        else
+          #Unquoted value
+          old_position = @data.position
+          begin
+            @data.find_next(SPACE_CHARACTERS)
+            return @data[old_position ... @data.position]
+          rescue EOF
+            #Return the whole remaining value
+            return @data[old_position .. -1]
+          end
+        end
+      rescue EOF
+        return nil
+      end
    end
-end
+  end

-# Determine if a string is a supported encoding
-def self.isValidEncoding(encoding)
+  # Determine if a string is a supported encoding
+  def self.is_valid_encoding(encoding)
    (not encoding.nil? and encoding.kind_of?(String) and ENCODINGS.include?(encoding.downcase.strip))
-end
+  end

 end