Ruby 1.9 Compatibility

Completely removed the html5lib sanitizer. Fixed the string-handling to work in both Ruby 1.8.x and 1.9.2. There are still, inexplicably, two functional tests that fail. But the rest seems to work quite well.
2009-11-30 16:28:18 -06:00 · 2009-11-30 16:28:18 -06:00 · a6429f8c22
commit a6429f8c22
parent 79c8572053
142 changed files with 519 additions and 843 deletions
--- a/attic/vendor/plugins/HTML5lib/lib/html5/html5parser.rb
+++ b/attic/vendor/plugins/HTML5lib/lib/html5/html5parser.rb
@ -0,0 +1,248 @@
+require 'html5/constants'
+require 'html5/tokenizer'
+require 'html5/treebuilders/rexml'
+
+Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
+  require 'html5/html5parser/' + File.basename(path)
+end
+
+module HTML5
+
+  # Error in parsed document
+  class ParseError < Exception; end
+  class AssertionError < Exception; end
+
+  # HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
+  #
+  class HTMLParser
+
+    attr_accessor :phase, :first_start_tag, :inner_html, :last_phase, :insert_from_table
+
+    attr_reader :phases, :tokenizer, :tree, :errors
+
+    def self.parse(stream, options = {})
+      encoding = options.delete(:encoding)
+      new(options).parse(stream,encoding)
+    end
+
+    def self.parse_fragment(stream, options = {})
+      container = options.delete(:container) || 'div'
+      encoding = options.delete(:encoding)
+      new(options).parse_fragment(stream, container, encoding)
+    end
+
+    @@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption
+      inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd )
+
+    # :strict - raise an exception when a parse error is encountered
+    # :tree - a treebuilder class controlling the type of tree that will be
+    # returned. Built in treebuilders can be accessed through
+    # HTML5::TreeBuilders[treeType]
+    def initialize(options = {})
+      @strict = false
+      @errors = []
+     
+      @tokenizer =  HTMLTokenizer
+      @tree = TreeBuilders::REXML::TreeBuilder
+
+      options.each {|name, value| instance_variable_set("@#{name}", value) }
+      @lowercase_attr_name    = nil unless instance_variable_defined?("@lowercase_attr_name")
+      @lowercase_element_name = nil unless instance_variable_defined?("@lowercase_element_name")
+
+      @tree = @tree.new
+
+      @phases = @@phases.inject({}) do |phases, phase_name|
+        phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
+        phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
+        phases
+      end
+    end
+
+    def _parse(stream, inner_html, encoding, container = 'div')
+      @tree.reset
+      @first_start_tag = false
+      @errors = []
+
+      @tokenizer = @tokenizer.class unless Class === @tokenizer
+      @tokenizer = @tokenizer.new(stream, :encoding => encoding,
+        :parseMeta => !inner_html, :lowercase_attr_name => @lowercase_attr_name, :lowercase_element_name => @lowercase_element_name)
+
+      if inner_html
+        case @inner_html = container.downcase
+        when 'title', 'textarea'
+          @tokenizer.content_model_flag = :RCDATA
+        when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
+          @tokenizer.content_model_flag = :CDATA
+        when 'plaintext'
+          @tokenizer.content_model_flag = :PLAINTEXT
+        else
+          # content_model_flag already is PCDATA
+          @tokenizer.content_model_flag = :PCDATA
+        end
+      
+        @phase = @phases[:rootElement]
+        @phase.insert_html_element
+        reset_insertion_mode
+      else
+        @inner_html = false
+        @phase = @phases[:initial]
+      end
+
+      # We only seem to have InBodyPhase testcases where the following is
+      # relevant ... need others too
+      @last_phase = nil
+
+      # XXX This is temporary for the moment so there isn't any other
+      # changes needed for the parser to work with the iterable tokenizer
+      @tokenizer.each do |token|
+        token = normalize_token(token)
+
+        method = 'process%s' % token[:type]
+
+        case token[:type]
+        when :Characters, :SpaceCharacters, :Comment
+          @phase.send method, token[:data]
+        when :StartTag
+          @phase.send method, token[:name], token[:data]
+        when :EndTag
+          @phase.send method, token[:name]
+        when :Doctype
+          @phase.send method, token[:name], token[:publicId],
+            token[:systemId], token[:correct]
+        else
+          parse_error(token[:data], token[:datavars])
+        end
+      end
+
+      # When the loop finishes it's EOF
+      @phase.process_eof
+    end
+
+    # Parse a HTML document into a well-formed tree
+    #
+    # stream - a filelike object or string containing the HTML to be parsed
+    #
+    # The optional encoding parameter must be a string that indicates
+    # the encoding.  If specified, that encoding will be used,
+    # regardless of any BOM or later declaration (such as in a meta
+    # element)
+    def parse(stream, encoding=nil)
+      _parse(stream, false, encoding)
+      @tree.get_document
+    end
+
+    # Parse a HTML fragment into a well-formed tree fragment
+
+    # container - name of the element we're setting the inner_html property
+    # if set to nil, default to 'div'
+    #
+    # stream - a filelike object or string containing the HTML to be parsed
+    #
+    # The optional encoding parameter must be a string that indicates
+    # the encoding.  If specified, that encoding will be used,
+    # regardless of any BOM or later declaration (such as in a meta
+    # element)
+    def parse_fragment(stream, container='div', encoding=nil)
+      _parse(stream, true, encoding, container)
+      @tree.get_fragment
+    end
+
+    def parse_error(code = 'XXX-undefined-error', data = {})
+      # XXX The idea is to make data mandatory.
+      @errors.push([@tokenizer.stream.position, code, data])
+      raise ParseError if @strict
+    end
+
+    # HTML5 specific normalizations to the token stream
+    def normalize_token(token)
+
+      if token[:type] == :EmptyTag
+        # When a solidus (/) is encountered within a tag name what happens
+        # depends on whether the current tag name matches that of a void
+        # element.  If it matches a void element atheists did the wrong
+        # thing and if it doesn't it's wrong for everyone.
+
+        unless VOID_ELEMENTS.include?(token[:name])
+          parse_error("incorrectly-placed-solidus")
+        end
+
+        token[:type] = :StartTag
+      end
+
+      if token[:type] == :StartTag
+        token[:name] = token[:name].downcase
+
+        # We need to remove the duplicate attributes and convert attributes
+        # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
+
+        unless token[:data].empty?
+          data = token[:data].reverse.map {|attr, value| [attr.downcase, value] }
+          token[:data] = Hash[*data.flatten]
+        end
+
+      elsif token[:type] == :EndTag
+        parse_error("attributes-in-end-tag") unless token[:data].empty?
+        token[:name] = token[:name].downcase
+      end
+
+      token
+    end
+
+    @@new_modes = {
+      'select'   => :inSelect,
+      'td'       => :inCell,
+      'th'       => :inCell,
+      'tr'       => :inRow,
+      'tbody'    => :inTableBody,
+      'thead'    => :inTableBody,
+      'tfoot'    => :inTableBody,
+      'caption'  => :inCaption,
+      'colgroup' => :inColumnGroup,
+      'table'    => :inTable,
+      'head'     => :inBody,
+      'body'     => :inBody,
+      'frameset' => :inFrameset
+    }
+
+    def reset_insertion_mode
+      # The name of this method is mostly historical. (It's also used in the
+      # specification.)
+      last = false
+
+      @tree.open_elements.reverse.each do |node|
+        node_name = node.name
+
+        if node == @tree.open_elements.first
+          last = true
+          unless ['td', 'th'].include?(node_name)
+            # XXX
+            # assert @inner_html
+            node_name = @inner_html
+          end
+        end
+
+        # Check for conditions that should only happen in the inner_html
+        # case
+        if ['select', 'colgroup', 'head', 'frameset'].include?(node_name)
+          # XXX
+          # assert @inner_html
+        end
+
+        if @@new_modes.has_key?(node_name)
+          @phase = @phases[@@new_modes[node_name]]
+        elsif node_name == 'html'
+          @phase = @phases[@tree.head_pointer.nil?? :beforeHead : :afterHead]
+        elsif last
+          @phase = @phases[:inBody]
+        else
+          next
+        end
+
+        break
+      end
+    end
+
+    def _(string); string; end
+  end
+
+end