instiki/vendor/plugins/HTML5lib/lib/html5/html5parser.rb

require 'html5/constants'
require 'html5/tokenizer'
require 'html5/treebuilders/rexml'

Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
  require 'html5/html5parser/' + File.basename(path)
end

module HTML5

  # Error in parsed document
  class ParseError < Exception; end
  class AssertionError < Exception; end

  # HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
  #
  class HTMLParser

    attr_accessor :phase, :first_start_tag, :inner_html, :last_phase, :insert_from_table

    attr_reader :phases, :tokenizer, :tree, :errors

    def self.parse(stream, options = {})
      encoding = options.delete(:encoding)
      new(options).parse(stream,encoding)
    end

    def self.parse_fragment(stream, options = {})
      container = options.delete(:container) || 'div'
      encoding = options.delete(:encoding)
      new(options).parse_fragment(stream, container, encoding)
    end

    @@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption
      inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd )

    # :strict - raise an exception when a parse error is encountered
    # :tree - a treebuilder class controlling the type of tree that will be
    # returned. Built in treebuilders can be accessed through
    # HTML5::TreeBuilders[treeType]
    def initialize(options = {})
      @strict = false
      @errors = []
     
      @tokenizer =  HTMLTokenizer
      @tree = TreeBuilders::REXML::TreeBuilder

      options.each {|name, value| instance_variable_set("@#{name}", value) }
      @lowercase_attr_name    = nil unless instance_variables.include?("@lowercase_attr_name")
      @lowercase_element_name = nil unless instance_variables.include?("@lowercase_element_name")

      @tree = @tree.new

      @phases = @@phases.inject({}) do |phases, phase_name|
        phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
        phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
        phases
      end
    end

    def _parse(stream, inner_html, encoding, container = 'div')
      @tree.reset
      @first_start_tag = false
      @errors = []

      @tokenizer = @tokenizer.class unless Class === @tokenizer
      @tokenizer = @tokenizer.new(stream, :encoding => encoding,
        :parseMeta => !inner_html, :lowercase_attr_name => @lowercase_attr_name, :lowercase_element_name => @lowercase_element_name)

      if inner_html
        case @inner_html = container.downcase
        when 'title', 'textarea'
          @tokenizer.content_model_flag = :RCDATA
        when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
          @tokenizer.content_model_flag = :CDATA
        when 'plaintext'
          @tokenizer.content_model_flag = :PLAINTEXT
        else
          # content_model_flag already is PCDATA
          @tokenizer.content_model_flag = :PCDATA
        end
      
        @phase = @phases[:rootElement]
        @phase.insert_html_element
        reset_insertion_mode
      else
        @inner_html = false
        @phase = @phases[:initial]
      end

      # We only seem to have InBodyPhase testcases where the following is
      # relevant ... need others too
      @last_phase = nil

      # XXX This is temporary for the moment so there isn't any other
      # changes needed for the parser to work with the iterable tokenizer
      @tokenizer.each do |token|
        token = normalize_token(token)

        method = 'process%s' % token[:type]

        case token[:type]
        when :Characters, :SpaceCharacters, :Comment
          @phase.send method, token[:data]
        when :StartTag
          @phase.send method, token[:name], token[:data]
        when :EndTag
          @phase.send method, token[:name]
        when :Doctype
          @phase.send method, token[:name], token[:publicId],
            token[:systemId], token[:correct]
        else
          parse_error(token[:data], token[:datavars])
        end
      end

      # When the loop finishes it's EOF
      @phase.process_eof
    end

    # Parse a HTML document into a well-formed tree
    #
    # stream - a filelike object or string containing the HTML to be parsed
    #
    # The optional encoding parameter must be a string that indicates
    # the encoding.  If specified, that encoding will be used,
    # regardless of any BOM or later declaration (such as in a meta
    # element)
    def parse(stream, encoding=nil)
      _parse(stream, false, encoding)
      @tree.get_document
    end

    # Parse a HTML fragment into a well-formed tree fragment

    # container - name of the element we're setting the inner_html property
    # if set to nil, default to 'div'
    #
    # stream - a filelike object or string containing the HTML to be parsed
    #
    # The optional encoding parameter must be a string that indicates
    # the encoding.  If specified, that encoding will be used,
    # regardless of any BOM or later declaration (such as in a meta
    # element)
    def parse_fragment(stream, container='div', encoding=nil)
      _parse(stream, true, encoding, container)
      @tree.get_fragment
    end

    def parse_error(code = 'XXX-undefined-error', data = {})
      # XXX The idea is to make data mandatory.
      @errors.push([@tokenizer.stream.position, code, data])
      raise ParseError if @strict
    end

    # HTML5 specific normalizations to the token stream
    def normalize_token(token)

      if token[:type] == :EmptyTag
        # When a solidus (/) is encountered within a tag name what happens
        # depends on whether the current tag name matches that of a void
        # element.  If it matches a void element atheists did the wrong
        # thing and if it doesn't it's wrong for everyone.

        unless VOID_ELEMENTS.include?(token[:name])
          parse_error("incorrectly-placed-solidus")
        end

        token[:type] = :StartTag
      end

      if token[:type] == :StartTag
        token[:name] = token[:name].downcase

        # We need to remove the duplicate attributes and convert attributes
        # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}

        unless token[:data].empty?
          data = token[:data].reverse.map {|attr, value| [attr.downcase, value] }
          token[:data] = Hash[*data.flatten]
        end

      elsif token[:type] == :EndTag
        parse_error("attributes-in-end-tag") unless token[:data].empty?
        token[:name] = token[:name].downcase
      end

      token
    end

    @@new_modes = {
      'select'   => :inSelect,
      'td'       => :inCell,
      'th'       => :inCell,
      'tr'       => :inRow,
      'tbody'    => :inTableBody,
      'thead'    => :inTableBody,
      'tfoot'    => :inTableBody,
      'caption'  => :inCaption,
      'colgroup' => :inColumnGroup,
      'table'    => :inTable,
      'head'     => :inBody,
      'body'     => :inBody,
      'frameset' => :inFrameset
    }

    def reset_insertion_mode
      # The name of this method is mostly historical. (It's also used in the
      # specification.)
      last = false

      @tree.open_elements.reverse.each do |node|
        node_name = node.name

        if node == @tree.open_elements.first
          last = true
          unless ['td', 'th'].include?(node_name)
            # XXX
            # assert @inner_html
            node_name = @inner_html
          end
        end

        # Check for conditions that should only happen in the inner_html
        # case
        if ['select', 'colgroup', 'head', 'frameset'].include?(node_name)
          # XXX
          # assert @inner_html
        end

        if @@new_modes.has_key?(node_name)
          @phase = @phases[@@new_modes[node_name]]
        elsif node_name == 'html'
          @phase = @phases[@tree.head_pointer.nil?? :beforeHead : :afterHead]
        elsif last
          @phase = @phases[:inBody]
        else
          next
        end

        break
      end
    end

    def _(string); string; end
  end

end
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`require 'html5/constants'`
			`require 'html5/tokenizer'`
			`require 'html5/treebuilders/rexml'`

			`Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do \|path\|`
			`require 'html5/html5parser/' + File.basename(path)`
			`end`

			`module HTML5`

			`# Error in parsed document`
			`class ParseError < Exception; end`
			`class AssertionError < Exception; end`

			`# HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML`
			`#`
			`class HTMLParser`

Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`attr_accessor :phase, :first_start_tag, :inner_html, :last_phase, :insert_from_table`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00
			`attr_reader :phases, :tokenizer, :tree, :errors`

			`def self.parse(stream, options = {})`
			`encoding = options.delete(:encoding)`
			`new(options).parse(stream,encoding)`
			`end`

Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`def self.parse_fragment(stream, options = {})`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`container = options.delete(:container) \|\| 'div'`
			`encoding = options.delete(:encoding)`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`new(options).parse_fragment(stream, container, encoding)`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`end`

			`@@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption`
			`inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd )`

			`# :strict - raise an exception when a parse error is encountered`
			`# :tree - a treebuilder class controlling the type of tree that will be`
			`# returned. Built in treebuilders can be accessed through`
			`# HTML5::TreeBuilders[treeType]`
			`def initialize(options = {})`
			`@strict = false`
			`@errors = []`

			`@tokenizer = HTMLTokenizer`
			`@tree = TreeBuilders::REXML::TreeBuilder`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00
			`options.each {\|name, value\| instance_variable_set("@#{name}", value) }`
			`@lowercase_attr_name = nil unless instance_variables.include?("@lowercase_attr_name")`
			`@lowercase_element_name = nil unless instance_variables.include?("@lowercase_element_name")`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00
			`@tree = @tree.new`

			`@phases = @@phases.inject({}) do \|phases, phase_name\|`
			`phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'`
			`phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`phases`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`end`
			`end`

Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`def _parse(stream, inner_html, encoding, container = 'div')`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`@tree.reset`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`@first_start_tag = false`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`@errors = []`

			`@tokenizer = @tokenizer.class unless Class === @tokenizer`
			`@tokenizer = @tokenizer.new(stream, :encoding => encoding,`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`:parseMeta => !inner_html, :lowercase_attr_name => @lowercase_attr_name, :lowercase_element_name => @lowercase_element_name)`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`if inner_html`
			`case @inner_html = container.downcase`
Sync with latest HTML5lib 2007-10-06 18:55:58 +02:00			`when 'title', 'textarea'`
			`@tokenizer.content_model_flag = :RCDATA`
			`when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'`
			`@tokenizer.content_model_flag = :CDATA`
			`when 'plaintext'`
			`@tokenizer.content_model_flag = :PLAINTEXT`
			`else`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`# content_model_flag already is PCDATA`
Sync with latest HTML5lib 2007-10-06 18:55:58 +02:00			`@tokenizer.content_model_flag = :PCDATA`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`end`

			`@phase = @phases[:rootElement]`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`@phase.insert_html_element`
			`reset_insertion_mode`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`else`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`@inner_html = false`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`@phase = @phases[:initial]`
			`end`

			`# We only seem to have InBodyPhase testcases where the following is`
			`# relevant ... need others too`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`@last_phase = nil`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00
			`# XXX This is temporary for the moment so there isn't any other`
			`# changes needed for the parser to work with the iterable tokenizer`
			`@tokenizer.each do \|token\|`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`token = normalize_token(token)`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00
			`method = 'process%s' % token[:type]`

			`case token[:type]`
Update to latest HTML5lib Fix that Tokenizer bug for real this time. 2007-09-10 05:26:19 +02:00			`when :Characters, :SpaceCharacters, :Comment`
			`@phase.send method, token[:data]`
			`when :StartTag`
			`@phase.send method, token[:name], token[:data]`
			`when :EndTag`
			`@phase.send method, token[:name]`
			`when :Doctype`
			`@phase.send method, token[:name], token[:publicId],`
			`token[:systemId], token[:correct]`
			`else`
			`parse_error(token[:data], token[:datavars])`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`end`
			`end`

			`# When the loop finishes it's EOF`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`@phase.process_eof`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`end`

			`# Parse a HTML document into a well-formed tree`
			`#`
			`# stream - a filelike object or string containing the HTML to be parsed`
			`#`
			`# The optional encoding parameter must be a string that indicates`
			`# the encoding. If specified, that encoding will be used,`
			`# regardless of any BOM or later declaration (such as in a meta`
			`# element)`
			`def parse(stream, encoding=nil)`
			`_parse(stream, false, encoding)`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`@tree.get_document`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`end`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`# Parse a HTML fragment into a well-formed tree fragment`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00
			`# container - name of the element we're setting the inner_html property`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`# if set to nil, default to 'div'`
			`#`
			`# stream - a filelike object or string containing the HTML to be parsed`
			`#`
			`# The optional encoding parameter must be a string that indicates`
			`# the encoding. If specified, that encoding will be used,`
			`# regardless of any BOM or later declaration (such as in a meta`
			`# element)`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`def parse_fragment(stream, container='div', encoding=nil)`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`_parse(stream, true, encoding, container)`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`@tree.get_fragment`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`end`

Update to latest HTML5lib Fix that Tokenizer bug for real this time. 2007-09-10 05:26:19 +02:00			`def parse_error(code = 'XXX-undefined-error', data = {})`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`# XXX The idea is to make data mandatory.`
Update to latest HTML5lib Fix that Tokenizer bug for real this time. 2007-09-10 05:26:19 +02:00			`@errors.push([@tokenizer.stream.position, code, data])`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`raise ParseError if @strict`
			`end`

			`# HTML5 specific normalizations to the token stream`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`def normalize_token(token)`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00
			`if token[:type] == :EmptyTag`
			`# When a solidus (/) is encountered within a tag name what happens`
			`# depends on whether the current tag name matches that of a void`
			`# element. If it matches a void element atheists did the wrong`
			`# thing and if it doesn't it's wrong for everyone.`

			`unless VOID_ELEMENTS.include?(token[:name])`
Update to latest HTML5lib Fix that Tokenizer bug for real this time. 2007-09-10 05:26:19 +02:00			`parse_error("incorrectly-placed-solidus")`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`end`

			`token[:type] = :StartTag`
			`end`

			`if token[:type] == :StartTag`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`token[:name] = token[:name].downcase`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00
			`# We need to remove the duplicate attributes and convert attributes`
			`# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}`

			`unless token[:data].empty?`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`data = token[:data].reverse.map {\|attr, value\| [attr.downcase, value] }`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`token[:data] = Hash[*data.flatten]`
			`end`

			`elsif token[:type] == :EndTag`
Update to latest HTML5lib Fix that Tokenizer bug for real this time. 2007-09-10 05:26:19 +02:00			`parse_error("attributes-in-end-tag") unless token[:data].empty?`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`token[:name] = token[:name].downcase`
			`end`

Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`token`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`end`

			`@@new_modes = {`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`'select' => :inSelect,`
			`'td' => :inCell,`
			`'th' => :inCell,`
			`'tr' => :inRow,`
			`'tbody' => :inTableBody,`
			`'thead' => :inTableBody,`
			`'tfoot' => :inTableBody,`
			`'caption' => :inCaption,`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`'colgroup' => :inColumnGroup,`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`'table' => :inTable,`
			`'head' => :inBody,`
			`'body' => :inBody,`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`'frameset' => :inFrameset`
			`}`

Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`def reset_insertion_mode`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`# The name of this method is mostly historical. (It's also used in the`
			`# specification.)`
			`last = false`

Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`@tree.open_elements.reverse.each do \|node\|`
			`node_name = node.name`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`if node == @tree.open_elements.first`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`last = true`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`unless ['td', 'th'].include?(node_name)`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`# XXX`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`# assert @inner_html`
			`node_name = @inner_html`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`end`
			`end`

Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`# Check for conditions that should only happen in the inner_html`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`# case`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`if ['select', 'colgroup', 'head', 'frameset'].include?(node_name)`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`# XXX`
Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`# assert @inner_html`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`end`

Minor S5 tweaks and Sync with Latest HTML5lib 2007-08-30 19:19:10 +02:00			`if @@new_modes.has_key?(node_name)`
			`@phase = @phases[@@new_modes[node_name]]`
			`elsif node_name == 'html'`
			`@phase = @phases[@tree.head_pointer.nil?? :beforeHead : :afterHead]`
Sync with latest HTML5lib and latest Maruku 2007-07-05 00:36:59 +02:00			`elsif last`
			`@phase = @phases[:inBody]`
			`else`
			`next`
			`end`

			`break`
			`end`
			`end`

			`def _(string); string; end`
			`end`

			`end`