diff --git a/lib/node.rb b/lib/node.rb deleted file mode 100644 index 34191b90..00000000 --- a/lib/node.rb +++ /dev/null @@ -1,530 +0,0 @@ -require 'strscan' - -module XHTML #:nodoc: - - class Conditions < Hash #:nodoc: - def initialize(hash) - super() - hash = { :content => hash } unless Hash === hash - hash = keys_to_symbols(hash) - hash.each do |k,v| - case k - when :tag, :content then - # keys are valid, and require no further processing - when :attributes then - hash[k] = keys_to_strings(v) - when :parent, :child, :ancestor, :descendant, :sibling, :before, - :after - hash[k] = Conditions.new(v) - when :children - hash[k] = v = keys_to_symbols(v) - v.each do |k,v2| - case k - when :count, :greater_than, :less_than - # keys are valid, and require no further processing - when :only - v[k] = Conditions.new(v2) - else - raise "illegal key #{k.inspect} => #{v2.inspect}" - end - end - else - raise "illegal key #{k.inspect} => #{v.inspect}" - end - end - update hash - end - - private - - def keys_to_strings(hash) - hash.keys.inject({}) do |h,k| - h[k.to_s] = hash[k] - h - end - end - - def keys_to_symbols(hash) - hash.keys.inject({}) do |h,k| - raise "illegal key #{k.inspect}" unless k.respond_to?(:to_sym) - h[k.to_sym] = hash[k] - h - end - end - end - - # The base class of all nodes, textual and otherwise, in an HTML document. - class Node #:nodoc: - # The array of children of this node. Not all nodes have children. - attr_reader :children - - # The parent node of this node. All nodes have a parent, except for the - # root node. - attr_reader :parent - - # The line number of the input where this node was begun - attr_reader :line - - # The byte position in the input where this node was begun - attr_reader :position - - # Create a new node as a child of the given parent. - def initialize(parent, line=0, pos=0) - @parent = parent - @children = [] - @line, @position = line, pos - end - - # Return a textual representation of the node. - def to_s - s = "" - @children.each { |child| s << child.to_s } - s - end - - # Return false (subclasses must override this to provide specific matching - # behavior.) +conditions+ may be of any type. - def match(conditions) - false - end - - # Search the children of this node for the first node for which #find - # returns non +nil+. Returns the result of the #find call that succeeded. - def find(conditions) - conditions = validate_conditions(conditions) - @children.each do |child| - node = child.find(conditions) - return node if node - end - nil - end - - # Search for all nodes that match the given conditions, and return them - # as an array. - def find_all(conditions) - conditions = validate_conditions(conditions) - - matches = [] - matches << self if match(conditions) - @children.each do |child| - matches.concat child.find_all(conditions) - end - matches - end - - # Returns +false+. Subclasses may override this if they define a kind of - # tag. - def tag? - false - end - - def validate_conditions(conditions) - Conditions === conditions ? conditions : Conditions.new(conditions) - end - - def ==(node) - return false unless self.class == node.class && children.size == node.children.size - - equivalent = true - - children.size.times do |i| - equivalent &&= children[i] == node.children[i] - end - - equivalent - end - - class </) - return CDATA.new(parent, line, pos, scanner.pre_match.gsub(/\/]+/) - end - end - attributes[attr] = value - scanner.skip(/\s*/) - end - - closing = ( scanner.scan(/\//) ? :self : nil ) - end - - unless scanner.scan(/\s*>/) - if strict - raise "expected > (got #{scanner.rest.inspect} for #{content}, #{attributes.inspect})" - else - # throw away all text until we find what we're looking for - scanner.skip_until(/>/) or scanner.terminate - end - end - - Tag.new(parent, line, pos, name, attributes, closing) - end - end - end - end - - # A node that represents text, rather than markup. - class Text < Node #:nodoc: - - attr_reader :content - - # Creates a new text node as a child of the given parent, with the given - # content. - def initialize(parent, line, pos, content) - super(parent, line, pos) - @content = content - end - - # Returns the content of this node. - def to_s - @content - end - - # Returns +self+ if this node meets the given conditions. Text nodes support - # conditions of the following kinds: - # - # * if +conditions+ is a string, it must be a substring of the node's - # content - # * if +conditions+ is a regular expression, it must match the node's - # content - # * if +conditions+ is a hash, it must contain a :content key that - # is either a string or a regexp, and which is interpreted as described - # above. - def find(conditions) - match(conditions) && self - end - - # Returns non-+nil+ if this node meets the given conditions, or +nil+ - # otherwise. See the discussion of #find for the valid conditions. - def match(conditions) - case conditions - when String - @content == conditions - when Regexp - @content =~ conditions - when Hash - conditions = validate_conditions(conditions) - - # Text nodes only have :content, :parent, :ancestor - unless (conditions.keys - [:content, :parent, :ancestor]).empty? - return false - end - - match(conditions[:content]) - else - nil - end - end - - def ==(node) - return false unless super - content == node.content - end - end - - # A CDATA node is simply a text node with a specialized way of displaying - # itself. - class CDATA < Text #:nodoc: - def to_s - "" - end - end - - # A Tag is any node that represents markup. It may be an opening tag, a - # closing tag, or a self-closing tag. It has a name, and may have a hash of - # attributes. - class Tag < Node #:nodoc: - - # Either +nil+, :close, or :self - attr_reader :closing - - # Either +nil+, or a hash of attributes for this node. - attr_reader :attributes - - # The name of this tag. - attr_reader :name - - # Create a new node as a child of the given parent, using the given content - # to describe the node. It will be parsed and the node name, attributes and - # closing status extracted. - def initialize(parent, line, pos, name, attributes, closing) - super(parent, line, pos) - @name = name - @attributes = attributes - @closing = closing - end - - # A convenience for obtaining an attribute of the node. Returns +nil+ if - # the node has no attributes. - def [](attr) - @attributes ? @attributes[attr] : nil - end - - # Returns non-+nil+ if this tag can contain child nodes. - def childless?(xml = false) - return false if xml && @closing.nil? - !@closing.nil? || - @name =~ /^(img|br|hr|link|meta|area|base|basefont| - col|frame|input|isindex|param)$/ox - end - - # Returns a textual representation of the node - def to_s - if @closing == :close - "" - else - s = "<#{@name}" - @attributes.each do |k,v| - s << " #{k}" - s << "=\"#{v}\"" if String === v - end - s << " /" if @closing == :self - s << ">" - @children.each { |child| s << child.to_s } - s << "" if @closing != :self && !@children.empty? - s - end - end - - # If either the node or any of its children meet the given conditions, the - # matching node is returned. Otherwise, +nil+ is returned. (See the - # description of the valid conditions in the +match+ method.) - def find(conditions) - match(conditions) && self || super - end - - # Returns +true+, indicating that this node represents an HTML tag. - def tag? - true - end - - # Returns +true+ if the node meets any of the given conditions. The - # +conditions+ parameter must be a hash of any of the following keys - # (all are optional): - # - # * :tag: the node name must match the corresponding value - # * :attributes: a hash. The node's values must match the - # corresponding values in the hash. - # * :parent: a hash. The node's parent must match the - # corresponding hash. - # * :child: a hash. At least one of the node's immediate children - # must meet the criteria described by the hash. - # * :ancestor: a hash. At least one of the node's ancestors must - # meet the criteria described by the hash. - # * :descendant: a hash. At least one of the node's descendants - # must meet the criteria described by the hash. - # * :sibling: a hash. At least one of the node's siblings must - # meet the criteria described by the hash. - # * :after: a hash. The node must be after any sibling meeting - # the criteria described by the hash, and at least one sibling must match. - # * :before: a hash. The node must be before any sibling meeting - # the criteria described by the hash, and at least one sibling must match. - # * :children: a hash, for counting children of a node. Accepts the - # keys: - # ** :count: either a number or a range which must equal (or - # include) the number of children that match. - # ** :less_than: the number of matching children must be less than - # this number. - # ** :greater_than: the number of matching children must be - # greater than this number. - # ** :only: another hash consisting of the keys to use - # to match on the children, and only matching children will be - # counted. - # - # Conditions are matched using the following algorithm: - # - # * if the condition is a string, it must be a substring of the value. - # * if the condition is a regexp, it must match the value. - # * if the condition is a number, the value must match number.to_s. - # * if the condition is +true+, the value must not be +nil+. - # * if the condition is +false+ or +nil+, the value must be +nil+. - # - # Usage: - # - # # test if the node is a "span" tag - # node.match :tag => "span" - # - # # test if the node's parent is a "div" - # node.match :parent => { :tag => "div" } - # - # # test if any of the node's ancestors are "table" tags - # node.match :ancestor => { :tag => "table" } - # - # # test if any of the node's immediate children are "em" tags - # node.match :child => { :tag => "em" } - # - # # test if any of the node's descendants are "strong" tags - # node.match :descendant => { :tag => "strong" } - # - # # test if the node has between 2 and 4 span tags as immediate children - # node.match :children => { :count => 2..4, :only => { :tag => "span" } } - # - # # get funky: test to see if the node is a "div", has a "ul" ancestor - # # and an "li" parent (with "class" = "enum"), and whether or not it has - # # a "span" descendant that contains # text matching /hello world/: - # node.match :tag => "div", - # :ancestor => { :tag => "ul" }, - # :parent => { :tag => "li", - # :attributes => { :class => "enum" } }, - # :descendant => { :tag => "span", - # :child => /hello world/ } - def match(conditions) - conditions = validate_conditions(conditions) - # check content of child nodes - if conditions[:content] - if children.empty? - return false unless match_condition("", conditions[:content]) - else - return false unless children.find { |child| child.match(conditions[:content]) } - end - end - - # test the name - return false unless match_condition(@name, conditions[:tag]) if conditions[:tag] - - # test attributes - (conditions[:attributes] || {}).each do |key, value| - return false unless match_condition(self[key], value) - end - - # test parent - return false unless parent.match(conditions[:parent]) if conditions[:parent] - - # test children - return false unless children.find { |child| child.match(conditions[:child]) } if conditions[:child] - - # test ancestors - if conditions[:ancestor] - return false unless catch :found do - p = self - throw :found, true if p.match(conditions[:ancestor]) while p = p.parent - end - end - - # test descendants - if conditions[:descendant] - return false unless children.find do |child| - # test the child - child.match(conditions[:descendant]) || - # test the child's descendants - child.match(:descendant => conditions[:descendant]) - end - end - - # count children - if opts = conditions[:children] - matches = children.select do |c| - (c.kind_of?(HTML::Tag) and (c.closing == :self or ! c.childless?)) - end - - matches = matches.select { |c| c.match(opts[:only]) } if opts[:only] - opts.each do |key, value| - next if key == :only - case key - when :count - if Integer === value - return false if matches.length != value - else - return false unless value.include?(matches.length) - end - when :less_than - return false unless matches.length < value - when :greater_than - return false unless matches.length > value - else raise "unknown count condition #{key}" - end - end - end - - # test siblings - if conditions[:sibling] || conditions[:before] || conditions[:after] - siblings = parent ? parent.children : [] - self_index = siblings.index(self) - - if conditions[:sibling] - return false unless siblings.detect do |s| - s != self && s.match(conditions[:sibling]) - end - end - - if conditions[:before] - return false unless siblings[self_index+1..-1].detect do |s| - s != self && s.match(conditions[:before]) - end - end - - if conditions[:after] - return false unless siblings[0,self_index].detect do |s| - s != self && s.match(conditions[:after]) - end - end - end - - true - end - - def ==(node) - return false unless super - return false unless closing == node.closing && self.name == node.name - attributes == node.attributes - end - - private - # Match the given value to the given condition. - def match_condition(value, condition) - case condition - when String - value && value == condition - when Regexp - value && value.match(condition) - when Numeric - value == condition.to_s - when true - !value.nil? - when false, nil - value.nil? - else - false - end - end - end -end diff --git a/lib/sanitize.rb b/lib/sanitize.rb index 0e9ca32b..69f8e3e7 100644 --- a/lib/sanitize.rb +++ b/lib/sanitize.rb @@ -1,207 +1,26 @@ module Sanitize -# This module provides sanitization of XHTML+MathML+SVG +# This module provides sanitization of XHTML+MathML+SVG # and of inline style attributes. # -# Based heavily on Sam Ruby's code in the Universal FeedParser. - - require 'html/tokenizer' - require 'node' - - acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', - 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', - 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', - 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', - 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', - 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', - 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', - 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', - 'ul', 'var'] - - mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi', - 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom', - 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub', - 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder', - 'munderover', 'none'] - - svg_elements = ['a', 'animate', 'animateColor', 'animateMotion', - 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face', - 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image', - 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', - 'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect', - 'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use'] - - acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', - 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', - 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', - 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', - 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', - 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', - 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', - 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', - 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', - 'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', 'title', - 'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'] +# Uses the HTML5lib parser, so that the parsing behaviour should +# resemble that of browsers. +# +# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML +# sanitize_html() is a case-insensitive sanitizer suitable for HTML - mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign', - 'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth', - 'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence', - 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace', - 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize', - 'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines', - 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection', - 'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show', - 'xlink:type', 'xmlns', 'xmlns:xlink'] + require 'html5lib/sanitizer' + require 'html5lib/html5parser' + require 'html5lib/liberalxmlparser' + include HTML5lib - - svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic', - 'arabic-form', 'ascent', 'attributeName', 'attributeType', - 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height', - 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx', - 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', - 'font-family', 'font-size', 'font-stretch', 'font-style', 'font-variant', - 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name', - 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x', - 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes', - 'lang', 'marker-end', 'marker-mid', 'marker-start', 'markerHeight', - 'markerUnits', 'markerWidth', 'mathematical', 'max', 'min', 'name', - 'offset', 'opacity', 'orient', 'origin', 'overline-position', - 'overline-thickness', 'panose-1', 'path', 'pathLength', 'points', - 'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount', 'repeatDur', - 'requiredExtensions', 'requiredFeatures', 'restart', 'rotate', 'rx', - 'ry', 'slope', 'stemh', 'stemv', 'stop-color', 'stop-opacity', - 'strikethrough-position', 'strikethrough-thickness', 'stroke', - 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap', - 'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity', - 'stroke-width', 'systemLanguage', 'target', - 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2', - 'underline-position', 'underline-thickness', 'unicode', - 'unicode-range', 'units-per-em', 'values', 'version', 'viewBox', - 'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2', - 'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role', - 'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang', - 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'] + def sanitize_xhtml(html) + XHTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s + end - attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'xlink:href'] - - acceptable_css_properties = ['azimuth', 'background-color', - 'border-bottom-color', 'border-collapse', 'border-color', - 'border-left-color', 'border-right-color', 'border-top-color', 'clear', - 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font', - 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight', - 'height', 'letter-spacing', 'line-height', 'overflow', 'pause', - 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness', - 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation', - 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent', - 'unicode-bidi', 'vertical-align', 'voice-family', 'volume', - 'white-space', 'width'] + def sanitize_html(html) + HTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s + end - acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue', - 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed', - 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left', - 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive', - 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top', - 'transparent', 'underline', 'white', 'yellow'] - - acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule', - 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin', - 'stroke-opacity'] - - acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc', - 'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal', - 'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag', - 'ssh', 'sftp', 'rtsp', 'afs' ] - - ALLOWED_ELEMENTS = acceptable_elements + mathml_elements + svg_elements unless defined?(ALLOWED_ELEMENTS) - ALLOWED_ATTRIBUTES = acceptable_attributes + mathml_attributes + svg_attributes unless defined?(ALLOWED_ATTRIBUTES) - ALLOWED_CSS_PROPERTIES = acceptable_css_properties unless defined?(ALLOWED_CSS_PROPERTIES) - ALLOWED_CSS_KEYWORDS = acceptable_css_keywords unless defined?(ALLOWED_CSS_KEYWORDS) - ALLOWED_SVG_PROPERTIES = acceptable_svg_properties unless defined?(ALLOWED_SVG_PROPERTIES) - ALLOWED_PROTOCOLS = acceptable_protocols unless defined?(ALLOWED_PROTOCOLS) - ATTR_VAL_IS_URI = attr_val_is_uri unless defined?(ATTR_VAL_IS_URI) - - # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and stripping out all - # attributes not in ALLOWED_ATTRIBUTES. Style attributes are parsed, and a restricted set, - # specified by ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through. - # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified in - # ALLOWED_PROTOCOLS are allowed. - # You can adjust what gets sanitized, by defining these constant arrays before this Module is loaded. - # - # sanitize_html('') - # => <script> do_nasty_stuff() </script> - # sanitize_html('Click here for $100') - # => Click here for $100 - def sanitize_xhtml(html) - if html.index("<") - tokenizer = HTML::Tokenizer.new(html) - new_text = "" - - while token = tokenizer.next - node = XHTML::Node.parse(nil, 0, 0, token, false) - new_text << case node.tag? - when true - if ALLOWED_ELEMENTS.include?(node.name) - if node.closing != :close - node.attributes.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) } - ATTR_VAL_IS_URI.each do |attr| - val_unescaped = CGI.unescapeHTML(node.attributes[attr].to_s).gsub(/[\000-\040\177\s]+|\302*[\200-\240]/,'').downcase - if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0]) - node.attributes.delete attr - end - end - if node.attributes['style'] - node.attributes['style'] = sanitize_css(node.attributes['style']) - end - end - node.to_s - else - node.to_s.gsub(/ U(0xC6), - "Aacute" => U(0xC1), - "Acirc" => U(0xC2), - "Agrave" => U(0xC0), - "Alpha" => U(0x0391), - "Aring" => U(0xC5), - "Atilde" => U(0xC3), - "Auml" => U(0xC4), - "Beta" => U(0x0392), - "Ccedil" => U(0xC7), - "Chi" => U(0x03A7), - "Dagger" => U(0x2021), - "Delta" => U(0x0394), - "ETH" => U(0xD0), - "Eacute" => U(0xC9), - "Ecirc" => U(0xCA), - "Egrave" => U(0xC8), - "Epsilon" => U(0x0395), - "Eta" => U(0x0397), - "Euml" => U(0xCB), - "Gamma" => U(0x0393), - "Iacute" => U(0xCD), - "Icirc" => U(0xCE), - "Igrave" => U(0xCC), - "Iota" => U(0x0399), - "Iuml" => U(0xCF), - "Kappa" => U(0x039A), - "Lambda" => U(0x039B), - "Mu" => U(0x039C), - "Ntilde" => U(0xD1), - "Nu" => U(0x039D), - "OElig" => U(0x0152), - "Oacute" => U(0xD3), - "Ocirc" => U(0xD4), - "Ograve" => U(0xD2), - "Omega" => U(0x03A9), - "Omicron" => U(0x039F), - "Oslash" => U(0xD8), - "Otilde" => U(0xD5), - "Ouml" => U(0xD6), - "Phi" => U(0x03A6), - "Pi" => U(0x03A0), - "Prime" => U(0x2033), - "Psi" => U(0x03A8), - "Rho" => U(0x03A1), - "Scaron" => U(0x0160), - "Sigma" => U(0x03A3), - "THORN" => U(0xDE), - "Tau" => U(0x03A4), - "Theta" => U(0x0398), - "Uacute" => U(0xDA), - "Ucirc" => U(0xDB), - "Ugrave" => U(0xD9), - "Upsilon" => U(0x03A5), - "Uuml" => U(0xDC), - "Xi" => U(0x039E), - "Yacute" => U(0xDD), - "Yuml" => U(0x0178), - "Zeta" => U(0x0396), - "aacute" => U(0xE1), - "acirc" => U(0xE2), - "acute" => U(0xB4), - "aelig" => U(0xE6), - "agrave" => U(0xE0), - "alefsym" => U(0x2135), - "alpha" => U(0x03B1), - "amp" => U(0x26), - "AMP" => U(0x26), - "and" => U(0x2227), - "ang" => U(0x2220), - "apos" => U(0x27), - "aring" => U(0xE5), - "asymp" => U(0x2248), - "atilde" => U(0xE3), - "auml" => U(0xE4), - "bdquo" => U(0x201E), - "beta" => U(0x03B2), - "brvbar" => U(0xA6), - "bull" => U(0x2022), - "cap" => U(0x2229), - "ccedil" => U(0xE7), - "cedil" => U(0xB8), - "cent" => U(0xA2), - "chi" => U(0x03C7), - "circ" => U(0x02C6), - "clubs" => U(0x2663), - "cong" => U(0x2245), - "copy" => U(0xA9), - "COPY" => U(0xA9), - "crarr" => U(0x21B5), - "cup" => U(0x222A), - "curren" => U(0xA4), - "dArr" => U(0x21D3), - "dagger" => U(0x2020), - "darr" => U(0x2193), - "deg" => U(0xB0), - "delta" => U(0x03B4), - "diams" => U(0x2666), - "divide" => U(0xF7), - "eacute" => U(0xE9), - "ecirc" => U(0xEA), - "egrave" => U(0xE8), - "empty" => U(0x2205), - "emsp" => U(0x2003), - "ensp" => U(0x2002), - "epsilon" => U(0x03B5), - "equiv" => U(0x2261), - "eta" => U(0x03B7), - "eth" => U(0xF0), - "euml" => U(0xEB), - "euro" => U(0x20AC), - "exist" => U(0x2203), - "fnof" => U(0x0192), - "forall" => U(0x2200), - "frac12" => U(0xBD), - "frac14" => U(0xBC), - "frac34" => U(0xBE), - "frasl" => U(0x2044), - "gamma" => U(0x03B3), - "ge" => U(0x2265), - "gt" => U(0x3E), - "GT" => U(0x3E), - "hArr" => U(0x21D4), - "harr" => U(0x2194), - "hearts" => U(0x2665), - "hellip" => U(0x2026), - "iacute" => U(0xED), - "icirc" => U(0xEE), - "iexcl" => U(0xA1), - "igrave" => U(0xEC), - "image" => U(0x2111), - "infin" => U(0x221E), - "int" => U(0x222B), - "iota" => U(0x03B9), - "iquest" => U(0xBF), - "isin" => U(0x2208), - "iuml" => U(0xEF), - "kappa" => U(0x03BA), - "lArr" => U(0x21D0), - "lambda" => U(0x03BB), - "lang" => U(0x2329), - "laquo" => U(0xAB), - "larr" => U(0x2190), - "lceil" => U(0x2308), - "ldquo" => U(0x201C), - "le" => U(0x2264), - "lfloor" => U(0x230A), - "lowast" => U(0x2217), - "loz" => U(0x25CA), - "lrm" => U(0x200E), - "lsaquo" => U(0x2039), - "lsquo" => U(0x2018), - "lt" => U(0x3C), - "LT" => U(0x3C), - "macr" => U(0xAF), - "mdash" => U(0x2014), - "micro" => U(0xB5), - "middot" => U(0xB7), - "minus" => U(0x2212), - "mu" => U(0x03BC), - "nabla" => U(0x2207), - "nbsp" => U(0xA0), - "ndash" => U(0x2013), - "ne" => U(0x2260), - "ni" => U(0x220B), - "not" => U(0xAC), - "notin" => U(0x2209), - "nsub" => U(0x2284), - "ntilde" => U(0xF1), - "nu" => U(0x03BD), - "oacute" => U(0xF3), - "ocirc" => U(0xF4), - "oelig" => U(0x0153), - "ograve" => U(0xF2), - "oline" => U(0x203E), - "omega" => U(0x03C9), - "omicron" => U(0x03BF), - "oplus" => U(0x2295), - "or" => U(0x2228), - "ordf" => U(0xAA), - "ordm" => U(0xBA), - "oslash" => U(0xF8), - "otilde" => U(0xF5), - "otimes" => U(0x2297), - "ouml" => U(0xF6), - "para" => U(0xB6), - "part" => U(0x2202), - "permil" => U(0x2030), - "perp" => U(0x22A5), - "phi" => U(0x03C6), - "pi" => U(0x03C0), - "piv" => U(0x03D6), - "plusmn" => U(0xB1), - "pound" => U(0xA3), - "prime" => U(0x2032), - "prod" => U(0x220F), - "prop" => U(0x221D), - "psi" => U(0x03C8), - "quot" => U(0x22), - "QUOT" => U(0x22), - "rArr" => U(0x21D2), - "radic" => U(0x221A), - "rang" => U(0x232A), - "raquo" => U(0xBB), - "rarr" => U(0x2192), - "rceil" => U(0x2309), - "rdquo" => U(0x201D), - "real" => U(0x211C), - "reg" => U(0xAE), - "REG" => U(0xAE), - "rfloor" => U(0x230B), - "rho" => U(0x03C1), - "rlm" => U(0x200F), - "rsaquo" => U(0x203A), - "rsquo" => U(0x2019), - "sbquo" => U(0x201A), - "scaron" => U(0x0161), - "sdot" => U(0x22C5), - "sect" => U(0xA7), - "shy" => U(0xAD), - "sigma" => U(0x03C3), - "sigmaf" => U(0x03C2), - "sim" => U(0x223C), - "spades" => U(0x2660), - "sub" => U(0x2282), - "sube" => U(0x2286), - "sum" => U(0x2211), - "sup" => U(0x2283), - "sup1" => U(0xB9), - "sup2" => U(0xB2), - "sup3" => U(0xB3), - "supe" => U(0x2287), - "szlig" => U(0xDF), - "tau" => U(0x03C4), - "there4" => U(0x2234), - "theta" => U(0x03B8), - "thetasym" => U(0x03D1), - "thinsp" => U(0x2009), - "thorn" => U(0xFE), - "tilde" => U(0x02DC), - "times" => U(0xD7), - "trade" => U(0x2122), - "uArr" => U(0x21D1), - "uacute" => U(0xFA), - "uarr" => U(0x2191), - "ucirc" => U(0xFB), - "ugrave" => U(0xF9), - "uml" => U(0xA8), - "upsih" => U(0x03D2), - "upsilon" => U(0x03C5), - "uuml" => U(0xFC), - "weierp" => U(0x2118), - "xi" => U(0x03BE), - "yacute" => U(0xFD), - "yen" => U(0xA5), - "yuml" => U(0xFF), - "zeta" => U(0x03B6), - "zwj" => U(0x200D), - "zwnj" => U(0x200C) -} + ENTITIES = { + "AElig" => U(0xC6), + "Aacute" => U(0xC1), + "Acirc" => U(0xC2), + "Agrave" => U(0xC0), + "Alpha" => U(0x0391), + "Aring" => U(0xC5), + "Atilde" => U(0xC3), + "Auml" => U(0xC4), + "Beta" => U(0x0392), + "Ccedil" => U(0xC7), + "Chi" => U(0x03A7), + "Dagger" => U(0x2021), + "Delta" => U(0x0394), + "ETH" => U(0xD0), + "Eacute" => U(0xC9), + "Ecirc" => U(0xCA), + "Egrave" => U(0xC8), + "Epsilon" => U(0x0395), + "Eta" => U(0x0397), + "Euml" => U(0xCB), + "Gamma" => U(0x0393), + "Iacute" => U(0xCD), + "Icirc" => U(0xCE), + "Igrave" => U(0xCC), + "Iota" => U(0x0399), + "Iuml" => U(0xCF), + "Kappa" => U(0x039A), + "Lambda" => U(0x039B), + "Mu" => U(0x039C), + "Ntilde" => U(0xD1), + "Nu" => U(0x039D), + "OElig" => U(0x0152), + "Oacute" => U(0xD3), + "Ocirc" => U(0xD4), + "Ograve" => U(0xD2), + "Omega" => U(0x03A9), + "Omicron" => U(0x039F), + "Oslash" => U(0xD8), + "Otilde" => U(0xD5), + "Ouml" => U(0xD6), + "Phi" => U(0x03A6), + "Pi" => U(0x03A0), + "Prime" => U(0x2033), + "Psi" => U(0x03A8), + "Rho" => U(0x03A1), + "Scaron" => U(0x0160), + "Sigma" => U(0x03A3), + "THORN" => U(0xDE), + "Tau" => U(0x03A4), + "Theta" => U(0x0398), + "Uacute" => U(0xDA), + "Ucirc" => U(0xDB), + "Ugrave" => U(0xD9), + "Upsilon" => U(0x03A5), + "Uuml" => U(0xDC), + "Xi" => U(0x039E), + "Yacute" => U(0xDD), + "Yuml" => U(0x0178), + "Zeta" => U(0x0396), + "aacute" => U(0xE1), + "acirc" => U(0xE2), + "acute" => U(0xB4), + "aelig" => U(0xE6), + "agrave" => U(0xE0), + "alefsym" => U(0x2135), + "alpha" => U(0x03B1), + "amp" => U(0x26), + "AMP" => U(0x26), + "and" => U(0x2227), + "ang" => U(0x2220), + "apos" => U(0x27), + "aring" => U(0xE5), + "asymp" => U(0x2248), + "atilde" => U(0xE3), + "auml" => U(0xE4), + "bdquo" => U(0x201E), + "beta" => U(0x03B2), + "brvbar" => U(0xA6), + "bull" => U(0x2022), + "cap" => U(0x2229), + "ccedil" => U(0xE7), + "cedil" => U(0xB8), + "cent" => U(0xA2), + "chi" => U(0x03C7), + "circ" => U(0x02C6), + "clubs" => U(0x2663), + "cong" => U(0x2245), + "copy" => U(0xA9), + "COPY" => U(0xA9), + "crarr" => U(0x21B5), + "cup" => U(0x222A), + "curren" => U(0xA4), + "dArr" => U(0x21D3), + "dagger" => U(0x2020), + "darr" => U(0x2193), + "deg" => U(0xB0), + "delta" => U(0x03B4), + "diams" => U(0x2666), + "divide" => U(0xF7), + "eacute" => U(0xE9), + "ecirc" => U(0xEA), + "egrave" => U(0xE8), + "empty" => U(0x2205), + "emsp" => U(0x2003), + "ensp" => U(0x2002), + "epsilon" => U(0x03B5), + "equiv" => U(0x2261), + "eta" => U(0x03B7), + "eth" => U(0xF0), + "euml" => U(0xEB), + "euro" => U(0x20AC), + "exist" => U(0x2203), + "fnof" => U(0x0192), + "forall" => U(0x2200), + "frac12" => U(0xBD), + "frac14" => U(0xBC), + "frac34" => U(0xBE), + "frasl" => U(0x2044), + "gamma" => U(0x03B3), + "ge" => U(0x2265), + "gt" => U(0x3E), + "GT" => U(0x3E), + "hArr" => U(0x21D4), + "harr" => U(0x2194), + "hearts" => U(0x2665), + "hellip" => U(0x2026), + "iacute" => U(0xED), + "icirc" => U(0xEE), + "iexcl" => U(0xA1), + "igrave" => U(0xEC), + "image" => U(0x2111), + "infin" => U(0x221E), + "int" => U(0x222B), + "iota" => U(0x03B9), + "iquest" => U(0xBF), + "isin" => U(0x2208), + "iuml" => U(0xEF), + "kappa" => U(0x03BA), + "lArr" => U(0x21D0), + "lambda" => U(0x03BB), + "lang" => U(0x2329), + "laquo" => U(0xAB), + "larr" => U(0x2190), + "lceil" => U(0x2308), + "ldquo" => U(0x201C), + "le" => U(0x2264), + "lfloor" => U(0x230A), + "lowast" => U(0x2217), + "loz" => U(0x25CA), + "lrm" => U(0x200E), + "lsaquo" => U(0x2039), + "lsquo" => U(0x2018), + "lt" => U(0x3C), + "LT" => U(0x3C), + "macr" => U(0xAF), + "mdash" => U(0x2014), + "micro" => U(0xB5), + "middot" => U(0xB7), + "minus" => U(0x2212), + "mu" => U(0x03BC), + "nabla" => U(0x2207), + "nbsp" => U(0xA0), + "ndash" => U(0x2013), + "ne" => U(0x2260), + "ni" => U(0x220B), + "not" => U(0xAC), + "notin" => U(0x2209), + "nsub" => U(0x2284), + "ntilde" => U(0xF1), + "nu" => U(0x03BD), + "oacute" => U(0xF3), + "ocirc" => U(0xF4), + "oelig" => U(0x0153), + "ograve" => U(0xF2), + "oline" => U(0x203E), + "omega" => U(0x03C9), + "omicron" => U(0x03BF), + "oplus" => U(0x2295), + "or" => U(0x2228), + "ordf" => U(0xAA), + "ordm" => U(0xBA), + "oslash" => U(0xF8), + "otilde" => U(0xF5), + "otimes" => U(0x2297), + "ouml" => U(0xF6), + "para" => U(0xB6), + "part" => U(0x2202), + "permil" => U(0x2030), + "perp" => U(0x22A5), + "phi" => U(0x03C6), + "pi" => U(0x03C0), + "piv" => U(0x03D6), + "plusmn" => U(0xB1), + "pound" => U(0xA3), + "prime" => U(0x2032), + "prod" => U(0x220F), + "prop" => U(0x221D), + "psi" => U(0x03C8), + "quot" => U(0x22), + "QUOT" => U(0x22), + "rArr" => U(0x21D2), + "radic" => U(0x221A), + "rang" => U(0x232A), + "raquo" => U(0xBB), + "rarr" => U(0x2192), + "rceil" => U(0x2309), + "rdquo" => U(0x201D), + "real" => U(0x211C), + "reg" => U(0xAE), + "REG" => U(0xAE), + "rfloor" => U(0x230B), + "rho" => U(0x03C1), + "rlm" => U(0x200F), + "rsaquo" => U(0x203A), + "rsquo" => U(0x2019), + "sbquo" => U(0x201A), + "scaron" => U(0x0161), + "sdot" => U(0x22C5), + "sect" => U(0xA7), + "shy" => U(0xAD), + "sigma" => U(0x03C3), + "sigmaf" => U(0x03C2), + "sim" => U(0x223C), + "spades" => U(0x2660), + "sub" => U(0x2282), + "sube" => U(0x2286), + "sum" => U(0x2211), + "sup" => U(0x2283), + "sup1" => U(0xB9), + "sup2" => U(0xB2), + "sup3" => U(0xB3), + "supe" => U(0x2287), + "szlig" => U(0xDF), + "tau" => U(0x03C4), + "there4" => U(0x2234), + "theta" => U(0x03B8), + "thetasym" => U(0x03D1), + "thinsp" => U(0x2009), + "thorn" => U(0xFE), + "tilde" => U(0x02DC), + "times" => U(0xD7), + "trade" => U(0x2122), + "uArr" => U(0x21D1), + "uacute" => U(0xFA), + "uarr" => U(0x2191), + "ucirc" => U(0xFB), + "ugrave" => U(0xF9), + "uml" => U(0xA8), + "upsih" => U(0x03D2), + "upsilon" => U(0x03C5), + "uuml" => U(0xFC), + "weierp" => U(0x2118), + "xi" => U(0x03BE), + "yacute" => U(0xFD), + "yen" => U(0xA5), + "yuml" => U(0xFF), + "zeta" => U(0x03B6), + "zwj" => U(0x200D), + "zwnj" => U(0x200C) + } -ENCODINGS = %w[ - ansi_x3.4-1968 - iso-ir-6 - ansi_x3.4-1986 - iso_646.irv:1991 - ascii - iso646-us - us-ascii - us - ibm367 - cp367 - csascii - ks_c_5601-1987 - korean - iso-2022-kr - csiso2022kr - euc-kr - iso-2022-jp - csiso2022jp - iso-2022-jp-2 - iso-ir-58 - chinese - csiso58gb231280 - iso_8859-1:1987 - iso-ir-100 - iso_8859-1 - iso-8859-1 - latin1 - l1 - ibm819 - cp819 - csisolatin1 - iso_8859-2:1987 - iso-ir-101 - iso_8859-2 - iso-8859-2 - latin2 - l2 - csisolatin2 - iso_8859-3:1988 - iso-ir-109 - iso_8859-3 - iso-8859-3 - latin3 - l3 - csisolatin3 - iso_8859-4:1988 - iso-ir-110 - iso_8859-4 - iso-8859-4 - latin4 - l4 - csisolatin4 - iso_8859-6:1987 - iso-ir-127 - iso_8859-6 - iso-8859-6 - ecma-114 - asmo-708 - arabic - csisolatinarabic - iso_8859-7:1987 - iso-ir-126 - iso_8859-7 - iso-8859-7 - elot_928 - ecma-118 - greek - greek8 - csisolatingreek - iso_8859-8:1988 - iso-ir-138 - iso_8859-8 - iso-8859-8 - hebrew - csisolatinhebrew - iso_8859-5:1988 - iso-ir-144 - iso_8859-5 - iso-8859-5 - cyrillic - csisolatincyrillic - iso_8859-9:1989 - iso-ir-148 - iso_8859-9 - iso-8859-9 - latin5 - l5 - csisolatin5 - iso-8859-10 - iso-ir-157 - l6 - iso_8859-10:1992 - csisolatin6 - latin6 - hp-roman8 - roman8 - r8 - ibm037 - cp037 - csibm037 - ibm424 - cp424 - csibm424 - ibm437 - cp437 - 437 - cspc8codepage437 - ibm500 - cp500 - csibm500 - ibm775 - cp775 - cspc775baltic - ibm850 - cp850 - 850 - cspc850multilingual - ibm852 - cp852 - 852 - cspcp852 - ibm855 - cp855 - 855 - csibm855 - ibm857 - cp857 - 857 - csibm857 - ibm860 - cp860 - 860 - csibm860 - ibm861 - cp861 - 861 - cp-is - csibm861 - ibm862 - cp862 - 862 - cspc862latinhebrew - ibm863 - cp863 - 863 - csibm863 - ibm864 - cp864 - csibm864 - ibm865 - cp865 - 865 - csibm865 - ibm866 - cp866 - 866 - csibm866 - ibm869 - cp869 - 869 - cp-gr - csibm869 - ibm1026 - cp1026 - csibm1026 - koi8-r - cskoi8r - koi8-u - big5-hkscs - ptcp154 - csptcp154 - pt154 - cp154 - utf-7 - utf-16be - utf-16le - utf-16 - utf-8 - iso-8859-13 - iso-8859-14 - iso-ir-199 - iso_8859-14:1998 - iso_8859-14 - latin8 - iso-celtic - l8 - iso-8859-15 - iso_8859-15 - iso-8859-16 - iso-ir-226 - iso_8859-16:2001 - iso_8859-16 - latin10 - l10 - gbk - cp936 - ms936 - gb18030 - shift_jis - ms_kanji - csshiftjis - euc-jp - gb2312 - big5 - csbig5 - windows-1250 - windows-1251 - windows-1252 - windows-1253 - windows-1254 - windows-1255 - windows-1256 - windows-1257 - windows-1258 - tis-620 - hz-gb-2312 -] + ENCODINGS = %w[ + ansi_x3.4-1968 + iso-ir-6 + ansi_x3.4-1986 + iso_646.irv:1991 + ascii + iso646-us + us-ascii + us + ibm367 + cp367 + csascii + ks_c_5601-1987 + korean + iso-2022-kr + csiso2022kr + euc-kr + iso-2022-jp + csiso2022jp + iso-2022-jp-2 + iso-ir-58 + chinese + csiso58gb231280 + iso_8859-1:1987 + iso-ir-100 + iso_8859-1 + iso-8859-1 + latin1 + l1 + ibm819 + cp819 + csisolatin1 + iso_8859-2:1987 + iso-ir-101 + iso_8859-2 + iso-8859-2 + latin2 + l2 + csisolatin2 + iso_8859-3:1988 + iso-ir-109 + iso_8859-3 + iso-8859-3 + latin3 + l3 + csisolatin3 + iso_8859-4:1988 + iso-ir-110 + iso_8859-4 + iso-8859-4 + latin4 + l4 + csisolatin4 + iso_8859-6:1987 + iso-ir-127 + iso_8859-6 + iso-8859-6 + ecma-114 + asmo-708 + arabic + csisolatinarabic + iso_8859-7:1987 + iso-ir-126 + iso_8859-7 + iso-8859-7 + elot_928 + ecma-118 + greek + greek8 + csisolatingreek + iso_8859-8:1988 + iso-ir-138 + iso_8859-8 + iso-8859-8 + hebrew + csisolatinhebrew + iso_8859-5:1988 + iso-ir-144 + iso_8859-5 + iso-8859-5 + cyrillic + csisolatincyrillic + iso_8859-9:1989 + iso-ir-148 + iso_8859-9 + iso-8859-9 + latin5 + l5 + csisolatin5 + iso-8859-10 + iso-ir-157 + l6 + iso_8859-10:1992 + csisolatin6 + latin6 + hp-roman8 + roman8 + r8 + ibm037 + cp037 + csibm037 + ibm424 + cp424 + csibm424 + ibm437 + cp437 + 437 + cspc8codepage437 + ibm500 + cp500 + csibm500 + ibm775 + cp775 + cspc775baltic + ibm850 + cp850 + 850 + cspc850multilingual + ibm852 + cp852 + 852 + cspcp852 + ibm855 + cp855 + 855 + csibm855 + ibm857 + cp857 + 857 + csibm857 + ibm860 + cp860 + 860 + csibm860 + ibm861 + cp861 + 861 + cp-is + csibm861 + ibm862 + cp862 + 862 + cspc862latinhebrew + ibm863 + cp863 + 863 + csibm863 + ibm864 + cp864 + csibm864 + ibm865 + cp865 + 865 + csibm865 + ibm866 + cp866 + 866 + csibm866 + ibm869 + cp869 + 869 + cp-gr + csibm869 + ibm1026 + cp1026 + csibm1026 + koi8-r + cskoi8r + koi8-u + big5-hkscs + ptcp154 + csptcp154 + pt154 + cp154 + utf-7 + utf-16be + utf-16le + utf-16 + utf-8 + iso-8859-13 + iso-8859-14 + iso-ir-199 + iso_8859-14:1998 + iso_8859-14 + latin8 + iso-celtic + l8 + iso-8859-15 + iso_8859-15 + iso-8859-16 + iso-ir-226 + iso_8859-16:2001 + iso_8859-16 + latin10 + l10 + gbk + cp936 + ms936 + gb18030 + shift_jis + ms_kanji + csshiftjis + euc-jp + gb2312 + big5 + csbig5 + windows-1250 + windows-1251 + windows-1252 + windows-1253 + windows-1254 + windows-1255 + windows-1256 + windows-1257 + windows-1258 + tis-620 + hz-gb-2312 + ] end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb b/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb index abbb89a6..178ed574 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb @@ -2,141 +2,131 @@ require 'html5lib/constants' require 'html5lib/tokenizer' require 'html5lib/treebuilders/rexml' +Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path| + require 'html5lib/html5parser/' + File.basename(path) +end + module HTML5lib -# HTML parser. Generates a tree structure from a stream of (possibly -# malformed) HTML -class HTMLParser + # Error in parsed document + class ParseError < Exception; end + class AssertionError < Exception; end + + # HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML + # + class HTMLParser attr_accessor :phase, :firstStartTag, :innerHTML, :lastPhase, :insertFromTable attr_reader :phases, :tokenizer, :tree, :errors - # convenience methods def self.parse(stream, options = {}) - encoding = options.delete(:encoding) - new(options).parse(stream,encoding) + encoding = options.delete(:encoding) + new(options).parse(stream,encoding) end def self.parseFragment(stream, options = {}) - container = options.delete(:container) || 'div' - encoding = options.delete(:encoding) - new(options).parseFragment(stream,container,encoding) + container = options.delete(:container) || 'div' + encoding = options.delete(:encoding) + new(options).parseFragment(stream,container,encoding) end - @@phases = [ - :initial, - :rootElement, - :beforeHead, - :inHead, - :afterHead, - :inBody, - :inTable, - :inCaption, - :inColumnGroup, - :inTableBody, - :inRow, - :inCell, - :inSelect, - :afterBody, - :inFrameset, - :afterFrameset, - :trailingEnd - ] + @@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption + inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd ) # :strict - raise an exception when a parse error is encountered # :tree - a treebuilder class controlling the type of tree that will be # returned. Built in treebuilders can be accessed through # html5lib.treebuilders.getTreeBuilder(treeType) def initialize(options = {}) - @strict = false - @errors = [] - - @tokenizer = HTMLTokenizer - @tree = TreeBuilders::REXMLTree::TreeBuilder + @strict = false + @errors = [] + + @tokenizer = HTMLTokenizer + @tree = TreeBuilders::REXMLTree::TreeBuilder - options.each { |name, value| instance_variable_set("@#{name}", value) } + options.each { |name, value| instance_variable_set("@#{name}", value) } - @tree = @tree.new + @tree = @tree.new - @phases = @@phases.inject({}) do |phases, symbol| - class_name = symbol.to_s.sub(/(.)/) { $1.upcase } + 'Phase' - phases[symbol] = HTML5lib.const_get(class_name).new(self, @tree) - phases - end + @phases = @@phases.inject({}) do |phases, phase_name| + phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase' + phases[phase_name.to_sym] = HTML5lib.const_get(phase_class_name).new(self, @tree) + phases + end end def _parse(stream, innerHTML, encoding, container = 'div') - @tree.reset - @firstStartTag = false - @errors = [] + @tree.reset + @firstStartTag = false + @errors = [] - @tokenizer = @tokenizer.class unless Class === @tokenizer - @tokenizer = @tokenizer.new(stream, :encoding => encoding, :parseMeta => innerHTML) + @tokenizer = @tokenizer.class unless Class === @tokenizer + @tokenizer = @tokenizer.new(stream, :encoding => encoding, :parseMeta => innerHTML) - if innerHTML - case @innerHTML = container.downcase - when 'title', 'textarea' - @tokenizer.contentModelFlag = :RCDATA - when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript' - @tokenizer.contentModelFlag = :CDATA - when 'plaintext' - @tokenizer.contentModelFlag = :PLAINTEXT - else - # contentModelFlag already is PCDATA - #@tokenizer.contentModelFlag = :PCDATA - end - - @phase = @phases[:rootElement] - @phase.insertHtmlElement - resetInsertionMode - else - @innerHTML = false - @phase = @phases[:initial] + if innerHTML + case @innerHTML = container.downcase + when 'title', 'textarea' + @tokenizer.contentModelFlag = :RCDATA + when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript' + @tokenizer.contentModelFlag = :CDATA + when 'plaintext' + @tokenizer.contentModelFlag = :PLAINTEXT + else + # contentModelFlag already is PCDATA + #@tokenizer.contentModelFlag = :PCDATA end + + @phase = @phases[:rootElement] + @phase.insertHtmlElement + resetInsertionMode + else + @innerHTML = false + @phase = @phases[:initial] + end - # We only seem to have InBodyPhase testcases where the following is - # relevant ... need others too - @lastPhase = nil + # We only seem to have InBodyPhase testcases where the following is + # relevant ... need others too + @lastPhase = nil - # XXX This is temporary for the moment so there isn't any other - # changes needed for the parser to work with the iterable tokenizer - @tokenizer.each do |token| - token = normalizeToken(token) + # XXX This is temporary for the moment so there isn't any other + # changes needed for the parser to work with the iterable tokenizer + @tokenizer.each do |token| + token = normalizeToken(token) - method = 'process%s' % token[:type] + method = 'process%s' % token[:type] - case token[:type] - when :Characters, :SpaceCharacters, :Comment - @phase.send method, token[:data] - when :StartTag, :Doctype - @phase.send method, token[:name], token[:data] - when :EndTag - @phase.send method, token[:name] - else - parseError(token[:data]) - end + case token[:type] + when :Characters, :SpaceCharacters, :Comment + @phase.send method, token[:data] + when :StartTag, :Doctype + @phase.send method, token[:name], token[:data] + when :EndTag + @phase.send method, token[:name] + else + parseError(token[:data]) end + end - # When the loop finishes it's EOF - @phase.processEOF - end - - # Parse a HTML document into a well-formed tree - # - # stream - a filelike object or string containing the HTML to be parsed - # - # The optional encoding parameter must be a string that indicates - # the encoding. If specified, that encoding will be used, - # regardless of any BOM or later declaration (such as in a meta - # element) - def parse(stream, encoding = nil) - _parse(stream, false, encoding) - return @tree.getDocument + # When the loop finishes it's EOF + @phase.processEOF end - + + # Parse a HTML document into a well-formed tree + # + # stream - a filelike object or string containing the HTML to be parsed + # + # The optional encoding parameter must be a string that indicates + # the encoding. If specified, that encoding will be used, + # regardless of any BOM or later declaration (such as in a meta + # element) + def parse(stream, encoding=nil) + _parse(stream, false, encoding) + return @tree.getDocument + end + # Parse a HTML fragment into a well-formed tree fragment - + # container - name of the element we're setting the innerHTML property # if set to nil, default to 'div' # @@ -146,15 +136,15 @@ class HTMLParser # the encoding. If specified, that encoding will be used, # regardless of any BOM or later declaration (such as in a meta # element) - def parseFragment(stream, container = 'div', encoding = nil) - _parse(stream, true, encoding, container) - return @tree.getFragment + def parseFragment(stream, container='div', encoding=nil) + _parse(stream, true, encoding, container) + return @tree.getFragment end def parseError(data = 'XXX ERROR MESSAGE NEEDED') - # XXX The idea is to make data mandatory. - @errors.push([@tokenizer.stream.position, data]) - raise ParseError if @strict + # XXX The idea is to make data mandatory. + @errors.push([@tokenizer.stream.position, data]) + raise ParseError if @strict end # This error is not an error @@ -164,1857 +154,95 @@ class HTMLParser # HTML5 specific normalizations to the token stream def normalizeToken(token) - if token[:type] == :EmptyTag - # When a solidus (/) is encountered within a tag name what happens - # depends on whether the current tag name matches that of a void - # element. If it matches a void element atheists did the wrong - # thing and if it doesn't it's wrong for everyone. + if token[:type] == :EmptyTag + # When a solidus (/) is encountered within a tag name what happens + # depends on whether the current tag name matches that of a void + # element. If it matches a void element atheists did the wrong + # thing and if it doesn't it's wrong for everyone. - if VOID_ELEMENTS.include?(token[:name]) - atheistParseError - else - parseError(_('Solidus (/) incorrectly placed in tag.')) - end - - token[:type] = :StartTag + if VOID_ELEMENTS.include?(token[:name]) + atheistParseError + else + parseError(_('Solidus (/) incorrectly placed in tag.')) end - if token[:type] == :StartTag - token[:name] = token[:name].tr(ASCII_UPPERCASE,ASCII_LOWERCASE) + token[:type] = :StartTag + end - # We need to remove the duplicate attributes and convert attributes - # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"} + if token[:type] == :StartTag + token[:name] = token[:name].tr(ASCII_UPPERCASE,ASCII_LOWERCASE) - if token[:data].length - token[:data] = Hash[*token[:data].reverse.map {|attr,value| - [attr.tr(ASCII_UPPERCASE,ASCII_LOWERCASE),value] - }.flatten] - else - token[:data] = {} - end + # We need to remove the duplicate attributes and convert attributes + # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"} - elsif token[:type] == :EndTag - parseError(_('End tag contains unexpected attributes.')) if token[:data] - token[:name] = token[:name].downcase + unless token[:data].empty? + data = token[:data].reverse.map { |attr, value| [attr.tr(ASCII_UPPERCASE, ASCII_LOWERCASE), value] } + token[:data] = Hash[*data.flatten] end - return token + elsif token[:type] == :EndTag + parseError(_('End tag contains unexpected attributes.')) if token[:data] + token[:name] = token[:name].downcase + end + + return token end @@new_modes = { - 'select' => :inSelect, - 'td' => :inCell, - 'th' => :inCell, - 'tr' => :inRow, - 'tbody' => :inTableBody, - 'thead' => :inTableBody, - 'tfoot' => :inTableBody, - 'caption' => :inCaption, - 'colgroup' => :inColumnGroup, - 'table' => :inTable, - 'head' => :inBody, - 'body' => :inBody, - 'frameset' => :inFrameset + 'select' => :inSelect, + 'td' => :inCell, + 'th' => :inCell, + 'tr' => :inRow, + 'tbody' => :inTableBody, + 'thead' => :inTableBody, + 'tfoot' => :inTableBody, + 'caption' => :inCaption, + 'colgroup' => :inColumnGroup, + 'table' => :inTable, + 'head' => :inBody, + 'body' => :inBody, + 'frameset' => :inFrameset } def resetInsertionMode - # The name of this method is mostly historical. (It's also used in the - # specification.) - last = false + # The name of this method is mostly historical. (It's also used in the + # specification.) + last = false - @tree.openElements.reverse.each do |node| - nodeName = node.name + @tree.openElements.reverse.each do |node| + nodeName = node.name - if node == @tree.openElements[0] - last = true - unless ['td', 'th'].include?(nodeName) - # XXX - # assert @innerHTML - nodeName = @innerHTML - end - end - - # Check for conditions that should only happen in the innerHTML - # case - if ['select', 'colgroup', 'head', 'frameset'].include?(nodeName) - # XXX - # assert @innerHTML - end - - if @@new_modes.has_key?(nodeName) - @phase = @phases[@@new_modes[nodeName]] - elsif nodeName == 'html' - @phase = @phases[@tree.headPointer.nil?? :beforeHead : :afterHead] - elsif last - @phase = @phases[:inBody] - else - next - end - - break + if node == @tree.openElements[0] + last = true + unless ['td', 'th'].include?(nodeName) + # XXX + # assert @innerHTML + nodeName = @innerHTML + end end + + # Check for conditions that should only happen in the innerHTML + # case + if ['select', 'colgroup', 'head', 'frameset'].include?(nodeName) + # XXX + # assert @innerHTML + end + + if @@new_modes.has_key?(nodeName) + @phase = @phases[@@new_modes[nodeName]] + elsif nodeName == 'html' + @phase = @phases[@tree.headPointer.nil?? :beforeHead : :afterHead] + elsif last + @phase = @phases[:inBody] + else + next + end + + break + end end def _(string); string; end -end - -# Base class for helper object that implements each phase of processing -class Phase - # Order should be (they can be omitted) - # * EOF - # * Comment - # * Doctype - # * SpaceCharacters - # * Characters - # * StartTag - # - startTag* methods - # * EndTag - # - endTag* methods - - def self.tag_handler_map(default,array) - array.inject(Hash.new(default)) do |map, (names, value)| - names = [names] unless Array === names - names.each { |name| map[name] = value } - map - end - end - - def self.start_tag_handlers - @start_tag_handlers - end - - def self.handle_start(tags) - @start_tag_handlers = tag_handler_map(:startTagOther, tags) - end - - def self.end_tag_handlers - @end_tag_handlers - end - - def self.handle_end(tags) - @end_tag_handlers = tag_handler_map(:endTagOther, tags) - end - - def initialize(parser, tree) - @parser = parser - @tree = tree - end - - def processEOF - @tree.generateImpliedEndTags - - if @tree.openElements.length > 2 - @parser.parseError(_('Unexpected end of file. Missing closing tags.')) - elsif @tree.openElements.length == 2 and @tree.openElements[1].name != 'body' - # This happens for framesets or something? - @parser.parseError(_("Unexpected end of file. Expected end tag (#{@tree.openElements[1].name}) first.")) - elsif @parser.innerHTML and @tree.openElements.length > 1 - # XXX This is not what the specification says. Not sure what to do here. - @parser.parseError(_('XXX innerHTML EOF')) - end - # Betting ends. - end - - def processComment(data) - # For most phases the following is correct. Where it's not it will be - # overridden. - @tree.insertComment(data, @tree.openElements[-1]) - end - - def processDoctype(name, error) - @parser.parseError(_('Unexpected DOCTYPE. Ignored.')) - end - - def processSpaceCharacters(data) - @tree.insertText(data) - end - - def processStartTag(name, attributes) - send self.class.start_tag_handlers[name], name, attributes - end - - def startTagHtml(name, attributes) - if @parser.firstStartTag == false and name == 'html' - @parser.parseError(_('html needs to be the first start tag.')) - end - # XXX Need a check here to see if the first start tag token emitted is - # this token... If it's not, invoke @parser.parseError. - attributes.each do |attr, value| - unless @tree.openElements[0].attributes.has_key?(attr) - @tree.openElements[0].attributes[attr] = value - end - end - @parser.firstStartTag = false - end - - def processEndTag(name) - send self.class.end_tag_handlers[name], name - end - - def _(string) - string - end - - def assert(value) - throw AssertionError.new unless value - end - - def in_scope?(*args) - @tree.elementInScope(*args) - end - - def remove_open_elements_until(name = nil) - finished = false - until finished - element = @tree.openElements.pop - finished = name.nil?? yield(element) : element.name == name - end - return element - end - -end - - -class InitialPhase < Phase - # This phase deals with error handling as well which is currently not - # covered in the specification. The error handling is typically known as - # "quirks mode". It is expected that a future version of HTML5 will defin - # this. - def processEOF - @parser.parseError(_('Unexpected End of file. Expected DOCTYPE.')) - @parser.phase = @parser.phases[:rootElement] - @parser.phase.processEOF - end - - def processComment(data) - @tree.insertComment(data, @tree.document) - end - - def processDoctype(name, error) - @parser.parseError(_('Erroneous DOCTYPE.')) if error - @tree.insertDoctype(name) - @parser.phase = @parser.phases[:rootElement] - end - - def processSpaceCharacters(data) - @tree.insertText(data, @tree.document) - end - - def processCharacters(data) - @parser.parseError(_('Unexpected non-space characters. Expected DOCTYPE.')) - @parser.phase = @parser.phases[:rootElement] - @parser.phase.processCharacters(data) - end - - def processStartTag(name, attributes) - @parser.parseError(_("Unexpected start tag (#{name}). Expected DOCTYPE.")) - @parser.phase = @parser.phases[:rootElement] - @parser.phase.processStartTag(name, attributes) - end - - def processEndTag(name) - @parser.parseError(_("Unexpected end tag (#{name}). Expected DOCTYPE.")) - @parser.phase = @parser.phases[:rootElement] - @parser.phase.processEndTag(name) - end -end - - -class RootElementPhase < Phase - # helper methods - def insertHtmlElement - element = @tree.createElement('html', {}) - @tree.openElements.push(element) - @tree.document.appendChild(element) - @parser.phase = @parser.phases[:beforeHead] - end - - # other - def processEOF - insertHtmlElement - @parser.phase.processEOF - end - - def processComment(data) - @tree.insertComment(data, @tree.document) - end - - def processSpaceCharacters(data) - @tree.insertText(data, @tree.document) - end - - def processCharacters(data) - insertHtmlElement - @parser.phase.processCharacters(data) - end - - def processStartTag(name, attributes) - @parser.firstStartTag = true if name == 'html' - insertHtmlElement - @parser.phase.processStartTag(name, attributes) - end - - def processEndTag(name) - insertHtmlElement - @parser.phase.processEndTag(name) - end -end - - -class BeforeHeadPhase < Phase - - handle_start [ - ['html', :startTagHtml], - ['head', :startTagHead] - ] - - handle_end [ - ['html', :endTagHtml] - ] - - def processEOF - startTagHead('head', {}) - @parser.phase.processEOF - end - - def processCharacters(data) - startTagHead('head', {}) - @parser.phase.processCharacters(data) - end - - def startTagHead(name, attributes) - @tree.insertElement(name, attributes) - @tree.headPointer = @tree.openElements[-1] - @parser.phase = @parser.phases[:inHead] - end - - def startTagOther(name, attributes) - startTagHead('head', {}) - @parser.phase.processStartTag(name, attributes) - end - - def endTagHtml(name) - startTagHead('head', {}) - @parser.phase.processEndTag(name) - end - - def endTagOther(name) - @parser.parseError(_("Unexpected end tag (#{name}) after the (implied) root element.")) - end -end - -class InHeadPhase < Phase - - handle_start [ - ['html', :startTagHtml], - ['title', :startTagTitle], - ['style', :startTagStyle], - ['script', :startTagScript], - [['base', 'link', 'meta'], :startTagBaseLinkMeta], - ['head', :startTagHead] - ] - - handle_end [ - ['head', :endTagHead], - ['html', :endTagHtml], - [['title', 'style', 'script'], :endTagTitleStyleScript] - ] - - # helper - def appendToHead(element) - if @tree.headPointer.nil? - assert @parser.innerHTML - @tree.openElements[-1].appendChild(element) - else - @tree.headPointer.appendChild(element) - end - end - - # the real thing - def processEOF - if ['title', 'style', 'script'].include?(name = @tree.openElements[-1].name) - @parser.parseError(_("Unexpected end of file. Expected end tag (#{name}).")) - @tree.openElements.pop - end - anythingElse - @parser.phase.processEOF - end - - def processCharacters(data) - if ['title', 'style', 'script'].include?(@tree.openElements[-1].name) - @tree.insertText(data) - else - anythingElse - @parser.phase.processCharacters(data) - end - end - - def startTagHead(name, attributes) - @parser.parseError(_('Unexpected start tag head in existing head. Ignored')) - end - - def startTagTitle(name, attributes) - element = @tree.createElement(name, attributes) - appendToHead(element) - @tree.openElements.push(element) - @parser.tokenizer.contentModelFlag = :RCDATA - end - - def startTagStyle(name, attributes) - element = @tree.createElement(name, attributes) - if @tree.headPointer != nil and @parser.phase == @parser.phases[:inHead] - appendToHead(element) - else - @tree.openElements[-1].appendChild(element) - end - @tree.openElements.push(element) - @parser.tokenizer.contentModelFlag = :CDATA - end - - def startTagScript(name, attributes) - #XXX Inner HTML case may be wrong - element = @tree.createElement(name, attributes) - element._flags.push("parser-inserted") - if (@tree.headPointer != nil and - @parser.phase == @parser.phases[:inHead]) - appendToHead(element) - else - @tree.openElements[-1].appendChild(element) - end - @tree.openElements.push(element) - @parser.tokenizer.contentModelFlag = :CDATA - end - - def startTagBaseLinkMeta(name, attributes) - element = @tree.createElement(name, attributes) - appendToHead(element) - end - - def startTagOther(name, attributes) - anythingElse - @parser.phase.processStartTag(name, attributes) - end - - def endTagHead(name) - if @tree.openElements[-1].name == 'head' - @tree.openElements.pop - else - @parser.parseError(_("Unexpected end tag (head). Ignored.")) - end - @parser.phase = @parser.phases[:afterHead] - end - - def endTagHtml(name) - anythingElse - @parser.phase.processEndTag(name) - end - - def endTagTitleStyleScript(name) - if @tree.openElements[-1].name == name - @tree.openElements.pop - else - @parser.parseError(_("Unexpected end tag (#{name}). Ignored.")) - end - end - - def endTagOther(name) - @parser.parseError(_("Unexpected end tag (#{name}). Ignored.")) - end - - def anythingElse - if @tree.openElements[-1].name == 'head' - endTagHead('head') - else - @parser.phase = @parser.phases[:afterHead] - end - end -end - -class AfterHeadPhase < Phase - - handle_start [ - ['html', :startTagHtml], - ['body', :startTagBody], - ['frameset', :startTagFrameset], - [['base', 'link', 'meta', 'script', 'style', 'title'], :startTagFromHead] - ] - - def processEOF - anythingElse - @parser.phase.processEOF - end - - def processCharacters(data) - anythingElse - @parser.phase.processCharacters(data) - end - - def startTagBody(name, attributes) - @tree.insertElement(name, attributes) - @parser.phase = @parser.phases[:inBody] - end - - def startTagFrameset(name, attributes) - @tree.insertElement(name, attributes) - @parser.phase = @parser.phases[:inFrameset] - end - - def startTagFromHead(name, attributes) - @parser.parseError(_("Unexpected start tag (#{name}) that can be in head. Moved.")) - @parser.phase = @parser.phases[:inHead] - @parser.phase.processStartTag(name, attributes) - end - - def startTagOther(name, attributes) - anythingElse - @parser.phase.processStartTag(name, attributes) - end - - def processEndTag(name) - anythingElse - @parser.phase.processEndTag(name) - end - - def anythingElse - @tree.insertElement('body', {}) - @parser.phase = @parser.phases[:inBody] - end -end - - -class InBodyPhase < Phase - # http://www.whatwg.org/specs/web-apps/current-work/#in-body - # the crazy mode - - handle_start [ - ['html', :startTagHtml], - [['script', 'style'], :startTagScriptStyle], - [['base', 'link', 'meta', 'title'], :startTagFromHead], - ['body', :startTagBody], - [['address', 'blockquote', 'center', 'dir', 'div', 'dl', - 'fieldset', 'listing', 'menu', 'ol', 'p', 'pre', 'ul'], - :startTagCloseP], - ['form', :startTagForm], - [['li', 'dd', 'dt'], :startTagListItem], - ['plaintext',:startTagPlaintext], - [HEADING_ELEMENTS, :startTagHeading], - ['a', :startTagA], - [['b', 'big', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', - 'strong', 'tt', 'u'],:startTagFormatting], - ['button', :startTagButton], - [['marquee', 'object'], :startTagMarqueeObject], - ['xmp', :startTagXmp], - ['table', :startTagTable], - [['area', 'basefont', 'bgsound', 'br', 'embed', 'img', 'param', - 'spacer', 'wbr'], :startTagVoidFormatting], - ['hr', :startTagHr], - ['image', :startTagImage], - ['input', :startTagInput], - ['isindex', :startTagIsIndex], - ['textarea', :startTagTextarea], - [['iframe', 'noembed', 'noframes', 'noscript'], :startTagCdata], - ['select', :startTagSelect], - [['caption', 'col', 'colgroup', 'frame', 'frameset', 'head', - 'option', 'optgroup', 'tbody', 'td', 'tfoot', 'th', 'thead', - 'tr'], :startTagMisplaced], - [['event-source', 'section', 'nav', 'article', 'aside', 'header', - 'footer', 'datagrid', 'command'], :startTagNew] - ] - - handle_end [ - ['p',:endTagP], - ['body',:endTagBody], - ['html',:endTagHtml], - [['address', 'blockquote', 'center', 'div', 'dl', 'fieldset', - 'listing', 'menu', 'ol', 'pre', 'ul'], :endTagBlock], - ['form', :endTagForm], - [['dd', 'dt', 'li'], :endTagListItem], - [HEADING_ELEMENTS, :endTagHeading], - [['a', 'b', 'big', 'em', 'font', 'i', 'nobr', 's', 'small', - 'strike', 'strong', 'tt', 'u'], :endTagFormatting], - [['marquee', 'object', 'button'], :endTagButtonMarqueeObject], - [['head', 'frameset', 'select', 'optgroup', 'option', 'table', - 'caption', 'colgroup', 'col', 'thead', 'tfoot', 'tbody', 'tr', - 'td', 'th'], :endTagMisplaced], - [['area', 'basefont', 'bgsound', 'br', 'embed', 'hr', 'image', - 'img', 'input', 'isindex', 'param', 'spacer', 'wbr', 'frame'], - :endTagNone], - [['noframes', 'noscript', 'noembed', 'textarea', 'xmp', 'iframe'], - :endTagCdataTextAreaXmp], - [['event-source', 'section', 'nav', 'article', 'aside', 'header', - 'footer', 'datagrid', 'command'], :endTagNew] - ] - - def initialize(parser, tree) - super(parser, tree) - - # for special handling of whitespace in
-        @processSpaceCharactersPre = false
-    end
-
-    # helper
-    def addFormattingElement(name, attributes)
-        @tree.insertElement(name, attributes)
-        @tree.activeFormattingElements.push(@tree.openElements[-1])
-    end
-
-    # the real deal
-    def processSpaceCharactersPre(data)
-        #Sometimes (start of 
 blocks) we want to drop leading newlines
-        @processSpaceCharactersPre = false
-        if (data.length > 0 and data[0] == ?\n and 
-            @tree.openElements[-1].name == 'pre' and
-            not @tree.openElements[-1].hasContent)
-            data = data[1..-1]
-        end
-        @tree.insertText(data) if data.length > 0
-    end
-
-    def processSpaceCharacters(data)
-        if @processSpaceCharactersPre
-            processSpaceCharactersPre(data)
-        else
-            super(data)
-        end
-    end
-
-    def processCharacters(data)
-        # XXX The specification says to do this for every character at the
-        # moment, but apparently that doesn't match the real world so we don't
-        # do it for space characters.
-        @tree.reconstructActiveFormattingElements
-        @tree.insertText(data)
-    end
-
-    def startTagScriptStyle(name, attributes)
-        @parser.phases[:inHead].processStartTag(name, attributes)
-    end
-
-    def startTagFromHead(name, attributes)
-        @parser.parseError(_("Unexpected start tag (#{name}) that belongs in the head. Moved."))
-        @parser.phases[:inHead].processStartTag(name, attributes)
-    end
-
-    def startTagBody(name, attributes)
-        @parser.parseError(_('Unexpected start tag (body).'))
-
-        if (@tree.openElements.length == 1 or
-            @tree.openElements[1].name != 'body')
-            assert @parser.innerHTML
-        else
-            attributes.each do |attr, value|
-                unless @tree.openElements[1].attributes.has_key?(attr)
-                    @tree.openElements[1].attributes[attr] = value
-                end
-            end
-        end
-    end
-
-    def startTagCloseP(name, attributes)
-        endTagP('p') if in_scope?('p')
-        @tree.insertElement(name, attributes)
-        @processSpaceCharactersPre = true if name == 'pre'
-    end
-
-    def startTagForm(name, attributes)
-        if @tree.formPointer
-            @parser.parseError('Unexpected start tag (form). Ignored.')
-        else
-            endTagP('p') if in_scope?('p')
-            @tree.insertElement(name, attributes)
-            @tree.formPointer = @tree.openElements[-1]
-        end
-    end
-
-    def startTagListItem(name, attributes)
-        endTagP('p') if in_scope?('p')
-        stopNames = {'li' => ['li'], 'dd' => ['dd', 'dt'], 'dt' => ['dd', 'dt']}
-        stopName = stopNames[name]
-
-        @tree.openElements.reverse.each_with_index do |node,i|
-            if stopName.include?(node.name)
-                (i+1).times { @tree.openElements.pop }
-                break
-            end
-
-            # Phrasing elements are all non special, non scoping, non
-            # formatting elements
-            break if ((SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name) and
-              not ['address', 'div'].include?(node.name))
-        end
-
-        # Always insert an 
  • element. - @tree.insertElement(name, attributes) - end - - def startTagPlaintext(name, attributes) - endTagP('p') if in_scope?('p') - @tree.insertElement(name, attributes) - @parser.tokenizer.contentModelFlag = :PLAINTEXT - end - - def startTagHeading(name, attributes) - endTagP('p') if in_scope?('p') - HEADING_ELEMENTS.each do |element| - if in_scope?(element) - @parser.parseError(_("Unexpected start tag (#{name}).")) - - remove_open_elements_until { |element| HEADING_ELEMENTS.include?(element.name) } - - break - end - end - @tree.insertElement(name, attributes) - end - - def startTagA(name, attributes) - if afeAElement = @tree.elementInActiveFormattingElements('a') - @parser.parseError(_('Unexpected start tag (a) implies end tag (a).')) - endTagFormatting('a') - @tree.openElements.delete(afeAElement) if @tree.openElements.include?(afeAElement) - @tree.activeFormattingElements.delete(afeAElement) if @tree.activeFormattingElements.include?(afeAElement) - end - @tree.reconstructActiveFormattingElements - addFormattingElement(name, attributes) - end - - def startTagFormatting(name, attributes) - @tree.reconstructActiveFormattingElements - addFormattingElement(name, attributes) - end - - def startTagButton(name, attributes) - if in_scope?('button') - @parser.parseError(_('Unexpected start tag (button) implied end tag (button).')) - processEndTag('button') - @parser.phase.processStartTag(name, attributes) - else - @tree.reconstructActiveFormattingElements - @tree.insertElement(name, attributes) - @tree.activeFormattingElements.push(Marker) - end - end - - def startTagMarqueeObject(name, attributes) - @tree.reconstructActiveFormattingElements - @tree.insertElement(name, attributes) - @tree.activeFormattingElements.push(Marker) - end - - def startTagXmp(name, attributes) - @tree.reconstructActiveFormattingElements - @tree.insertElement(name, attributes) - @parser.tokenizer.contentModelFlag = :CDATA - end - - def startTagTable(name, attributes) - processEndTag('p') if in_scope?('p') - @tree.insertElement(name, attributes) - @parser.phase = @parser.phases[:inTable] - end - - def startTagVoidFormatting(name, attributes) - @tree.reconstructActiveFormattingElements - @tree.insertElement(name, attributes) - @tree.openElements.pop - end - - def startTagHr(name, attributes) - endTagP('p') if in_scope?('p') - @tree.insertElement(name, attributes) - @tree.openElements.pop - end - - def startTagImage(name, attributes) - # No really... - @parser.parseError(_('Unexpected start tag (image). Treated as img.')) - processStartTag('img', attributes) - end - - def startTagInput(name, attributes) - @tree.reconstructActiveFormattingElements - @tree.insertElement(name, attributes) - if @tree.formPointer - # XXX Not exactly sure what to do here - # @tree.openElements[-1].form = @tree.formPointer - end - @tree.openElements.pop - end - - def startTagIsIndex(name, attributes) - @parser.parseError("Unexpected start tag isindex. Don't use it!") - return if @tree.formPointer - processStartTag('form', {}) - processStartTag('hr', {}) - processStartTag('p', {}) - processStartTag('label', {}) - # XXX Localization ... - processCharacters('This is a searchable index. Insert your search keywords here:') - attributes['name'] = 'isindex' - attrs = attributes.to_a - processStartTag('input', attributes) - processEndTag('label') - processEndTag('p') - processStartTag('hr', {}) - processEndTag('form') - end - - def startTagTextarea(name, attributes) - # XXX Form element pointer checking here as well... - @tree.insertElement(name, attributes) - @parser.tokenizer.contentModelFlag = :RCDATA - end - - # iframe, noembed noframes, noscript(if scripting enabled) - def startTagCdata(name, attributes) - @tree.insertElement(name, attributes) - @parser.tokenizer.contentModelFlag = :CDATA - end - - def startTagSelect(name, attributes) - @tree.reconstructActiveFormattingElements - @tree.insertElement(name, attributes) - @parser.phase = @parser.phases[:inSelect] - end - - def startTagMisplaced(name, attributes) - # Elements that should be children of other elements that have a - # different insertion mode; here they are ignored - # "caption", "col", "colgroup", "frame", "frameset", "head", - # "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", - # "tr", "noscript" - @parser.parseError(_("Unexpected start tag (#{name}). Ignored.")) - end - - def startTagNew(name, attributes) - # New HTML5 elements, "event-source", "section", "nav", - # "article", "aside", "header", "footer", "datagrid", "command" - sys.stderr.write("Warning: Undefined behaviour for start tag #{name}") - startTagOther(name, attributes) - #raise NotImplementedError - end - - def startTagOther(name, attributes) - @tree.reconstructActiveFormattingElements - @tree.insertElement(name, attributes) - end - - def endTagP(name) - @tree.generateImpliedEndTags('p') if in_scope?('p') - @parser.parseError('Unexpected end tag (p).') unless @tree.openElements[-1].name == 'p' - @tree.openElements.pop while in_scope?('p') - end - - def endTagBody(name) - # XXX Need to take open

    tags into account here. We shouldn't imply - #

    but we should not throw a parse error either. Specification is - # likely to be updated. - unless @tree.openElements[1].name == 'body' - # innerHTML case - @parser.parseError - return - end - unless @tree.openElements[-1].name == 'body' - @parser.parseError(_("Unexpected end tag (body). Missing end tag (#{@tree.openElements[-1].name}).")) - end - @parser.phase = @parser.phases[:afterBody] - end - - def endTagHtml(name) - endTagBody(name) - @parser.phase.processEndTag(name) unless @parser.innerHTML - end - - def endTagBlock(name) - #Put us back in the right whitespace handling mode - @processSpaceCharactersPre = false if name == 'pre' - - @tree.generateImpliedEndTags if in_scope?(name) - - unless @tree.openElements[-1].name == name - @parser.parseError(("End tag (#{name}) seen too early. Expected other end tag.")) - end - - if in_scope?(name) - remove_open_elements_until(name) - end - end - - def endTagForm(name) - endTagBlock(name) - @tree.formPointer = nil - end - - def endTagListItem(name) - # AT Could merge this with the Block case - if in_scope?(name) - @tree.generateImpliedEndTags(name) - - unless @tree.openElements[-1].name == name - @parser.parseError(("End tag (#{name}) seen too early. Expected other end tag.")) - end - end - - remove_open_elements_until(name) if in_scope?(name) - end - - def endTagHeading(name) - HEADING_ELEMENTS.each do |element| - if in_scope?(element) - @tree.generateImpliedEndTags - break - end - end - - unless @tree.openElements[-1].name == name - @parser.parseError(("Unexpected end tag (#{name}). Expected other end tag.")) - end - - HEADING_ELEMENTS.each do |element| - if in_scope?(element) - remove_open_elements_until { |element| HEADING_ELEMENTS.include?(element.name) } - break - end - end - end - - # The much-feared adoption agency algorithm - def endTagFormatting(name) - # http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency - # XXX Better parseError messages appreciated. - while true - # Step 1 paragraph 1 - afeElement = @tree.elementInActiveFormattingElements(name) - if not afeElement or (@tree.openElements.include?(afeElement) and not in_scope?(afeElement.name)) - @parser.parseError(_("End tag (#{name}) violates step 1, paragraph 1 of the adoption agency algorithm.")) - return - # Step 1 paragraph 2 - elsif not @tree.openElements.include?(afeElement) - @parser.parseError(_("End tag (#{name}) violates step 1, paragraph 2 of the adoption agency algorithm.")) - @tree.activeFormattingElements.delete(afeElement) - return - end - - # Step 1 paragraph 3 - if afeElement != @tree.openElements[-1] - @parser.parseError(_("End tag (#{name}) violates step 1, paragraph 3 of the adoption agency algorithm.")) - end - - # Step 2 - # Start of the adoption agency algorithm proper - afeIndex = @tree.openElements.index(afeElement) - furthestBlock = nil - @tree.openElements[afeIndex..-1].each do |element| - if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(element.name) - furthestBlock = element - break - end - end - - # Step 3 - if furthestBlock.nil? - element = remove_open_elements_until { |element| element == afeElement } - @tree.activeFormattingElements.delete(element) - return - end - commonAncestor = @tree.openElements[afeIndex-1] - - # Step 5 - furthestBlock.parent.removeChild(furthestBlock) if furthestBlock.parent - - # Step 6 - # The bookmark is supposed to help us identify where to reinsert - # nodes in step 12. We have to ensure that we reinsert nodes after - # the node before the active formatting element. Note the bookmark - # can move in step 7.4 - bookmark = @tree.activeFormattingElements.index(afeElement) - - # Step 7 - lastNode = node = furthestBlock - while true - # AT replace this with a function and recursion? - # Node is element before node in open elements - node = @tree.openElements[@tree.openElements.index(node)-1] - until @tree.activeFormattingElements.include?(node) - tmpNode = node - node = @tree.openElements[@tree.openElements.index(node)-1] - @tree.openElements.delete(tmpNode) - end - # Step 7.3 - break if node == afeElement - # Step 7.4 - if lastNode == furthestBlock - # XXX should this be index(node) or index(node)+1 - # Anne: I think +1 is ok. Given x = [2,3,4,5] - # x.index(3) gives 1 and then x[1 +1] gives 4... - bookmark = @tree.activeFormattingElements.index(node) + 1 - end - # Step 7.5 - cite = node.parent - if node.hasContent - clone = node.cloneNode - # Replace node with clone - @tree.activeFormattingElements[@tree.activeFormattingElements.index(node)] = clone - @tree.openElements[@tree.openElements.index(node)] = clone - node = clone - end - # Step 7.6 - # Remove lastNode from its parents, if any - lastNode.parent.removeChild(lastNode) if lastNode.parent - node.appendChild(lastNode) - # Step 7.7 - lastNode = node - # End of inner loop - end - - # Step 8 - lastNode.parent.removeChild(lastNode) if lastNode.parent - commonAncestor.appendChild(lastNode) - - # Step 9 - clone = afeElement.cloneNode - - # Step 10 - furthestBlock.reparentChildren(clone) - - # Step 11 - furthestBlock.appendChild(clone) - - # Step 12 - @tree.activeFormattingElements.delete(afeElement) - @tree.activeFormattingElements.insert([bookmark,@tree.activeFormattingElements.length].min, clone) - - # Step 13 - @tree.openElements.delete(afeElement) - @tree.openElements.insert(@tree.openElements.index(furthestBlock) + 1, clone) - end - end - - def endTagButtonMarqueeObject(name) - @tree.generateImpliedEndTags if in_scope?(name) - - unless @tree.openElements[-1].name == name - @parser.parseError(_("Unexpected end tag (#{name}). Expected other end tag first.")) - end - - if in_scope?(name) - remove_open_elements_until(name) - - @tree.clearActiveFormattingElements - end - end - - def endTagMisplaced(name) - # This handles elements with end tags in other insertion modes. - @parser.parseError(_("Unexpected end tag (#{name}). Ignored.")) - end - - def endTagNone(name) - # This handles elements with no end tag. - @parser.parseError(_("This tag (#{name}) has no end tag")) - end - - def endTagCdataTextAreaXmp(name) - if @tree.openElements[-1].name == name - @tree.openElements.pop - else - @parser.parseError(_("Unexpected end tag (#{name}). Ignored.")) - end - end - - def endTagNew(name) - # New HTML5 elements, "event-source", "section", "nav", - # "article", "aside", "header", "footer", "datagrid", "command" - STDERR.puts "Warning: Undefined behaviour for end tag #{name}" - endTagOther(name) - #raise NotImplementedError - end - - def endTagOther(name) - # XXX This logic should be moved into the treebuilder - @tree.openElements.reverse.each do |node| - if node.name == name - @tree.generateImpliedEndTags - - unless @tree.openElements[-1].name == name - @parser.parseError(_("Unexpected end tag (#{name}).")) - end - - remove_open_elements_until { |element| element == node } - - break - else - if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name) - @parser.parseError(_("Unexpected end tag (#{name}). Ignored.")) - break - end - end - end - end -end - -class InTablePhase < Phase - # http://www.whatwg.org/specs/web-apps/current-work/#in-table - - handle_start [ - ['html', :startTagHtml], - ['caption', :startTagCaption], - ['colgroup', :startTagColgroup], - ['col', :startTagCol], - [['tbody', 'tfoot', 'thead'], :startTagRowGroup], - [['td', 'th', 'tr'], :startTagImplyTbody], - ['table', :startTagTable] - ] - - handle_end [ - ['table', :endTagTable], - [['body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'], :endTagIgnore] - ] - - # helper methods - def clearStackToTableContext - # "clear the stack back to a table context" - until ['table', 'html'].include?(name = @tree.openElements[-1].name) - @parser.parseError(_("Unexpected implied end tag (#{name}) in the table phase.")) - @tree.openElements.pop - end - # When the current node is it's an innerHTML case - end - - # processing methods - def processCharacters(data) - @parser.parseError(_("Unexpected non-space characters in table context caused voodoo mode.")) - # Make all the special element rearranging voodoo kick in - @tree.insertFromTable = true - # Process the character in the "in body" mode - @parser.phases[:inBody].processCharacters(data) - @tree.insertFromTable = false - end - - def startTagCaption(name, attributes) - clearStackToTableContext - @tree.activeFormattingElements.push(Marker) - @tree.insertElement(name, attributes) - @parser.phase = @parser.phases[:inCaption] - end - - def startTagColgroup(name, attributes) - clearStackToTableContext - @tree.insertElement(name, attributes) - @parser.phase = @parser.phases[:inColumnGroup] - end - - def startTagCol(name, attributes) - startTagColgroup('colgroup', {}) - @parser.phase.processStartTag(name, attributes) - end - - def startTagRowGroup(name, attributes) - clearStackToTableContext - @tree.insertElement(name, attributes) - @parser.phase = @parser.phases[:inTableBody] - end - - def startTagImplyTbody(name, attributes) - startTagRowGroup('tbody', {}) - @parser.phase.processStartTag(name, attributes) - end - - def startTagTable(name, attributes) - @parser.parseError(_("Unexpected start tag (table) in table phase. Implies end tag (table).")) - @parser.phase.processEndTag('table') - @parser.phase.processStartTag(name, attributes) unless @parser.innerHTML - end - - def startTagOther(name, attributes) - @parser.parseError(_("Unexpected start tag (#{name}) in table context caused voodoo mode.")) - # Make all the special element rearranging voodoo kick in - @tree.insertFromTable = true - # Process the start tag in the "in body" mode - @parser.phases[:inBody].processStartTag(name, attributes) - @tree.insertFromTable = false - end - - def endTagTable(name) - if in_scope?('table', true) - @tree.generateImpliedEndTags - - unless @tree.openElements[-1].name == 'table' - @parser.parseError(_("Unexpected end tag (table). Expected end tag (#{@tree.openElements[-1].name}).")) - end - - remove_open_elements_until('table') - - @parser.resetInsertionMode - else - # innerHTML case - assert @parser.innerHTML - @parser.parseError - end - end - - def endTagIgnore(name) - @parser.parseError(_("Unexpected end tag (#{name}). Ignored.")) - end - - def endTagOther(name) - @parser.parseError(_("Unexpected end tag (#{name}) in table context caused voodoo mode.")) - # Make all the special element rearranging voodoo kick in - @parser.insertFromTable = true - # Process the end tag in the "in body" mode - @parser.phases[:inBody].processEndTag(name) - @parser.insertFromTable = false - end -end - - -class InCaptionPhase < Phase - # http://www.whatwg.org/specs/web-apps/current-work/#in-caption - - handle_start [ - ['html', :startTagHtml], - [['caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'], :startTagTableElement] - ] - - handle_end [ - ['caption', :endTagCaption], - ['table', :endTagTable], - [['body', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'], :endTagIgnore] - ] - - def ignoreEndTagCaption - not in_scope?('caption', true) - end - - def processCharacters(data) - @parser.phases[:inBody].processCharacters(data) - end - - def startTagTableElement(name, attributes) - @parser.parseError - #XXX Have to duplicate logic here to find out if the tag is ignored - ignoreEndTag = ignoreEndTagCaption - @parser.phase.processEndTag('caption') - @parser.phase.processStartTag(name, attributes) unless ignoreEndTag - end - - def startTagOther(name, attributes) - @parser.phases[:inBody].processStartTag(name, attributes) - end - - def endTagCaption(name) - if ignoreEndTagCaption - # innerHTML case - assert @parser.innerHTML - @parser.parseError - else - # AT this code is quite similar to endTagTable in "InTable" - @tree.generateImpliedEndTags - - unless @tree.openElements[-1].name == 'caption' - @parser.parseError(_("Unexpected end tag (caption). Missing end tags.")) - end - - remove_open_elements_until('caption') - - @tree.clearActiveFormattingElements - @parser.phase = @parser.phases[:inTable] - end - end - - def endTagTable(name) - @parser.parseError - ignoreEndTag = ignoreEndTagCaption - @parser.phase.processEndTag('caption') - @parser.phase.processEndTag(name) unless ignoreEndTag - end - - def endTagIgnore(name) - @parser.parseError(_("Unexpected end tag (#{name}). Ignored.")) - end - - def endTagOther(name) - @parser.phases[:inBody].processEndTag(name) - end -end - - -class InColumnGroupPhase < Phase - # http://www.whatwg.org/specs/web-apps/current-work/#in-column - - handle_start [ - ['html', :startTagHtml], - ['col', :startTagCol] - ] - - handle_end [ - ['colgroup', :endTagColgroup], - ['col', :endTagCol] - ] - - def ignoreEndTagColgroup - @tree.openElements[-1].name == 'html' - end - - def processCharacters(data) - ignoreEndTag = ignoreEndTagColgroup - endTagColgroup("colgroup") - @parser.phase.processCharacters(data) unless ignoreEndTag - end - - def startTagCol(name, attributes) - @tree.insertElement(name, attributes) - @tree.openElements.pop - end - - def startTagOther(name, attributes) - ignoreEndTag = ignoreEndTagColgroup - endTagColgroup('colgroup') - @parser.phase.processStartTag(name, attributes) unless ignoreEndTag - end - - def endTagColgroup(name) - if ignoreEndTagColgroup - # innerHTML case - assert @parser.innerHTML - @parser.parseError - else - @tree.openElements.pop - @parser.phase = @parser.phases[:inTable] - end - end - - def endTagCol(name) - @parser.parseError(_('Unexpected end tag (col). col has no end tag.')) - end - - def endTagOther(name) - ignoreEndTag = ignoreEndTagColgroup - endTagColgroup('colgroup') - @parser.phase.processEndTag(name) unless ignoreEndTag - end -end - - -class InTableBodyPhase < Phase - # http://www.whatwg.org/specs/web-apps/current-work/#in-table0 - - handle_start [ - ['html', :startTagHtml], - ['tr', :startTagTr], - [['td', 'th'], :startTagTableCell], - [['caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead'], :startTagTableOther] - ] - - handle_end [ - [['tbody', 'tfoot', 'thead'], :endTagTableRowGroup], - ['table', :endTagTable], - [['body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'], :endTagIgnore] - ] - - # helper methods - def clearStackToTableBodyContext - until ['tbody', 'tfoot', 'thead', 'html'].include?(name = @tree.openElements[-1].name) - @parser.parseError(_("Unexpected implied end tag (#{name}) in the table body phase.")) - @tree.openElements.pop - end - end - - # the rest - def processCharacters(data) - @parser.phases[:inTable].processCharacters(data) - end - - def startTagTr(name, attributes) - clearStackToTableBodyContext - @tree.insertElement(name, attributes) - @parser.phase = @parser.phases[:inRow] - end - - def startTagTableCell(name, attributes) - @parser.parseError(_("Unexpected table cell start tag (#{name}) in the table body phase.")) - startTagTr('tr', {}) - @parser.phase.processStartTag(name, attributes) - end - - def startTagTableOther(name, attributes) - # XXX AT Any ideas on how to share this with endTagTable? - if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true) - clearStackToTableBodyContext - endTagTableRowGroup(@tree.openElements[-1].name) - @parser.phase.processStartTag(name, attributes) - else - # innerHTML case - @parser.parseError - end - end - - def startTagOther(name, attributes) - @parser.phases[:inTable].processStartTag(name, attributes) - end - - def endTagTableRowGroup(name) - if in_scope?(name, true) - clearStackToTableBodyContext - @tree.openElements.pop - @parser.phase = @parser.phases[:inTable] - else - @parser.parseError(_("Unexpected end tag (#{name}) in the table body phase. Ignored.")) - end - end - - def endTagTable(name) - if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true) - clearStackToTableBodyContext - endTagTableRowGroup(@tree.openElements[-1].name) - @parser.phase.processEndTag(name) - else - # innerHTML case - @parser.parseError - end - end - - def endTagIgnore(name) - @parser.parseError(_("Unexpected end tag (#{name}) in the table body phase. Ignored.")) - end - - def endTagOther(name) - @parser.phases[:inTable].processEndTag(name) - end -end - - -class InRowPhase < Phase - # http://www.whatwg.org/specs/web-apps/current-work/#in-row - - handle_start [ - ['html', :startTagHtml], - [['td', 'th'], :startTagTableCell], - [['caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'], :startTagTableOther] - ] - - handle_end [ - ['tr', :endTagTr], - ['table', :endTagTable], - [['tbody', 'tfoot', 'thead'], :endTagTableRowGroup], - [['body', 'caption', 'col', 'colgroup', 'html', 'td', 'th'], :endTagIgnore] - ] - - # helper methods (XXX unify this with other table helper methods) - def clearStackToTableRowContext - until ['tr', 'html'].include?(name = @tree.openElements[-1].name) - @parser.parseError(_("Unexpected implied end tag (#{name}) in the row phase.")) - @tree.openElements.pop - end - end - - def ignoreEndTagTr - not in_scope?('tr', :tableVariant => true) - end - - # the rest - def processCharacters(data) - @parser.phases[:inTable].processCharacters(data) - end - - def startTagTableCell(name, attributes) - clearStackToTableRowContext - @tree.insertElement(name, attributes) - @parser.phase = @parser.phases[:inCell] - @tree.activeFormattingElements.push(Marker) - end - - def startTagTableOther(name, attributes) - ignoreEndTag = ignoreEndTagTr - endTagTr('tr') - # XXX how are we sure it's always ignored in the innerHTML case? - @parser.phase.processStartTag(name, attributes) unless ignoreEndTag - end - - def startTagOther(name, attributes) - @parser.phases[:inTable].processStartTag(name, attributes) - end - - def endTagTr(name) - if ignoreEndTagTr - # innerHTML case - assert @parser.innerHTML - @parser.parseError - else - clearStackToTableRowContext - @tree.openElements.pop - @parser.phase = @parser.phases[:inTableBody] - end - end - - def endTagTable(name) - ignoreEndTag = ignoreEndTagTr - endTagTr('tr') - # Reprocess the current tag if the tr end tag was not ignored - # XXX how are we sure it's always ignored in the innerHTML case? - @parser.phase.processEndTag(name) unless ignoreEndTag - end - - def endTagTableRowGroup(name) - if in_scope?(name, true) - endTagTr('tr') - @parser.phase.processEndTag(name) - else - # innerHTML case - @parser.parseError - end - end - - def endTagIgnore(name) - @parser.parseError(_("Unexpected end tag (#{name}) in the row phase. Ignored.")) - end - - def endTagOther(name) - @parser.phases[:inTable].processEndTag(name) - end -end - -class InCellPhase < Phase - # http://www.whatwg.org/specs/web-apps/current-work/#in-cell - - handle_start [ - ['html', :startTagHtml], - [['caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'], :startTagTableOther] - ] - - handle_end [ - [['td', 'th'], :endTagTableCell], - [['body', 'caption', 'col', 'colgroup', 'html'], :endTagIgnore], - [['table', 'tbody', 'tfoot', 'thead', 'tr'], :endTagImply] - ] - - # helper - def closeCell - if in_scope?('td', true) - endTagTableCell('td') - elsif in_scope?('th', true) - endTagTableCell('th') - end - end - - # the rest - def processCharacters(data) - @parser.phases[:inBody].processCharacters(data) - end - - def startTagTableOther(name, attributes) - if in_scope?('td', true) or in_scope?('th', true) - closeCell - @parser.phase.processStartTag(name, attributes) - else - # innerHTML case - @parser.parseError - end - end - - def startTagOther(name, attributes) - @parser.phases[:inBody].processStartTag(name, attributes) - end - - def endTagTableCell(name) - if in_scope?(name, true) - @tree.generateImpliedEndTags(name) - if @tree.openElements[-1].name != name - @parser.parseError("Got table cell end tag (#{name}) while required end tags are missing.") - - remove_open_elements_until(name) - else - @tree.openElements.pop - end - @tree.clearActiveFormattingElements - @parser.phase = @parser.phases[:inRow] - else - @parser.parseError(_("Unexpected end tag (#{name}). Ignored.")) - end - end - - def endTagIgnore(name) - @parser.parseError(_("Unexpected end tag (#{name}). Ignored.")) - end - - def endTagImply(name) - if in_scope?(name, true) - closeCell - @parser.phase.processEndTag(name) - else - # sometimes innerHTML case - @parser.parseError - end - end - - def endTagOther(name) - @parser.phases[:inBody].processEndTag(name) - end -end - - -class InSelectPhase < Phase - # http://www.whatwg.org/specs/web-apps/current-work/#in-select - - handle_start [ - ['html', :startTagHtml], - ['option', :startTagOption], - ['optgroup', :startTagOptgroup], - ['select', :startTagSelect] - ] - - handle_end [ - ['option', :endTagOption], - ['optgroup', :endTagOptgroup], - ['select', :endTagSelect], - [['caption', 'table', 'tbody', 'tfoot', 'thead', 'tr', 'td', 'th'], :endTagTableElements] - ] - - def processCharacters(data) - @tree.insertText(data) - end - - def startTagOption(name, attributes) - # We need to imply if