")
+ end
+ end
+ end
+
+ Sanitize::ALLOWED_ATTRIBUTES.each do |attribute_name|
+ define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
+ assert_equal "
foo <bad>bar</bad> baz
",
+ sanitize_html("
foo bar baz
")
+ end
+ end
+
+ Sanitize::ALLOWED_PROTOCOLS.each do |protocol|
+ define_method "test_should_allow_#{protocol}_uris" do
+ assert_equal "foo",
+ sanitize_html(%(foo))
+ end
+ end
+
+ Sanitize::ALLOWED_PROTOCOLS.each do |protocol|
+ define_method "test_should_allow_uppercase_#{protocol}_uris" do
+ assert_equal "foo",
+ sanitize_html(%(foo))
+ end
+ end
+
+ def test_should_allow_anchors
+ assert_equal "<script>baz</script>",
+ sanitize_html("")
+ end
+
+ # RFC 3986, sec 4.2
+ def test_allow_colons_in_path_component
+ assert_equal "foo",
+ sanitize_html("foo")
+ end
+
+ %w(src width height alt).each do |img_attr|
+ define_method "test_should_allow_image_#{img_attr}_attribute" do
+ assert_equal "",
+ sanitize_html("")
+ end
+ end
+
+ def test_should_handle_non_html
+ assert_equal 'abc', sanitize_html("abc")
+ end
+
+ def test_should_handle_blank_text
+ assert_equal '', sanitize_html('')
+ end
+
+ [%w(img src), %w(a href)].each do |(tag, attr)|
+ define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols" do
+ assert_equal %(<#{tag} title="1">boo#{tag}>), sanitize_html(%(<#{tag} #{attr}="javascript:XSS" title="1">boo#{tag}>))
+ end
+ end
+
+ [%w(img src), %w(a href)].each do |(tag, attr)|
+ define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols_and_whitespace" do
+ assert_equal %(<#{tag} title="1">boo#{tag}>), sanitize_html(%(<#{tag} #{attr}=" javascript:XSS" title="1">boo#{tag}>))
+ end
+ end
+
+ [%(),
+ %(),
+ %(),
+ %(),
+ %(),
+ %(),
+ %(),
+ %(),
+ %(),
+ %(),
+ %(),
+ %(),
+ %(),
+ %(),
+ %()].each_with_index do |img_hack, i|
+ define_method "test_should_not_fall_for_xss_image_hack_#{i}" do
+ assert_equal "", sanitize_html(img_hack)
+ end
+ end
+
+ def test_should_sanitize_tag_broken_up_by_null
+ assert_equal "<scr>alert(\"XSS\")</scr>", sanitize_html(%(alert(\"XSS\")))
+ end
+
+ def test_should_sanitize_invalid_script_tag
+ assert_equal "<script /></script>", sanitize_html(%())
+ end
+
+ def test_should_sanitize_script_tag_with_multiple_open_brackets
+ assert_equal "<<script>alert(\"XSS\");//<</script>", sanitize_html(%(<))
+ assert_equal %(<iframe src="http:" /><), sanitize_html(%(", sanitize_html(%())
+ assert_equal "foo",
+ sanitize_html('foo')
+ assert_equal "",
+ sanitize_html('')
+ end
+
+ def test_img_dynsrc_lowsrc
+ assert_equal "",
+ sanitize_html(%())
+ assert_equal "",
+ sanitize_html(%())
+ end
+
+ def test_div_background_image_unicode_encoded
+ assert_equal '
foo
',
+ sanitize_html(%(
foo
))
+ end
+
+ def test_div_expression
+ assert_equal '
foo
',
+ sanitize_html(%(
foo
))
+ end
+
+ def test_img_vbscript
+ assert_equal '',
+ sanitize_html(%())
+ end
+
+end
diff --git a/lib/chunks/engines.rb b/lib/chunks/engines.rb
index 78d062d4..c870541a 100644
--- a/lib/chunks/engines.rb
+++ b/lib/chunks/engines.rb
@@ -32,7 +32,7 @@ module Engines
redcloth.filter_html = false
redcloth.no_span_caps = false
html = redcloth.to_html(:textile)
- sanitize_html(html)
+ sanitize_xhtml(html)
end
end
@@ -43,7 +43,7 @@ module Engines
require_dependency 'maruku'
require_dependency 'maruku/ext/math'
html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"), {:math_enabled => false}).to_html
- sanitize_html(html).to_ncr
+ sanitize_xhtml(html.to_ncr)
end
end
@@ -55,7 +55,7 @@ module Engines
require_dependency 'maruku/ext/math'
html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
{:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html
- sanitize_html(html).to_ncr
+ sanitize_xhtml(html.to_ncr)
end
end
@@ -68,7 +68,7 @@ module Engines
redcloth.filter_html = false
redcloth.no_span_caps = false
html = redcloth.to_html
- sanitize_html(html)
+ sanitize_xhtml(html)
end
end
@@ -78,7 +78,7 @@ module Engines
def mask
require_dependency 'rdocsupport'
html = RDocSupport::RDocFormatter.new(@content).to_html
- sanitize_html(html)
+ sanitize_xhtml(html)
end
end
diff --git a/lib/sanitize.rb b/lib/sanitize.rb
index dcef7b1e..69f8e3e7 100644
--- a/lib/sanitize.rb
+++ b/lib/sanitize.rb
@@ -1,207 +1,26 @@
module Sanitize
-# This module provides sanitization of XHTML+MathML+SVG
+# This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes.
#
-# Based heavily on Sam Ruby's code in the Universal FeedParser.
-
- require 'html/tokenizer'
- require 'node'
-
- acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
- 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
- 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
- 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
- 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
- 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
- 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
- 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
- 'ul', 'var']
-
- mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
- 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
- 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
- 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
- 'munderover', 'none']
-
- svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
- 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
- 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
- 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
- 'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
- 'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
-
- acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
- 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
- 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
- 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
- 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
- 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
- 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
- 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
- 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
- 'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', 'title',
- 'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang']
+# Uses the HTML5lib parser, so that the parsing behaviour should
+# resemble that of browsers.
+#
+# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
+# sanitize_html() is a case-insensitive sanitizer suitable for HTML
- mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
- 'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
- 'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
- 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
- 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
- 'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
- 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
- 'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
- 'xlink:type', 'xmlns', 'xmlns:xlink']
+ require 'html5lib/sanitizer'
+ require 'html5lib/html5parser'
+ require 'html5lib/liberalxmlparser'
+ include HTML5lib
-
- svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
- 'arabic-form', 'ascent', 'attributeName', 'attributeType',
- 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
- 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
- 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
- 'font-family', 'font-size', 'font-stretch', 'font-style', 'font-variant',
- 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
- 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
- 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
- 'lang', 'marker-end', 'marker-mid', 'marker-start', 'markerHeight',
- 'markerUnits', 'markerWidth', 'mathematical', 'max', 'min', 'name',
- 'offset', 'opacity', 'orient', 'origin', 'overline-position',
- 'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
- 'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount', 'repeatDur',
- 'requiredExtensions', 'requiredFeatures', 'restart', 'rotate', 'rx',
- 'ry', 'slope', 'stemh', 'stemv', 'stop-color', 'stop-opacity',
- 'strikethrough-position', 'strikethrough-thickness', 'stroke',
- 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
- 'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
- 'stroke-width', 'systemLanguage', 'target',
- 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
- 'underline-position', 'underline-thickness', 'unicode',
- 'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
- 'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
- 'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
- 'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
- 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
+ def sanitize_xhtml(html)
+ XHTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
+ end
- attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'xlink:href']
-
- acceptable_css_properties = ['azimuth', 'background-color',
- 'border-bottom-color', 'border-collapse', 'border-color',
- 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
- 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
- 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
- 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
- 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
- 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
- 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
- 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
- 'white-space', 'width']
+ def sanitize_html(html)
+ HTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
+ end
- acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
- 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
- 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
- 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
- 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
- 'transparent', 'underline', 'white', 'yellow']
-
- acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
- 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
- 'stroke-opacity']
-
- acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
- 'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
- 'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
- 'ssh', 'sftp', 'rtsp', 'afs' ]
-
- ALLOWED_ELEMENTS = acceptable_elements + mathml_elements + svg_elements unless defined?(ALLOWED_ELEMENTS)
- ALLOWED_ATTRIBUTES = acceptable_attributes + mathml_attributes + svg_attributes unless defined?(ALLOWED_ATTRIBUTES)
- ALLOWED_CSS_PROPERTIES = acceptable_css_properties unless defined?(ALLOWED_CSS_PROPERTIES)
- ALLOWED_CSS_KEYWORDS = acceptable_css_keywords unless defined?(ALLOWED_CSS_KEYWORDS)
- ALLOWED_SVG_PROPERTIES = acceptable_svg_properties unless defined?(ALLOWED_SVG_PROPERTIES)
- ALLOWED_PROTOCOLS = acceptable_protocols unless defined?(ALLOWED_PROTOCOLS)
- ATTR_VAL_IS_URI = attr_val_is_uri unless defined?(ATTR_VAL_IS_URI)
-
- # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and stripping out all
- # attributes not in ALLOWED_ATTRIBUTES. Style attributes are parsed, and a restricted set,
- # specified by ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
- # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified in
- # ALLOWED_PROTOCOLS are allowed.
- # You can adjust what gets sanitized, by defining these constant arrays before this Module is loaded.
- #
- # sanitize_html('')
- # => <script> do_nasty_stuff() </script>
- # sanitize_html('Click here for $100')
- # => Click here for $100
- def sanitize_html(html)
- if html.index("<")
- tokenizer = HTML::Tokenizer.new(html)
- new_text = ""
-
- while token = tokenizer.next
- node = XHTML::Node.parse(nil, 0, 0, token, false)
- new_text << case node.tag?
- when true
- if ALLOWED_ELEMENTS.include?(node.name)
- if node.closing != :close
- node.attributes.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
- ATTR_VAL_IS_URI.each do |attr|
- val_unescaped = CGI.unescapeHTML(node.attributes[attr].to_s).gsub(/[\000-\040\177-\240]+/,'').downcase
- if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
- node.attributes.delete attr
- end
- end
- if node.attributes['style']
- node.attributes['style'] = sanitize_css(node.attributes['style'])
- end
- end
- node.to_s
- else
- node.to_s.gsub(/, "<")
- end
- else
- node.to_s.gsub(/, "<")
- end
- end
-
- html = new_text
- end
- html
- end
-
- def sanitize_css(style)
- # disallow urls
- style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
-
- # gauntlet
- if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
- style = ''
- return style
- end
- if style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
- style = ''
- return style
- end
-
- clean = []
- style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
- if ALLOWED_CSS_PROPERTIES.include?(prop.downcase)
- clean << prop + ': ' + val + ';'
- elsif ['background','border','margin','padding'].include?(prop.split('-')[0].downcase)
- goodval = true
- val.split().each do |keyword|
- if !ALLOWED_CSS_KEYWORDS.include?(keyword) and
- keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
- goodval = false
- end
- end
- if goodval
- clean << prop + ': ' + val + ';'
- end
- elsif ALLOWED_SVG_PROPERTIES.include?(prop.downcase)
- clean << prop + ': ' + val + ';'
- end
- end
-
- style = clean.join(' ')
- end
-end
+end
diff --git a/vendor/plugins/HTML5lib/README b/vendor/plugins/HTML5lib/README
new file mode 100644
index 00000000..c9b3304d
--- /dev/null
+++ b/vendor/plugins/HTML5lib/README
@@ -0,0 +1,9 @@
+= HTML5lib
+
+== Basic Usage
+
+ require 'html5lib'
+
+ doc = HTML5lib.parse('...')
+
+ doc.class # REXML::Document
\ No newline at end of file
diff --git a/vendor/plugins/HTML5lib/Rakefile.rb b/vendor/plugins/HTML5lib/Rakefile.rb
new file mode 100644
index 00000000..36c4692b
--- /dev/null
+++ b/vendor/plugins/HTML5lib/Rakefile.rb
@@ -0,0 +1,7 @@
+require 'rake'
+require 'rake/testtask'
+
+Rake::TestTask.new do |task|
+ task.pattern = 'tests/test_*.rb'
+ task.verbose = true
+end
diff --git a/vendor/plugins/HTML5lib/lib/html5lib.rb b/vendor/plugins/HTML5lib/lib/html5lib.rb
new file mode 100644
index 00000000..b4aba9a9
--- /dev/null
+++ b/vendor/plugins/HTML5lib/lib/html5lib.rb
@@ -0,0 +1,11 @@
+require 'html5lib/html5parser'
+
+module HTML5lib
+ def self.parse(stream, options={})
+ HTMLParser.parse(stream, options)
+ end
+
+ def self.parseFragment(stream, options={})
+ HTMLParser.parse(stream, options)
+ end
+end
diff --git a/vendor/plugins/HTML5lib/lib/html5lib/constants.rb b/vendor/plugins/HTML5lib/lib/html5lib/constants.rb
new file mode 100755
index 00000000..67bc2f96
--- /dev/null
+++ b/vendor/plugins/HTML5lib/lib/html5lib/constants.rb
@@ -0,0 +1,676 @@
+module HTML5lib
+
+class EOF < Exception; end
+
+CONTENT_MODEL_FLAGS = [
+ :PCDATA,
+ :RCDATA,
+ :CDATA,
+ :PLAINTEXT
+]
+
+SCOPING_ELEMENTS = %w[
+ button
+ caption
+ html
+ marquee
+ object
+ table
+ td
+ th
+]
+
+FORMATTING_ELEMENTS = %w[
+ a
+ b
+ big
+ em
+ font
+ i
+ nobr
+ s
+ small
+ strike
+ strong
+ tt
+ u
+]
+
+SPECIAL_ELEMENTS = %w[
+ address
+ area
+ base
+ basefont
+ bgsound
+ blockquote
+ body
+ br
+ center
+ col
+ colgroup
+ dd
+ dir
+ div
+ dl
+ dt
+ embed
+ fieldset
+ form
+ frame
+ frameset
+ h1
+ h2
+ h3
+ h4
+ h5
+ h6
+ head
+ hr
+ iframe
+ image
+ img
+ input
+ isindex
+ li
+ link
+ listing
+ menu
+ meta
+ noembed
+ noframes
+ noscript
+ ol
+ optgroup
+ option
+ p
+ param
+ plaintext
+ pre
+ script
+ select
+ spacer
+ style
+ tbody
+ textarea
+ tfoot
+ thead
+ title
+ tr
+ ul
+ wbr
+]
+
+SPACE_CHARACTERS = %W[
+ \t
+ \n
+ \x0B
+ \x0C
+ \x20
+ \r
+]
+
+TABLE_INSERT_MODE_ELEMENTS = %w[
+ table
+ tbody
+ tfoot
+ thead
+ tr
+]
+
+ASCII_LOWERCASE = ('a'..'z').to_a.join('')
+ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
+ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
+DIGITS = '0'..'9'
+HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
+
+# Heading elements need to be ordered
+HEADING_ELEMENTS = %w[
+ h1
+ h2
+ h3
+ h4
+ h5
+ h6
+]
+
+# XXX What about event-source and command?
+VOID_ELEMENTS = %w[
+ base
+ link
+ meta
+ hr
+ br
+ img
+ embed
+ param
+ area
+ col
+ input
+]
+
+# entitiesWindows1252 has to be _ordered_ and needs to have an index.
+ENTITIES_WINDOWS1252 = [
+ 8364, # 0x80 0x20AC EURO SIGN
+ 65533, # 0x81 UNDEFINED
+ 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
+ 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
+ 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
+ 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
+ 8224, # 0x86 0x2020 DAGGER
+ 8225, # 0x87 0x2021 DOUBLE DAGGER
+ 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
+ 8240, # 0x89 0x2030 PER MILLE SIGN
+ 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
+ 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+ 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
+ 65533, # 0x8D UNDEFINED
+ 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
+ 65533, # 0x8F UNDEFINED
+ 65533, # 0x90 UNDEFINED
+ 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
+ 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
+ 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
+ 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
+ 8226, # 0x95 0x2022 BULLET
+ 8211, # 0x96 0x2013 EN DASH
+ 8212, # 0x97 0x2014 EM DASH
+ 732, # 0x98 0x02DC SMALL TILDE
+ 8482, # 0x99 0x2122 TRADE MARK SIGN
+ 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
+ 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+ 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
+ 65533, # 0x9D UNDEFINED
+ 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
+ 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
+]
+
+private
+
+ def self.U n
+ [n].pack('U')
+ end
+
+public
+
+ENTITIES = {
+ "AElig" => U(0xC6),
+ "Aacute" => U(0xC1),
+ "Acirc" => U(0xC2),
+ "Agrave" => U(0xC0),
+ "Alpha" => U(0x0391),
+ "Aring" => U(0xC5),
+ "Atilde" => U(0xC3),
+ "Auml" => U(0xC4),
+ "Beta" => U(0x0392),
+ "Ccedil" => U(0xC7),
+ "Chi" => U(0x03A7),
+ "Dagger" => U(0x2021),
+ "Delta" => U(0x0394),
+ "ETH" => U(0xD0),
+ "Eacute" => U(0xC9),
+ "Ecirc" => U(0xCA),
+ "Egrave" => U(0xC8),
+ "Epsilon" => U(0x0395),
+ "Eta" => U(0x0397),
+ "Euml" => U(0xCB),
+ "Gamma" => U(0x0393),
+ "Iacute" => U(0xCD),
+ "Icirc" => U(0xCE),
+ "Igrave" => U(0xCC),
+ "Iota" => U(0x0399),
+ "Iuml" => U(0xCF),
+ "Kappa" => U(0x039A),
+ "Lambda" => U(0x039B),
+ "Mu" => U(0x039C),
+ "Ntilde" => U(0xD1),
+ "Nu" => U(0x039D),
+ "OElig" => U(0x0152),
+ "Oacute" => U(0xD3),
+ "Ocirc" => U(0xD4),
+ "Ograve" => U(0xD2),
+ "Omega" => U(0x03A9),
+ "Omicron" => U(0x039F),
+ "Oslash" => U(0xD8),
+ "Otilde" => U(0xD5),
+ "Ouml" => U(0xD6),
+ "Phi" => U(0x03A6),
+ "Pi" => U(0x03A0),
+ "Prime" => U(0x2033),
+ "Psi" => U(0x03A8),
+ "Rho" => U(0x03A1),
+ "Scaron" => U(0x0160),
+ "Sigma" => U(0x03A3),
+ "THORN" => U(0xDE),
+ "Tau" => U(0x03A4),
+ "Theta" => U(0x0398),
+ "Uacute" => U(0xDA),
+ "Ucirc" => U(0xDB),
+ "Ugrave" => U(0xD9),
+ "Upsilon" => U(0x03A5),
+ "Uuml" => U(0xDC),
+ "Xi" => U(0x039E),
+ "Yacute" => U(0xDD),
+ "Yuml" => U(0x0178),
+ "Zeta" => U(0x0396),
+ "aacute" => U(0xE1),
+ "acirc" => U(0xE2),
+ "acute" => U(0xB4),
+ "aelig" => U(0xE6),
+ "agrave" => U(0xE0),
+ "alefsym" => U(0x2135),
+ "alpha" => U(0x03B1),
+ "amp" => U(0x26),
+ "AMP" => U(0x26),
+ "and" => U(0x2227),
+ "ang" => U(0x2220),
+ "apos" => U(0x27),
+ "aring" => U(0xE5),
+ "asymp" => U(0x2248),
+ "atilde" => U(0xE3),
+ "auml" => U(0xE4),
+ "bdquo" => U(0x201E),
+ "beta" => U(0x03B2),
+ "brvbar" => U(0xA6),
+ "bull" => U(0x2022),
+ "cap" => U(0x2229),
+ "ccedil" => U(0xE7),
+ "cedil" => U(0xB8),
+ "cent" => U(0xA2),
+ "chi" => U(0x03C7),
+ "circ" => U(0x02C6),
+ "clubs" => U(0x2663),
+ "cong" => U(0x2245),
+ "copy" => U(0xA9),
+ "COPY" => U(0xA9),
+ "crarr" => U(0x21B5),
+ "cup" => U(0x222A),
+ "curren" => U(0xA4),
+ "dArr" => U(0x21D3),
+ "dagger" => U(0x2020),
+ "darr" => U(0x2193),
+ "deg" => U(0xB0),
+ "delta" => U(0x03B4),
+ "diams" => U(0x2666),
+ "divide" => U(0xF7),
+ "eacute" => U(0xE9),
+ "ecirc" => U(0xEA),
+ "egrave" => U(0xE8),
+ "empty" => U(0x2205),
+ "emsp" => U(0x2003),
+ "ensp" => U(0x2002),
+ "epsilon" => U(0x03B5),
+ "equiv" => U(0x2261),
+ "eta" => U(0x03B7),
+ "eth" => U(0xF0),
+ "euml" => U(0xEB),
+ "euro" => U(0x20AC),
+ "exist" => U(0x2203),
+ "fnof" => U(0x0192),
+ "forall" => U(0x2200),
+ "frac12" => U(0xBD),
+ "frac14" => U(0xBC),
+ "frac34" => U(0xBE),
+ "frasl" => U(0x2044),
+ "gamma" => U(0x03B3),
+ "ge" => U(0x2265),
+ "gt" => U(0x3E),
+ "GT" => U(0x3E),
+ "hArr" => U(0x21D4),
+ "harr" => U(0x2194),
+ "hearts" => U(0x2665),
+ "hellip" => U(0x2026),
+ "iacute" => U(0xED),
+ "icirc" => U(0xEE),
+ "iexcl" => U(0xA1),
+ "igrave" => U(0xEC),
+ "image" => U(0x2111),
+ "infin" => U(0x221E),
+ "int" => U(0x222B),
+ "iota" => U(0x03B9),
+ "iquest" => U(0xBF),
+ "isin" => U(0x2208),
+ "iuml" => U(0xEF),
+ "kappa" => U(0x03BA),
+ "lArr" => U(0x21D0),
+ "lambda" => U(0x03BB),
+ "lang" => U(0x2329),
+ "laquo" => U(0xAB),
+ "larr" => U(0x2190),
+ "lceil" => U(0x2308),
+ "ldquo" => U(0x201C),
+ "le" => U(0x2264),
+ "lfloor" => U(0x230A),
+ "lowast" => U(0x2217),
+ "loz" => U(0x25CA),
+ "lrm" => U(0x200E),
+ "lsaquo" => U(0x2039),
+ "lsquo" => U(0x2018),
+ "lt" => U(0x3C),
+ "LT" => U(0x3C),
+ "macr" => U(0xAF),
+ "mdash" => U(0x2014),
+ "micro" => U(0xB5),
+ "middot" => U(0xB7),
+ "minus" => U(0x2212),
+ "mu" => U(0x03BC),
+ "nabla" => U(0x2207),
+ "nbsp" => U(0xA0),
+ "ndash" => U(0x2013),
+ "ne" => U(0x2260),
+ "ni" => U(0x220B),
+ "not" => U(0xAC),
+ "notin" => U(0x2209),
+ "nsub" => U(0x2284),
+ "ntilde" => U(0xF1),
+ "nu" => U(0x03BD),
+ "oacute" => U(0xF3),
+ "ocirc" => U(0xF4),
+ "oelig" => U(0x0153),
+ "ograve" => U(0xF2),
+ "oline" => U(0x203E),
+ "omega" => U(0x03C9),
+ "omicron" => U(0x03BF),
+ "oplus" => U(0x2295),
+ "or" => U(0x2228),
+ "ordf" => U(0xAA),
+ "ordm" => U(0xBA),
+ "oslash" => U(0xF8),
+ "otilde" => U(0xF5),
+ "otimes" => U(0x2297),
+ "ouml" => U(0xF6),
+ "para" => U(0xB6),
+ "part" => U(0x2202),
+ "permil" => U(0x2030),
+ "perp" => U(0x22A5),
+ "phi" => U(0x03C6),
+ "pi" => U(0x03C0),
+ "piv" => U(0x03D6),
+ "plusmn" => U(0xB1),
+ "pound" => U(0xA3),
+ "prime" => U(0x2032),
+ "prod" => U(0x220F),
+ "prop" => U(0x221D),
+ "psi" => U(0x03C8),
+ "quot" => U(0x22),
+ "QUOT" => U(0x22),
+ "rArr" => U(0x21D2),
+ "radic" => U(0x221A),
+ "rang" => U(0x232A),
+ "raquo" => U(0xBB),
+ "rarr" => U(0x2192),
+ "rceil" => U(0x2309),
+ "rdquo" => U(0x201D),
+ "real" => U(0x211C),
+ "reg" => U(0xAE),
+ "REG" => U(0xAE),
+ "rfloor" => U(0x230B),
+ "rho" => U(0x03C1),
+ "rlm" => U(0x200F),
+ "rsaquo" => U(0x203A),
+ "rsquo" => U(0x2019),
+ "sbquo" => U(0x201A),
+ "scaron" => U(0x0161),
+ "sdot" => U(0x22C5),
+ "sect" => U(0xA7),
+ "shy" => U(0xAD),
+ "sigma" => U(0x03C3),
+ "sigmaf" => U(0x03C2),
+ "sim" => U(0x223C),
+ "spades" => U(0x2660),
+ "sub" => U(0x2282),
+ "sube" => U(0x2286),
+ "sum" => U(0x2211),
+ "sup" => U(0x2283),
+ "sup1" => U(0xB9),
+ "sup2" => U(0xB2),
+ "sup3" => U(0xB3),
+ "supe" => U(0x2287),
+ "szlig" => U(0xDF),
+ "tau" => U(0x03C4),
+ "there4" => U(0x2234),
+ "theta" => U(0x03B8),
+ "thetasym" => U(0x03D1),
+ "thinsp" => U(0x2009),
+ "thorn" => U(0xFE),
+ "tilde" => U(0x02DC),
+ "times" => U(0xD7),
+ "trade" => U(0x2122),
+ "uArr" => U(0x21D1),
+ "uacute" => U(0xFA),
+ "uarr" => U(0x2191),
+ "ucirc" => U(0xFB),
+ "ugrave" => U(0xF9),
+ "uml" => U(0xA8),
+ "upsih" => U(0x03D2),
+ "upsilon" => U(0x03C5),
+ "uuml" => U(0xFC),
+ "weierp" => U(0x2118),
+ "xi" => U(0x03BE),
+ "yacute" => U(0xFD),
+ "yen" => U(0xA5),
+ "yuml" => U(0xFF),
+ "zeta" => U(0x03B6),
+ "zwj" => U(0x200D),
+ "zwnj" => U(0x200C)
+}
+
+ENCODINGS = %w[
+ ansi_x3.4-1968
+ iso-ir-6
+ ansi_x3.4-1986
+ iso_646.irv:1991
+ ascii
+ iso646-us
+ us-ascii
+ us
+ ibm367
+ cp367
+ csascii
+ ks_c_5601-1987
+ korean
+ iso-2022-kr
+ csiso2022kr
+ euc-kr
+ iso-2022-jp
+ csiso2022jp
+ iso-2022-jp-2
+ iso-ir-58
+ chinese
+ csiso58gb231280
+ iso_8859-1:1987
+ iso-ir-100
+ iso_8859-1
+ iso-8859-1
+ latin1
+ l1
+ ibm819
+ cp819
+ csisolatin1
+ iso_8859-2:1987
+ iso-ir-101
+ iso_8859-2
+ iso-8859-2
+ latin2
+ l2
+ csisolatin2
+ iso_8859-3:1988
+ iso-ir-109
+ iso_8859-3
+ iso-8859-3
+ latin3
+ l3
+ csisolatin3
+ iso_8859-4:1988
+ iso-ir-110
+ iso_8859-4
+ iso-8859-4
+ latin4
+ l4
+ csisolatin4
+ iso_8859-6:1987
+ iso-ir-127
+ iso_8859-6
+ iso-8859-6
+ ecma-114
+ asmo-708
+ arabic
+ csisolatinarabic
+ iso_8859-7:1987
+ iso-ir-126
+ iso_8859-7
+ iso-8859-7
+ elot_928
+ ecma-118
+ greek
+ greek8
+ csisolatingreek
+ iso_8859-8:1988
+ iso-ir-138
+ iso_8859-8
+ iso-8859-8
+ hebrew
+ csisolatinhebrew
+ iso_8859-5:1988
+ iso-ir-144
+ iso_8859-5
+ iso-8859-5
+ cyrillic
+ csisolatincyrillic
+ iso_8859-9:1989
+ iso-ir-148
+ iso_8859-9
+ iso-8859-9
+ latin5
+ l5
+ csisolatin5
+ iso-8859-10
+ iso-ir-157
+ l6
+ iso_8859-10:1992
+ csisolatin6
+ latin6
+ hp-roman8
+ roman8
+ r8
+ ibm037
+ cp037
+ csibm037
+ ibm424
+ cp424
+ csibm424
+ ibm437
+ cp437
+ 437
+ cspc8codepage437
+ ibm500
+ cp500
+ csibm500
+ ibm775
+ cp775
+ cspc775baltic
+ ibm850
+ cp850
+ 850
+ cspc850multilingual
+ ibm852
+ cp852
+ 852
+ cspcp852
+ ibm855
+ cp855
+ 855
+ csibm855
+ ibm857
+ cp857
+ 857
+ csibm857
+ ibm860
+ cp860
+ 860
+ csibm860
+ ibm861
+ cp861
+ 861
+ cp-is
+ csibm861
+ ibm862
+ cp862
+ 862
+ cspc862latinhebrew
+ ibm863
+ cp863
+ 863
+ csibm863
+ ibm864
+ cp864
+ csibm864
+ ibm865
+ cp865
+ 865
+ csibm865
+ ibm866
+ cp866
+ 866
+ csibm866
+ ibm869
+ cp869
+ 869
+ cp-gr
+ csibm869
+ ibm1026
+ cp1026
+ csibm1026
+ koi8-r
+ cskoi8r
+ koi8-u
+ big5-hkscs
+ ptcp154
+ csptcp154
+ pt154
+ cp154
+ utf-7
+ utf-16be
+ utf-16le
+ utf-16
+ utf-8
+ iso-8859-13
+ iso-8859-14
+ iso-ir-199
+ iso_8859-14:1998
+ iso_8859-14
+ latin8
+ iso-celtic
+ l8
+ iso-8859-15
+ iso_8859-15
+ iso-8859-16
+ iso-ir-226
+ iso_8859-16:2001
+ iso_8859-16
+ latin10
+ l10
+ gbk
+ cp936
+ ms936
+ gb18030
+ shift_jis
+ ms_kanji
+ csshiftjis
+ euc-jp
+ gb2312
+ big5
+ csbig5
+ windows-1250
+ windows-1251
+ windows-1252
+ windows-1253
+ windows-1254
+ windows-1255
+ windows-1256
+ windows-1257
+ windows-1258
+ tis-620
+ hz-gb-2312
+]
+
+end
diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb b/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb
new file mode 100644
index 00000000..abbb89a6
--- /dev/null
+++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb
@@ -0,0 +1,2020 @@
+require 'html5lib/constants'
+require 'html5lib/tokenizer'
+require 'html5lib/treebuilders/rexml'
+
+module HTML5lib
+
+# HTML parser. Generates a tree structure from a stream of (possibly
+# malformed) HTML
+class HTMLParser
+
+ attr_accessor :phase, :firstStartTag, :innerHTML, :lastPhase, :insertFromTable
+
+ attr_reader :phases, :tokenizer, :tree, :errors
+
+ # convenience methods
+ def self.parse(stream, options = {})
+ encoding = options.delete(:encoding)
+ new(options).parse(stream,encoding)
+ end
+
+ def self.parseFragment(stream, options = {})
+ container = options.delete(:container) || 'div'
+ encoding = options.delete(:encoding)
+ new(options).parseFragment(stream,container,encoding)
+ end
+
+ @@phases = [
+ :initial,
+ :rootElement,
+ :beforeHead,
+ :inHead,
+ :afterHead,
+ :inBody,
+ :inTable,
+ :inCaption,
+ :inColumnGroup,
+ :inTableBody,
+ :inRow,
+ :inCell,
+ :inSelect,
+ :afterBody,
+ :inFrameset,
+ :afterFrameset,
+ :trailingEnd
+ ]
+
+ # :strict - raise an exception when a parse error is encountered
+ # :tree - a treebuilder class controlling the type of tree that will be
+ # returned. Built in treebuilders can be accessed through
+ # html5lib.treebuilders.getTreeBuilder(treeType)
+ def initialize(options = {})
+ @strict = false
+ @errors = []
+
+ @tokenizer = HTMLTokenizer
+ @tree = TreeBuilders::REXMLTree::TreeBuilder
+
+ options.each { |name, value| instance_variable_set("@#{name}", value) }
+
+ @tree = @tree.new
+
+ @phases = @@phases.inject({}) do |phases, symbol|
+ class_name = symbol.to_s.sub(/(.)/) { $1.upcase } + 'Phase'
+ phases[symbol] = HTML5lib.const_get(class_name).new(self, @tree)
+ phases
+ end
+ end
+
+ def _parse(stream, innerHTML, encoding, container = 'div')
+ @tree.reset
+ @firstStartTag = false
+ @errors = []
+
+ @tokenizer = @tokenizer.class unless Class === @tokenizer
+ @tokenizer = @tokenizer.new(stream, :encoding => encoding, :parseMeta => innerHTML)
+
+ if innerHTML
+ case @innerHTML = container.downcase
+ when 'title', 'textarea'
+ @tokenizer.contentModelFlag = :RCDATA
+ when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
+ @tokenizer.contentModelFlag = :CDATA
+ when 'plaintext'
+ @tokenizer.contentModelFlag = :PLAINTEXT
+ else
+ # contentModelFlag already is PCDATA
+ #@tokenizer.contentModelFlag = :PCDATA
+ end
+
+ @phase = @phases[:rootElement]
+ @phase.insertHtmlElement
+ resetInsertionMode
+ else
+ @innerHTML = false
+ @phase = @phases[:initial]
+ end
+
+ # We only seem to have InBodyPhase testcases where the following is
+ # relevant ... need others too
+ @lastPhase = nil
+
+ # XXX This is temporary for the moment so there isn't any other
+ # changes needed for the parser to work with the iterable tokenizer
+ @tokenizer.each do |token|
+ token = normalizeToken(token)
+
+ method = 'process%s' % token[:type]
+
+ case token[:type]
+ when :Characters, :SpaceCharacters, :Comment
+ @phase.send method, token[:data]
+ when :StartTag, :Doctype
+ @phase.send method, token[:name], token[:data]
+ when :EndTag
+ @phase.send method, token[:name]
+ else
+ parseError(token[:data])
+ end
+ end
+
+ # When the loop finishes it's EOF
+ @phase.processEOF
+ end
+
+ # Parse a HTML document into a well-formed tree
+ #
+ # stream - a filelike object or string containing the HTML to be parsed
+ #
+ # The optional encoding parameter must be a string that indicates
+ # the encoding. If specified, that encoding will be used,
+ # regardless of any BOM or later declaration (such as in a meta
+ # element)
+ def parse(stream, encoding = nil)
+ _parse(stream, false, encoding)
+ return @tree.getDocument
+ end
+
+ # Parse a HTML fragment into a well-formed tree fragment
+
+ # container - name of the element we're setting the innerHTML property
+ # if set to nil, default to 'div'
+ #
+ # stream - a filelike object or string containing the HTML to be parsed
+ #
+ # The optional encoding parameter must be a string that indicates
+ # the encoding. If specified, that encoding will be used,
+ # regardless of any BOM or later declaration (such as in a meta
+ # element)
+ def parseFragment(stream, container = 'div', encoding = nil)
+ _parse(stream, true, encoding, container)
+ return @tree.getFragment
+ end
+
+ def parseError(data = 'XXX ERROR MESSAGE NEEDED')
+ # XXX The idea is to make data mandatory.
+ @errors.push([@tokenizer.stream.position, data])
+ raise ParseError if @strict
+ end
+
+ # This error is not an error
+ def atheistParseError
+ end
+
+ # HTML5 specific normalizations to the token stream
+ def normalizeToken(token)
+
+ if token[:type] == :EmptyTag
+ # When a solidus (/) is encountered within a tag name what happens
+ # depends on whether the current tag name matches that of a void
+ # element. If it matches a void element atheists did the wrong
+ # thing and if it doesn't it's wrong for everyone.
+
+ if VOID_ELEMENTS.include?(token[:name])
+ atheistParseError
+ else
+ parseError(_('Solidus (/) incorrectly placed in tag.'))
+ end
+
+ token[:type] = :StartTag
+ end
+
+ if token[:type] == :StartTag
+ token[:name] = token[:name].tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
+
+ # We need to remove the duplicate attributes and convert attributes
+ # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
+
+ if token[:data].length
+ token[:data] = Hash[*token[:data].reverse.map {|attr,value|
+ [attr.tr(ASCII_UPPERCASE,ASCII_LOWERCASE),value]
+ }.flatten]
+ else
+ token[:data] = {}
+ end
+
+ elsif token[:type] == :EndTag
+ parseError(_('End tag contains unexpected attributes.')) if token[:data]
+ token[:name] = token[:name].downcase
+ end
+
+ return token
+ end
+
+ @@new_modes = {
+ 'select' => :inSelect,
+ 'td' => :inCell,
+ 'th' => :inCell,
+ 'tr' => :inRow,
+ 'tbody' => :inTableBody,
+ 'thead' => :inTableBody,
+ 'tfoot' => :inTableBody,
+ 'caption' => :inCaption,
+ 'colgroup' => :inColumnGroup,
+ 'table' => :inTable,
+ 'head' => :inBody,
+ 'body' => :inBody,
+ 'frameset' => :inFrameset
+ }
+
+ def resetInsertionMode
+ # The name of this method is mostly historical. (It's also used in the
+ # specification.)
+ last = false
+
+ @tree.openElements.reverse.each do |node|
+ nodeName = node.name
+
+ if node == @tree.openElements[0]
+ last = true
+ unless ['td', 'th'].include?(nodeName)
+ # XXX
+ # assert @innerHTML
+ nodeName = @innerHTML
+ end
+ end
+
+ # Check for conditions that should only happen in the innerHTML
+ # case
+ if ['select', 'colgroup', 'head', 'frameset'].include?(nodeName)
+ # XXX
+ # assert @innerHTML
+ end
+
+ if @@new_modes.has_key?(nodeName)
+ @phase = @phases[@@new_modes[nodeName]]
+ elsif nodeName == 'html'
+ @phase = @phases[@tree.headPointer.nil?? :beforeHead : :afterHead]
+ elsif last
+ @phase = @phases[:inBody]
+ else
+ next
+ end
+
+ break
+ end
+ end
+
+ def _(string); string; end
+end
+
+# Base class for helper object that implements each phase of processing
+class Phase
+ # Order should be (they can be omitted)
+ # * EOF
+ # * Comment
+ # * Doctype
+ # * SpaceCharacters
+ # * Characters
+ # * StartTag
+ # - startTag* methods
+ # * EndTag
+ # - endTag* methods
+
+ def self.tag_handler_map(default,array)
+ array.inject(Hash.new(default)) do |map, (names, value)|
+ names = [names] unless Array === names
+ names.each { |name| map[name] = value }
+ map
+ end
+ end
+
+ def self.start_tag_handlers
+ @start_tag_handlers
+ end
+
+ def self.handle_start(tags)
+ @start_tag_handlers = tag_handler_map(:startTagOther, tags)
+ end
+
+ def self.end_tag_handlers
+ @end_tag_handlers
+ end
+
+ def self.handle_end(tags)
+ @end_tag_handlers = tag_handler_map(:endTagOther, tags)
+ end
+
+ def initialize(parser, tree)
+ @parser = parser
+ @tree = tree
+ end
+
+ def processEOF
+ @tree.generateImpliedEndTags
+
+ if @tree.openElements.length > 2
+ @parser.parseError(_('Unexpected end of file. Missing closing tags.'))
+ elsif @tree.openElements.length == 2 and @tree.openElements[1].name != 'body'
+ # This happens for framesets or something?
+ @parser.parseError(_("Unexpected end of file. Expected end tag (#{@tree.openElements[1].name}) first."))
+ elsif @parser.innerHTML and @tree.openElements.length > 1
+ # XXX This is not what the specification says. Not sure what to do here.
+ @parser.parseError(_('XXX innerHTML EOF'))
+ end
+ # Betting ends.
+ end
+
+ def processComment(data)
+ # For most phases the following is correct. Where it's not it will be
+ # overridden.
+ @tree.insertComment(data, @tree.openElements[-1])
+ end
+
+ def processDoctype(name, error)
+ @parser.parseError(_('Unexpected DOCTYPE. Ignored.'))
+ end
+
+ def processSpaceCharacters(data)
+ @tree.insertText(data)
+ end
+
+ def processStartTag(name, attributes)
+ send self.class.start_tag_handlers[name], name, attributes
+ end
+
+ def startTagHtml(name, attributes)
+ if @parser.firstStartTag == false and name == 'html'
+ @parser.parseError(_('html needs to be the first start tag.'))
+ end
+ # XXX Need a check here to see if the first start tag token emitted is
+ # this token... If it's not, invoke @parser.parseError.
+ attributes.each do |attr, value|
+ unless @tree.openElements[0].attributes.has_key?(attr)
+ @tree.openElements[0].attributes[attr] = value
+ end
+ end
+ @parser.firstStartTag = false
+ end
+
+ def processEndTag(name)
+ send self.class.end_tag_handlers[name], name
+ end
+
+ def _(string)
+ string
+ end
+
+ def assert(value)
+ throw AssertionError.new unless value
+ end
+
+ def in_scope?(*args)
+ @tree.elementInScope(*args)
+ end
+
+ def remove_open_elements_until(name = nil)
+ finished = false
+ until finished
+ element = @tree.openElements.pop
+ finished = name.nil?? yield(element) : element.name == name
+ end
+ return element
+ end
+
+end
+
+
+class InitialPhase < Phase
+ # This phase deals with error handling as well which is currently not
+ # covered in the specification. The error handling is typically known as
+ # "quirks mode". It is expected that a future version of HTML5 will defin
+ # this.
+ def processEOF
+ @parser.parseError(_('Unexpected End of file. Expected DOCTYPE.'))
+ @parser.phase = @parser.phases[:rootElement]
+ @parser.phase.processEOF
+ end
+
+ def processComment(data)
+ @tree.insertComment(data, @tree.document)
+ end
+
+ def processDoctype(name, error)
+ @parser.parseError(_('Erroneous DOCTYPE.')) if error
+ @tree.insertDoctype(name)
+ @parser.phase = @parser.phases[:rootElement]
+ end
+
+ def processSpaceCharacters(data)
+ @tree.insertText(data, @tree.document)
+ end
+
+ def processCharacters(data)
+ @parser.parseError(_('Unexpected non-space characters. Expected DOCTYPE.'))
+ @parser.phase = @parser.phases[:rootElement]
+ @parser.phase.processCharacters(data)
+ end
+
+ def processStartTag(name, attributes)
+ @parser.parseError(_("Unexpected start tag (#{name}). Expected DOCTYPE."))
+ @parser.phase = @parser.phases[:rootElement]
+ @parser.phase.processStartTag(name, attributes)
+ end
+
+ def processEndTag(name)
+ @parser.parseError(_("Unexpected end tag (#{name}). Expected DOCTYPE."))
+ @parser.phase = @parser.phases[:rootElement]
+ @parser.phase.processEndTag(name)
+ end
+end
+
+
+class RootElementPhase < Phase
+ # helper methods
+ def insertHtmlElement
+ element = @tree.createElement('html', {})
+ @tree.openElements.push(element)
+ @tree.document.appendChild(element)
+ @parser.phase = @parser.phases[:beforeHead]
+ end
+
+ # other
+ def processEOF
+ insertHtmlElement
+ @parser.phase.processEOF
+ end
+
+ def processComment(data)
+ @tree.insertComment(data, @tree.document)
+ end
+
+ def processSpaceCharacters(data)
+ @tree.insertText(data, @tree.document)
+ end
+
+ def processCharacters(data)
+ insertHtmlElement
+ @parser.phase.processCharacters(data)
+ end
+
+ def processStartTag(name, attributes)
+ @parser.firstStartTag = true if name == 'html'
+ insertHtmlElement
+ @parser.phase.processStartTag(name, attributes)
+ end
+
+ def processEndTag(name)
+ insertHtmlElement
+ @parser.phase.processEndTag(name)
+ end
+end
+
+
+class BeforeHeadPhase < Phase
+
+ handle_start [
+ ['html', :startTagHtml],
+ ['head', :startTagHead]
+ ]
+
+ handle_end [
+ ['html', :endTagHtml]
+ ]
+
+ def processEOF
+ startTagHead('head', {})
+ @parser.phase.processEOF
+ end
+
+ def processCharacters(data)
+ startTagHead('head', {})
+ @parser.phase.processCharacters(data)
+ end
+
+ def startTagHead(name, attributes)
+ @tree.insertElement(name, attributes)
+ @tree.headPointer = @tree.openElements[-1]
+ @parser.phase = @parser.phases[:inHead]
+ end
+
+ def startTagOther(name, attributes)
+ startTagHead('head', {})
+ @parser.phase.processStartTag(name, attributes)
+ end
+
+ def endTagHtml(name)
+ startTagHead('head', {})
+ @parser.phase.processEndTag(name)
+ end
+
+ def endTagOther(name)
+ @parser.parseError(_("Unexpected end tag (#{name}) after the (implied) root element."))
+ end
+end
+
+class InHeadPhase < Phase
+
+ handle_start [
+ ['html', :startTagHtml],
+ ['title', :startTagTitle],
+ ['style', :startTagStyle],
+ ['script', :startTagScript],
+ [['base', 'link', 'meta'], :startTagBaseLinkMeta],
+ ['head', :startTagHead]
+ ]
+
+ handle_end [
+ ['head', :endTagHead],
+ ['html', :endTagHtml],
+ [['title', 'style', 'script'], :endTagTitleStyleScript]
+ ]
+
+ # helper
+ def appendToHead(element)
+ if @tree.headPointer.nil?
+ assert @parser.innerHTML
+ @tree.openElements[-1].appendChild(element)
+ else
+ @tree.headPointer.appendChild(element)
+ end
+ end
+
+ # the real thing
+ def processEOF
+ if ['title', 'style', 'script'].include?(name = @tree.openElements[-1].name)
+ @parser.parseError(_("Unexpected end of file. Expected end tag (#{name})."))
+ @tree.openElements.pop
+ end
+ anythingElse
+ @parser.phase.processEOF
+ end
+
+ def processCharacters(data)
+ if ['title', 'style', 'script'].include?(@tree.openElements[-1].name)
+ @tree.insertText(data)
+ else
+ anythingElse
+ @parser.phase.processCharacters(data)
+ end
+ end
+
+ def startTagHead(name, attributes)
+ @parser.parseError(_('Unexpected start tag head in existing head. Ignored'))
+ end
+
+ def startTagTitle(name, attributes)
+ element = @tree.createElement(name, attributes)
+ appendToHead(element)
+ @tree.openElements.push(element)
+ @parser.tokenizer.contentModelFlag = :RCDATA
+ end
+
+ def startTagStyle(name, attributes)
+ element = @tree.createElement(name, attributes)
+ if @tree.headPointer != nil and @parser.phase == @parser.phases[:inHead]
+ appendToHead(element)
+ else
+ @tree.openElements[-1].appendChild(element)
+ end
+ @tree.openElements.push(element)
+ @parser.tokenizer.contentModelFlag = :CDATA
+ end
+
+ def startTagScript(name, attributes)
+ #XXX Inner HTML case may be wrong
+ element = @tree.createElement(name, attributes)
+ element._flags.push("parser-inserted")
+ if (@tree.headPointer != nil and
+ @parser.phase == @parser.phases[:inHead])
+ appendToHead(element)
+ else
+ @tree.openElements[-1].appendChild(element)
+ end
+ @tree.openElements.push(element)
+ @parser.tokenizer.contentModelFlag = :CDATA
+ end
+
+ def startTagBaseLinkMeta(name, attributes)
+ element = @tree.createElement(name, attributes)
+ appendToHead(element)
+ end
+
+ def startTagOther(name, attributes)
+ anythingElse
+ @parser.phase.processStartTag(name, attributes)
+ end
+
+ def endTagHead(name)
+ if @tree.openElements[-1].name == 'head'
+ @tree.openElements.pop
+ else
+ @parser.parseError(_("Unexpected end tag (head). Ignored."))
+ end
+ @parser.phase = @parser.phases[:afterHead]
+ end
+
+ def endTagHtml(name)
+ anythingElse
+ @parser.phase.processEndTag(name)
+ end
+
+ def endTagTitleStyleScript(name)
+ if @tree.openElements[-1].name == name
+ @tree.openElements.pop
+ else
+ @parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
+ end
+ end
+
+ def endTagOther(name)
+ @parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
+ end
+
+ def anythingElse
+ if @tree.openElements[-1].name == 'head'
+ endTagHead('head')
+ else
+ @parser.phase = @parser.phases[:afterHead]
+ end
+ end
+end
+
+class AfterHeadPhase < Phase
+
+ handle_start [
+ ['html', :startTagHtml],
+ ['body', :startTagBody],
+ ['frameset', :startTagFrameset],
+ [['base', 'link', 'meta', 'script', 'style', 'title'], :startTagFromHead]
+ ]
+
+ def processEOF
+ anythingElse
+ @parser.phase.processEOF
+ end
+
+ def processCharacters(data)
+ anythingElse
+ @parser.phase.processCharacters(data)
+ end
+
+ def startTagBody(name, attributes)
+ @tree.insertElement(name, attributes)
+ @parser.phase = @parser.phases[:inBody]
+ end
+
+ def startTagFrameset(name, attributes)
+ @tree.insertElement(name, attributes)
+ @parser.phase = @parser.phases[:inFrameset]
+ end
+
+ def startTagFromHead(name, attributes)
+ @parser.parseError(_("Unexpected start tag (#{name}) that can be in head. Moved."))
+ @parser.phase = @parser.phases[:inHead]
+ @parser.phase.processStartTag(name, attributes)
+ end
+
+ def startTagOther(name, attributes)
+ anythingElse
+ @parser.phase.processStartTag(name, attributes)
+ end
+
+ def processEndTag(name)
+ anythingElse
+ @parser.phase.processEndTag(name)
+ end
+
+ def anythingElse
+ @tree.insertElement('body', {})
+ @parser.phase = @parser.phases[:inBody]
+ end
+end
+
+
+class InBodyPhase < Phase
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-body
+ # the crazy mode
+
+ handle_start [
+ ['html', :startTagHtml],
+ [['script', 'style'], :startTagScriptStyle],
+ [['base', 'link', 'meta', 'title'], :startTagFromHead],
+ ['body', :startTagBody],
+ [['address', 'blockquote', 'center', 'dir', 'div', 'dl',
+ 'fieldset', 'listing', 'menu', 'ol', 'p', 'pre', 'ul'],
+ :startTagCloseP],
+ ['form', :startTagForm],
+ [['li', 'dd', 'dt'], :startTagListItem],
+ ['plaintext',:startTagPlaintext],
+ [HEADING_ELEMENTS, :startTagHeading],
+ ['a', :startTagA],
+ [['b', 'big', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike',
+ 'strong', 'tt', 'u'],:startTagFormatting],
+ ['button', :startTagButton],
+ [['marquee', 'object'], :startTagMarqueeObject],
+ ['xmp', :startTagXmp],
+ ['table', :startTagTable],
+ [['area', 'basefont', 'bgsound', 'br', 'embed', 'img', 'param',
+ 'spacer', 'wbr'], :startTagVoidFormatting],
+ ['hr', :startTagHr],
+ ['image', :startTagImage],
+ ['input', :startTagInput],
+ ['isindex', :startTagIsIndex],
+ ['textarea', :startTagTextarea],
+ [['iframe', 'noembed', 'noframes', 'noscript'], :startTagCdata],
+ ['select', :startTagSelect],
+ [['caption', 'col', 'colgroup', 'frame', 'frameset', 'head',
+ 'option', 'optgroup', 'tbody', 'td', 'tfoot', 'th', 'thead',
+ 'tr'], :startTagMisplaced],
+ [['event-source', 'section', 'nav', 'article', 'aside', 'header',
+ 'footer', 'datagrid', 'command'], :startTagNew]
+ ]
+
+ handle_end [
+ ['p',:endTagP],
+ ['body',:endTagBody],
+ ['html',:endTagHtml],
+ [['address', 'blockquote', 'center', 'div', 'dl', 'fieldset',
+ 'listing', 'menu', 'ol', 'pre', 'ul'], :endTagBlock],
+ ['form', :endTagForm],
+ [['dd', 'dt', 'li'], :endTagListItem],
+ [HEADING_ELEMENTS, :endTagHeading],
+ [['a', 'b', 'big', 'em', 'font', 'i', 'nobr', 's', 'small',
+ 'strike', 'strong', 'tt', 'u'], :endTagFormatting],
+ [['marquee', 'object', 'button'], :endTagButtonMarqueeObject],
+ [['head', 'frameset', 'select', 'optgroup', 'option', 'table',
+ 'caption', 'colgroup', 'col', 'thead', 'tfoot', 'tbody', 'tr',
+ 'td', 'th'], :endTagMisplaced],
+ [['area', 'basefont', 'bgsound', 'br', 'embed', 'hr', 'image',
+ 'img', 'input', 'isindex', 'param', 'spacer', 'wbr', 'frame'],
+ :endTagNone],
+ [['noframes', 'noscript', 'noembed', 'textarea', 'xmp', 'iframe'],
+ :endTagCdataTextAreaXmp],
+ [['event-source', 'section', 'nav', 'article', 'aside', 'header',
+ 'footer', 'datagrid', 'command'], :endTagNew]
+ ]
+
+ def initialize(parser, tree)
+ super(parser, tree)
+
+ # for special handling of whitespace in
+ @processSpaceCharactersPre = false
+ end
+
+ # helper
+ def addFormattingElement(name, attributes)
+ @tree.insertElement(name, attributes)
+ @tree.activeFormattingElements.push(@tree.openElements[-1])
+ end
+
+ # the real deal
+ def processSpaceCharactersPre(data)
+ #Sometimes (start of
blocks) we want to drop leading newlines
+ @processSpaceCharactersPre = false
+ if (data.length > 0 and data[0] == ?\n and
+ @tree.openElements[-1].name == 'pre' and
+ not @tree.openElements[-1].hasContent)
+ data = data[1..-1]
+ end
+ @tree.insertText(data) if data.length > 0
+ end
+
+ def processSpaceCharacters(data)
+ if @processSpaceCharactersPre
+ processSpaceCharactersPre(data)
+ else
+ super(data)
+ end
+ end
+
+ def processCharacters(data)
+ # XXX The specification says to do this for every character at the
+ # moment, but apparently that doesn't match the real world so we don't
+ # do it for space characters.
+ @tree.reconstructActiveFormattingElements
+ @tree.insertText(data)
+ end
+
+ def startTagScriptStyle(name, attributes)
+ @parser.phases[:inHead].processStartTag(name, attributes)
+ end
+
+ def startTagFromHead(name, attributes)
+ @parser.parseError(_("Unexpected start tag (#{name}) that belongs in the head. Moved."))
+ @parser.phases[:inHead].processStartTag(name, attributes)
+ end
+
+ def startTagBody(name, attributes)
+ @parser.parseError(_('Unexpected start tag (body).'))
+
+ if (@tree.openElements.length == 1 or
+ @tree.openElements[1].name != 'body')
+ assert @parser.innerHTML
+ else
+ attributes.each do |attr, value|
+ unless @tree.openElements[1].attributes.has_key?(attr)
+ @tree.openElements[1].attributes[attr] = value
+ end
+ end
+ end
+ end
+
+ def startTagCloseP(name, attributes)
+ endTagP('p') if in_scope?('p')
+ @tree.insertElement(name, attributes)
+ @processSpaceCharactersPre = true if name == 'pre'
+ end
+
+ def startTagForm(name, attributes)
+ if @tree.formPointer
+ @parser.parseError('Unexpected start tag (form). Ignored.')
+ else
+ endTagP('p') if in_scope?('p')
+ @tree.insertElement(name, attributes)
+ @tree.formPointer = @tree.openElements[-1]
+ end
+ end
+
+ def startTagListItem(name, attributes)
+ endTagP('p') if in_scope?('p')
+ stopNames = {'li' => ['li'], 'dd' => ['dd', 'dt'], 'dt' => ['dd', 'dt']}
+ stopName = stopNames[name]
+
+ @tree.openElements.reverse.each_with_index do |node,i|
+ if stopName.include?(node.name)
+ (i+1).times { @tree.openElements.pop }
+ break
+ end
+
+ # Phrasing elements are all non special, non scoping, non
+ # formatting elements
+ break if ((SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name) and
+ not ['address', 'div'].include?(node.name))
+ end
+
+ # Always insert an
element.
+ @tree.insertElement(name, attributes)
+ end
+
+ def startTagPlaintext(name, attributes)
+ endTagP('p') if in_scope?('p')
+ @tree.insertElement(name, attributes)
+ @parser.tokenizer.contentModelFlag = :PLAINTEXT
+ end
+
+ def startTagHeading(name, attributes)
+ endTagP('p') if in_scope?('p')
+ HEADING_ELEMENTS.each do |element|
+ if in_scope?(element)
+ @parser.parseError(_("Unexpected start tag (#{name})."))
+
+ remove_open_elements_until { |element| HEADING_ELEMENTS.include?(element.name) }
+
+ break
+ end
+ end
+ @tree.insertElement(name, attributes)
+ end
+
+ def startTagA(name, attributes)
+ if afeAElement = @tree.elementInActiveFormattingElements('a')
+ @parser.parseError(_('Unexpected start tag (a) implies end tag (a).'))
+ endTagFormatting('a')
+ @tree.openElements.delete(afeAElement) if @tree.openElements.include?(afeAElement)
+ @tree.activeFormattingElements.delete(afeAElement) if @tree.activeFormattingElements.include?(afeAElement)
+ end
+ @tree.reconstructActiveFormattingElements
+ addFormattingElement(name, attributes)
+ end
+
+ def startTagFormatting(name, attributes)
+ @tree.reconstructActiveFormattingElements
+ addFormattingElement(name, attributes)
+ end
+
+ def startTagButton(name, attributes)
+ if in_scope?('button')
+ @parser.parseError(_('Unexpected start tag (button) implied end tag (button).'))
+ processEndTag('button')
+ @parser.phase.processStartTag(name, attributes)
+ else
+ @tree.reconstructActiveFormattingElements
+ @tree.insertElement(name, attributes)
+ @tree.activeFormattingElements.push(Marker)
+ end
+ end
+
+ def startTagMarqueeObject(name, attributes)
+ @tree.reconstructActiveFormattingElements
+ @tree.insertElement(name, attributes)
+ @tree.activeFormattingElements.push(Marker)
+ end
+
+ def startTagXmp(name, attributes)
+ @tree.reconstructActiveFormattingElements
+ @tree.insertElement(name, attributes)
+ @parser.tokenizer.contentModelFlag = :CDATA
+ end
+
+ def startTagTable(name, attributes)
+ processEndTag('p') if in_scope?('p')
+ @tree.insertElement(name, attributes)
+ @parser.phase = @parser.phases[:inTable]
+ end
+
+ def startTagVoidFormatting(name, attributes)
+ @tree.reconstructActiveFormattingElements
+ @tree.insertElement(name, attributes)
+ @tree.openElements.pop
+ end
+
+ def startTagHr(name, attributes)
+ endTagP('p') if in_scope?('p')
+ @tree.insertElement(name, attributes)
+ @tree.openElements.pop
+ end
+
+ def startTagImage(name, attributes)
+ # No really...
+ @parser.parseError(_('Unexpected start tag (image). Treated as img.'))
+ processStartTag('img', attributes)
+ end
+
+ def startTagInput(name, attributes)
+ @tree.reconstructActiveFormattingElements
+ @tree.insertElement(name, attributes)
+ if @tree.formPointer
+ # XXX Not exactly sure what to do here
+ # @tree.openElements[-1].form = @tree.formPointer
+ end
+ @tree.openElements.pop
+ end
+
+ def startTagIsIndex(name, attributes)
+ @parser.parseError("Unexpected start tag isindex. Don't use it!")
+ return if @tree.formPointer
+ processStartTag('form', {})
+ processStartTag('hr', {})
+ processStartTag('p', {})
+ processStartTag('label', {})
+ # XXX Localization ...
+ processCharacters('This is a searchable index. Insert your search keywords here:')
+ attributes['name'] = 'isindex'
+ attrs = attributes.to_a
+ processStartTag('input', attributes)
+ processEndTag('label')
+ processEndTag('p')
+ processStartTag('hr', {})
+ processEndTag('form')
+ end
+
+ def startTagTextarea(name, attributes)
+ # XXX Form element pointer checking here as well...
+ @tree.insertElement(name, attributes)
+ @parser.tokenizer.contentModelFlag = :RCDATA
+ end
+
+ # iframe, noembed noframes, noscript(if scripting enabled)
+ def startTagCdata(name, attributes)
+ @tree.insertElement(name, attributes)
+ @parser.tokenizer.contentModelFlag = :CDATA
+ end
+
+ def startTagSelect(name, attributes)
+ @tree.reconstructActiveFormattingElements
+ @tree.insertElement(name, attributes)
+ @parser.phase = @parser.phases[:inSelect]
+ end
+
+ def startTagMisplaced(name, attributes)
+ # Elements that should be children of other elements that have a
+ # different insertion mode; here they are ignored
+ # "caption", "col", "colgroup", "frame", "frameset", "head",
+ # "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
+ # "tr", "noscript"
+ @parser.parseError(_("Unexpected start tag (#{name}). Ignored."))
+ end
+
+ def startTagNew(name, attributes)
+ # New HTML5 elements, "event-source", "section", "nav",
+ # "article", "aside", "header", "footer", "datagrid", "command"
+ sys.stderr.write("Warning: Undefined behaviour for start tag #{name}")
+ startTagOther(name, attributes)
+ #raise NotImplementedError
+ end
+
+ def startTagOther(name, attributes)
+ @tree.reconstructActiveFormattingElements
+ @tree.insertElement(name, attributes)
+ end
+
+ def endTagP(name)
+ @tree.generateImpliedEndTags('p') if in_scope?('p')
+ @parser.parseError('Unexpected end tag (p).') unless @tree.openElements[-1].name == 'p'
+ @tree.openElements.pop while in_scope?('p')
+ end
+
+ def endTagBody(name)
+ # XXX Need to take open
tags into account here. We shouldn't imply
+ #
but we should not throw a parse error either. Specification is
+ # likely to be updated.
+ unless @tree.openElements[1].name == 'body'
+ # innerHTML case
+ @parser.parseError
+ return
+ end
+ unless @tree.openElements[-1].name == 'body'
+ @parser.parseError(_("Unexpected end tag (body). Missing end tag (#{@tree.openElements[-1].name})."))
+ end
+ @parser.phase = @parser.phases[:afterBody]
+ end
+
+ def endTagHtml(name)
+ endTagBody(name)
+ @parser.phase.processEndTag(name) unless @parser.innerHTML
+ end
+
+ def endTagBlock(name)
+ #Put us back in the right whitespace handling mode
+ @processSpaceCharactersPre = false if name == 'pre'
+
+ @tree.generateImpliedEndTags if in_scope?(name)
+
+ unless @tree.openElements[-1].name == name
+ @parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
+ end
+
+ if in_scope?(name)
+ remove_open_elements_until(name)
+ end
+ end
+
+ def endTagForm(name)
+ endTagBlock(name)
+ @tree.formPointer = nil
+ end
+
+ def endTagListItem(name)
+ # AT Could merge this with the Block case
+ if in_scope?(name)
+ @tree.generateImpliedEndTags(name)
+
+ unless @tree.openElements[-1].name == name
+ @parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
+ end
+ end
+
+ remove_open_elements_until(name) if in_scope?(name)
+ end
+
+ def endTagHeading(name)
+ HEADING_ELEMENTS.each do |element|
+ if in_scope?(element)
+ @tree.generateImpliedEndTags
+ break
+ end
+ end
+
+ unless @tree.openElements[-1].name == name
+ @parser.parseError(("Unexpected end tag (#{name}). Expected other end tag."))
+ end
+
+ HEADING_ELEMENTS.each do |element|
+ if in_scope?(element)
+ remove_open_elements_until { |element| HEADING_ELEMENTS.include?(element.name) }
+ break
+ end
+ end
+ end
+
+ # The much-feared adoption agency algorithm
+ def endTagFormatting(name)
+ # http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
+ # XXX Better parseError messages appreciated.
+ while true
+ # Step 1 paragraph 1
+ afeElement = @tree.elementInActiveFormattingElements(name)
+ if not afeElement or (@tree.openElements.include?(afeElement) and not in_scope?(afeElement.name))
+ @parser.parseError(_("End tag (#{name}) violates step 1, paragraph 1 of the adoption agency algorithm."))
+ return
+ # Step 1 paragraph 2
+ elsif not @tree.openElements.include?(afeElement)
+ @parser.parseError(_("End tag (#{name}) violates step 1, paragraph 2 of the adoption agency algorithm."))
+ @tree.activeFormattingElements.delete(afeElement)
+ return
+ end
+
+ # Step 1 paragraph 3
+ if afeElement != @tree.openElements[-1]
+ @parser.parseError(_("End tag (#{name}) violates step 1, paragraph 3 of the adoption agency algorithm."))
+ end
+
+ # Step 2
+ # Start of the adoption agency algorithm proper
+ afeIndex = @tree.openElements.index(afeElement)
+ furthestBlock = nil
+ @tree.openElements[afeIndex..-1].each do |element|
+ if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(element.name)
+ furthestBlock = element
+ break
+ end
+ end
+
+ # Step 3
+ if furthestBlock.nil?
+ element = remove_open_elements_until { |element| element == afeElement }
+ @tree.activeFormattingElements.delete(element)
+ return
+ end
+ commonAncestor = @tree.openElements[afeIndex-1]
+
+ # Step 5
+ furthestBlock.parent.removeChild(furthestBlock) if furthestBlock.parent
+
+ # Step 6
+ # The bookmark is supposed to help us identify where to reinsert
+ # nodes in step 12. We have to ensure that we reinsert nodes after
+ # the node before the active formatting element. Note the bookmark
+ # can move in step 7.4
+ bookmark = @tree.activeFormattingElements.index(afeElement)
+
+ # Step 7
+ lastNode = node = furthestBlock
+ while true
+ # AT replace this with a function and recursion?
+ # Node is element before node in open elements
+ node = @tree.openElements[@tree.openElements.index(node)-1]
+ until @tree.activeFormattingElements.include?(node)
+ tmpNode = node
+ node = @tree.openElements[@tree.openElements.index(node)-1]
+ @tree.openElements.delete(tmpNode)
+ end
+ # Step 7.3
+ break if node == afeElement
+ # Step 7.4
+ if lastNode == furthestBlock
+ # XXX should this be index(node) or index(node)+1
+ # Anne: I think +1 is ok. Given x = [2,3,4,5]
+ # x.index(3) gives 1 and then x[1 +1] gives 4...
+ bookmark = @tree.activeFormattingElements.index(node) + 1
+ end
+ # Step 7.5
+ cite = node.parent
+ if node.hasContent
+ clone = node.cloneNode
+ # Replace node with clone
+ @tree.activeFormattingElements[@tree.activeFormattingElements.index(node)] = clone
+ @tree.openElements[@tree.openElements.index(node)] = clone
+ node = clone
+ end
+ # Step 7.6
+ # Remove lastNode from its parents, if any
+ lastNode.parent.removeChild(lastNode) if lastNode.parent
+ node.appendChild(lastNode)
+ # Step 7.7
+ lastNode = node
+ # End of inner loop
+ end
+
+ # Step 8
+ lastNode.parent.removeChild(lastNode) if lastNode.parent
+ commonAncestor.appendChild(lastNode)
+
+ # Step 9
+ clone = afeElement.cloneNode
+
+ # Step 10
+ furthestBlock.reparentChildren(clone)
+
+ # Step 11
+ furthestBlock.appendChild(clone)
+
+ # Step 12
+ @tree.activeFormattingElements.delete(afeElement)
+ @tree.activeFormattingElements.insert([bookmark,@tree.activeFormattingElements.length].min, clone)
+
+ # Step 13
+ @tree.openElements.delete(afeElement)
+ @tree.openElements.insert(@tree.openElements.index(furthestBlock) + 1, clone)
+ end
+ end
+
+ def endTagButtonMarqueeObject(name)
+ @tree.generateImpliedEndTags if in_scope?(name)
+
+ unless @tree.openElements[-1].name == name
+ @parser.parseError(_("Unexpected end tag (#{name}). Expected other end tag first."))
+ end
+
+ if in_scope?(name)
+ remove_open_elements_until(name)
+
+ @tree.clearActiveFormattingElements
+ end
+ end
+
+ def endTagMisplaced(name)
+ # This handles elements with end tags in other insertion modes.
+ @parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
+ end
+
+ def endTagNone(name)
+ # This handles elements with no end tag.
+ @parser.parseError(_("This tag (#{name}) has no end tag"))
+ end
+
+ def endTagCdataTextAreaXmp(name)
+ if @tree.openElements[-1].name == name
+ @tree.openElements.pop
+ else
+ @parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
+ end
+ end
+
+ def endTagNew(name)
+ # New HTML5 elements, "event-source", "section", "nav",
+ # "article", "aside", "header", "footer", "datagrid", "command"
+ STDERR.puts "Warning: Undefined behaviour for end tag #{name}"
+ endTagOther(name)
+ #raise NotImplementedError
+ end
+
+ def endTagOther(name)
+ # XXX This logic should be moved into the treebuilder
+ @tree.openElements.reverse.each do |node|
+ if node.name == name
+ @tree.generateImpliedEndTags
+
+ unless @tree.openElements[-1].name == name
+ @parser.parseError(_("Unexpected end tag (#{name})."))
+ end
+
+ remove_open_elements_until { |element| element == node }
+
+ break
+ else
+ if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name)
+ @parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
+ break
+ end
+ end
+ end
+ end
+end
+
+class InTablePhase < Phase
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-table
+
+ handle_start [
+ ['html', :startTagHtml],
+ ['caption', :startTagCaption],
+ ['colgroup', :startTagColgroup],
+ ['col', :startTagCol],
+ [['tbody', 'tfoot', 'thead'], :startTagRowGroup],
+ [['td', 'th', 'tr'], :startTagImplyTbody],
+ ['table', :startTagTable]
+ ]
+
+ handle_end [
+ ['table', :endTagTable],
+ [['body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'], :endTagIgnore]
+ ]
+
+ # helper methods
+ def clearStackToTableContext
+ # "clear the stack back to a table context"
+ until ['table', 'html'].include?(name = @tree.openElements[-1].name)
+ @parser.parseError(_("Unexpected implied end tag (#{name}) in the table phase."))
+ @tree.openElements.pop
+ end
+ # When the current node is it's an innerHTML case
+ end
+
+ # processing methods
+ def processCharacters(data)
+ @parser.parseError(_("Unexpected non-space characters in table context caused voodoo mode."))
+ # Make all the special element rearranging voodoo kick in
+ @tree.insertFromTable = true
+ # Process the character in the "in body" mode
+ @parser.phases[:inBody].processCharacters(data)
+ @tree.insertFromTable = false
+ end
+
+ def startTagCaption(name, attributes)
+ clearStackToTableContext
+ @tree.activeFormattingElements.push(Marker)
+ @tree.insertElement(name, attributes)
+ @parser.phase = @parser.phases[:inCaption]
+ end
+
+ def startTagColgroup(name, attributes)
+ clearStackToTableContext
+ @tree.insertElement(name, attributes)
+ @parser.phase = @parser.phases[:inColumnGroup]
+ end
+
+ def startTagCol(name, attributes)
+ startTagColgroup('colgroup', {})
+ @parser.phase.processStartTag(name, attributes)
+ end
+
+ def startTagRowGroup(name, attributes)
+ clearStackToTableContext
+ @tree.insertElement(name, attributes)
+ @parser.phase = @parser.phases[:inTableBody]
+ end
+
+ def startTagImplyTbody(name, attributes)
+ startTagRowGroup('tbody', {})
+ @parser.phase.processStartTag(name, attributes)
+ end
+
+ def startTagTable(name, attributes)
+ @parser.parseError(_("Unexpected start tag (table) in table phase. Implies end tag (table)."))
+ @parser.phase.processEndTag('table')
+ @parser.phase.processStartTag(name, attributes) unless @parser.innerHTML
+ end
+
+ def startTagOther(name, attributes)
+ @parser.parseError(_("Unexpected start tag (#{name}) in table context caused voodoo mode."))
+ # Make all the special element rearranging voodoo kick in
+ @tree.insertFromTable = true
+ # Process the start tag in the "in body" mode
+ @parser.phases[:inBody].processStartTag(name, attributes)
+ @tree.insertFromTable = false
+ end
+
+ def endTagTable(name)
+ if in_scope?('table', true)
+ @tree.generateImpliedEndTags
+
+ unless @tree.openElements[-1].name == 'table'
+ @parser.parseError(_("Unexpected end tag (table). Expected end tag (#{@tree.openElements[-1].name})."))
+ end
+
+ remove_open_elements_until('table')
+
+ @parser.resetInsertionMode
+ else
+ # innerHTML case
+ assert @parser.innerHTML
+ @parser.parseError
+ end
+ end
+
+ def endTagIgnore(name)
+ @parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
+ end
+
+ def endTagOther(name)
+ @parser.parseError(_("Unexpected end tag (#{name}) in table context caused voodoo mode."))
+ # Make all the special element rearranging voodoo kick in
+ @parser.insertFromTable = true
+ # Process the end tag in the "in body" mode
+ @parser.phases[:inBody].processEndTag(name)
+ @parser.insertFromTable = false
+ end
+end
+
+
+class InCaptionPhase < Phase
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
+
+ handle_start [
+ ['html', :startTagHtml],
+ [['caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'], :startTagTableElement]
+ ]
+
+ handle_end [
+ ['caption', :endTagCaption],
+ ['table', :endTagTable],
+ [['body', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'], :endTagIgnore]
+ ]
+
+ def ignoreEndTagCaption
+ not in_scope?('caption', true)
+ end
+
+ def processCharacters(data)
+ @parser.phases[:inBody].processCharacters(data)
+ end
+
+ def startTagTableElement(name, attributes)
+ @parser.parseError
+ #XXX Have to duplicate logic here to find out if the tag is ignored
+ ignoreEndTag = ignoreEndTagCaption
+ @parser.phase.processEndTag('caption')
+ @parser.phase.processStartTag(name, attributes) unless ignoreEndTag
+ end
+
+ def startTagOther(name, attributes)
+ @parser.phases[:inBody].processStartTag(name, attributes)
+ end
+
+ def endTagCaption(name)
+ if ignoreEndTagCaption
+ # innerHTML case
+ assert @parser.innerHTML
+ @parser.parseError
+ else
+ # AT this code is quite similar to endTagTable in "InTable"
+ @tree.generateImpliedEndTags
+
+ unless @tree.openElements[-1].name == 'caption'
+ @parser.parseError(_("Unexpected end tag (caption). Missing end tags."))
+ end
+
+ remove_open_elements_until('caption')
+
+ @tree.clearActiveFormattingElements
+ @parser.phase = @parser.phases[:inTable]
+ end
+ end
+
+ def endTagTable(name)
+ @parser.parseError
+ ignoreEndTag = ignoreEndTagCaption
+ @parser.phase.processEndTag('caption')
+ @parser.phase.processEndTag(name) unless ignoreEndTag
+ end
+
+ def endTagIgnore(name)
+ @parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
+ end
+
+ def endTagOther(name)
+ @parser.phases[:inBody].processEndTag(name)
+ end
+end
+
+
+class InColumnGroupPhase < Phase
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-column
+
+ handle_start [
+ ['html', :startTagHtml],
+ ['col', :startTagCol]
+ ]
+
+ handle_end [
+ ['colgroup', :endTagColgroup],
+ ['col', :endTagCol]
+ ]
+
+ def ignoreEndTagColgroup
+ @tree.openElements[-1].name == 'html'
+ end
+
+ def processCharacters(data)
+ ignoreEndTag = ignoreEndTagColgroup
+ endTagColgroup("colgroup")
+ @parser.phase.processCharacters(data) unless ignoreEndTag
+ end
+
+ def startTagCol(name, attributes)
+ @tree.insertElement(name, attributes)
+ @tree.openElements.pop
+ end
+
+ def startTagOther(name, attributes)
+ ignoreEndTag = ignoreEndTagColgroup
+ endTagColgroup('colgroup')
+ @parser.phase.processStartTag(name, attributes) unless ignoreEndTag
+ end
+
+ def endTagColgroup(name)
+ if ignoreEndTagColgroup
+ # innerHTML case
+ assert @parser.innerHTML
+ @parser.parseError
+ else
+ @tree.openElements.pop
+ @parser.phase = @parser.phases[:inTable]
+ end
+ end
+
+ def endTagCol(name)
+ @parser.parseError(_('Unexpected end tag (col). col has no end tag.'))
+ end
+
+ def endTagOther(name)
+ ignoreEndTag = ignoreEndTagColgroup
+ endTagColgroup('colgroup')
+ @parser.phase.processEndTag(name) unless ignoreEndTag
+ end
+end
+
+
+class InTableBodyPhase < Phase
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
+
+ handle_start [
+ ['html', :startTagHtml],
+ ['tr', :startTagTr],
+ [['td', 'th'], :startTagTableCell],
+ [['caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead'], :startTagTableOther]
+ ]
+
+ handle_end [
+ [['tbody', 'tfoot', 'thead'], :endTagTableRowGroup],
+ ['table', :endTagTable],
+ [['body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'], :endTagIgnore]
+ ]
+
+ # helper methods
+ def clearStackToTableBodyContext
+ until ['tbody', 'tfoot', 'thead', 'html'].include?(name = @tree.openElements[-1].name)
+ @parser.parseError(_("Unexpected implied end tag (#{name}) in the table body phase."))
+ @tree.openElements.pop
+ end
+ end
+
+ # the rest
+ def processCharacters(data)
+ @parser.phases[:inTable].processCharacters(data)
+ end
+
+ def startTagTr(name, attributes)
+ clearStackToTableBodyContext
+ @tree.insertElement(name, attributes)
+ @parser.phase = @parser.phases[:inRow]
+ end
+
+ def startTagTableCell(name, attributes)
+ @parser.parseError(_("Unexpected table cell start tag (#{name}) in the table body phase."))
+ startTagTr('tr', {})
+ @parser.phase.processStartTag(name, attributes)
+ end
+
+ def startTagTableOther(name, attributes)
+ # XXX AT Any ideas on how to share this with endTagTable?
+ if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
+ clearStackToTableBodyContext
+ endTagTableRowGroup(@tree.openElements[-1].name)
+ @parser.phase.processStartTag(name, attributes)
+ else
+ # innerHTML case
+ @parser.parseError
+ end
+ end
+
+ def startTagOther(name, attributes)
+ @parser.phases[:inTable].processStartTag(name, attributes)
+ end
+
+ def endTagTableRowGroup(name)
+ if in_scope?(name, true)
+ clearStackToTableBodyContext
+ @tree.openElements.pop
+ @parser.phase = @parser.phases[:inTable]
+ else
+ @parser.parseError(_("Unexpected end tag (#{name}) in the table body phase. Ignored."))
+ end
+ end
+
+ def endTagTable(name)
+ if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
+ clearStackToTableBodyContext
+ endTagTableRowGroup(@tree.openElements[-1].name)
+ @parser.phase.processEndTag(name)
+ else
+ # innerHTML case
+ @parser.parseError
+ end
+ end
+
+ def endTagIgnore(name)
+ @parser.parseError(_("Unexpected end tag (#{name}) in the table body phase. Ignored."))
+ end
+
+ def endTagOther(name)
+ @parser.phases[:inTable].processEndTag(name)
+ end
+end
+
+
+class InRowPhase < Phase
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-row
+
+ handle_start [
+ ['html', :startTagHtml],
+ [['td', 'th'], :startTagTableCell],
+ [['caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'], :startTagTableOther]
+ ]
+
+ handle_end [
+ ['tr', :endTagTr],
+ ['table', :endTagTable],
+ [['tbody', 'tfoot', 'thead'], :endTagTableRowGroup],
+ [['body', 'caption', 'col', 'colgroup', 'html', 'td', 'th'], :endTagIgnore]
+ ]
+
+ # helper methods (XXX unify this with other table helper methods)
+ def clearStackToTableRowContext
+ until ['tr', 'html'].include?(name = @tree.openElements[-1].name)
+ @parser.parseError(_("Unexpected implied end tag (#{name}) in the row phase."))
+ @tree.openElements.pop
+ end
+ end
+
+ def ignoreEndTagTr
+ not in_scope?('tr', :tableVariant => true)
+ end
+
+ # the rest
+ def processCharacters(data)
+ @parser.phases[:inTable].processCharacters(data)
+ end
+
+ def startTagTableCell(name, attributes)
+ clearStackToTableRowContext
+ @tree.insertElement(name, attributes)
+ @parser.phase = @parser.phases[:inCell]
+ @tree.activeFormattingElements.push(Marker)
+ end
+
+ def startTagTableOther(name, attributes)
+ ignoreEndTag = ignoreEndTagTr
+ endTagTr('tr')
+ # XXX how are we sure it's always ignored in the innerHTML case?
+ @parser.phase.processStartTag(name, attributes) unless ignoreEndTag
+ end
+
+ def startTagOther(name, attributes)
+ @parser.phases[:inTable].processStartTag(name, attributes)
+ end
+
+ def endTagTr(name)
+ if ignoreEndTagTr
+ # innerHTML case
+ assert @parser.innerHTML
+ @parser.parseError
+ else
+ clearStackToTableRowContext
+ @tree.openElements.pop
+ @parser.phase = @parser.phases[:inTableBody]
+ end
+ end
+
+ def endTagTable(name)
+ ignoreEndTag = ignoreEndTagTr
+ endTagTr('tr')
+ # Reprocess the current tag if the tr end tag was not ignored
+ # XXX how are we sure it's always ignored in the innerHTML case?
+ @parser.phase.processEndTag(name) unless ignoreEndTag
+ end
+
+ def endTagTableRowGroup(name)
+ if in_scope?(name, true)
+ endTagTr('tr')
+ @parser.phase.processEndTag(name)
+ else
+ # innerHTML case
+ @parser.parseError
+ end
+ end
+
+ def endTagIgnore(name)
+ @parser.parseError(_("Unexpected end tag (#{name}) in the row phase. Ignored."))
+ end
+
+ def endTagOther(name)
+ @parser.phases[:inTable].processEndTag(name)
+ end
+end
+
+class InCellPhase < Phase
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
+
+ handle_start [
+ ['html', :startTagHtml],
+ [['caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'], :startTagTableOther]
+ ]
+
+ handle_end [
+ [['td', 'th'], :endTagTableCell],
+ [['body', 'caption', 'col', 'colgroup', 'html'], :endTagIgnore],
+ [['table', 'tbody', 'tfoot', 'thead', 'tr'], :endTagImply]
+ ]
+
+ # helper
+ def closeCell
+ if in_scope?('td', true)
+ endTagTableCell('td')
+ elsif in_scope?('th', true)
+ endTagTableCell('th')
+ end
+ end
+
+ # the rest
+ def processCharacters(data)
+ @parser.phases[:inBody].processCharacters(data)
+ end
+
+ def startTagTableOther(name, attributes)
+ if in_scope?('td', true) or in_scope?('th', true)
+ closeCell
+ @parser.phase.processStartTag(name, attributes)
+ else
+ # innerHTML case
+ @parser.parseError
+ end
+ end
+
+ def startTagOther(name, attributes)
+ @parser.phases[:inBody].processStartTag(name, attributes)
+ end
+
+ def endTagTableCell(name)
+ if in_scope?(name, true)
+ @tree.generateImpliedEndTags(name)
+ if @tree.openElements[-1].name != name
+ @parser.parseError("Got table cell end tag (#{name}) while required end tags are missing.")
+
+ remove_open_elements_until(name)
+ else
+ @tree.openElements.pop
+ end
+ @tree.clearActiveFormattingElements
+ @parser.phase = @parser.phases[:inRow]
+ else
+ @parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
+ end
+ end
+
+ def endTagIgnore(name)
+ @parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
+ end
+
+ def endTagImply(name)
+ if in_scope?(name, true)
+ closeCell
+ @parser.phase.processEndTag(name)
+ else
+ # sometimes innerHTML case
+ @parser.parseError
+ end
+ end
+
+ def endTagOther(name)
+ @parser.phases[:inBody].processEndTag(name)
+ end
+end
+
+
+class InSelectPhase < Phase
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-select
+
+ handle_start [
+ ['html', :startTagHtml],
+ ['option', :startTagOption],
+ ['optgroup', :startTagOptgroup],
+ ['select', :startTagSelect]
+ ]
+
+ handle_end [
+ ['option', :endTagOption],
+ ['optgroup', :endTagOptgroup],
+ ['select', :endTagSelect],
+ [['caption', 'table', 'tbody', 'tfoot', 'thead', 'tr', 'td', 'th'], :endTagTableElements]
+ ]
+
+ def processCharacters(data)
+ @tree.insertText(data)
+ end
+
+ def startTagOption(name, attributes)
+ # We need to imply if