HTML5lib Sanitizer
Replaced native Sanitizer with HTML5lib version. Synced with latest Maruku.
This commit is contained in:
parent
457ec8627c
commit
6b21ac484f
36 changed files with 6534 additions and 215 deletions
|
@ -294,13 +294,13 @@ class WikiController < ApplicationController
|
||||||
|
|
||||||
def s5
|
def s5
|
||||||
if @web.markup == :markdownMML
|
if @web.markup == :markdownMML
|
||||||
@s5_content = sanitize_html(Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
|
@s5_content = sanitize_xhtml(Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
|
||||||
{:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}'], :content_only => true,
|
{:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}'], :content_only => true,
|
||||||
:author => @page.author, :title => @page.plain_name}).to_s5).to_ncr
|
:author => @page.author, :title => @page.plain_name}).to_s5.to_ncr)
|
||||||
elsif @web.markup == :markdown
|
elsif @web.markup == :markdown
|
||||||
@s5_content = sanitize_html(Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
|
@s5_content = sanitize_xhtml(Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
|
||||||
{:math_enabled => false, :content_only => true,
|
{:math_enabled => false, :content_only => true,
|
||||||
:author => @page.author, :title => @page.plain_name}).to_s5).to_ncr
|
:author => @page.author, :title => @page.plain_name}).to_s5.to_ncr)
|
||||||
else
|
else
|
||||||
@s5_content = "S5 not supported with this text filter"
|
@s5_content = "S5 not supported with this text filter"
|
||||||
end
|
end
|
||||||
|
|
207
attic/lib/sanitize.rb
Normal file
207
attic/lib/sanitize.rb
Normal file
|
@ -0,0 +1,207 @@
|
||||||
|
module Sanitize
|
||||||
|
|
||||||
|
# This module provides sanitization of XHTML+MathML+SVG
|
||||||
|
# and of inline style attributes.
|
||||||
|
#
|
||||||
|
# Based heavily on Sam Ruby's code in the Universal FeedParser.
|
||||||
|
|
||||||
|
require 'html/tokenizer'
|
||||||
|
require 'node'
|
||||||
|
|
||||||
|
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
||||||
|
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
||||||
|
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
|
||||||
|
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||||
|
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
|
||||||
|
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
|
||||||
|
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
|
||||||
|
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
|
||||||
|
'ul', 'var']
|
||||||
|
|
||||||
|
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
|
||||||
|
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
|
||||||
|
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
|
||||||
|
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
|
||||||
|
'munderover', 'none']
|
||||||
|
|
||||||
|
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
||||||
|
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
|
||||||
|
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
|
||||||
|
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
|
||||||
|
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
|
||||||
|
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
|
||||||
|
|
||||||
|
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
||||||
|
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
|
||||||
|
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
|
||||||
|
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
|
||||||
|
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
|
||||||
|
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
|
||||||
|
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
|
||||||
|
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
||||||
|
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
|
||||||
|
'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', 'title',
|
||||||
|
'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang']
|
||||||
|
|
||||||
|
|
||||||
|
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
|
||||||
|
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
|
||||||
|
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
|
||||||
|
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
|
||||||
|
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
|
||||||
|
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
|
||||||
|
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
|
||||||
|
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
|
||||||
|
'xlink:type', 'xmlns', 'xmlns:xlink']
|
||||||
|
|
||||||
|
|
||||||
|
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
|
||||||
|
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
||||||
|
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
||||||
|
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
|
||||||
|
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
|
||||||
|
'font-family', 'font-size', 'font-stretch', 'font-style', 'font-variant',
|
||||||
|
'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
|
||||||
|
'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
|
||||||
|
'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
|
||||||
|
'lang', 'marker-end', 'marker-mid', 'marker-start', 'markerHeight',
|
||||||
|
'markerUnits', 'markerWidth', 'mathematical', 'max', 'min', 'name',
|
||||||
|
'offset', 'opacity', 'orient', 'origin', 'overline-position',
|
||||||
|
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
|
||||||
|
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount', 'repeatDur',
|
||||||
|
'requiredExtensions', 'requiredFeatures', 'restart', 'rotate', 'rx',
|
||||||
|
'ry', 'slope', 'stemh', 'stemv', 'stop-color', 'stop-opacity',
|
||||||
|
'strikethrough-position', 'strikethrough-thickness', 'stroke',
|
||||||
|
'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
|
||||||
|
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
|
||||||
|
'stroke-width', 'systemLanguage', 'target',
|
||||||
|
'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
|
||||||
|
'underline-position', 'underline-thickness', 'unicode',
|
||||||
|
'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
|
||||||
|
'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
|
||||||
|
'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
|
||||||
|
'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
|
||||||
|
'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
|
||||||
|
|
||||||
|
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'xlink:href']
|
||||||
|
|
||||||
|
acceptable_css_properties = ['azimuth', 'background-color',
|
||||||
|
'border-bottom-color', 'border-collapse', 'border-color',
|
||||||
|
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
|
||||||
|
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
|
||||||
|
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
|
||||||
|
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
|
||||||
|
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
|
||||||
|
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
|
||||||
|
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
|
||||||
|
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
|
||||||
|
'white-space', 'width']
|
||||||
|
|
||||||
|
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
|
||||||
|
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
|
||||||
|
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
|
||||||
|
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
|
||||||
|
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
|
||||||
|
'transparent', 'underline', 'white', 'yellow']
|
||||||
|
|
||||||
|
acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
|
||||||
|
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
|
||||||
|
'stroke-opacity']
|
||||||
|
|
||||||
|
acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
|
||||||
|
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
|
||||||
|
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
|
||||||
|
'ssh', 'sftp', 'rtsp', 'afs' ]
|
||||||
|
|
||||||
|
ALLOWED_ELEMENTS = acceptable_elements + mathml_elements + svg_elements unless defined?(ALLOWED_ELEMENTS)
|
||||||
|
ALLOWED_ATTRIBUTES = acceptable_attributes + mathml_attributes + svg_attributes unless defined?(ALLOWED_ATTRIBUTES)
|
||||||
|
ALLOWED_CSS_PROPERTIES = acceptable_css_properties unless defined?(ALLOWED_CSS_PROPERTIES)
|
||||||
|
ALLOWED_CSS_KEYWORDS = acceptable_css_keywords unless defined?(ALLOWED_CSS_KEYWORDS)
|
||||||
|
ALLOWED_SVG_PROPERTIES = acceptable_svg_properties unless defined?(ALLOWED_SVG_PROPERTIES)
|
||||||
|
ALLOWED_PROTOCOLS = acceptable_protocols unless defined?(ALLOWED_PROTOCOLS)
|
||||||
|
ATTR_VAL_IS_URI = attr_val_is_uri unless defined?(ATTR_VAL_IS_URI)
|
||||||
|
|
||||||
|
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and stripping out all
|
||||||
|
# attributes not in ALLOWED_ATTRIBUTES. Style attributes are parsed, and a restricted set,
|
||||||
|
# specified by ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
||||||
|
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified in
|
||||||
|
# ALLOWED_PROTOCOLS are allowed.
|
||||||
|
# You can adjust what gets sanitized, by defining these constant arrays before this Module is loaded.
|
||||||
|
#
|
||||||
|
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||||
|
# => <script> do_nasty_stuff() </script>
|
||||||
|
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||||
|
# => <a>Click here for $100</a>
|
||||||
|
def sanitize_html(html)
|
||||||
|
if html.index("<")
|
||||||
|
tokenizer = HTML::Tokenizer.new(html)
|
||||||
|
new_text = ""
|
||||||
|
|
||||||
|
while token = tokenizer.next
|
||||||
|
node = XHTML::Node.parse(nil, 0, 0, token, false)
|
||||||
|
new_text << case node.tag?
|
||||||
|
when true
|
||||||
|
if ALLOWED_ELEMENTS.include?(node.name)
|
||||||
|
if node.closing != :close
|
||||||
|
node.attributes.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
|
||||||
|
ATTR_VAL_IS_URI.each do |attr|
|
||||||
|
val_unescaped = CGI.unescapeHTML(node.attributes[attr].to_s).gsub(/[\000-\040\177\s]+|\302*[\200-\240]/,'').downcase
|
||||||
|
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
|
||||||
|
node.attributes.delete attr
|
||||||
|
end
|
||||||
|
end
|
||||||
|
if node.attributes['style']
|
||||||
|
node.attributes['style'] = sanitize_css(node.attributes['style'])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
node.to_s
|
||||||
|
else
|
||||||
|
node.to_s.gsub(/</, "<")
|
||||||
|
end
|
||||||
|
else
|
||||||
|
node.to_s.gsub(/</, "<")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
html = new_text
|
||||||
|
end
|
||||||
|
html
|
||||||
|
end
|
||||||
|
|
||||||
|
def sanitize_css(style)
|
||||||
|
# disallow urls
|
||||||
|
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
|
||||||
|
|
||||||
|
# gauntlet
|
||||||
|
if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
|
||||||
|
style = ''
|
||||||
|
return style
|
||||||
|
end
|
||||||
|
if style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
|
||||||
|
style = ''
|
||||||
|
return style
|
||||||
|
end
|
||||||
|
|
||||||
|
clean = []
|
||||||
|
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
|
||||||
|
if ALLOWED_CSS_PROPERTIES.include?(prop.downcase)
|
||||||
|
clean << prop + ': ' + val + ';'
|
||||||
|
elsif ['background','border','margin','padding'].include?(prop.split('-')[0].downcase)
|
||||||
|
goodval = true
|
||||||
|
val.split().each do |keyword|
|
||||||
|
if !ALLOWED_CSS_KEYWORDS.include?(keyword) and
|
||||||
|
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
|
||||||
|
goodval = false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
if goodval
|
||||||
|
clean << prop + ': ' + val + ';'
|
||||||
|
end
|
||||||
|
elsif ALLOWED_SVG_PROPERTIES.include?(prop.downcase)
|
||||||
|
clean << prop + ': ' + val + ';'
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
style = clean.join(' ')
|
||||||
|
end
|
||||||
|
end
|
187
attic/test/unit/sanitize_test.rb
Normal file
187
attic/test/unit/sanitize_test.rb
Normal file
|
@ -0,0 +1,187 @@
|
||||||
|
#!/usr/bin/env ruby
|
||||||
|
|
||||||
|
require File.expand_path(File.join(File.dirname(__FILE__), '/../test_helper'))
|
||||||
|
require 'sanitize'
|
||||||
|
|
||||||
|
class SanitizeTest < Test::Unit::TestCase
|
||||||
|
include Sanitize
|
||||||
|
|
||||||
|
def setup
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
Sanitize::ALLOWED_ELEMENTS.each do |tag_name|
|
||||||
|
define_method "test_should_allow_#{tag_name}_tag" do
|
||||||
|
assert_equal "<#{tag_name} title=\"1\">foo <bad>bar</bad> baz</#{tag_name}>",
|
||||||
|
sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
Sanitize::ALLOWED_ELEMENTS.each do |tag_name|
|
||||||
|
define_method "test_should_forbid_#{tag_name.upcase}_tag" do
|
||||||
|
assert_equal "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>",
|
||||||
|
sanitize_html("<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
Sanitize::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
||||||
|
if attribute_name != 'style'
|
||||||
|
define_method "test_should_allow_#{attribute_name}_attribute" do
|
||||||
|
assert_equal "<p #{attribute_name}=\"foo\">foo <bad>bar</bad> baz</p>",
|
||||||
|
sanitize_html("<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
Sanitize::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
||||||
|
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
|
||||||
|
assert_equal "<p>foo <bad>bar</bad> baz</p>",
|
||||||
|
sanitize_html("<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
Sanitize::ALLOWED_PROTOCOLS.each do |protocol|
|
||||||
|
define_method "test_should_allow_#{protocol}_uris" do
|
||||||
|
assert_equal "<a href=\"#{protocol}\">foo</a>",
|
||||||
|
sanitize_html(%(<a href="#{protocol}">foo</a>))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
Sanitize::ALLOWED_PROTOCOLS.each do |protocol|
|
||||||
|
define_method "test_should_allow_uppercase_#{protocol}_uris" do
|
||||||
|
assert_equal "<a href=\"#{protocol.upcase}\">foo</a>",
|
||||||
|
sanitize_html(%(<a href="#{protocol.upcase}">foo</a>))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_allow_anchors
|
||||||
|
assert_equal "<a href=\"foo\"><script>baz</script></a>",
|
||||||
|
sanitize_html("<a href='foo' onclick='bar'><script>baz</script></a>")
|
||||||
|
end
|
||||||
|
|
||||||
|
# RFC 3986, sec 4.2
|
||||||
|
def test_allow_colons_in_path_component
|
||||||
|
assert_equal "<a href=\"./this:that\">foo</a>",
|
||||||
|
sanitize_html("<a href=\"./this:that\">foo</a>")
|
||||||
|
end
|
||||||
|
|
||||||
|
%w(src width height alt).each do |img_attr|
|
||||||
|
define_method "test_should_allow_image_#{img_attr}_attribute" do
|
||||||
|
assert_equal "<img #{img_attr}=\"foo\" />",
|
||||||
|
sanitize_html("<img #{img_attr}='foo' onclick='bar' />")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_handle_non_html
|
||||||
|
assert_equal 'abc', sanitize_html("abc")
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_handle_blank_text
|
||||||
|
assert_equal '', sanitize_html('')
|
||||||
|
end
|
||||||
|
|
||||||
|
[%w(img src), %w(a href)].each do |(tag, attr)|
|
||||||
|
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols" do
|
||||||
|
assert_equal %(<#{tag} title="1">boo</#{tag}>), sanitize_html(%(<#{tag} #{attr}="javascript:XSS" title="1">boo</#{tag}>))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
[%w(img src), %w(a href)].each do |(tag, attr)|
|
||||||
|
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols_and_whitespace" do
|
||||||
|
assert_equal %(<#{tag} title="1">boo</#{tag}>), sanitize_html(%(<#{tag} #{attr}=" javascript:XSS" title="1">boo</#{tag}>))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
[%(<img src="javascript:alert('XSS');" />),
|
||||||
|
%(<img src=javascript:alert('XSS') />),
|
||||||
|
%(<img src="JaVaScRiPt:alert('XSS')" />),
|
||||||
|
%(<img src='javascript:alert("XSS")' />),
|
||||||
|
%(<img src='javascript:alert(String.fromCharCode(88,83,83))' />),
|
||||||
|
%(<img src='javascript:alert('XSS')' />),
|
||||||
|
%(<img src='javascript:alert('XSS')' />),
|
||||||
|
%(<img src='javascript:alert('XSS')' />),
|
||||||
|
%(<img src="jav\tascript:alert('XSS');" />),
|
||||||
|
%(<img src="jav	ascript:alert('XSS');" />),
|
||||||
|
%(<img src="jav
ascript:alert('XSS');" />),
|
||||||
|
%(<img src="jav
ascript:alert('XSS');" />),
|
||||||
|
%(<img src="  javascript:alert('XSS');" />),
|
||||||
|
%(<img src=" javascript:alert('XSS');" />),
|
||||||
|
%(<img src=" javascript:alert('XSS');" />)].each_with_index do |img_hack, i|
|
||||||
|
define_method "test_should_not_fall_for_xss_image_hack_#{i}" do
|
||||||
|
assert_equal "<img />", sanitize_html(img_hack)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_sanitize_tag_broken_up_by_null
|
||||||
|
assert_equal "<scr>alert(\"XSS\")</scr>", sanitize_html(%(<scr\0ipt>alert(\"XSS\")</scr\0ipt>))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_sanitize_invalid_script_tag
|
||||||
|
assert_equal "<script /></script>", sanitize_html(%(<script/XSS SRC="http://ha.ckers.org/xss.js"></script>))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_sanitize_script_tag_with_multiple_open_brackets
|
||||||
|
assert_equal "<<script>alert(\"XSS\");//<</script>", sanitize_html(%(<<script>alert("XSS");//<</script>))
|
||||||
|
assert_equal %(<iframe src="http:" /><), sanitize_html(%(<iframe src=http://ha.ckers.org/scriptlet.html\n<))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_sanitize_unclosed_script
|
||||||
|
assert_equal "<script src=\"http:\" /><b>", sanitize_html(%(<script src=http://ha.ckers.org/xss.js?<b>))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_sanitize_half_open_scripts
|
||||||
|
assert_equal "<img>", sanitize_html(%(<img src="javascript:alert('XSS')"))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_not_fall_for_ridiculous_hack
|
||||||
|
img_hack = %(<img\nsrc\n=\n"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n"\n />)
|
||||||
|
assert_equal "<img />", sanitize_html(img_hack)
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_platypus
|
||||||
|
assert_equal %(<a href=\"http://www.ragingplatypus.com/\" style=\"display: block; width: 100%; height: 100%; background-color: black; background-image: ; background-x: center; background-y: center;\">never trust your upstream platypus</a>),
|
||||||
|
sanitize_html(%(<a href="http://www.ragingplatypus.com/" style="display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;">never trust your upstream platypus</a>))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_xul
|
||||||
|
assert_equal %(<p style="">fubar</p>),
|
||||||
|
sanitize_html(%(<p style="-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')">fubar</p>))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_input_image
|
||||||
|
assert_equal %(<input type="image" />),
|
||||||
|
sanitize_html(%(<input type="image" src="javascript:alert('XSS');" />))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_non_alpha_non_digit
|
||||||
|
assert_equal "<script /></script>",
|
||||||
|
sanitize_html(%(<script/XSS src="http://ha.ckers.org/xss.js"></script>))
|
||||||
|
assert_equal "<a>foo</a>",
|
||||||
|
sanitize_html('<a onclick!#$%&()*~+-_.,:;?@[/|\]^`=alert("XSS")>foo</a>')
|
||||||
|
assert_equal "<img />",
|
||||||
|
sanitize_html('<img/src="http://ha.ckers.org/xss.js"/>')
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_img_dynsrc_lowsrc
|
||||||
|
assert_equal "<img />",
|
||||||
|
sanitize_html(%(<img dynsrc="javascript:alert('XSS')" />))
|
||||||
|
assert_equal "<img />",
|
||||||
|
sanitize_html(%(<img lowsrc="javascript:alert('XSS')" />))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_div_background_image_unicode_encoded
|
||||||
|
assert_equal '<div style="">foo</div>',
|
||||||
|
sanitize_html(%(<div style="background-image:\0075\0072\006C\0028'\006a\0061\0076\0061\0073\0063\0072\0069\0070\0074\003a\0061\006c\0065\0072\0074\0028.1027\0058.1053\0053\0027\0029'\0029">foo</div>))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_div_expression
|
||||||
|
assert_equal '<div style="">foo</div>',
|
||||||
|
sanitize_html(%(<div style="width: expression(alert('XSS'));">foo</div>))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_img_vbscript
|
||||||
|
assert_equal '<img />',
|
||||||
|
sanitize_html(%(<img src='vbscript:msgbox("XSS")' />))
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
|
@ -32,7 +32,7 @@ module Engines
|
||||||
redcloth.filter_html = false
|
redcloth.filter_html = false
|
||||||
redcloth.no_span_caps = false
|
redcloth.no_span_caps = false
|
||||||
html = redcloth.to_html(:textile)
|
html = redcloth.to_html(:textile)
|
||||||
sanitize_html(html)
|
sanitize_xhtml(html)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -43,7 +43,7 @@ module Engines
|
||||||
require_dependency 'maruku'
|
require_dependency 'maruku'
|
||||||
require_dependency 'maruku/ext/math'
|
require_dependency 'maruku/ext/math'
|
||||||
html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"), {:math_enabled => false}).to_html
|
html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"), {:math_enabled => false}).to_html
|
||||||
sanitize_html(html).to_ncr
|
sanitize_xhtml(html.to_ncr)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -55,7 +55,7 @@ module Engines
|
||||||
require_dependency 'maruku/ext/math'
|
require_dependency 'maruku/ext/math'
|
||||||
html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
|
html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
|
||||||
{:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html
|
{:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html
|
||||||
sanitize_html(html).to_ncr
|
sanitize_xhtml(html.to_ncr)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -68,7 +68,7 @@ module Engines
|
||||||
redcloth.filter_html = false
|
redcloth.filter_html = false
|
||||||
redcloth.no_span_caps = false
|
redcloth.no_span_caps = false
|
||||||
html = redcloth.to_html
|
html = redcloth.to_html
|
||||||
sanitize_html(html)
|
sanitize_xhtml(html)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -78,7 +78,7 @@ module Engines
|
||||||
def mask
|
def mask
|
||||||
require_dependency 'rdocsupport'
|
require_dependency 'rdocsupport'
|
||||||
html = RDocSupport::RDocFormatter.new(@content).to_html
|
html = RDocSupport::RDocFormatter.new(@content).to_html
|
||||||
sanitize_html(html)
|
sanitize_xhtml(html)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
215
lib/sanitize.rb
215
lib/sanitize.rb
|
@ -1,207 +1,26 @@
|
||||||
module Sanitize
|
module Sanitize
|
||||||
|
|
||||||
# This module provides sanitization of XHTML+MathML+SVG
|
# This module provides sanitization of XHTML+MathML+SVG
|
||||||
# and of inline style attributes.
|
# and of inline style attributes.
|
||||||
#
|
#
|
||||||
# Based heavily on Sam Ruby's code in the Universal FeedParser.
|
# Uses the HTML5lib parser, so that the parsing behaviour should
|
||||||
|
# resemble that of browsers.
|
||||||
require 'html/tokenizer'
|
#
|
||||||
require 'node'
|
# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
|
||||||
|
# sanitize_html() is a case-insensitive sanitizer suitable for HTML
|
||||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
|
||||||
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
|
||||||
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
|
|
||||||
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
|
||||||
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
|
|
||||||
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
|
|
||||||
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
|
|
||||||
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
|
|
||||||
'ul', 'var']
|
|
||||||
|
|
||||||
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
|
|
||||||
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
|
|
||||||
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
|
|
||||||
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
|
|
||||||
'munderover', 'none']
|
|
||||||
|
|
||||||
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
|
||||||
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
|
|
||||||
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
|
|
||||||
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
|
|
||||||
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
|
|
||||||
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
|
|
||||||
|
|
||||||
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
|
||||||
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
|
|
||||||
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
|
|
||||||
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
|
|
||||||
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
|
|
||||||
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
|
|
||||||
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
|
|
||||||
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
|
||||||
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
|
|
||||||
'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', 'title',
|
|
||||||
'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang']
|
|
||||||
|
|
||||||
|
|
||||||
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
|
require 'html5lib/sanitizer'
|
||||||
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
|
require 'html5lib/html5parser'
|
||||||
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
|
require 'html5lib/liberalxmlparser'
|
||||||
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
|
include HTML5lib
|
||||||
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
|
|
||||||
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
|
|
||||||
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
|
|
||||||
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
|
|
||||||
'xlink:type', 'xmlns', 'xmlns:xlink']
|
|
||||||
|
|
||||||
|
def sanitize_xhtml(html)
|
||||||
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
|
XHTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
|
||||||
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
end
|
||||||
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
|
||||||
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
|
|
||||||
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
|
|
||||||
'font-family', 'font-size', 'font-stretch', 'font-style', 'font-variant',
|
|
||||||
'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
|
|
||||||
'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
|
|
||||||
'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
|
|
||||||
'lang', 'marker-end', 'marker-mid', 'marker-start', 'markerHeight',
|
|
||||||
'markerUnits', 'markerWidth', 'mathematical', 'max', 'min', 'name',
|
|
||||||
'offset', 'opacity', 'orient', 'origin', 'overline-position',
|
|
||||||
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
|
|
||||||
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount', 'repeatDur',
|
|
||||||
'requiredExtensions', 'requiredFeatures', 'restart', 'rotate', 'rx',
|
|
||||||
'ry', 'slope', 'stemh', 'stemv', 'stop-color', 'stop-opacity',
|
|
||||||
'strikethrough-position', 'strikethrough-thickness', 'stroke',
|
|
||||||
'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
|
|
||||||
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
|
|
||||||
'stroke-width', 'systemLanguage', 'target',
|
|
||||||
'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
|
|
||||||
'underline-position', 'underline-thickness', 'unicode',
|
|
||||||
'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
|
|
||||||
'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
|
|
||||||
'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
|
|
||||||
'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
|
|
||||||
'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
|
|
||||||
|
|
||||||
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'xlink:href']
|
def sanitize_html(html)
|
||||||
|
HTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
|
||||||
acceptable_css_properties = ['azimuth', 'background-color',
|
end
|
||||||
'border-bottom-color', 'border-collapse', 'border-color',
|
|
||||||
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
|
|
||||||
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
|
|
||||||
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
|
|
||||||
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
|
|
||||||
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
|
|
||||||
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
|
|
||||||
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
|
|
||||||
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
|
|
||||||
'white-space', 'width']
|
|
||||||
|
|
||||||
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
|
end
|
||||||
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
|
|
||||||
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
|
|
||||||
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
|
|
||||||
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
|
|
||||||
'transparent', 'underline', 'white', 'yellow']
|
|
||||||
|
|
||||||
acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
|
|
||||||
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
|
|
||||||
'stroke-opacity']
|
|
||||||
|
|
||||||
acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
|
|
||||||
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
|
|
||||||
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
|
|
||||||
'ssh', 'sftp', 'rtsp', 'afs' ]
|
|
||||||
|
|
||||||
ALLOWED_ELEMENTS = acceptable_elements + mathml_elements + svg_elements unless defined?(ALLOWED_ELEMENTS)
|
|
||||||
ALLOWED_ATTRIBUTES = acceptable_attributes + mathml_attributes + svg_attributes unless defined?(ALLOWED_ATTRIBUTES)
|
|
||||||
ALLOWED_CSS_PROPERTIES = acceptable_css_properties unless defined?(ALLOWED_CSS_PROPERTIES)
|
|
||||||
ALLOWED_CSS_KEYWORDS = acceptable_css_keywords unless defined?(ALLOWED_CSS_KEYWORDS)
|
|
||||||
ALLOWED_SVG_PROPERTIES = acceptable_svg_properties unless defined?(ALLOWED_SVG_PROPERTIES)
|
|
||||||
ALLOWED_PROTOCOLS = acceptable_protocols unless defined?(ALLOWED_PROTOCOLS)
|
|
||||||
ATTR_VAL_IS_URI = attr_val_is_uri unless defined?(ATTR_VAL_IS_URI)
|
|
||||||
|
|
||||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and stripping out all
|
|
||||||
# attributes not in ALLOWED_ATTRIBUTES. Style attributes are parsed, and a restricted set,
|
|
||||||
# specified by ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
|
||||||
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified in
|
|
||||||
# ALLOWED_PROTOCOLS are allowed.
|
|
||||||
# You can adjust what gets sanitized, by defining these constant arrays before this Module is loaded.
|
|
||||||
#
|
|
||||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
|
||||||
# => <script> do_nasty_stuff() </script>
|
|
||||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
|
||||||
# => <a>Click here for $100</a>
|
|
||||||
def sanitize_html(html)
|
|
||||||
if html.index("<")
|
|
||||||
tokenizer = HTML::Tokenizer.new(html)
|
|
||||||
new_text = ""
|
|
||||||
|
|
||||||
while token = tokenizer.next
|
|
||||||
node = XHTML::Node.parse(nil, 0, 0, token, false)
|
|
||||||
new_text << case node.tag?
|
|
||||||
when true
|
|
||||||
if ALLOWED_ELEMENTS.include?(node.name)
|
|
||||||
if node.closing != :close
|
|
||||||
node.attributes.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
|
|
||||||
ATTR_VAL_IS_URI.each do |attr|
|
|
||||||
val_unescaped = CGI.unescapeHTML(node.attributes[attr].to_s).gsub(/[\000-\040\177-\240]+/,'').downcase
|
|
||||||
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
|
|
||||||
node.attributes.delete attr
|
|
||||||
end
|
|
||||||
end
|
|
||||||
if node.attributes['style']
|
|
||||||
node.attributes['style'] = sanitize_css(node.attributes['style'])
|
|
||||||
end
|
|
||||||
end
|
|
||||||
node.to_s
|
|
||||||
else
|
|
||||||
node.to_s.gsub(/</, "<")
|
|
||||||
end
|
|
||||||
else
|
|
||||||
node.to_s.gsub(/</, "<")
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
html = new_text
|
|
||||||
end
|
|
||||||
html
|
|
||||||
end
|
|
||||||
|
|
||||||
def sanitize_css(style)
|
|
||||||
# disallow urls
|
|
||||||
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
|
|
||||||
|
|
||||||
# gauntlet
|
|
||||||
if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
|
|
||||||
style = ''
|
|
||||||
return style
|
|
||||||
end
|
|
||||||
if style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
|
|
||||||
style = ''
|
|
||||||
return style
|
|
||||||
end
|
|
||||||
|
|
||||||
clean = []
|
|
||||||
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
|
|
||||||
if ALLOWED_CSS_PROPERTIES.include?(prop.downcase)
|
|
||||||
clean << prop + ': ' + val + ';'
|
|
||||||
elsif ['background','border','margin','padding'].include?(prop.split('-')[0].downcase)
|
|
||||||
goodval = true
|
|
||||||
val.split().each do |keyword|
|
|
||||||
if !ALLOWED_CSS_KEYWORDS.include?(keyword) and
|
|
||||||
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
|
|
||||||
goodval = false
|
|
||||||
end
|
|
||||||
end
|
|
||||||
if goodval
|
|
||||||
clean << prop + ': ' + val + ';'
|
|
||||||
end
|
|
||||||
elsif ALLOWED_SVG_PROPERTIES.include?(prop.downcase)
|
|
||||||
clean << prop + ': ' + val + ';'
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
style = clean.join(' ')
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
9
vendor/plugins/HTML5lib/README
vendored
Normal file
9
vendor/plugins/HTML5lib/README
vendored
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
= HTML5lib
|
||||||
|
|
||||||
|
== Basic Usage
|
||||||
|
|
||||||
|
require 'html5lib'
|
||||||
|
|
||||||
|
doc = HTML5lib.parse('<html>...</html>')
|
||||||
|
|
||||||
|
doc.class # REXML::Document
|
7
vendor/plugins/HTML5lib/Rakefile.rb
vendored
Normal file
7
vendor/plugins/HTML5lib/Rakefile.rb
vendored
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
require 'rake'
|
||||||
|
require 'rake/testtask'
|
||||||
|
|
||||||
|
Rake::TestTask.new do |task|
|
||||||
|
task.pattern = 'tests/test_*.rb'
|
||||||
|
task.verbose = true
|
||||||
|
end
|
11
vendor/plugins/HTML5lib/lib/html5lib.rb
vendored
Normal file
11
vendor/plugins/HTML5lib/lib/html5lib.rb
vendored
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
require 'html5lib/html5parser'
|
||||||
|
|
||||||
|
module HTML5lib
|
||||||
|
def self.parse(stream, options={})
|
||||||
|
HTMLParser.parse(stream, options)
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.parseFragment(stream, options={})
|
||||||
|
HTMLParser.parse(stream, options)
|
||||||
|
end
|
||||||
|
end
|
676
vendor/plugins/HTML5lib/lib/html5lib/constants.rb
vendored
Executable file
676
vendor/plugins/HTML5lib/lib/html5lib/constants.rb
vendored
Executable file
|
@ -0,0 +1,676 @@
|
||||||
|
module HTML5lib
|
||||||
|
|
||||||
|
class EOF < Exception; end
|
||||||
|
|
||||||
|
CONTENT_MODEL_FLAGS = [
|
||||||
|
:PCDATA,
|
||||||
|
:RCDATA,
|
||||||
|
:CDATA,
|
||||||
|
:PLAINTEXT
|
||||||
|
]
|
||||||
|
|
||||||
|
SCOPING_ELEMENTS = %w[
|
||||||
|
button
|
||||||
|
caption
|
||||||
|
html
|
||||||
|
marquee
|
||||||
|
object
|
||||||
|
table
|
||||||
|
td
|
||||||
|
th
|
||||||
|
]
|
||||||
|
|
||||||
|
FORMATTING_ELEMENTS = %w[
|
||||||
|
a
|
||||||
|
b
|
||||||
|
big
|
||||||
|
em
|
||||||
|
font
|
||||||
|
i
|
||||||
|
nobr
|
||||||
|
s
|
||||||
|
small
|
||||||
|
strike
|
||||||
|
strong
|
||||||
|
tt
|
||||||
|
u
|
||||||
|
]
|
||||||
|
|
||||||
|
SPECIAL_ELEMENTS = %w[
|
||||||
|
address
|
||||||
|
area
|
||||||
|
base
|
||||||
|
basefont
|
||||||
|
bgsound
|
||||||
|
blockquote
|
||||||
|
body
|
||||||
|
br
|
||||||
|
center
|
||||||
|
col
|
||||||
|
colgroup
|
||||||
|
dd
|
||||||
|
dir
|
||||||
|
div
|
||||||
|
dl
|
||||||
|
dt
|
||||||
|
embed
|
||||||
|
fieldset
|
||||||
|
form
|
||||||
|
frame
|
||||||
|
frameset
|
||||||
|
h1
|
||||||
|
h2
|
||||||
|
h3
|
||||||
|
h4
|
||||||
|
h5
|
||||||
|
h6
|
||||||
|
head
|
||||||
|
hr
|
||||||
|
iframe
|
||||||
|
image
|
||||||
|
img
|
||||||
|
input
|
||||||
|
isindex
|
||||||
|
li
|
||||||
|
link
|
||||||
|
listing
|
||||||
|
menu
|
||||||
|
meta
|
||||||
|
noembed
|
||||||
|
noframes
|
||||||
|
noscript
|
||||||
|
ol
|
||||||
|
optgroup
|
||||||
|
option
|
||||||
|
p
|
||||||
|
param
|
||||||
|
plaintext
|
||||||
|
pre
|
||||||
|
script
|
||||||
|
select
|
||||||
|
spacer
|
||||||
|
style
|
||||||
|
tbody
|
||||||
|
textarea
|
||||||
|
tfoot
|
||||||
|
thead
|
||||||
|
title
|
||||||
|
tr
|
||||||
|
ul
|
||||||
|
wbr
|
||||||
|
]
|
||||||
|
|
||||||
|
SPACE_CHARACTERS = %W[
|
||||||
|
\t
|
||||||
|
\n
|
||||||
|
\x0B
|
||||||
|
\x0C
|
||||||
|
\x20
|
||||||
|
\r
|
||||||
|
]
|
||||||
|
|
||||||
|
TABLE_INSERT_MODE_ELEMENTS = %w[
|
||||||
|
table
|
||||||
|
tbody
|
||||||
|
tfoot
|
||||||
|
thead
|
||||||
|
tr
|
||||||
|
]
|
||||||
|
|
||||||
|
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
|
||||||
|
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
|
||||||
|
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
|
||||||
|
DIGITS = '0'..'9'
|
||||||
|
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
|
||||||
|
|
||||||
|
# Heading elements need to be ordered
|
||||||
|
HEADING_ELEMENTS = %w[
|
||||||
|
h1
|
||||||
|
h2
|
||||||
|
h3
|
||||||
|
h4
|
||||||
|
h5
|
||||||
|
h6
|
||||||
|
]
|
||||||
|
|
||||||
|
# XXX What about event-source and command?
|
||||||
|
VOID_ELEMENTS = %w[
|
||||||
|
base
|
||||||
|
link
|
||||||
|
meta
|
||||||
|
hr
|
||||||
|
br
|
||||||
|
img
|
||||||
|
embed
|
||||||
|
param
|
||||||
|
area
|
||||||
|
col
|
||||||
|
input
|
||||||
|
]
|
||||||
|
|
||||||
|
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
|
||||||
|
ENTITIES_WINDOWS1252 = [
|
||||||
|
8364, # 0x80 0x20AC EURO SIGN
|
||||||
|
65533, # 0x81 UNDEFINED
|
||||||
|
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
|
||||||
|
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
|
||||||
|
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
|
||||||
|
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
|
||||||
|
8224, # 0x86 0x2020 DAGGER
|
||||||
|
8225, # 0x87 0x2021 DOUBLE DAGGER
|
||||||
|
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
|
||||||
|
8240, # 0x89 0x2030 PER MILLE SIGN
|
||||||
|
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
|
||||||
|
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||||
|
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
|
||||||
|
65533, # 0x8D UNDEFINED
|
||||||
|
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
|
||||||
|
65533, # 0x8F UNDEFINED
|
||||||
|
65533, # 0x90 UNDEFINED
|
||||||
|
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
|
||||||
|
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
|
||||||
|
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
|
||||||
|
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
|
||||||
|
8226, # 0x95 0x2022 BULLET
|
||||||
|
8211, # 0x96 0x2013 EN DASH
|
||||||
|
8212, # 0x97 0x2014 EM DASH
|
||||||
|
732, # 0x98 0x02DC SMALL TILDE
|
||||||
|
8482, # 0x99 0x2122 TRADE MARK SIGN
|
||||||
|
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
|
||||||
|
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||||
|
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
|
||||||
|
65533, # 0x9D UNDEFINED
|
||||||
|
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
|
||||||
|
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||||
|
]
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def self.U n
|
||||||
|
[n].pack('U')
|
||||||
|
end
|
||||||
|
|
||||||
|
public
|
||||||
|
|
||||||
|
ENTITIES = {
|
||||||
|
"AElig" => U(0xC6),
|
||||||
|
"Aacute" => U(0xC1),
|
||||||
|
"Acirc" => U(0xC2),
|
||||||
|
"Agrave" => U(0xC0),
|
||||||
|
"Alpha" => U(0x0391),
|
||||||
|
"Aring" => U(0xC5),
|
||||||
|
"Atilde" => U(0xC3),
|
||||||
|
"Auml" => U(0xC4),
|
||||||
|
"Beta" => U(0x0392),
|
||||||
|
"Ccedil" => U(0xC7),
|
||||||
|
"Chi" => U(0x03A7),
|
||||||
|
"Dagger" => U(0x2021),
|
||||||
|
"Delta" => U(0x0394),
|
||||||
|
"ETH" => U(0xD0),
|
||||||
|
"Eacute" => U(0xC9),
|
||||||
|
"Ecirc" => U(0xCA),
|
||||||
|
"Egrave" => U(0xC8),
|
||||||
|
"Epsilon" => U(0x0395),
|
||||||
|
"Eta" => U(0x0397),
|
||||||
|
"Euml" => U(0xCB),
|
||||||
|
"Gamma" => U(0x0393),
|
||||||
|
"Iacute" => U(0xCD),
|
||||||
|
"Icirc" => U(0xCE),
|
||||||
|
"Igrave" => U(0xCC),
|
||||||
|
"Iota" => U(0x0399),
|
||||||
|
"Iuml" => U(0xCF),
|
||||||
|
"Kappa" => U(0x039A),
|
||||||
|
"Lambda" => U(0x039B),
|
||||||
|
"Mu" => U(0x039C),
|
||||||
|
"Ntilde" => U(0xD1),
|
||||||
|
"Nu" => U(0x039D),
|
||||||
|
"OElig" => U(0x0152),
|
||||||
|
"Oacute" => U(0xD3),
|
||||||
|
"Ocirc" => U(0xD4),
|
||||||
|
"Ograve" => U(0xD2),
|
||||||
|
"Omega" => U(0x03A9),
|
||||||
|
"Omicron" => U(0x039F),
|
||||||
|
"Oslash" => U(0xD8),
|
||||||
|
"Otilde" => U(0xD5),
|
||||||
|
"Ouml" => U(0xD6),
|
||||||
|
"Phi" => U(0x03A6),
|
||||||
|
"Pi" => U(0x03A0),
|
||||||
|
"Prime" => U(0x2033),
|
||||||
|
"Psi" => U(0x03A8),
|
||||||
|
"Rho" => U(0x03A1),
|
||||||
|
"Scaron" => U(0x0160),
|
||||||
|
"Sigma" => U(0x03A3),
|
||||||
|
"THORN" => U(0xDE),
|
||||||
|
"Tau" => U(0x03A4),
|
||||||
|
"Theta" => U(0x0398),
|
||||||
|
"Uacute" => U(0xDA),
|
||||||
|
"Ucirc" => U(0xDB),
|
||||||
|
"Ugrave" => U(0xD9),
|
||||||
|
"Upsilon" => U(0x03A5),
|
||||||
|
"Uuml" => U(0xDC),
|
||||||
|
"Xi" => U(0x039E),
|
||||||
|
"Yacute" => U(0xDD),
|
||||||
|
"Yuml" => U(0x0178),
|
||||||
|
"Zeta" => U(0x0396),
|
||||||
|
"aacute" => U(0xE1),
|
||||||
|
"acirc" => U(0xE2),
|
||||||
|
"acute" => U(0xB4),
|
||||||
|
"aelig" => U(0xE6),
|
||||||
|
"agrave" => U(0xE0),
|
||||||
|
"alefsym" => U(0x2135),
|
||||||
|
"alpha" => U(0x03B1),
|
||||||
|
"amp" => U(0x26),
|
||||||
|
"AMP" => U(0x26),
|
||||||
|
"and" => U(0x2227),
|
||||||
|
"ang" => U(0x2220),
|
||||||
|
"apos" => U(0x27),
|
||||||
|
"aring" => U(0xE5),
|
||||||
|
"asymp" => U(0x2248),
|
||||||
|
"atilde" => U(0xE3),
|
||||||
|
"auml" => U(0xE4),
|
||||||
|
"bdquo" => U(0x201E),
|
||||||
|
"beta" => U(0x03B2),
|
||||||
|
"brvbar" => U(0xA6),
|
||||||
|
"bull" => U(0x2022),
|
||||||
|
"cap" => U(0x2229),
|
||||||
|
"ccedil" => U(0xE7),
|
||||||
|
"cedil" => U(0xB8),
|
||||||
|
"cent" => U(0xA2),
|
||||||
|
"chi" => U(0x03C7),
|
||||||
|
"circ" => U(0x02C6),
|
||||||
|
"clubs" => U(0x2663),
|
||||||
|
"cong" => U(0x2245),
|
||||||
|
"copy" => U(0xA9),
|
||||||
|
"COPY" => U(0xA9),
|
||||||
|
"crarr" => U(0x21B5),
|
||||||
|
"cup" => U(0x222A),
|
||||||
|
"curren" => U(0xA4),
|
||||||
|
"dArr" => U(0x21D3),
|
||||||
|
"dagger" => U(0x2020),
|
||||||
|
"darr" => U(0x2193),
|
||||||
|
"deg" => U(0xB0),
|
||||||
|
"delta" => U(0x03B4),
|
||||||
|
"diams" => U(0x2666),
|
||||||
|
"divide" => U(0xF7),
|
||||||
|
"eacute" => U(0xE9),
|
||||||
|
"ecirc" => U(0xEA),
|
||||||
|
"egrave" => U(0xE8),
|
||||||
|
"empty" => U(0x2205),
|
||||||
|
"emsp" => U(0x2003),
|
||||||
|
"ensp" => U(0x2002),
|
||||||
|
"epsilon" => U(0x03B5),
|
||||||
|
"equiv" => U(0x2261),
|
||||||
|
"eta" => U(0x03B7),
|
||||||
|
"eth" => U(0xF0),
|
||||||
|
"euml" => U(0xEB),
|
||||||
|
"euro" => U(0x20AC),
|
||||||
|
"exist" => U(0x2203),
|
||||||
|
"fnof" => U(0x0192),
|
||||||
|
"forall" => U(0x2200),
|
||||||
|
"frac12" => U(0xBD),
|
||||||
|
"frac14" => U(0xBC),
|
||||||
|
"frac34" => U(0xBE),
|
||||||
|
"frasl" => U(0x2044),
|
||||||
|
"gamma" => U(0x03B3),
|
||||||
|
"ge" => U(0x2265),
|
||||||
|
"gt" => U(0x3E),
|
||||||
|
"GT" => U(0x3E),
|
||||||
|
"hArr" => U(0x21D4),
|
||||||
|
"harr" => U(0x2194),
|
||||||
|
"hearts" => U(0x2665),
|
||||||
|
"hellip" => U(0x2026),
|
||||||
|
"iacute" => U(0xED),
|
||||||
|
"icirc" => U(0xEE),
|
||||||
|
"iexcl" => U(0xA1),
|
||||||
|
"igrave" => U(0xEC),
|
||||||
|
"image" => U(0x2111),
|
||||||
|
"infin" => U(0x221E),
|
||||||
|
"int" => U(0x222B),
|
||||||
|
"iota" => U(0x03B9),
|
||||||
|
"iquest" => U(0xBF),
|
||||||
|
"isin" => U(0x2208),
|
||||||
|
"iuml" => U(0xEF),
|
||||||
|
"kappa" => U(0x03BA),
|
||||||
|
"lArr" => U(0x21D0),
|
||||||
|
"lambda" => U(0x03BB),
|
||||||
|
"lang" => U(0x2329),
|
||||||
|
"laquo" => U(0xAB),
|
||||||
|
"larr" => U(0x2190),
|
||||||
|
"lceil" => U(0x2308),
|
||||||
|
"ldquo" => U(0x201C),
|
||||||
|
"le" => U(0x2264),
|
||||||
|
"lfloor" => U(0x230A),
|
||||||
|
"lowast" => U(0x2217),
|
||||||
|
"loz" => U(0x25CA),
|
||||||
|
"lrm" => U(0x200E),
|
||||||
|
"lsaquo" => U(0x2039),
|
||||||
|
"lsquo" => U(0x2018),
|
||||||
|
"lt" => U(0x3C),
|
||||||
|
"LT" => U(0x3C),
|
||||||
|
"macr" => U(0xAF),
|
||||||
|
"mdash" => U(0x2014),
|
||||||
|
"micro" => U(0xB5),
|
||||||
|
"middot" => U(0xB7),
|
||||||
|
"minus" => U(0x2212),
|
||||||
|
"mu" => U(0x03BC),
|
||||||
|
"nabla" => U(0x2207),
|
||||||
|
"nbsp" => U(0xA0),
|
||||||
|
"ndash" => U(0x2013),
|
||||||
|
"ne" => U(0x2260),
|
||||||
|
"ni" => U(0x220B),
|
||||||
|
"not" => U(0xAC),
|
||||||
|
"notin" => U(0x2209),
|
||||||
|
"nsub" => U(0x2284),
|
||||||
|
"ntilde" => U(0xF1),
|
||||||
|
"nu" => U(0x03BD),
|
||||||
|
"oacute" => U(0xF3),
|
||||||
|
"ocirc" => U(0xF4),
|
||||||
|
"oelig" => U(0x0153),
|
||||||
|
"ograve" => U(0xF2),
|
||||||
|
"oline" => U(0x203E),
|
||||||
|
"omega" => U(0x03C9),
|
||||||
|
"omicron" => U(0x03BF),
|
||||||
|
"oplus" => U(0x2295),
|
||||||
|
"or" => U(0x2228),
|
||||||
|
"ordf" => U(0xAA),
|
||||||
|
"ordm" => U(0xBA),
|
||||||
|
"oslash" => U(0xF8),
|
||||||
|
"otilde" => U(0xF5),
|
||||||
|
"otimes" => U(0x2297),
|
||||||
|
"ouml" => U(0xF6),
|
||||||
|
"para" => U(0xB6),
|
||||||
|
"part" => U(0x2202),
|
||||||
|
"permil" => U(0x2030),
|
||||||
|
"perp" => U(0x22A5),
|
||||||
|
"phi" => U(0x03C6),
|
||||||
|
"pi" => U(0x03C0),
|
||||||
|
"piv" => U(0x03D6),
|
||||||
|
"plusmn" => U(0xB1),
|
||||||
|
"pound" => U(0xA3),
|
||||||
|
"prime" => U(0x2032),
|
||||||
|
"prod" => U(0x220F),
|
||||||
|
"prop" => U(0x221D),
|
||||||
|
"psi" => U(0x03C8),
|
||||||
|
"quot" => U(0x22),
|
||||||
|
"QUOT" => U(0x22),
|
||||||
|
"rArr" => U(0x21D2),
|
||||||
|
"radic" => U(0x221A),
|
||||||
|
"rang" => U(0x232A),
|
||||||
|
"raquo" => U(0xBB),
|
||||||
|
"rarr" => U(0x2192),
|
||||||
|
"rceil" => U(0x2309),
|
||||||
|
"rdquo" => U(0x201D),
|
||||||
|
"real" => U(0x211C),
|
||||||
|
"reg" => U(0xAE),
|
||||||
|
"REG" => U(0xAE),
|
||||||
|
"rfloor" => U(0x230B),
|
||||||
|
"rho" => U(0x03C1),
|
||||||
|
"rlm" => U(0x200F),
|
||||||
|
"rsaquo" => U(0x203A),
|
||||||
|
"rsquo" => U(0x2019),
|
||||||
|
"sbquo" => U(0x201A),
|
||||||
|
"scaron" => U(0x0161),
|
||||||
|
"sdot" => U(0x22C5),
|
||||||
|
"sect" => U(0xA7),
|
||||||
|
"shy" => U(0xAD),
|
||||||
|
"sigma" => U(0x03C3),
|
||||||
|
"sigmaf" => U(0x03C2),
|
||||||
|
"sim" => U(0x223C),
|
||||||
|
"spades" => U(0x2660),
|
||||||
|
"sub" => U(0x2282),
|
||||||
|
"sube" => U(0x2286),
|
||||||
|
"sum" => U(0x2211),
|
||||||
|
"sup" => U(0x2283),
|
||||||
|
"sup1" => U(0xB9),
|
||||||
|
"sup2" => U(0xB2),
|
||||||
|
"sup3" => U(0xB3),
|
||||||
|
"supe" => U(0x2287),
|
||||||
|
"szlig" => U(0xDF),
|
||||||
|
"tau" => U(0x03C4),
|
||||||
|
"there4" => U(0x2234),
|
||||||
|
"theta" => U(0x03B8),
|
||||||
|
"thetasym" => U(0x03D1),
|
||||||
|
"thinsp" => U(0x2009),
|
||||||
|
"thorn" => U(0xFE),
|
||||||
|
"tilde" => U(0x02DC),
|
||||||
|
"times" => U(0xD7),
|
||||||
|
"trade" => U(0x2122),
|
||||||
|
"uArr" => U(0x21D1),
|
||||||
|
"uacute" => U(0xFA),
|
||||||
|
"uarr" => U(0x2191),
|
||||||
|
"ucirc" => U(0xFB),
|
||||||
|
"ugrave" => U(0xF9),
|
||||||
|
"uml" => U(0xA8),
|
||||||
|
"upsih" => U(0x03D2),
|
||||||
|
"upsilon" => U(0x03C5),
|
||||||
|
"uuml" => U(0xFC),
|
||||||
|
"weierp" => U(0x2118),
|
||||||
|
"xi" => U(0x03BE),
|
||||||
|
"yacute" => U(0xFD),
|
||||||
|
"yen" => U(0xA5),
|
||||||
|
"yuml" => U(0xFF),
|
||||||
|
"zeta" => U(0x03B6),
|
||||||
|
"zwj" => U(0x200D),
|
||||||
|
"zwnj" => U(0x200C)
|
||||||
|
}
|
||||||
|
|
||||||
|
ENCODINGS = %w[
|
||||||
|
ansi_x3.4-1968
|
||||||
|
iso-ir-6
|
||||||
|
ansi_x3.4-1986
|
||||||
|
iso_646.irv:1991
|
||||||
|
ascii
|
||||||
|
iso646-us
|
||||||
|
us-ascii
|
||||||
|
us
|
||||||
|
ibm367
|
||||||
|
cp367
|
||||||
|
csascii
|
||||||
|
ks_c_5601-1987
|
||||||
|
korean
|
||||||
|
iso-2022-kr
|
||||||
|
csiso2022kr
|
||||||
|
euc-kr
|
||||||
|
iso-2022-jp
|
||||||
|
csiso2022jp
|
||||||
|
iso-2022-jp-2
|
||||||
|
iso-ir-58
|
||||||
|
chinese
|
||||||
|
csiso58gb231280
|
||||||
|
iso_8859-1:1987
|
||||||
|
iso-ir-100
|
||||||
|
iso_8859-1
|
||||||
|
iso-8859-1
|
||||||
|
latin1
|
||||||
|
l1
|
||||||
|
ibm819
|
||||||
|
cp819
|
||||||
|
csisolatin1
|
||||||
|
iso_8859-2:1987
|
||||||
|
iso-ir-101
|
||||||
|
iso_8859-2
|
||||||
|
iso-8859-2
|
||||||
|
latin2
|
||||||
|
l2
|
||||||
|
csisolatin2
|
||||||
|
iso_8859-3:1988
|
||||||
|
iso-ir-109
|
||||||
|
iso_8859-3
|
||||||
|
iso-8859-3
|
||||||
|
latin3
|
||||||
|
l3
|
||||||
|
csisolatin3
|
||||||
|
iso_8859-4:1988
|
||||||
|
iso-ir-110
|
||||||
|
iso_8859-4
|
||||||
|
iso-8859-4
|
||||||
|
latin4
|
||||||
|
l4
|
||||||
|
csisolatin4
|
||||||
|
iso_8859-6:1987
|
||||||
|
iso-ir-127
|
||||||
|
iso_8859-6
|
||||||
|
iso-8859-6
|
||||||
|
ecma-114
|
||||||
|
asmo-708
|
||||||
|
arabic
|
||||||
|
csisolatinarabic
|
||||||
|
iso_8859-7:1987
|
||||||
|
iso-ir-126
|
||||||
|
iso_8859-7
|
||||||
|
iso-8859-7
|
||||||
|
elot_928
|
||||||
|
ecma-118
|
||||||
|
greek
|
||||||
|
greek8
|
||||||
|
csisolatingreek
|
||||||
|
iso_8859-8:1988
|
||||||
|
iso-ir-138
|
||||||
|
iso_8859-8
|
||||||
|
iso-8859-8
|
||||||
|
hebrew
|
||||||
|
csisolatinhebrew
|
||||||
|
iso_8859-5:1988
|
||||||
|
iso-ir-144
|
||||||
|
iso_8859-5
|
||||||
|
iso-8859-5
|
||||||
|
cyrillic
|
||||||
|
csisolatincyrillic
|
||||||
|
iso_8859-9:1989
|
||||||
|
iso-ir-148
|
||||||
|
iso_8859-9
|
||||||
|
iso-8859-9
|
||||||
|
latin5
|
||||||
|
l5
|
||||||
|
csisolatin5
|
||||||
|
iso-8859-10
|
||||||
|
iso-ir-157
|
||||||
|
l6
|
||||||
|
iso_8859-10:1992
|
||||||
|
csisolatin6
|
||||||
|
latin6
|
||||||
|
hp-roman8
|
||||||
|
roman8
|
||||||
|
r8
|
||||||
|
ibm037
|
||||||
|
cp037
|
||||||
|
csibm037
|
||||||
|
ibm424
|
||||||
|
cp424
|
||||||
|
csibm424
|
||||||
|
ibm437
|
||||||
|
cp437
|
||||||
|
437
|
||||||
|
cspc8codepage437
|
||||||
|
ibm500
|
||||||
|
cp500
|
||||||
|
csibm500
|
||||||
|
ibm775
|
||||||
|
cp775
|
||||||
|
cspc775baltic
|
||||||
|
ibm850
|
||||||
|
cp850
|
||||||
|
850
|
||||||
|
cspc850multilingual
|
||||||
|
ibm852
|
||||||
|
cp852
|
||||||
|
852
|
||||||
|
cspcp852
|
||||||
|
ibm855
|
||||||
|
cp855
|
||||||
|
855
|
||||||
|
csibm855
|
||||||
|
ibm857
|
||||||
|
cp857
|
||||||
|
857
|
||||||
|
csibm857
|
||||||
|
ibm860
|
||||||
|
cp860
|
||||||
|
860
|
||||||
|
csibm860
|
||||||
|
ibm861
|
||||||
|
cp861
|
||||||
|
861
|
||||||
|
cp-is
|
||||||
|
csibm861
|
||||||
|
ibm862
|
||||||
|
cp862
|
||||||
|
862
|
||||||
|
cspc862latinhebrew
|
||||||
|
ibm863
|
||||||
|
cp863
|
||||||
|
863
|
||||||
|
csibm863
|
||||||
|
ibm864
|
||||||
|
cp864
|
||||||
|
csibm864
|
||||||
|
ibm865
|
||||||
|
cp865
|
||||||
|
865
|
||||||
|
csibm865
|
||||||
|
ibm866
|
||||||
|
cp866
|
||||||
|
866
|
||||||
|
csibm866
|
||||||
|
ibm869
|
||||||
|
cp869
|
||||||
|
869
|
||||||
|
cp-gr
|
||||||
|
csibm869
|
||||||
|
ibm1026
|
||||||
|
cp1026
|
||||||
|
csibm1026
|
||||||
|
koi8-r
|
||||||
|
cskoi8r
|
||||||
|
koi8-u
|
||||||
|
big5-hkscs
|
||||||
|
ptcp154
|
||||||
|
csptcp154
|
||||||
|
pt154
|
||||||
|
cp154
|
||||||
|
utf-7
|
||||||
|
utf-16be
|
||||||
|
utf-16le
|
||||||
|
utf-16
|
||||||
|
utf-8
|
||||||
|
iso-8859-13
|
||||||
|
iso-8859-14
|
||||||
|
iso-ir-199
|
||||||
|
iso_8859-14:1998
|
||||||
|
iso_8859-14
|
||||||
|
latin8
|
||||||
|
iso-celtic
|
||||||
|
l8
|
||||||
|
iso-8859-15
|
||||||
|
iso_8859-15
|
||||||
|
iso-8859-16
|
||||||
|
iso-ir-226
|
||||||
|
iso_8859-16:2001
|
||||||
|
iso_8859-16
|
||||||
|
latin10
|
||||||
|
l10
|
||||||
|
gbk
|
||||||
|
cp936
|
||||||
|
ms936
|
||||||
|
gb18030
|
||||||
|
shift_jis
|
||||||
|
ms_kanji
|
||||||
|
csshiftjis
|
||||||
|
euc-jp
|
||||||
|
gb2312
|
||||||
|
big5
|
||||||
|
csbig5
|
||||||
|
windows-1250
|
||||||
|
windows-1251
|
||||||
|
windows-1252
|
||||||
|
windows-1253
|
||||||
|
windows-1254
|
||||||
|
windows-1255
|
||||||
|
windows-1256
|
||||||
|
windows-1257
|
||||||
|
windows-1258
|
||||||
|
tis-620
|
||||||
|
hz-gb-2312
|
||||||
|
]
|
||||||
|
|
||||||
|
end
|
2020
vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb
vendored
Normal file
2020
vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb
vendored
Normal file
File diff suppressed because it is too large
Load diff
549
vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
vendored
Executable file
549
vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
vendored
Executable file
|
@ -0,0 +1,549 @@
|
||||||
|
require 'stringio'
|
||||||
|
require 'html5lib/constants'
|
||||||
|
|
||||||
|
module HTML5lib
|
||||||
|
|
||||||
|
# Provides a unicode stream of characters to the HTMLTokenizer.
|
||||||
|
|
||||||
|
# This class takes care of character encoding and removing or replacing
|
||||||
|
# incorrect byte-sequences and also provides column and line tracking.
|
||||||
|
|
||||||
|
class HTMLInputStream
|
||||||
|
|
||||||
|
attr_accessor :queue, :charEncoding
|
||||||
|
|
||||||
|
# Initialises the HTMLInputStream.
|
||||||
|
#
|
||||||
|
# HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||||
|
# for use by the HTML5Lib.
|
||||||
|
#
|
||||||
|
# source can be either a file-object, local filename or a string.
|
||||||
|
#
|
||||||
|
# The optional encoding parameter must be a string that indicates
|
||||||
|
# the encoding. If specified, that encoding will be used,
|
||||||
|
# regardless of any BOM or later declaration (such as in a meta
|
||||||
|
# element)
|
||||||
|
#
|
||||||
|
# parseMeta - Look for a <meta> element containing encoding information
|
||||||
|
|
||||||
|
def initialize(source, options = {})
|
||||||
|
@encoding = nil
|
||||||
|
@parseMeta = true
|
||||||
|
@chardet = true
|
||||||
|
|
||||||
|
options.each { |name, value| instance_variable_set("@#{name}", value) }
|
||||||
|
|
||||||
|
# List of where new lines occur
|
||||||
|
@newLines = []
|
||||||
|
|
||||||
|
# Raw Stream
|
||||||
|
@rawStream = openStream(source)
|
||||||
|
|
||||||
|
# Encoding Information
|
||||||
|
#Number of bytes to use when looking for a meta element with
|
||||||
|
#encoding information
|
||||||
|
@NUM_BYTES_META = 512
|
||||||
|
#Encoding to use if no other information can be found
|
||||||
|
@DEFAULT_ENCODING = 'windows-1252'
|
||||||
|
|
||||||
|
#Detect encoding iff no explicit "transport level" encoding is supplied
|
||||||
|
if @encoding.nil? or not HTML5lib.isValidEncoding(@encoding)
|
||||||
|
@charEncoding = detectEncoding
|
||||||
|
else
|
||||||
|
@charEncoding = @encoding
|
||||||
|
end
|
||||||
|
|
||||||
|
# Read bytes from stream decoding them into Unicode
|
||||||
|
uString = @rawStream.read
|
||||||
|
unless @charEncoding == 'utf-8'
|
||||||
|
begin
|
||||||
|
require 'iconv'
|
||||||
|
uString = Iconv.iconv('utf-8', @encoding, uString)[0]
|
||||||
|
rescue
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Normalize newlines and null characters
|
||||||
|
uString.gsub!(/\r\n?/, "\n")
|
||||||
|
uString.gsub!("\x00", [0xFFFD].pack('U'))
|
||||||
|
|
||||||
|
# Convert the unicode string into a list to be used as the data stream
|
||||||
|
@dataStream = uString
|
||||||
|
|
||||||
|
@queue = []
|
||||||
|
|
||||||
|
# Reset position in the list to read from
|
||||||
|
reset
|
||||||
|
end
|
||||||
|
|
||||||
|
# Produces a file object from source.
|
||||||
|
#
|
||||||
|
# source can be either a file object, local filename or a string.
|
||||||
|
def openStream(source)
|
||||||
|
# Already an IO like object
|
||||||
|
if source.respond_to?(:read)
|
||||||
|
@stream = source
|
||||||
|
else
|
||||||
|
# Treat source as a string and wrap in StringIO
|
||||||
|
@stream = StringIO.new(source)
|
||||||
|
end
|
||||||
|
return @stream
|
||||||
|
end
|
||||||
|
|
||||||
|
def detectEncoding
|
||||||
|
|
||||||
|
#First look for a BOM
|
||||||
|
#This will also read past the BOM if present
|
||||||
|
encoding = detectBOM
|
||||||
|
#If there is no BOM need to look for meta elements with encoding
|
||||||
|
#information
|
||||||
|
if encoding.nil? and @parseMeta
|
||||||
|
encoding = detectEncodingMeta
|
||||||
|
end
|
||||||
|
#Guess with chardet, if avaliable
|
||||||
|
if encoding.nil? and @chardet
|
||||||
|
begin
|
||||||
|
require 'rubygems'
|
||||||
|
require 'UniversalDetector' # gem install chardet
|
||||||
|
buffer = @rawStream.read
|
||||||
|
encoding = UniversalDetector::chardet(buffer)['encoding']
|
||||||
|
@rawStream = openStream(buffer)
|
||||||
|
rescue LoadError
|
||||||
|
end
|
||||||
|
end
|
||||||
|
# If all else fails use the default encoding
|
||||||
|
if encoding.nil?
|
||||||
|
encoding = @DEFAULT_ENCODING
|
||||||
|
end
|
||||||
|
|
||||||
|
#Substitute for equivalent encodings:
|
||||||
|
encodingSub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
|
||||||
|
|
||||||
|
if encodingSub.has_key?(encoding.downcase)
|
||||||
|
encoding = encodingSub[encoding.downcase]
|
||||||
|
end
|
||||||
|
|
||||||
|
return encoding
|
||||||
|
end
|
||||||
|
|
||||||
|
# Attempts to detect at BOM at the start of the stream. If
|
||||||
|
# an encoding can be determined from the BOM return the name of the
|
||||||
|
# encoding otherwise return nil
|
||||||
|
def detectBOM
|
||||||
|
bomDict = {
|
||||||
|
"\xef\xbb\xbf" => 'utf-8',
|
||||||
|
"\xff\xfe" => 'utf-16-le',
|
||||||
|
"\xfe\xff" => 'utf-16-be',
|
||||||
|
"\xff\xfe\x00\x00" => 'utf-32-le',
|
||||||
|
"\x00\x00\xfe\xff" => 'utf-32-be'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Go to beginning of file and read in 4 bytes
|
||||||
|
@rawStream.seek(0)
|
||||||
|
string = @rawStream.read(4)
|
||||||
|
return nil unless string
|
||||||
|
|
||||||
|
# Try detecting the BOM using bytes from the string
|
||||||
|
encoding = bomDict[string[0...3]] # UTF-8
|
||||||
|
seek = 3
|
||||||
|
unless encoding
|
||||||
|
# Need to detect UTF-32 before UTF-16
|
||||||
|
encoding = bomDict[string] # UTF-32
|
||||||
|
seek = 4
|
||||||
|
unless encoding
|
||||||
|
encoding = bomDict[string[0...2]] # UTF-16
|
||||||
|
seek = 2
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
#AT - move this to the caller?
|
||||||
|
# Set the read position past the BOM if one was found, otherwise
|
||||||
|
# set it to the start of the stream
|
||||||
|
@rawStream.seek(encoding ? seek : 0)
|
||||||
|
|
||||||
|
return encoding
|
||||||
|
end
|
||||||
|
|
||||||
|
# Report the encoding declared by the meta element
|
||||||
|
def detectEncodingMeta
|
||||||
|
parser = EncodingParser.new(@rawStream.read(@NUM_BYTES_META))
|
||||||
|
@rawStream.seek(0)
|
||||||
|
return parser.getEncoding
|
||||||
|
end
|
||||||
|
|
||||||
|
def determineNewLines
|
||||||
|
# Looks through the stream to find where new lines occur so
|
||||||
|
# the position method can tell where it is.
|
||||||
|
@newLines.push(0)
|
||||||
|
(0...@dataStream.length).each { |i| @newLines.push(i) if @dataStream[i] == ?\n }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Returns (line, col) of the current position in the stream.
|
||||||
|
def position
|
||||||
|
# Generate list of new lines first time around
|
||||||
|
determineNewLines if @newLines.empty?
|
||||||
|
line = 0
|
||||||
|
tell = @tell
|
||||||
|
@newLines.each do |pos|
|
||||||
|
break unless pos < tell
|
||||||
|
line += 1
|
||||||
|
end
|
||||||
|
col = tell - @newLines[line-1] - 1
|
||||||
|
return [line, col]
|
||||||
|
end
|
||||||
|
|
||||||
|
# Resets the position in the stream back to the start.
|
||||||
|
def reset
|
||||||
|
@tell = 0
|
||||||
|
end
|
||||||
|
|
||||||
|
# Read one character from the stream or queue if available. Return
|
||||||
|
# EOF when EOF is reached.
|
||||||
|
def char
|
||||||
|
unless @queue.empty?
|
||||||
|
return @queue.shift
|
||||||
|
else
|
||||||
|
begin
|
||||||
|
@tell += 1
|
||||||
|
return @dataStream[@tell - 1].chr
|
||||||
|
rescue
|
||||||
|
return :EOF
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Returns a string of characters from the stream up to but not
|
||||||
|
# including any character in characters or EOF. characters can be
|
||||||
|
# any container that supports the in method being called on it.
|
||||||
|
def charsUntil(characters, opposite = false)
|
||||||
|
charStack = [char]
|
||||||
|
|
||||||
|
unless charStack[0] == :EOF
|
||||||
|
while (characters.include? charStack[-1]) == opposite
|
||||||
|
unless @queue.empty?
|
||||||
|
# First from the queue
|
||||||
|
charStack.push(@queue.shift)
|
||||||
|
break if charStack[-1] == :EOF
|
||||||
|
else
|
||||||
|
# Then the rest
|
||||||
|
begin
|
||||||
|
charStack.push(@dataStream[@tell].chr)
|
||||||
|
@tell += 1
|
||||||
|
rescue
|
||||||
|
charStack.push(:EOF)
|
||||||
|
break
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Put the character stopped on back to the front of the queue
|
||||||
|
# from where it came.
|
||||||
|
@queue.insert(0, charStack.pop)
|
||||||
|
return charStack.join('')
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# String-like object with an assosiated position and various extra methods
|
||||||
|
# If the position is ever greater than the string length then an exception is raised
|
||||||
|
class EncodingBytes < String
|
||||||
|
|
||||||
|
attr_accessor :position
|
||||||
|
|
||||||
|
def initialize(value)
|
||||||
|
super(value)
|
||||||
|
@position = -1
|
||||||
|
end
|
||||||
|
|
||||||
|
def each
|
||||||
|
while @position < length
|
||||||
|
@position += 1
|
||||||
|
yield self[@position]
|
||||||
|
end
|
||||||
|
rescue EOF
|
||||||
|
end
|
||||||
|
|
||||||
|
def currentByte
|
||||||
|
raise EOF if @position >= length
|
||||||
|
return self[@position].chr
|
||||||
|
end
|
||||||
|
|
||||||
|
# Skip past a list of characters
|
||||||
|
def skip(chars = SPACE_CHARACTERS)
|
||||||
|
while chars.include?(currentByte)
|
||||||
|
@position += 1
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Look for a sequence of bytes at the start of a string. If the bytes
|
||||||
|
# are found return true and advance the position to the byte after the
|
||||||
|
# match. Otherwise return false and leave the position alone
|
||||||
|
def matchBytes(bytes, lower = false)
|
||||||
|
data = self[position ... position+bytes.length]
|
||||||
|
data.downcase! if lower
|
||||||
|
rv = (data == bytes)
|
||||||
|
@position += bytes.length if rv == true
|
||||||
|
return rv
|
||||||
|
end
|
||||||
|
|
||||||
|
# Look for the next sequence of bytes matching a given sequence. If
|
||||||
|
# a match is found advance the position to the last byte of the match
|
||||||
|
def jumpTo(bytes)
|
||||||
|
newPosition = self[position .. -1].index(bytes)
|
||||||
|
if newPosition
|
||||||
|
@position += (newPosition + bytes.length-1)
|
||||||
|
return true
|
||||||
|
else
|
||||||
|
raise EOF
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Move the pointer so it points to the next byte in a set of possible
|
||||||
|
# bytes
|
||||||
|
def findNext(byteList)
|
||||||
|
until byteList.include?(currentByte)
|
||||||
|
@position += 1
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Mini parser for detecting character encoding from meta elements
|
||||||
|
class EncodingParser
|
||||||
|
|
||||||
|
# string - the data to work on for encoding detection
|
||||||
|
def initialize(data)
|
||||||
|
@data = EncodingBytes.new(data.to_s)
|
||||||
|
@encoding = nil
|
||||||
|
end
|
||||||
|
|
||||||
|
@@method_dispatch = [
|
||||||
|
['<!--', :handleComment],
|
||||||
|
['<meta', :handleMeta],
|
||||||
|
['</', :handlePossibleEndTag],
|
||||||
|
['<!', :handleOther],
|
||||||
|
['<?', :handleOther],
|
||||||
|
['<', :handlePossibleStartTag]
|
||||||
|
]
|
||||||
|
|
||||||
|
def getEncoding
|
||||||
|
@data.each do |byte|
|
||||||
|
keepParsing = true
|
||||||
|
@@method_dispatch.each do |(key, method)|
|
||||||
|
if @data.matchBytes(key, lower = true)
|
||||||
|
keepParsing = send(method)
|
||||||
|
break
|
||||||
|
end
|
||||||
|
end
|
||||||
|
break unless keepParsing
|
||||||
|
end
|
||||||
|
@encoding = @encoding.strip unless @encoding.nil?
|
||||||
|
return @encoding
|
||||||
|
end
|
||||||
|
|
||||||
|
# Skip over comments
|
||||||
|
def handleComment
|
||||||
|
return @data.jumpTo('-->')
|
||||||
|
end
|
||||||
|
|
||||||
|
def handleMeta
|
||||||
|
# if we have <meta not followed by a space so just keep going
|
||||||
|
return true unless SPACE_CHARACTERS.include?(@data.currentByte)
|
||||||
|
|
||||||
|
#We have a valid meta element we want to search for attributes
|
||||||
|
while true
|
||||||
|
#Try to find the next attribute after the current position
|
||||||
|
attr = getAttribute
|
||||||
|
|
||||||
|
return true if attr.nil?
|
||||||
|
|
||||||
|
if attr[0] == 'charset'
|
||||||
|
tentativeEncoding = attr[1]
|
||||||
|
if HTML5lib.isValidEncoding(tentativeEncoding)
|
||||||
|
@encoding = tentativeEncoding
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
elsif attr[0] == 'content'
|
||||||
|
contentParser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
|
||||||
|
tentativeEncoding = contentParser.parse
|
||||||
|
if HTML5lib.isValidEncoding(tentativeEncoding)
|
||||||
|
@encoding = tentativeEncoding
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def handlePossibleStartTag
|
||||||
|
return handlePossibleTag(false)
|
||||||
|
end
|
||||||
|
|
||||||
|
def handlePossibleEndTag
|
||||||
|
@data.position+=1
|
||||||
|
return handlePossibleTag(true)
|
||||||
|
end
|
||||||
|
|
||||||
|
def handlePossibleTag(endTag)
|
||||||
|
unless ASCII_LETTERS.include?(@data.currentByte)
|
||||||
|
#If the next byte is not an ascii letter either ignore this
|
||||||
|
#fragment (possible start tag case) or treat it according to
|
||||||
|
#handleOther
|
||||||
|
if endTag
|
||||||
|
@data.position -= 1
|
||||||
|
handleOther
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
@data.findNext(SPACE_CHARACTERS + ['<', '>'])
|
||||||
|
|
||||||
|
if @data.currentByte == '<'
|
||||||
|
#return to the first step in the overall "two step" algorithm
|
||||||
|
#reprocessing the < byte
|
||||||
|
@data.position -= 1
|
||||||
|
else
|
||||||
|
#Read all attributes
|
||||||
|
{} until getAttribute.nil?
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def handleOther
|
||||||
|
return @data.jumpTo('>')
|
||||||
|
end
|
||||||
|
|
||||||
|
# Return a name,value pair for the next attribute in the stream,
|
||||||
|
# if one is found, or nil
|
||||||
|
def getAttribute
|
||||||
|
@data.skip(SPACE_CHARACTERS + ['/'])
|
||||||
|
|
||||||
|
if @data.currentByte == '<'
|
||||||
|
@data.position -= 1
|
||||||
|
return nil
|
||||||
|
elsif @data.currentByte == '>'
|
||||||
|
return nil
|
||||||
|
end
|
||||||
|
|
||||||
|
attrName = []
|
||||||
|
attrValue = []
|
||||||
|
spaceFound = false
|
||||||
|
#Step 5 attribute name
|
||||||
|
while true
|
||||||
|
if @data.currentByte == '=' and attrName:
|
||||||
|
break
|
||||||
|
elsif SPACE_CHARACTERS.include?(@data.currentByte)
|
||||||
|
spaceFound = true
|
||||||
|
break
|
||||||
|
elsif ['/', '<', '>'].include?(@data.currentByte)
|
||||||
|
return [attrName.join(''), '']
|
||||||
|
elsif ASCII_UPPERCASE.include?(@data.currentByte)
|
||||||
|
attrName.push(@data.currentByte.downcase)
|
||||||
|
else
|
||||||
|
attrName.push(@data.currentByte)
|
||||||
|
end
|
||||||
|
#Step 6
|
||||||
|
@data.position += 1
|
||||||
|
end
|
||||||
|
#Step 7
|
||||||
|
if spaceFound
|
||||||
|
@data.skip
|
||||||
|
#Step 8
|
||||||
|
unless @data.currentByte == '='
|
||||||
|
@data.position -= 1
|
||||||
|
return [attrName.join(''), '']
|
||||||
|
end
|
||||||
|
end
|
||||||
|
#XXX need to advance position in both spaces and value case
|
||||||
|
#Step 9
|
||||||
|
@data.position += 1
|
||||||
|
#Step 10
|
||||||
|
@data.skip
|
||||||
|
#Step 11
|
||||||
|
if ["'", '"'].include?(@data.currentByte)
|
||||||
|
#11.1
|
||||||
|
quoteChar = @data.currentByte
|
||||||
|
while true
|
||||||
|
@data.position+=1
|
||||||
|
#11.3
|
||||||
|
if @data.currentByte == quoteChar
|
||||||
|
@data.position += 1
|
||||||
|
return [attrName.join(''), attrValue.join('')]
|
||||||
|
#11.4
|
||||||
|
elsif ASCII_UPPERCASE.include?(@data.currentByte)
|
||||||
|
attrValue.push(@data.currentByte.downcase)
|
||||||
|
#11.5
|
||||||
|
else
|
||||||
|
attrValue.push(@data.currentByte)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
elsif ['>', '<'].include?(@data.currentByte)
|
||||||
|
return [attrName.join(''), '']
|
||||||
|
elsif ASCII_UPPERCASE.include?(@data.currentByte)
|
||||||
|
attrValue.push(@data.currentByte.downcase)
|
||||||
|
else
|
||||||
|
attrValue.push(@data.currentByte)
|
||||||
|
end
|
||||||
|
while true
|
||||||
|
@data.position +=1
|
||||||
|
if (SPACE_CHARACTERS + ['>', '<']).include?(@data.currentByte)
|
||||||
|
return [attrName.join(''), attrValue.join('')]
|
||||||
|
elsif ASCII_UPPERCASE.include?(@data.currentByte)
|
||||||
|
attrValue.push(@data.currentByte.downcase)
|
||||||
|
else
|
||||||
|
attrValue.push(@data.currentByte)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class ContentAttrParser
|
||||||
|
def initialize(data)
|
||||||
|
@data = data
|
||||||
|
end
|
||||||
|
def parse
|
||||||
|
begin
|
||||||
|
#Skip to the first ";"
|
||||||
|
@data.position = 0
|
||||||
|
@data.jumpTo(';')
|
||||||
|
@data.position += 1
|
||||||
|
@data.skip
|
||||||
|
#Check if the attr name is charset
|
||||||
|
#otherwise return
|
||||||
|
@data.jumpTo('charset')
|
||||||
|
@data.position += 1
|
||||||
|
@data.skip
|
||||||
|
unless @data.currentByte == '='
|
||||||
|
#If there is no = sign keep looking for attrs
|
||||||
|
return nil
|
||||||
|
end
|
||||||
|
@data.position += 1
|
||||||
|
@data.skip
|
||||||
|
#Look for an encoding between matching quote marks
|
||||||
|
if ['"', "'"].include?(@data.currentByte)
|
||||||
|
quoteMark = @data.currentByte
|
||||||
|
@data.position += 1
|
||||||
|
oldPosition = @data.position
|
||||||
|
@data.jumpTo(quoteMark)
|
||||||
|
return @data[oldPosition ... @data.position]
|
||||||
|
else
|
||||||
|
#Unquoted value
|
||||||
|
oldPosition = @data.position
|
||||||
|
begin
|
||||||
|
@data.findNext(SPACE_CHARACTERS)
|
||||||
|
return @data[oldPosition ... @data.position]
|
||||||
|
rescue EOF
|
||||||
|
#Return the whole remaining value
|
||||||
|
return @data[oldPosition .. -1]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
rescue EOF
|
||||||
|
return nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determine if a string is a supported encoding
|
||||||
|
def self.isValidEncoding(encoding)
|
||||||
|
(not encoding.nil? and encoding.kind_of?(String) and ENCODINGS.include?(encoding.downcase.strip))
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
141
vendor/plugins/HTML5lib/lib/html5lib/liberalxmlparser.rb
vendored
Executable file
141
vendor/plugins/HTML5lib/lib/html5lib/liberalxmlparser.rb
vendored
Executable file
|
@ -0,0 +1,141 @@
|
||||||
|
# Warning: this module is experimental and subject to change and even removal
|
||||||
|
# at any time.
|
||||||
|
#
|
||||||
|
# For background/rationale, see:
|
||||||
|
# * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
|
||||||
|
# * http://tinyurl.com/ylfj8k (and follow-ups)
|
||||||
|
#
|
||||||
|
# References:
|
||||||
|
# * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
|
||||||
|
# * http://wiki.whatwg.org/wiki/HtmlVsXhtml
|
||||||
|
#
|
||||||
|
# @@TODO:
|
||||||
|
# * Selectively lowercase only XHTML, but not foreign markup
|
||||||
|
require 'html5lib/html5parser'
|
||||||
|
require 'html5lib/constants'
|
||||||
|
|
||||||
|
module HTML5lib
|
||||||
|
|
||||||
|
# liberal XML parser
|
||||||
|
class XMLParser < HTMLParser
|
||||||
|
|
||||||
|
def initialize(options={})
|
||||||
|
super options
|
||||||
|
@phases[:initial] = XmlRootPhase.new(self, @tree)
|
||||||
|
end
|
||||||
|
|
||||||
|
def normalizeToken(token)
|
||||||
|
if token[:type] == :StartTag or token[:type] == :EmptyTag
|
||||||
|
# We need to remove the duplicate attributes and convert attributes
|
||||||
|
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||||
|
|
||||||
|
token[:data] = Hash[*token[:data].reverse.flatten]
|
||||||
|
|
||||||
|
# For EmptyTags, process both a Start and an End tag
|
||||||
|
if token[:type] == :EmptyTag
|
||||||
|
@phase.processStartTag(token[:name], token[:data])
|
||||||
|
token[:data] = {}
|
||||||
|
token[:type] = :EndTag
|
||||||
|
end
|
||||||
|
|
||||||
|
elsif token[:type] == :EndTag
|
||||||
|
if token[:data]
|
||||||
|
parseError(_("End tag contains unexpected attributes."))
|
||||||
|
end
|
||||||
|
|
||||||
|
elsif token[:type] == :Comment
|
||||||
|
# Rescue CDATA from the comments
|
||||||
|
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
|
||||||
|
token[:type] = :Characters
|
||||||
|
token[:data] = token[:data][7 ... -2]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
return token
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# liberal XMTHML parser
|
||||||
|
class XHTMLParser < XMLParser
|
||||||
|
|
||||||
|
def initialize(options={})
|
||||||
|
super options
|
||||||
|
@phases[:initial] = InitialPhase.new(self, @tree)
|
||||||
|
@phases[:rootElement] = XhmlRootPhase.new(self, @tree)
|
||||||
|
end
|
||||||
|
|
||||||
|
def normalizeToken(token)
|
||||||
|
super(token)
|
||||||
|
|
||||||
|
# ensure that non-void XHTML elements have content so that separate
|
||||||
|
# open and close tags are emitted
|
||||||
|
if token[:type] == :EndTag and \
|
||||||
|
not VOID_ELEMENTS.include? token[:name] and \
|
||||||
|
token[:name] == @tree.openElements[-1].name and \
|
||||||
|
not @tree.openElements[-1].hasContent
|
||||||
|
@tree.insertText('') unless
|
||||||
|
@tree.openElements.any? {|e|
|
||||||
|
e.attributes.keys.include? 'xmlns' and
|
||||||
|
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
return token
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class XhmlRootPhase < RootElementPhase
|
||||||
|
def insertHtmlElement
|
||||||
|
element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
|
||||||
|
@tree.openElements.push(element)
|
||||||
|
@tree.document.appendChild(element)
|
||||||
|
@parser.phase = @parser.phases[:beforeHead]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class XmlRootPhase < Phase
|
||||||
|
# Prime the Xml parser
|
||||||
|
@start_tag_handlers = Hash.new(:startTagOther)
|
||||||
|
@end_tag_handlers = Hash.new(:endTagOther)
|
||||||
|
def startTagOther(name, attributes)
|
||||||
|
@tree.openElements.push(@tree.document)
|
||||||
|
element = @tree.createElement(name, attributes)
|
||||||
|
@tree.openElements[-1].appendChild(element)
|
||||||
|
@tree.openElements.push(element)
|
||||||
|
@parser.phase = XmlElementPhase.new(@parser,@tree)
|
||||||
|
end
|
||||||
|
def endTagOther(name)
|
||||||
|
super
|
||||||
|
@tree.openElements.pop
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class XmlElementPhase < Phase
|
||||||
|
# Generic handling for all XML elements
|
||||||
|
|
||||||
|
@start_tag_handlers = Hash.new(:startTagOther)
|
||||||
|
@end_tag_handlers = Hash.new(:endTagOther)
|
||||||
|
|
||||||
|
def startTagOther(name, attributes)
|
||||||
|
element = @tree.createElement(name, attributes)
|
||||||
|
@tree.openElements[-1].appendChild(element)
|
||||||
|
@tree.openElements.push(element)
|
||||||
|
end
|
||||||
|
|
||||||
|
def endTagOther(name)
|
||||||
|
for node in @tree.openElements.reverse
|
||||||
|
if node.name == name
|
||||||
|
{} while @tree.openElements.pop != node
|
||||||
|
break
|
||||||
|
else
|
||||||
|
@parser.parseError
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def processCharacters(data)
|
||||||
|
@tree.insertText(data)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
178
vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb
vendored
Normal file
178
vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb
vendored
Normal file
|
@ -0,0 +1,178 @@
|
||||||
|
require 'html5lib/tokenizer'
|
||||||
|
require 'cgi'
|
||||||
|
|
||||||
|
module HTML5lib
|
||||||
|
|
||||||
|
# This module provides sanitization of XHTML+MathML+SVG
|
||||||
|
# and of inline style attributes.
|
||||||
|
|
||||||
|
class HTMLSanitizer < HTMLTokenizer
|
||||||
|
|
||||||
|
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
|
||||||
|
button caption center cite code col colgroup dd del dfn dir div dl dt
|
||||||
|
em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label
|
||||||
|
legend li map menu ol optgroup option p pre q s samp select small span
|
||||||
|
strike strong sub sup table tbody td textarea tfoot th thead tr tt u
|
||||||
|
ul var]
|
||||||
|
|
||||||
|
MATHML_ELEMENTS = %w[maction math merror mfrac mi mmultiscripts mn mo
|
||||||
|
mover mpadded mphantom mprescripts mroot mrow mspace msqrt mstyle msub
|
||||||
|
msubsup msup mtable mtd mtext mtr munder munderover none]
|
||||||
|
|
||||||
|
SVG_ELEMENTS = %w[a animate animateColor animateMotion animateTransform
|
||||||
|
circle defs desc ellipse font-face font-face-name font-face-src g
|
||||||
|
glyph hkern image linearGradient line marker metadata missing-glyph
|
||||||
|
mpath path polygon polyline radialGradient rect set stop svg switch
|
||||||
|
text title tspan use]
|
||||||
|
|
||||||
|
ACCEPTABLE_ATTRIBUTES = %w[abbr accept accept-charset accesskey action
|
||||||
|
align alt axis border cellpadding cellspacing char charoff charset
|
||||||
|
checked cite class clear cols colspan color compact coords datetime
|
||||||
|
dir disabled enctype for frame headers height href hreflang hspace id
|
||||||
|
ismap label lang longdesc maxlength media method multiple name nohref
|
||||||
|
noshade nowrap prompt readonly rel rev rows rowspan rules scope
|
||||||
|
selected shape size span src start style summary tabindex target title
|
||||||
|
type usemap valign value vspace width xml:lang]
|
||||||
|
|
||||||
|
MATHML_ATTRIBUTES = %w[actiontype align columnalign columnalign
|
||||||
|
columnalign columnlines columnspacing columnspan depth display
|
||||||
|
displaystyle equalcolumns equalrows fence fontstyle fontweight frame
|
||||||
|
height linethickness lspace mathbackground mathcolor mathvariant
|
||||||
|
mathvariant maxsize minsize other rowalign rowalign rowalign rowlines
|
||||||
|
rowspacing rowspan rspace scriptlevel selection separator stretchy
|
||||||
|
width width xlink:href xlink:show xlink:type xmlns xmlns:xlink]
|
||||||
|
|
||||||
|
SVG_ATTRIBUTES = %w[accent-height accumulate additive alphabetic
|
||||||
|
arabic-form ascent attributeName attributeType baseProfile bbox begin
|
||||||
|
by calcMode cap-height class color color-rendering content cx cy d dx
|
||||||
|
dy descent display dur end fill fill-rule font-family font-size
|
||||||
|
font-stretch font-style font-variant font-weight from fx fy g1 g2
|
||||||
|
glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
|
||||||
|
ideographic k keyPoints keySplines keyTimes lang marker-end
|
||||||
|
marker-mid marker-start markerHeight markerUnits markerWidth
|
||||||
|
mathematical max min name offset opacity orient origin
|
||||||
|
overline-position overline-thickness panose-1 path pathLength points
|
||||||
|
preserveAspectRatio r refX refY repeatCount repeatDur
|
||||||
|
requiredExtensions requiredFeatures restart rotate rx ry slope stemh
|
||||||
|
stemv stop-color stop-opacity strikethrough-position
|
||||||
|
strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
|
||||||
|
stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
|
||||||
|
stroke-width systemLanguage target text-anchor to transform type u1
|
||||||
|
u2 underline-position underline-thickness unicode unicode-range
|
||||||
|
units-per-em values version viewBox visibility width widths x
|
||||||
|
x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
|
||||||
|
xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
|
||||||
|
xmlns:xlink y y1 y2 zoomAndPan]
|
||||||
|
|
||||||
|
ATTR_VAL_IS_URI = %w[href src cite action longdesc xlink:href]
|
||||||
|
|
||||||
|
ACCEPTABLE_CSS_PROPERTIES = %w[azimuth background-color
|
||||||
|
border-bottom-color border-collapse border-color border-left-color
|
||||||
|
border-right-color border-top-color clear color cursor direction
|
||||||
|
display elevation float font font-family font-size font-style
|
||||||
|
font-variant font-weight height letter-spacing line-height overflow
|
||||||
|
pause pause-after pause-before pitch pitch-range richness speak
|
||||||
|
speak-header speak-numeral speak-punctuation speech-rate stress
|
||||||
|
text-align text-decoration text-indent unicode-bidi vertical-align
|
||||||
|
voice-family volume white-space width]
|
||||||
|
|
||||||
|
ACCEPTABLE_CSS_KEYWORDS = %w[auto aqua black block blue bold both bottom
|
||||||
|
brown center collapse dashed dotted fuchsia gray green !important
|
||||||
|
italic left lime maroon medium none navy normal nowrap olive pointer
|
||||||
|
purple red right solid silver teal top transparent underline white
|
||||||
|
yellow]
|
||||||
|
|
||||||
|
ACCEPTABLE_SVG_PROPERTIES = %w[fill fill-opacity fill-rule stroke
|
||||||
|
stroke-width stroke-linecap stroke-linejoin stroke-opacity]
|
||||||
|
|
||||||
|
ACCEPTABLE_PROTOCOLS = %w[ed2k ftp http https irc mailto news gopher nntp
|
||||||
|
telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
|
||||||
|
|
||||||
|
# subclasses may define their own versions of these constants
|
||||||
|
ALLOWED_ELEMENTS = ACCEPTABLE_ELEMENTS + MATHML_ELEMENTS + SVG_ELEMENTS
|
||||||
|
ALLOWED_ATTRIBUTES = ACCEPTABLE_ATTRIBUTES + MATHML_ATTRIBUTES + SVG_ATTRIBUTES
|
||||||
|
ALLOWED_CSS_PROPERTIES = ACCEPTABLE_CSS_PROPERTIES
|
||||||
|
ALLOWED_CSS_KEYWORDS = ACCEPTABLE_CSS_KEYWORDS
|
||||||
|
ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
|
||||||
|
ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
|
||||||
|
|
||||||
|
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
||||||
|
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
|
||||||
|
# attributes are parsed, and a restricted set, # specified by
|
||||||
|
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
||||||
|
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
|
||||||
|
# in ALLOWED_PROTOCOLS are allowed.
|
||||||
|
#
|
||||||
|
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||||
|
# => <script> do_nasty_stuff() </script>
|
||||||
|
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||||
|
# => <a>Click here for $100</a>
|
||||||
|
def each
|
||||||
|
super do |token|
|
||||||
|
case token[:type]
|
||||||
|
when :StartTag, :EndTag, :EmptyTag
|
||||||
|
if ALLOWED_ELEMENTS.include?(token[:name])
|
||||||
|
if token.has_key? :data
|
||||||
|
attrs = Hash[*token[:data].flatten]
|
||||||
|
attrs.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
|
||||||
|
ATTR_VAL_IS_URI.each do |attr|
|
||||||
|
val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
|
||||||
|
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
|
||||||
|
attrs.delete attr
|
||||||
|
end
|
||||||
|
end
|
||||||
|
if attrs['style']
|
||||||
|
attrs['style'] = sanitize_css(attrs['style'])
|
||||||
|
end
|
||||||
|
token[:data] = attrs.map {|k,v| [k,v]}
|
||||||
|
end
|
||||||
|
yield token
|
||||||
|
else
|
||||||
|
if token[:type] == :EndTag
|
||||||
|
token[:data] = "</#{token[:name]}>"
|
||||||
|
elsif token[:data]
|
||||||
|
attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('')
|
||||||
|
token[:data] = "<#{token[:name]}#{attrs}>"
|
||||||
|
else
|
||||||
|
token[:data] = "<#{token[:name]}>"
|
||||||
|
end
|
||||||
|
token[:data].insert(-2,'/') if token[:type] == :EmptyTag
|
||||||
|
token[:type] = :Characters
|
||||||
|
token.delete(:name)
|
||||||
|
yield token
|
||||||
|
end
|
||||||
|
else
|
||||||
|
yield token
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def sanitize_css(style)
|
||||||
|
# disallow urls
|
||||||
|
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
|
||||||
|
|
||||||
|
# gauntlet
|
||||||
|
return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
|
||||||
|
return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
|
||||||
|
|
||||||
|
clean = []
|
||||||
|
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
|
||||||
|
next if val.empty?
|
||||||
|
prop.downcase!
|
||||||
|
if ALLOWED_CSS_PROPERTIES.include?(prop)
|
||||||
|
clean << "#{prop}: #{val};"
|
||||||
|
elsif %w[background border margin padding].include?(prop.split('-')[0])
|
||||||
|
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
|
||||||
|
!ALLOWED_CSS_KEYWORDS.include?(keyword) and
|
||||||
|
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
|
||||||
|
end
|
||||||
|
elsif ALLOWED_SVG_PROPERTIES.include?(prop)
|
||||||
|
clean << "#{prop}: #{val};"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
style = clean.join(' ')
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
854
vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb
vendored
Normal file
854
vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb
vendored
Normal file
|
@ -0,0 +1,854 @@
|
||||||
|
require 'html5lib/constants'
|
||||||
|
require 'html5lib/inputstream'
|
||||||
|
|
||||||
|
module HTML5lib
|
||||||
|
|
||||||
|
# This class takes care of tokenizing HTML.
|
||||||
|
#
|
||||||
|
# * @currentToken
|
||||||
|
# Holds the token that is currently being processed.
|
||||||
|
#
|
||||||
|
# * @state
|
||||||
|
# Holds a reference to the method to be invoked... XXX
|
||||||
|
#
|
||||||
|
# * @states
|
||||||
|
# Holds a mapping between states and methods that implement the state.
|
||||||
|
#
|
||||||
|
# * @stream
|
||||||
|
# Points to HTMLInputStream object.
|
||||||
|
|
||||||
|
class HTMLTokenizer
|
||||||
|
attr_accessor :contentModelFlag, :currentToken
|
||||||
|
attr_reader :stream
|
||||||
|
|
||||||
|
# XXX need to fix documentation
|
||||||
|
|
||||||
|
def initialize(stream, options={})
|
||||||
|
@stream = HTMLInputStream.new(stream, options)
|
||||||
|
|
||||||
|
@states = {
|
||||||
|
:data => :dataState,
|
||||||
|
:entityData => :entityDataState,
|
||||||
|
:tagOpen => :tagOpenState,
|
||||||
|
:closeTagOpen => :closeTagOpenState,
|
||||||
|
:tagName => :tagNameState,
|
||||||
|
:beforeAttributeName => :beforeAttributeNameState,
|
||||||
|
:attributeName => :attributeNameState,
|
||||||
|
:afterAttributeName => :afterAttributeNameState,
|
||||||
|
:beforeAttributeValue => :beforeAttributeValueState,
|
||||||
|
:attributeValueDoubleQuoted => :attributeValueDoubleQuotedState,
|
||||||
|
:attributeValueSingleQuoted => :attributeValueSingleQuotedState,
|
||||||
|
:attributeValueUnQuoted => :attributeValueUnQuotedState,
|
||||||
|
:bogusComment => :bogusCommentState,
|
||||||
|
:markupDeclarationOpen => :markupDeclarationOpenState,
|
||||||
|
:comment => :commentState,
|
||||||
|
:commentDash => :commentDashState,
|
||||||
|
:commentEnd => :commentEndState,
|
||||||
|
:doctype => :doctypeState,
|
||||||
|
:beforeDoctypeName => :beforeDoctypeNameState,
|
||||||
|
:doctypeName => :doctypeNameState,
|
||||||
|
:afterDoctypeName => :afterDoctypeNameState,
|
||||||
|
:bogusDoctype => :bogusDoctypeState
|
||||||
|
}
|
||||||
|
|
||||||
|
# Setup the initial tokenizer state
|
||||||
|
@contentModelFlag = :PCDATA
|
||||||
|
@state = @states[:data]
|
||||||
|
|
||||||
|
# The current token being created
|
||||||
|
@currentToken = nil
|
||||||
|
|
||||||
|
# Tokens to be processed.
|
||||||
|
@tokenQueue = []
|
||||||
|
end
|
||||||
|
|
||||||
|
# This is where the magic happens.
|
||||||
|
#
|
||||||
|
# We do our usually processing through the states and when we have a token
|
||||||
|
# to return we yield the token which pauses processing until the next token
|
||||||
|
# is requested.
|
||||||
|
def each
|
||||||
|
@stream.reset
|
||||||
|
@tokenQueue = []
|
||||||
|
# Start processing. When EOF is reached @state will return false
|
||||||
|
# instead of true and the loop will terminate.
|
||||||
|
while send @state
|
||||||
|
while not @tokenQueue.empty?
|
||||||
|
yield @tokenQueue.shift
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Below are various helper functions the tokenizer states use worked out.
|
||||||
|
|
||||||
|
# If the next character is a '>', convert the currentToken into
|
||||||
|
# an EmptyTag
|
||||||
|
|
||||||
|
def processSolidusInTag
|
||||||
|
|
||||||
|
# We need to consume another character to make sure it's a ">"
|
||||||
|
data = @stream.char
|
||||||
|
|
||||||
|
if @currentToken[:type] == :StartTag and data == ">"
|
||||||
|
@currentToken[:type] = :EmptyTag
|
||||||
|
else
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Solidus (/) incorrectly placed in tag.")})
|
||||||
|
end
|
||||||
|
|
||||||
|
# The character we just consumed need to be put back on the stack so it
|
||||||
|
# doesn't get lost...
|
||||||
|
@stream.queue.push(data)
|
||||||
|
end
|
||||||
|
|
||||||
|
# This function returns either U+FFFD or the character based on the
|
||||||
|
# decimal or hexadecimal representation. It also discards ";" if present.
|
||||||
|
# If not present @tokenQueue.push({:type => :ParseError}") is invoked.
|
||||||
|
|
||||||
|
def consumeNumberEntity(isHex)
|
||||||
|
|
||||||
|
# XXX More need to be done here. For instance, #13 should prolly be
|
||||||
|
# converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
|
||||||
|
# such. Thoughts on this appreciated.
|
||||||
|
allowed = DIGITS
|
||||||
|
radix = 10
|
||||||
|
if isHex
|
||||||
|
allowed = HEX_DIGITS
|
||||||
|
radix = 16
|
||||||
|
end
|
||||||
|
|
||||||
|
char = [0xFFFD].pack('U')
|
||||||
|
charStack = []
|
||||||
|
|
||||||
|
# Consume all the characters that are in range while making sure we
|
||||||
|
# don't hit an EOF.
|
||||||
|
c = @stream.char
|
||||||
|
while allowed.include?(c) and c != :EOF
|
||||||
|
charStack.push(c)
|
||||||
|
c = @stream.char
|
||||||
|
end
|
||||||
|
|
||||||
|
# Convert the set of characters consumed to an int.
|
||||||
|
charAsInt = charStack.join('').to_i(radix)
|
||||||
|
|
||||||
|
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
|
||||||
|
# smaller) we need to do the "windows trick".
|
||||||
|
if (127...160).include? charAsInt
|
||||||
|
#XXX - removed parse error from windows 1252 entity for now
|
||||||
|
#we may want to reenable this later
|
||||||
|
#@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
# _("Entity used with illegal number (windows-1252 reference).")})
|
||||||
|
|
||||||
|
charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
|
||||||
|
end
|
||||||
|
|
||||||
|
# 0 is not a good number.
|
||||||
|
if charAsInt == 0
|
||||||
|
charAsInt = 65533
|
||||||
|
end
|
||||||
|
|
||||||
|
if charAsInt <= 0x10FFF
|
||||||
|
char = [charAsInt].pack('U')
|
||||||
|
else
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Numeric entity couldn't be converted to character.")})
|
||||||
|
end
|
||||||
|
|
||||||
|
# Discard the ; if present. Otherwise, put it back on the queue and
|
||||||
|
# invoke parseError on parser.
|
||||||
|
if c != ";"
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Numeric entity didn't end with ';'.")})
|
||||||
|
@stream.queue.push(c)
|
||||||
|
end
|
||||||
|
|
||||||
|
return char
|
||||||
|
end
|
||||||
|
|
||||||
|
def consumeEntity
|
||||||
|
char = nil
|
||||||
|
charStack = [@stream.char]
|
||||||
|
if charStack[0] == "#"
|
||||||
|
# We might have a number entity here.
|
||||||
|
charStack += [@stream.char, @stream.char]
|
||||||
|
if charStack.include? :EOF
|
||||||
|
# If we reach the end of the file put everything up to :EOF
|
||||||
|
# back in the queue
|
||||||
|
charStack = charStack[0...charStack.index(:EOF)]
|
||||||
|
@stream.queue+= charStack
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Numeric entity expected. Got end of file instead.")})
|
||||||
|
else
|
||||||
|
if charStack[1].downcase == "x" \
|
||||||
|
and HEX_DIGITS.include? charStack[2]
|
||||||
|
# Hexadecimal entity detected.
|
||||||
|
@stream.queue.push(charStack[2])
|
||||||
|
char = consumeNumberEntity(true)
|
||||||
|
elsif DIGITS.include? charStack[1]
|
||||||
|
# Decimal entity detected.
|
||||||
|
@stream.queue += charStack[1..-1]
|
||||||
|
char = consumeNumberEntity(false)
|
||||||
|
else
|
||||||
|
# No number entity detected.
|
||||||
|
@stream.queue += charStack
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Numeric entity expected but none found.")})
|
||||||
|
end
|
||||||
|
end
|
||||||
|
# Break out if we reach the end of the file
|
||||||
|
elsif charStack[0] == :EOF
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Entity expected. Got end of file instead.")})
|
||||||
|
else
|
||||||
|
# At this point in the process might have named entity. Entities
|
||||||
|
# are stored in the global variable "entities".
|
||||||
|
#
|
||||||
|
# Consume characters and compare to these to a substring of the
|
||||||
|
# entity names in the list until the substring no longer matches.
|
||||||
|
filteredEntityList = ENTITIES.keys
|
||||||
|
filteredEntityList.reject! {|e| e[0].chr != charStack[0]}
|
||||||
|
entityName = nil
|
||||||
|
|
||||||
|
while charStack[-1] != :EOF
|
||||||
|
name = charStack.join('')
|
||||||
|
if filteredEntityList.any? {|e| e[0...name.length] == name}
|
||||||
|
filteredEntityList.reject! {|e| e[0...name.length] != name}
|
||||||
|
charStack.push(@stream.char)
|
||||||
|
else
|
||||||
|
break
|
||||||
|
end
|
||||||
|
|
||||||
|
if ENTITIES.include? name
|
||||||
|
entityName = name
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
if entityName != nil
|
||||||
|
char = ENTITIES[entityName]
|
||||||
|
|
||||||
|
# Check whether or not the last character returned can be
|
||||||
|
# discarded or needs to be put back.
|
||||||
|
if not charStack[-1] == ";"
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Named entity didn't end with ';'.")})
|
||||||
|
@stream.queue += charStack[entityName.length..-1]
|
||||||
|
end
|
||||||
|
else
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Named entity expected. Got none.")})
|
||||||
|
@stream.queue += charStack
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return char
|
||||||
|
end
|
||||||
|
|
||||||
|
# This method replaces the need for "entityInAttributeValueState".
|
||||||
|
def processEntityInAttribute
|
||||||
|
entity = consumeEntity
|
||||||
|
if entity
|
||||||
|
@currentToken[:data][-1][1] += entity
|
||||||
|
else
|
||||||
|
@currentToken[:data][-1][1] += "&"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# This method is a generic handler for emitting the tags. It also sets
|
||||||
|
# the state to "data" because that's what's needed after a token has been
|
||||||
|
# emitted.
|
||||||
|
def emitCurrentToken
|
||||||
|
# Add token to the queue to be yielded
|
||||||
|
@tokenQueue.push(@currentToken)
|
||||||
|
@state = @states[:data]
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# Below are the various tokenizer states worked out.
|
||||||
|
|
||||||
|
# XXX AT Perhaps we should have Hixie run some evaluation on billions of
|
||||||
|
# documents to figure out what the order of the various if and elsif
|
||||||
|
# statements should be.
|
||||||
|
|
||||||
|
def dataState
|
||||||
|
data = @stream.char
|
||||||
|
if data == "&" and (@contentModelFlag == :PCDATA or
|
||||||
|
@contentModelFlag == :RCDATA)
|
||||||
|
@state = @states[:entityData]
|
||||||
|
elsif data == "<" and @contentModelFlag != :PLAINTEXT
|
||||||
|
@state = @states[:tagOpen]
|
||||||
|
elsif data == :EOF
|
||||||
|
# Tokenization ends.
|
||||||
|
return false
|
||||||
|
elsif SPACE_CHARACTERS.include? data
|
||||||
|
# Directly after emitting a token you switch back to the "data
|
||||||
|
# state". At that point SPACE_CHARACTERS are important so they are
|
||||||
|
# emitted separately.
|
||||||
|
# XXX need to check if we don't need a special "spaces" flag on
|
||||||
|
# characters.
|
||||||
|
@tokenQueue.push({:type => :SpaceCharacters, :data =>
|
||||||
|
data + @stream.charsUntil(SPACE_CHARACTERS, true)})
|
||||||
|
else
|
||||||
|
@tokenQueue.push({:type => :Characters, :data =>
|
||||||
|
data + @stream.charsUntil(["&", "<"])})
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def entityDataState
|
||||||
|
entity = consumeEntity
|
||||||
|
if entity
|
||||||
|
@tokenQueue.push({:type => :Characters, :data => entity})
|
||||||
|
else
|
||||||
|
@tokenQueue.push({:type => :Characters, :data => "&"})
|
||||||
|
end
|
||||||
|
@state = @states[:data]
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def tagOpenState
|
||||||
|
data = @stream.char
|
||||||
|
if @contentModelFlag == :PCDATA
|
||||||
|
if data == "!"
|
||||||
|
@state = @states[:markupDeclarationOpen]
|
||||||
|
elsif data == "/"
|
||||||
|
@state = @states[:closeTagOpen]
|
||||||
|
elsif data != :EOF and ASCII_LETTERS.include? data
|
||||||
|
@currentToken =\
|
||||||
|
{:type => :StartTag, :name => data, :data => []}
|
||||||
|
@state = @states[:tagName]
|
||||||
|
elsif data == ">"
|
||||||
|
# XXX In theory it could be something besides a tag name. But
|
||||||
|
# do we really care?
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Expected tag name. Got '>' instead.")})
|
||||||
|
@tokenQueue.push({:type => :Characters, :data => "<>"})
|
||||||
|
@state = @states[:data]
|
||||||
|
elsif data == "?"
|
||||||
|
# XXX In theory it could be something besides a tag name. But
|
||||||
|
# do we really care?
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Expected tag name. Got '?' instead (HTML doesn't " +
|
||||||
|
"support processing instructions).")})
|
||||||
|
@stream.queue.push(data)
|
||||||
|
@state = @states[:bogusComment]
|
||||||
|
else
|
||||||
|
# XXX
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Expected tag name. Got something else instead")})
|
||||||
|
@tokenQueue.push({:type => :Characters, :data => "<"})
|
||||||
|
@stream.queue.push(data)
|
||||||
|
@state = @states[:data]
|
||||||
|
end
|
||||||
|
else
|
||||||
|
# We know the content model flag is set to either RCDATA or CDATA
|
||||||
|
# now because this state can never be entered with the PLAINTEXT
|
||||||
|
# flag.
|
||||||
|
if data == "/"
|
||||||
|
@state = @states[:closeTagOpen]
|
||||||
|
else
|
||||||
|
@tokenQueue.push({:type => :Characters, :data => "<"})
|
||||||
|
@stream.queue.insert(0, data)
|
||||||
|
@state = @states[:data]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def closeTagOpenState
|
||||||
|
if (@contentModelFlag == :RCDATA or @contentModelFlag == :CDATA)
|
||||||
|
if @currentToken
|
||||||
|
charStack = []
|
||||||
|
|
||||||
|
# So far we know that "</" has been consumed. We now need to know
|
||||||
|
# whether the next few characters match the name of last emitted
|
||||||
|
# start tag which also happens to be the currentToken. We also need
|
||||||
|
# to have the character directly after the characters that could
|
||||||
|
# match the start tag name.
|
||||||
|
(@currentToken[:name].length + 1).times do
|
||||||
|
charStack.push(@stream.char)
|
||||||
|
# Make sure we don't get hit by :EOF
|
||||||
|
break if charStack[-1] == :EOF
|
||||||
|
end
|
||||||
|
|
||||||
|
# Since this is just for checking. We put the characters back on
|
||||||
|
# the stack.
|
||||||
|
@stream.queue += charStack
|
||||||
|
end
|
||||||
|
|
||||||
|
if @currentToken and
|
||||||
|
@currentToken[:name].downcase ==
|
||||||
|
charStack[0...-1].join('').downcase and
|
||||||
|
(SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? charStack[-1]
|
||||||
|
# Because the characters are correct we can safely switch to
|
||||||
|
# PCDATA mode now. This also means we don't have to do it when
|
||||||
|
# emitting the end tag token.
|
||||||
|
@contentModelFlag = :PCDATA
|
||||||
|
else
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Expected closing tag after seeing '</'. None found.")})
|
||||||
|
@tokenQueue.push({:type => :Characters, :data => "</"})
|
||||||
|
@state = @states[:data]
|
||||||
|
|
||||||
|
# Need to return here since we don't want the rest of the
|
||||||
|
# method to be walked through.
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
if @contentModelFlag == :PCDATA
|
||||||
|
data = @stream.char
|
||||||
|
if data == :EOF
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Expected closing tag. Unexpected end of file.")})
|
||||||
|
@tokenQueue.push({:type => :Characters, :data => "</"})
|
||||||
|
@state = @states[:data]
|
||||||
|
elsif ASCII_LETTERS.include? data
|
||||||
|
@currentToken =\
|
||||||
|
{:type => :EndTag, :name => data, :data => []}
|
||||||
|
@state = @states[:tagName]
|
||||||
|
elsif data == ">"
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
|
||||||
|
@state = @states[:data]
|
||||||
|
else
|
||||||
|
# XXX data can be _'_...
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Expected closing tag. Unexpected character '" + data + "' found.")})
|
||||||
|
@stream.queue.push(data)
|
||||||
|
@state = @states[:bogusComment]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def tagNameState
|
||||||
|
data = @stream.char
|
||||||
|
if SPACE_CHARACTERS.include? data
|
||||||
|
@state = @states[:beforeAttributeName]
|
||||||
|
elsif data == :EOF
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected end of file in the tag name.")})
|
||||||
|
emitCurrentToken
|
||||||
|
elsif ASCII_LETTERS.include? data
|
||||||
|
@currentToken[:name] += data +\
|
||||||
|
@stream.charsUntil(ASCII_LETTERS, true)
|
||||||
|
elsif data == ">"
|
||||||
|
emitCurrentToken
|
||||||
|
elsif data == "<"
|
||||||
|
@stream.queue.push(data)
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected < character when getting the tag name.")})
|
||||||
|
emitCurrentToken
|
||||||
|
elsif data == "/"
|
||||||
|
processSolidusInTag
|
||||||
|
@state = @states[:beforeAttributeName]
|
||||||
|
else
|
||||||
|
@currentToken[:name] += data
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def beforeAttributeNameState
|
||||||
|
data = @stream.char
|
||||||
|
if SPACE_CHARACTERS.include? data
|
||||||
|
@stream.charsUntil(SPACE_CHARACTERS, true)
|
||||||
|
elsif data == :EOF
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected end of file. Expected attribute name instead.")})
|
||||||
|
emitCurrentToken
|
||||||
|
elsif ASCII_LETTERS.include? data
|
||||||
|
@currentToken[:data].push([data, ""])
|
||||||
|
@state = @states[:attributeName]
|
||||||
|
elsif data == ">"
|
||||||
|
emitCurrentToken
|
||||||
|
elsif data == "/"
|
||||||
|
processSolidusInTag
|
||||||
|
elsif data == "<"
|
||||||
|
@stream.queue.push(data)
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected < character. Expected attribute name instead.")})
|
||||||
|
emitCurrentToken
|
||||||
|
else
|
||||||
|
@currentToken[:data].push([data, ""])
|
||||||
|
@state = @states[:attributeName]
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def attributeNameState
|
||||||
|
data = @stream.char
|
||||||
|
leavingThisState = true
|
||||||
|
if data == "="
|
||||||
|
@state = @states[:beforeAttributeValue]
|
||||||
|
elsif data == :EOF
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected end of file in attribute name.")})
|
||||||
|
emitCurrentToken
|
||||||
|
leavingThisState = false
|
||||||
|
elsif ASCII_LETTERS.include? data
|
||||||
|
@currentToken[:data][-1][0] += data +\
|
||||||
|
@stream.charsUntil(ASCII_LETTERS, true)
|
||||||
|
leavingThisState = false
|
||||||
|
elsif data == ">"
|
||||||
|
# XXX If we emit here the attributes are converted to a dict
|
||||||
|
# without being checked and when the code below runs we error
|
||||||
|
# because data is a dict not a list
|
||||||
|
elsif SPACE_CHARACTERS.include? data
|
||||||
|
@state = @states[:afterAttributeName]
|
||||||
|
elsif data == "/"
|
||||||
|
processSolidusInTag
|
||||||
|
@state = @states[:beforeAttributeName]
|
||||||
|
elsif data == "<"
|
||||||
|
@stream.queue.push(data)
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected < character in attribute name.")})
|
||||||
|
emitCurrentToken
|
||||||
|
leavingThisState = false
|
||||||
|
else
|
||||||
|
@currentToken[:data][-1][0] += data
|
||||||
|
leavingThisState = false
|
||||||
|
end
|
||||||
|
|
||||||
|
if leavingThisState
|
||||||
|
# Attributes are not dropped at this stage. That happens when the
|
||||||
|
# start tag token is emitted so values can still be safely appended
|
||||||
|
# to attributes, but we do want to report the parse error in time.
|
||||||
|
@currentToken[:data][0...-1].each {|name,value|
|
||||||
|
if @currentToken[:data][-1][0] == name
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Dropped duplicate attribute on tag.")})
|
||||||
|
end
|
||||||
|
}
|
||||||
|
# XXX Fix for above XXX
|
||||||
|
if data == ">"
|
||||||
|
emitCurrentToken
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def afterAttributeNameState
|
||||||
|
data = @stream.char
|
||||||
|
if SPACE_CHARACTERS.include? data
|
||||||
|
@stream.charsUntil(SPACE_CHARACTERS, true)
|
||||||
|
elsif data == "="
|
||||||
|
@state = @states[:beforeAttributeValue]
|
||||||
|
elsif data == ">"
|
||||||
|
emitCurrentToken
|
||||||
|
elsif ASCII_LETTERS.include? data
|
||||||
|
@currentToken[:data].push([data, ""])
|
||||||
|
@state = @states[:attributeName]
|
||||||
|
elsif data == "/"
|
||||||
|
processSolidusInTag
|
||||||
|
@state = @states[:beforeAttributeName]
|
||||||
|
elsif data == "<"
|
||||||
|
@stream.queue.push(data)
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected < character. Expected = or end of tag.")})
|
||||||
|
emitCurrentToken
|
||||||
|
elsif data == :EOF
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected end of file. Expected = or end of tag.")})
|
||||||
|
emitCurrentToken
|
||||||
|
else
|
||||||
|
@currentToken[:data].push([data, ""])
|
||||||
|
@state = @states[:attributeName]
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def beforeAttributeValueState
|
||||||
|
data = @stream.char
|
||||||
|
if SPACE_CHARACTERS.include? data
|
||||||
|
@stream.charsUntil(SPACE_CHARACTERS, true)
|
||||||
|
elsif data == "\""
|
||||||
|
@state = @states[:attributeValueDoubleQuoted]
|
||||||
|
elsif data == "&"
|
||||||
|
@state = @states[:attributeValueUnQuoted]
|
||||||
|
@stream.queue.push(data);
|
||||||
|
elsif data == "'"
|
||||||
|
@state = @states[:attributeValueSingleQuoted]
|
||||||
|
elsif data == ">"
|
||||||
|
emitCurrentToken
|
||||||
|
elsif data == "<"
|
||||||
|
@stream.queue.push(data)
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected < character. Expected attribute value.")})
|
||||||
|
emitCurrentToken
|
||||||
|
elsif data == :EOF
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected end of file. Expected attribute value.")})
|
||||||
|
emitCurrentToken
|
||||||
|
else
|
||||||
|
@currentToken[:data][-1][1] += data
|
||||||
|
@state = @states[:attributeValueUnQuoted]
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def attributeValueDoubleQuotedState
|
||||||
|
data = @stream.char
|
||||||
|
if data == "\""
|
||||||
|
@state = @states[:beforeAttributeName]
|
||||||
|
elsif data == "&"
|
||||||
|
processEntityInAttribute
|
||||||
|
elsif data == :EOF
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected end of file in attribute value (\").")})
|
||||||
|
emitCurrentToken
|
||||||
|
else
|
||||||
|
@currentToken[:data][-1][1] += data +\
|
||||||
|
@stream.charsUntil(["\"", "&"])
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def attributeValueSingleQuotedState
|
||||||
|
data = @stream.char
|
||||||
|
if data == "'"
|
||||||
|
@state = @states[:beforeAttributeName]
|
||||||
|
elsif data == "&"
|
||||||
|
processEntityInAttribute
|
||||||
|
elsif data == :EOF
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected end of file in attribute value (').")})
|
||||||
|
emitCurrentToken
|
||||||
|
else
|
||||||
|
@currentToken[:data][-1][1] += data +\
|
||||||
|
@stream.charsUntil(["'", "&"])
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def attributeValueUnQuotedState
|
||||||
|
data = @stream.char
|
||||||
|
if SPACE_CHARACTERS.include? data
|
||||||
|
@state = @states[:beforeAttributeName]
|
||||||
|
elsif data == "&"
|
||||||
|
processEntityInAttribute
|
||||||
|
elsif data == ">"
|
||||||
|
emitCurrentToken
|
||||||
|
elsif data == "<"
|
||||||
|
@stream.queue.push(data)
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected < character in attribute value.")})
|
||||||
|
emitCurrentToken
|
||||||
|
elsif data == :EOF
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected end of file in attribute value.")})
|
||||||
|
emitCurrentToken
|
||||||
|
else
|
||||||
|
@currentToken[:data][-1][1] += data +
|
||||||
|
@stream.charsUntil(["&", ">","<"] + SPACE_CHARACTERS)
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def bogusCommentState
|
||||||
|
# Make a new comment token and give it as value all the characters
|
||||||
|
# until the first > or :EOF (charsUntil checks for :EOF automatically)
|
||||||
|
# and emit it.
|
||||||
|
@tokenQueue.push(
|
||||||
|
{:type => :Comment, :data => @stream.charsUntil((">"))})
|
||||||
|
|
||||||
|
# Eat the character directly after the bogus comment which is either a
|
||||||
|
# ">" or an :EOF.
|
||||||
|
@stream.char
|
||||||
|
@state = @states[:data]
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def markupDeclarationOpenState
|
||||||
|
charStack = [@stream.char, @stream.char]
|
||||||
|
if charStack == ["-", "-"]
|
||||||
|
@currentToken = {:type => :Comment, :data => ""}
|
||||||
|
@state = @states[:comment]
|
||||||
|
else
|
||||||
|
5.times { charStack.push(@stream.char) }
|
||||||
|
# Put in explicit :EOF check
|
||||||
|
if ((not charStack.include? :EOF) and
|
||||||
|
charStack.join("").upcase == "DOCTYPE")
|
||||||
|
@currentToken =\
|
||||||
|
{:type => :Doctype, :name => "", :data => true}
|
||||||
|
@state = @states[:doctype]
|
||||||
|
else
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Expected '--' or 'DOCTYPE'. Not found.")})
|
||||||
|
@stream.queue += charStack
|
||||||
|
@state = @states[:bogusComment]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def commentState
|
||||||
|
data = @stream.char
|
||||||
|
if data == "-"
|
||||||
|
@state = @states[:commentDash]
|
||||||
|
elsif data == :EOF
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected end of file in comment.")})
|
||||||
|
@tokenQueue.push(@currentToken)
|
||||||
|
@state = @states[:data]
|
||||||
|
else
|
||||||
|
@currentToken[:data] += data + @stream.charsUntil("-")
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def commentDashState
|
||||||
|
data = @stream.char
|
||||||
|
if data == "-"
|
||||||
|
@state = @states[:commentEnd]
|
||||||
|
elsif data == :EOF
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected end of file in comment (-)")})
|
||||||
|
@tokenQueue.push(@currentToken)
|
||||||
|
@state = @states[:data]
|
||||||
|
else
|
||||||
|
@currentToken[:data] += "-" + data +\
|
||||||
|
@stream.charsUntil("-")
|
||||||
|
# Consume the next character which is either a "-" or an :EOF as
|
||||||
|
# well so if there's a "-" directly after the "-" we go nicely to
|
||||||
|
# the "comment end state" without emitting a ParseError there.
|
||||||
|
@stream.char
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def commentEndState
|
||||||
|
data = @stream.char
|
||||||
|
if data == ">"
|
||||||
|
@tokenQueue.push(@currentToken)
|
||||||
|
@state = @states[:data]
|
||||||
|
elsif data == "-"
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected '-' after '--' found in comment.")})
|
||||||
|
@currentToken[:data] += data
|
||||||
|
elsif data == :EOF
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected end of file in comment (--).")})
|
||||||
|
@tokenQueue.push(@currentToken)
|
||||||
|
@state = @states[:data]
|
||||||
|
else
|
||||||
|
# XXX
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected character in comment found.")})
|
||||||
|
@currentToken[:data] += "--" + data
|
||||||
|
@state = @states[:comment]
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def doctypeState
|
||||||
|
data = @stream.char
|
||||||
|
if SPACE_CHARACTERS.include? data
|
||||||
|
@state = @states[:beforeDoctypeName]
|
||||||
|
else
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("No space after literal string 'DOCTYPE'.")})
|
||||||
|
@stream.queue.push(data)
|
||||||
|
@state = @states[:beforeDoctypeName]
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def beforeDoctypeNameState
|
||||||
|
data = @stream.char
|
||||||
|
if SPACE_CHARACTERS.include? data
|
||||||
|
elsif ASCII_LOWERCASE.include? data
|
||||||
|
@currentToken[:name] = data.upcase
|
||||||
|
@state = @states[:doctypeName]
|
||||||
|
elsif data == ">"
|
||||||
|
# Character needs to be consumed per the specification so don't
|
||||||
|
# invoke emitCurrentTokenWithParseError with :data as argument.
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected > character. Expected DOCTYPE name.")})
|
||||||
|
@tokenQueue.push(@currentToken)
|
||||||
|
@state = @states[:data]
|
||||||
|
elsif data == :EOF
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected end of file. Expected DOCTYPE name.")})
|
||||||
|
@tokenQueue.push(@currentToken)
|
||||||
|
@state = @states[:data]
|
||||||
|
else
|
||||||
|
@currentToken[:name] = data
|
||||||
|
@state = @states[:doctypeName]
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def doctypeNameState
|
||||||
|
data = @stream.char
|
||||||
|
needsDoctypeCheck = false
|
||||||
|
if SPACE_CHARACTERS.include? data
|
||||||
|
@state = @states[:afterDoctypeName]
|
||||||
|
needsDoctypeCheck = true
|
||||||
|
elsif data == ">"
|
||||||
|
@tokenQueue.push(@currentToken)
|
||||||
|
@state = @states[:data]
|
||||||
|
elsif data == :EOF
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected end of file in DOCTYPE name.")})
|
||||||
|
@tokenQueue.push(@currentToken)
|
||||||
|
@state = @states[:data]
|
||||||
|
else
|
||||||
|
# We can't just uppercase everything that arrives here. For
|
||||||
|
# instance, non-ASCII characters.
|
||||||
|
if ASCII_LOWERCASE.include? data
|
||||||
|
data = data.upcase
|
||||||
|
end
|
||||||
|
@currentToken[:name] += data
|
||||||
|
needsDoctypeCheck = true
|
||||||
|
end
|
||||||
|
|
||||||
|
# After some iterations through this state it should eventually say
|
||||||
|
# "HTML". Otherwise there's an error.
|
||||||
|
if needsDoctypeCheck and @currentToken[:name] == "HTML"
|
||||||
|
@currentToken[:data] = false
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def afterDoctypeNameState
|
||||||
|
data = @stream.char
|
||||||
|
if SPACE_CHARACTERS.include? data
|
||||||
|
elsif data == ">"
|
||||||
|
@tokenQueue.push(@currentToken)
|
||||||
|
@state = @states[:data]
|
||||||
|
elsif data == :EOF
|
||||||
|
@currentToken[:data] = true
|
||||||
|
# XXX EMIT
|
||||||
|
@stream.queue.push(data)
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected end of file in DOCTYPE.")})
|
||||||
|
@tokenQueue.push(@currentToken)
|
||||||
|
@state = @states[:data]
|
||||||
|
else
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Expected space or '>'. Got '" + data + "'")})
|
||||||
|
@currentToken[:data] = true
|
||||||
|
@state = @states[:bogusDoctype]
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def bogusDoctypeState
|
||||||
|
data = @stream.char
|
||||||
|
if data == ">"
|
||||||
|
@tokenQueue.push(@currentToken)
|
||||||
|
@state = @states[:data]
|
||||||
|
elsif data == :EOF
|
||||||
|
# XXX EMIT
|
||||||
|
@stream.queue.push(data)
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Unexpected end of file in bogus doctype.")})
|
||||||
|
@tokenQueue.push(@currentToken)
|
||||||
|
@state = @states[:data]
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
def _(string); string; end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
21
vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb
vendored
Normal file
21
vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb
vendored
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
module HTML5lib
|
||||||
|
module TreeBuilders
|
||||||
|
|
||||||
|
def self.getTreeBuilder(name)
|
||||||
|
case name.to_s.downcase
|
||||||
|
when 'simpletree' then
|
||||||
|
require 'html5lib/treebuilders/simpletree'
|
||||||
|
SimpleTree::TreeBuilder
|
||||||
|
when 'rexml' then
|
||||||
|
require 'html5lib/treebuilders/rexml'
|
||||||
|
REXMLTree::TreeBuilder
|
||||||
|
when 'hpricot' then
|
||||||
|
require 'html5lib/treebuilders/hpricot'
|
||||||
|
Hpricot::TreeBuilder
|
||||||
|
else
|
||||||
|
raise "Unknown TreeBuilder #{name}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
end
|
330
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb
vendored
Executable file
330
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb
vendored
Executable file
|
@ -0,0 +1,330 @@
|
||||||
|
require 'html5lib/constants'
|
||||||
|
|
||||||
|
#XXX - TODO; make the default interface more ElementTree-like rather than DOM-like
|
||||||
|
|
||||||
|
module HTML5lib
|
||||||
|
|
||||||
|
# The scope markers are inserted when entering buttons, object elements,
|
||||||
|
# marquees, table cells, and table captions, and are used to prevent formatting
|
||||||
|
# from "leaking" into tables, buttons, object elements, and marquees.
|
||||||
|
Marker = nil
|
||||||
|
|
||||||
|
module TreeBuilders
|
||||||
|
module Base
|
||||||
|
|
||||||
|
class Node
|
||||||
|
# The parent of the current node (or nil for the document node)
|
||||||
|
attr_accessor :parent
|
||||||
|
|
||||||
|
# a list of child nodes of the current node. This must
|
||||||
|
# include all elements but not necessarily other node types
|
||||||
|
attr_accessor :childNodes
|
||||||
|
|
||||||
|
# A list of miscellaneous flags that can be set on the node
|
||||||
|
attr_accessor :_flags
|
||||||
|
|
||||||
|
def initialize(name)
|
||||||
|
@parent = nil
|
||||||
|
@childNodes = []
|
||||||
|
@_flags = []
|
||||||
|
end
|
||||||
|
|
||||||
|
# Insert node as a child of the current node
|
||||||
|
def appendChild(node)
|
||||||
|
raise NotImplementedError
|
||||||
|
end
|
||||||
|
|
||||||
|
# Insert data as text in the current node, positioned before the
|
||||||
|
# start of node insertBefore or to the end of the node's text.
|
||||||
|
def insertText(data, insertBefore = nil)
|
||||||
|
raise NotImplementedError
|
||||||
|
end
|
||||||
|
|
||||||
|
# Insert node as a child of the current node, before refNode in the
|
||||||
|
# list of child nodes. Raises ValueError if refNode is not a child of
|
||||||
|
# the current node
|
||||||
|
def insertBefore(node, refNode)
|
||||||
|
raise NotImplementedError
|
||||||
|
end
|
||||||
|
|
||||||
|
# Remove node from the children of the current node
|
||||||
|
def removeChild(node)
|
||||||
|
raise NotImplementedError
|
||||||
|
end
|
||||||
|
|
||||||
|
# Move all the children of the current node to newParent.
|
||||||
|
# This is needed so that trees that don't store text as nodes move the
|
||||||
|
# text in the correct way
|
||||||
|
def reparentChildren(newParent)
|
||||||
|
#XXX - should this method be made more general?
|
||||||
|
@childNodes.each { |child| newParent.appendChild(child) }
|
||||||
|
@childNodes = []
|
||||||
|
end
|
||||||
|
|
||||||
|
# Return a shallow copy of the current node i.e. a node with the same
|
||||||
|
# name and attributes but with no parent or child nodes
|
||||||
|
def cloneNode
|
||||||
|
raise NotImplementedError
|
||||||
|
end
|
||||||
|
|
||||||
|
# Return true if the node has children or text, false otherwise
|
||||||
|
def hasContent
|
||||||
|
raise NotImplementedError
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Base treebuilder implementation
|
||||||
|
class TreeBuilder
|
||||||
|
|
||||||
|
attr_accessor :openElements
|
||||||
|
|
||||||
|
attr_accessor :activeFormattingElements
|
||||||
|
|
||||||
|
attr_accessor :document
|
||||||
|
|
||||||
|
attr_accessor :headPointer
|
||||||
|
|
||||||
|
attr_accessor :formPointer
|
||||||
|
|
||||||
|
# Class to use for document root
|
||||||
|
documentClass = nil
|
||||||
|
|
||||||
|
# Class to use for HTML elements
|
||||||
|
elementClass = nil
|
||||||
|
|
||||||
|
# Class to use for comments
|
||||||
|
commentClass = nil
|
||||||
|
|
||||||
|
# Class to use for doctypes
|
||||||
|
doctypeClass = nil
|
||||||
|
|
||||||
|
# Fragment class
|
||||||
|
fragmentClass = nil
|
||||||
|
|
||||||
|
def initialize
|
||||||
|
reset
|
||||||
|
end
|
||||||
|
|
||||||
|
def reset
|
||||||
|
@openElements = []
|
||||||
|
@activeFormattingElements = []
|
||||||
|
|
||||||
|
#XXX - rename these to headElement, formElement
|
||||||
|
@headPointer = nil
|
||||||
|
@formPointer = nil
|
||||||
|
|
||||||
|
self.insertFromTable = false
|
||||||
|
|
||||||
|
@document = @documentClass.new
|
||||||
|
end
|
||||||
|
|
||||||
|
def elementInScope(target, tableVariant = false)
|
||||||
|
# Exit early when possible.
|
||||||
|
return true if @openElements[-1].name == target
|
||||||
|
|
||||||
|
# AT How about while true and simply set node to [-1] and set it to
|
||||||
|
# [-2] at the end...
|
||||||
|
@openElements.reverse.each do |element|
|
||||||
|
if element.name == target
|
||||||
|
return true
|
||||||
|
elsif element.name == 'table'
|
||||||
|
return false
|
||||||
|
elsif not tableVariant and SCOPING_ELEMENTS.include?(element.name)
|
||||||
|
return false
|
||||||
|
elsif element.name == 'html'
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
assert false # We should never reach this point
|
||||||
|
end
|
||||||
|
|
||||||
|
def reconstructActiveFormattingElements
|
||||||
|
# Within this algorithm the order of steps described in the
|
||||||
|
# specification is not quite the same as the order of steps in the
|
||||||
|
# code. It should still do the same though.
|
||||||
|
|
||||||
|
# Step 1: stop the algorithm when there's nothing to do.
|
||||||
|
return unless @activeFormattingElements
|
||||||
|
|
||||||
|
# Step 2 and step 3: we start with the last element. So i is -1.
|
||||||
|
i = -1
|
||||||
|
entry = @activeFormattingElements[i]
|
||||||
|
return if entry == Marker or @openElements.include?(entry)
|
||||||
|
|
||||||
|
# Step 6
|
||||||
|
until entry == Marker or @openElements.include?(entry)
|
||||||
|
# Step 5: let entry be one earlier in the list.
|
||||||
|
i -= 1
|
||||||
|
begin
|
||||||
|
entry = @activeFormattingElements[i]
|
||||||
|
rescue
|
||||||
|
# Step 4: at this point we need to jump to step 8. By not doing
|
||||||
|
# i += 1 which is also done in step 7 we achieve that.
|
||||||
|
break
|
||||||
|
end
|
||||||
|
end
|
||||||
|
while true
|
||||||
|
# Step 7
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# Step 8
|
||||||
|
clone = @activeFormattingElements[i].cloneNode
|
||||||
|
|
||||||
|
# Step 9
|
||||||
|
element = insertElement(clone.name, clone.attributes)
|
||||||
|
|
||||||
|
# Step 10
|
||||||
|
@activeFormattingElements[i] = element
|
||||||
|
|
||||||
|
# Step 11
|
||||||
|
break if element == @activeFormattingElements[-1]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def clearActiveFormattingElements
|
||||||
|
{} until @activeFormattingElements.empty? || @activeFormattingElements.pop == Marker
|
||||||
|
end
|
||||||
|
|
||||||
|
# Check if an element exists between the end of the active
|
||||||
|
# formatting elements and the last marker. If it does, return it, else
|
||||||
|
# return false
|
||||||
|
def elementInActiveFormattingElements(name)
|
||||||
|
@activeFormattingElements.reverse.each do |element|
|
||||||
|
# Check for Marker first because if it's a Marker it doesn't have a
|
||||||
|
# name attribute.
|
||||||
|
break if element == Marker
|
||||||
|
return element if element.name == name
|
||||||
|
end
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
|
||||||
|
def insertDoctype(name)
|
||||||
|
@document.appendChild(@doctypeClass.new(name))
|
||||||
|
end
|
||||||
|
|
||||||
|
def insertComment(data, parent = nil)
|
||||||
|
parent = @openElements[-1] if parent.nil?
|
||||||
|
parent.appendChild(@commentClass.new(data))
|
||||||
|
end
|
||||||
|
|
||||||
|
# Create an element but don't insert it anywhere
|
||||||
|
def createElement(name, attributes)
|
||||||
|
element = @elementClass.new(name)
|
||||||
|
element.attributes = attributes
|
||||||
|
return element
|
||||||
|
end
|
||||||
|
|
||||||
|
# Switch the function used to insert an element from the
|
||||||
|
# normal one to the misnested table one and back again
|
||||||
|
def insertFromTable=(value)
|
||||||
|
@insertFromTable = value
|
||||||
|
@insertElement = value ? :insertElementTable : :insertElementNormal
|
||||||
|
end
|
||||||
|
|
||||||
|
def insertElement(name, attributes)
|
||||||
|
send(@insertElement, name, attributes)
|
||||||
|
end
|
||||||
|
|
||||||
|
def insertElementNormal(name, attributes)
|
||||||
|
element = @elementClass.new(name)
|
||||||
|
element.attributes = attributes
|
||||||
|
@openElements[-1].appendChild(element)
|
||||||
|
@openElements.push(element)
|
||||||
|
return element
|
||||||
|
end
|
||||||
|
|
||||||
|
# Create an element and insert it into the tree
|
||||||
|
def insertElementTable(name, attributes)
|
||||||
|
element = @elementClass.new(name)
|
||||||
|
element.attributes = attributes
|
||||||
|
if TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)
|
||||||
|
#We should be in the InTable mode. This means we want to do
|
||||||
|
#special magic element rearranging
|
||||||
|
parent, insertBefore = getTableMisnestedNodePosition
|
||||||
|
if insertBefore.nil?
|
||||||
|
parent.appendChild(element)
|
||||||
|
else
|
||||||
|
parent.insertBefore(element, insertBefore)
|
||||||
|
end
|
||||||
|
@openElements.push(element)
|
||||||
|
else
|
||||||
|
return insertElementNormal(name, attributes)
|
||||||
|
end
|
||||||
|
return element
|
||||||
|
end
|
||||||
|
|
||||||
|
def insertText(data, parent = nil)
|
||||||
|
parent = @openElements[-1] if parent.nil?
|
||||||
|
|
||||||
|
if (not(@insertFromTable) or (@insertFromTable and not TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)))
|
||||||
|
parent.insertText(data)
|
||||||
|
else
|
||||||
|
#We should be in the InTable mode. This means we want to do
|
||||||
|
#special magic element rearranging
|
||||||
|
parent, insertBefore = getTableMisnestedNodePosition
|
||||||
|
parent.insertText(data, insertBefore)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Get the foster parent element, and sibling to insert before
|
||||||
|
# (or nil) when inserting a misnested table node
|
||||||
|
def getTableMisnestedNodePosition
|
||||||
|
#The foster parent element is the one which comes before the most
|
||||||
|
#recently opened table element
|
||||||
|
#XXX - this is really inelegant
|
||||||
|
lastTable = nil
|
||||||
|
fosterParent = nil
|
||||||
|
insertBefore = nil
|
||||||
|
@openElements.reverse.each do |element|
|
||||||
|
if element.name == "table"
|
||||||
|
lastTable = element
|
||||||
|
break
|
||||||
|
end
|
||||||
|
end
|
||||||
|
if lastTable
|
||||||
|
#XXX - we should really check that this parent is actually a
|
||||||
|
#node here
|
||||||
|
if lastTable.parent
|
||||||
|
fosterParent = lastTable.parent
|
||||||
|
insertBefore = lastTable
|
||||||
|
else
|
||||||
|
fosterParent = @openElements[@openElements.index(lastTable) - 1]
|
||||||
|
end
|
||||||
|
else
|
||||||
|
fosterParent = @openElements[0]
|
||||||
|
end
|
||||||
|
return fosterParent, insertBefore
|
||||||
|
end
|
||||||
|
|
||||||
|
def generateImpliedEndTags(exclude = nil)
|
||||||
|
name = @openElements[-1].name
|
||||||
|
|
||||||
|
if (['dd', 'dt', 'li', 'p', 'td', 'th', 'tr'].include?(name) and name != exclude)
|
||||||
|
@openElements.pop
|
||||||
|
# XXX This is not entirely what the specification says. We should
|
||||||
|
# investigate it more closely.
|
||||||
|
generateImpliedEndTags(exclude)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def getDocument
|
||||||
|
@document
|
||||||
|
end
|
||||||
|
|
||||||
|
def getFragment
|
||||||
|
#assert @innerHTML
|
||||||
|
fragment = @fragmentClass.new
|
||||||
|
@openElements[0].reparentChildren(fragment)
|
||||||
|
return fragment
|
||||||
|
end
|
||||||
|
|
||||||
|
# Serialize the subtree of node in the format required by unit tests
|
||||||
|
# node - the node from which to start serializing
|
||||||
|
def testSerializer(node)
|
||||||
|
raise NotImplementedError
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
211
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb
vendored
Normal file
211
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb
vendored
Normal file
|
@ -0,0 +1,211 @@
|
||||||
|
require 'html5lib/treebuilders/base'
|
||||||
|
require 'hpricot'
|
||||||
|
require 'forwardable'
|
||||||
|
|
||||||
|
module HTML5lib
|
||||||
|
module TreeBuilders
|
||||||
|
module Hpricot
|
||||||
|
|
||||||
|
class Node < Base::Node
|
||||||
|
|
||||||
|
extend Forwardable
|
||||||
|
|
||||||
|
def_delegators :@hpricot, :name
|
||||||
|
|
||||||
|
attr_accessor :hpricot
|
||||||
|
|
||||||
|
def initialize(name)
|
||||||
|
super(name)
|
||||||
|
@hpricot = self.class.hpricot_class.new name
|
||||||
|
end
|
||||||
|
|
||||||
|
def appendChild(node)
|
||||||
|
if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
|
||||||
|
childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s
|
||||||
|
else
|
||||||
|
childNodes << node
|
||||||
|
hpricot.children << node.hpricot
|
||||||
|
end
|
||||||
|
node.parent = self
|
||||||
|
end
|
||||||
|
|
||||||
|
def removeChild(node)
|
||||||
|
childNodes.delete(node)
|
||||||
|
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
|
||||||
|
node.parent = nil
|
||||||
|
end
|
||||||
|
|
||||||
|
def insertText(data, before = nil)
|
||||||
|
if before
|
||||||
|
insertBefore(TextNode.new(data), before)
|
||||||
|
else
|
||||||
|
appendChild(TextNode.new(data))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def insertBefore(node, refNode)
|
||||||
|
index = childNodes.index(refNode)
|
||||||
|
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
|
||||||
|
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
|
||||||
|
else
|
||||||
|
childNodes.insert(index, node)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def hasContent
|
||||||
|
childNodes.any?
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class Element < Node
|
||||||
|
def self.hpricot_class
|
||||||
|
::Hpricot::Elem
|
||||||
|
end
|
||||||
|
|
||||||
|
def initialize(name)
|
||||||
|
super(name)
|
||||||
|
|
||||||
|
@hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
|
||||||
|
end
|
||||||
|
|
||||||
|
def name
|
||||||
|
@hpricot.stag.name
|
||||||
|
end
|
||||||
|
|
||||||
|
def cloneNode
|
||||||
|
attributes.inject(self.class.new(name)) do |node, (name, value)|
|
||||||
|
node.hpricot[name] = value
|
||||||
|
node
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# A call to Hpricot::Elem#raw_attributes is built dynamically,
|
||||||
|
# so alterations to the returned value (a hash) will be lost.
|
||||||
|
#
|
||||||
|
# AttributeProxy works around this by forwarding :[]= calls
|
||||||
|
# to the raw_attributes accessor on the element start tag.
|
||||||
|
#
|
||||||
|
class AttributeProxy
|
||||||
|
def initialize(hpricot)
|
||||||
|
@hpricot = hpricot
|
||||||
|
end
|
||||||
|
def []=(k, v)
|
||||||
|
@hpricot.stag.send(stag_attributes_method)[k] = v
|
||||||
|
end
|
||||||
|
def stag_attributes_method
|
||||||
|
# STag#attributes changed to STag#raw_attributes after Hpricot 0.5
|
||||||
|
@hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
|
||||||
|
end
|
||||||
|
def method_missing(*a, &b)
|
||||||
|
@hpricot.attributes.send(*a, &b)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def attributes
|
||||||
|
AttributeProxy.new(@hpricot)
|
||||||
|
end
|
||||||
|
|
||||||
|
def attributes=(attrs)
|
||||||
|
attrs.each { |name, value| @hpricot[name] = value }
|
||||||
|
end
|
||||||
|
|
||||||
|
def printTree(indent = 0)
|
||||||
|
tree = "\n|#{' ' * indent}<#{name}>"
|
||||||
|
indent += 2
|
||||||
|
attributes.each do |name, value|
|
||||||
|
next if name == 'xmlns'
|
||||||
|
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
|
||||||
|
end
|
||||||
|
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class Document < Node
|
||||||
|
def self.hpricot_class
|
||||||
|
::Hpricot::Doc
|
||||||
|
end
|
||||||
|
|
||||||
|
def initialize
|
||||||
|
super(nil)
|
||||||
|
end
|
||||||
|
|
||||||
|
def printTree(indent = 0)
|
||||||
|
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class DocumentType < Node
|
||||||
|
def self.hpricot_class
|
||||||
|
::Hpricot::DocType
|
||||||
|
end
|
||||||
|
|
||||||
|
def initialize(name)
|
||||||
|
begin
|
||||||
|
super(name)
|
||||||
|
rescue ArgumentError # needs 3...
|
||||||
|
end
|
||||||
|
|
||||||
|
@hpricot = ::Hpricot::DocType.new(name, nil, nil)
|
||||||
|
end
|
||||||
|
|
||||||
|
def printTree(indent = 0)
|
||||||
|
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class DocumentFragment < Element
|
||||||
|
def initialize
|
||||||
|
super('')
|
||||||
|
end
|
||||||
|
|
||||||
|
def printTree(indent = 0)
|
||||||
|
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class TextNode < Node
|
||||||
|
def initialize(data)
|
||||||
|
@hpricot = ::Hpricot::Text.new(data)
|
||||||
|
end
|
||||||
|
|
||||||
|
def printTree(indent = 0)
|
||||||
|
"\n|#{' ' * indent}\"#{hpricot.content}\""
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class CommentNode < Node
|
||||||
|
def self.hpricot_class
|
||||||
|
::Hpricot::Comment
|
||||||
|
end
|
||||||
|
|
||||||
|
def printTree(indent = 0)
|
||||||
|
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class TreeBuilder < Base::TreeBuilder
|
||||||
|
def initialize
|
||||||
|
@documentClass = Document
|
||||||
|
@doctypeClass = DocumentType
|
||||||
|
@elementClass = Element
|
||||||
|
@commentClass = CommentNode
|
||||||
|
@fragmentClass = DocumentFragment
|
||||||
|
end
|
||||||
|
|
||||||
|
def testSerializer(node)
|
||||||
|
node.printTree
|
||||||
|
end
|
||||||
|
|
||||||
|
def getDocument
|
||||||
|
@document.hpricot
|
||||||
|
end
|
||||||
|
|
||||||
|
def getFragment
|
||||||
|
@document = super
|
||||||
|
return @document.hpricot.children
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
191
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb
vendored
Normal file
191
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb
vendored
Normal file
|
@ -0,0 +1,191 @@
|
||||||
|
require 'html5lib/treebuilders/base'
|
||||||
|
require 'rexml/document'
|
||||||
|
require 'forwardable'
|
||||||
|
|
||||||
|
module HTML5lib
|
||||||
|
module TreeBuilders
|
||||||
|
module REXMLTree
|
||||||
|
|
||||||
|
class Node < Base::Node
|
||||||
|
extend Forwardable
|
||||||
|
def_delegators :@rxobj, :name, :attributes
|
||||||
|
attr_accessor :rxobj
|
||||||
|
|
||||||
|
def initialize name
|
||||||
|
super name
|
||||||
|
@rxobj = self.class.rxclass.new name
|
||||||
|
end
|
||||||
|
|
||||||
|
def appendChild node
|
||||||
|
if node.kind_of? TextNode and
|
||||||
|
childNodes.length>0 and childNodes[-1].kind_of? TextNode
|
||||||
|
childNodes[-1].rxobj.value =
|
||||||
|
childNodes[-1].rxobj.to_s + node.rxobj.to_s
|
||||||
|
childNodes[-1].rxobj.raw = true
|
||||||
|
else
|
||||||
|
childNodes.push node
|
||||||
|
rxobj.add node.rxobj
|
||||||
|
end
|
||||||
|
node.parent = self
|
||||||
|
end
|
||||||
|
|
||||||
|
def removeChild node
|
||||||
|
childNodes.delete node
|
||||||
|
rxobj.delete node.rxobj
|
||||||
|
node.parent = nil
|
||||||
|
end
|
||||||
|
|
||||||
|
def insertText data, before=nil
|
||||||
|
if before
|
||||||
|
insertBefore TextNode.new(data), before
|
||||||
|
else
|
||||||
|
appendChild TextNode.new(data)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def insertBefore node, refNode
|
||||||
|
index = childNodes.index(refNode)
|
||||||
|
if node.kind_of? TextNode and index>0 and
|
||||||
|
childNodes[index-1].kind_of? TextNode
|
||||||
|
childNodes[index-1].rxobj.value =
|
||||||
|
childNodes[index-1].rxobj.to_s + node.rxobj.to_s
|
||||||
|
childNodes[index-1].rxobj.raw = true
|
||||||
|
else
|
||||||
|
childNodes.insert index, node
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def hasContent
|
||||||
|
return (childNodes.length > 0)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class Element < Node
|
||||||
|
def self.rxclass
|
||||||
|
REXML::Element
|
||||||
|
end
|
||||||
|
|
||||||
|
def initialize name
|
||||||
|
super name
|
||||||
|
end
|
||||||
|
|
||||||
|
def cloneNode
|
||||||
|
newNode = self.class.new name
|
||||||
|
attributes.each {|name,value| newNode.attributes[name] = value}
|
||||||
|
newNode
|
||||||
|
end
|
||||||
|
|
||||||
|
def attributes= value
|
||||||
|
value.each {|name,value| rxobj.attributes[name]=value}
|
||||||
|
end
|
||||||
|
|
||||||
|
def printTree indent=0
|
||||||
|
tree = "\n|#{' ' * indent}<#{name}>"
|
||||||
|
indent += 2
|
||||||
|
for name, value in attributes
|
||||||
|
next if name == 'xmlns'
|
||||||
|
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
|
||||||
|
end
|
||||||
|
for child in childNodes
|
||||||
|
tree += child.printTree(indent)
|
||||||
|
end
|
||||||
|
return tree
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class Document < Node
|
||||||
|
def self.rxclass
|
||||||
|
REXML::Document
|
||||||
|
end
|
||||||
|
|
||||||
|
def initialize
|
||||||
|
super nil
|
||||||
|
end
|
||||||
|
|
||||||
|
def appendChild node
|
||||||
|
if node.kind_of? Element and node.name == 'html'
|
||||||
|
node.rxobj.add_namespace('http://www.w3.org/1999/xhtml')
|
||||||
|
end
|
||||||
|
super node
|
||||||
|
end
|
||||||
|
|
||||||
|
def printTree indent=0
|
||||||
|
tree = "#document"
|
||||||
|
for child in childNodes
|
||||||
|
tree += child.printTree(indent + 2)
|
||||||
|
end
|
||||||
|
return tree
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class DocumentType < Node
|
||||||
|
def self.rxclass
|
||||||
|
REXML::DocType
|
||||||
|
end
|
||||||
|
|
||||||
|
def printTree indent=0
|
||||||
|
"\n|#{' ' * indent}<!DOCTYPE #{name}>"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class DocumentFragment < Element
|
||||||
|
def initialize
|
||||||
|
super nil
|
||||||
|
end
|
||||||
|
|
||||||
|
def printTree indent=0
|
||||||
|
tree = ""
|
||||||
|
for child in childNodes
|
||||||
|
tree += child.printTree(indent+2)
|
||||||
|
end
|
||||||
|
return tree
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class TextNode < Node
|
||||||
|
def initialize data
|
||||||
|
raw=data.gsub('&','&').gsub('<','<').gsub('>','>')
|
||||||
|
@rxobj = REXML::Text.new(raw, true, nil, true)
|
||||||
|
end
|
||||||
|
|
||||||
|
def printTree indent=0
|
||||||
|
"\n|#{' ' * indent}\"#{rxobj.value}\""
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class CommentNode < Node
|
||||||
|
def self.rxclass
|
||||||
|
REXML::Comment
|
||||||
|
end
|
||||||
|
|
||||||
|
def printTree indent=0
|
||||||
|
"\n|#{' ' * indent}<!-- #{rxobj.string} -->"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class TreeBuilder < Base::TreeBuilder
|
||||||
|
def initialize
|
||||||
|
@documentClass = Document
|
||||||
|
@doctypeClass = DocumentType
|
||||||
|
@elementClass = Element
|
||||||
|
@commentClass = CommentNode
|
||||||
|
@fragmentClass = DocumentFragment
|
||||||
|
end
|
||||||
|
|
||||||
|
def testSerializer node
|
||||||
|
node.printTree()
|
||||||
|
end
|
||||||
|
|
||||||
|
def getDocument
|
||||||
|
@document.rxobj
|
||||||
|
end
|
||||||
|
|
||||||
|
def getFragment
|
||||||
|
@document = super
|
||||||
|
return @document.rxobj.children
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
178
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/simpletree.rb
vendored
Normal file
178
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/simpletree.rb
vendored
Normal file
|
@ -0,0 +1,178 @@
|
||||||
|
require 'html5lib/treebuilders/base'
|
||||||
|
|
||||||
|
module HTML5lib
|
||||||
|
module TreeBuilders
|
||||||
|
module SimpleTree
|
||||||
|
|
||||||
|
class Node < Base::Node
|
||||||
|
# Node representing an item in the tree.
|
||||||
|
# name - The tag name associated with the node
|
||||||
|
attr_accessor :name
|
||||||
|
|
||||||
|
# The value of the current node (applies to text nodes and
|
||||||
|
# comments
|
||||||
|
attr_accessor :value
|
||||||
|
|
||||||
|
# a dict holding name, value pairs for attributes of the node
|
||||||
|
attr_accessor :attributes
|
||||||
|
|
||||||
|
def initialize name
|
||||||
|
super
|
||||||
|
@name = name
|
||||||
|
@value = nil
|
||||||
|
@attributes = {}
|
||||||
|
end
|
||||||
|
|
||||||
|
def appendChild node
|
||||||
|
if node.kind_of? TextNode and
|
||||||
|
childNodes.length>0 and childNodes[-1].kind_of? TextNode
|
||||||
|
childNodes[-1].value += node.value
|
||||||
|
else
|
||||||
|
childNodes.push node
|
||||||
|
end
|
||||||
|
node.parent = self
|
||||||
|
end
|
||||||
|
|
||||||
|
def removeChild node
|
||||||
|
childNodes.delete node
|
||||||
|
node.parent = nil
|
||||||
|
end
|
||||||
|
|
||||||
|
def cloneNode
|
||||||
|
newNode = self.class.new name
|
||||||
|
attributes.each {|name,value| newNode.attributes[name] = value}
|
||||||
|
newNode.value = value
|
||||||
|
newNode
|
||||||
|
end
|
||||||
|
|
||||||
|
def insertText data, before=nil
|
||||||
|
if before
|
||||||
|
insertBefore TextNode.new(data), before
|
||||||
|
else
|
||||||
|
appendChild TextNode.new(data)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def insertBefore node, refNode
|
||||||
|
index = childNodes.index(refNode)
|
||||||
|
if node.kind_of? TextNode and index>0 and
|
||||||
|
childNodes[index-1].kind_of? TextNode
|
||||||
|
childNodes[index-1].value += node.value
|
||||||
|
else
|
||||||
|
childNodes.insert index, node
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def printTree indent=0
|
||||||
|
tree = "\n|%s%s" % [' '* indent, self.to_s]
|
||||||
|
for child in childNodes
|
||||||
|
tree += child.printTree(indent + 2)
|
||||||
|
end
|
||||||
|
return tree
|
||||||
|
end
|
||||||
|
|
||||||
|
def hasContent
|
||||||
|
return (childNodes.length > 0)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class Element < Node
|
||||||
|
def to_s
|
||||||
|
"<%s>" % name
|
||||||
|
end
|
||||||
|
|
||||||
|
def printTree indent=0
|
||||||
|
tree = "\n|%s%s" % [' '* indent, self.to_s]
|
||||||
|
indent += 2
|
||||||
|
for name, value in attributes
|
||||||
|
tree += "\n|%s%s=\"%s\"" % [' ' * indent, name, value]
|
||||||
|
end
|
||||||
|
for child in childNodes
|
||||||
|
tree += child.printTree(indent)
|
||||||
|
end
|
||||||
|
return tree
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class Document < Node
|
||||||
|
def to_s
|
||||||
|
"#document"
|
||||||
|
end
|
||||||
|
|
||||||
|
def initialize
|
||||||
|
super nil
|
||||||
|
end
|
||||||
|
|
||||||
|
def printTree indent=0
|
||||||
|
tree = to_s
|
||||||
|
for child in childNodes
|
||||||
|
tree += child.printTree(indent + 2)
|
||||||
|
end
|
||||||
|
return tree
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class DocumentType < Node
|
||||||
|
def to_s
|
||||||
|
"<!DOCTYPE %s>" % name
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class DocumentFragment < Element
|
||||||
|
def initialize
|
||||||
|
super nil
|
||||||
|
end
|
||||||
|
|
||||||
|
def printTree indent=0
|
||||||
|
tree = ""
|
||||||
|
for child in childNodes
|
||||||
|
tree += child.printTree(indent+2)
|
||||||
|
end
|
||||||
|
return tree
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class TextNode < Node
|
||||||
|
def initialize value
|
||||||
|
super nil
|
||||||
|
@value = value
|
||||||
|
end
|
||||||
|
|
||||||
|
def to_s
|
||||||
|
'"%s"' % value
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class CommentNode < Node
|
||||||
|
def initialize value
|
||||||
|
super nil
|
||||||
|
@value = value
|
||||||
|
end
|
||||||
|
|
||||||
|
def to_s
|
||||||
|
"<!-- %s -->" % value
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class TreeBuilder < Base::TreeBuilder
|
||||||
|
def initialize
|
||||||
|
@documentClass = Document
|
||||||
|
@doctypeClass = DocumentType
|
||||||
|
@elementClass = Element
|
||||||
|
@commentClass = CommentNode
|
||||||
|
@fragmentClass = DocumentFragment
|
||||||
|
end
|
||||||
|
|
||||||
|
def testSerializer node
|
||||||
|
node.printTree()
|
||||||
|
end
|
||||||
|
|
||||||
|
def getFragment
|
||||||
|
@document = super
|
||||||
|
return @document.childNodes
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
11
vendor/plugins/HTML5lib/tests/preamble.rb
vendored
Normal file
11
vendor/plugins/HTML5lib/tests/preamble.rb
vendored
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
require 'test/unit'
|
||||||
|
|
||||||
|
HTML5LIB_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__))))
|
||||||
|
|
||||||
|
$:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
|
||||||
|
|
||||||
|
$:.unshift File.dirname(__FILE__)
|
||||||
|
|
||||||
|
def html5lib_test_files(subdirectory)
|
||||||
|
Dir[File.join(HTML5LIB_BASE, 'tests', subdirectory, '*.*')]
|
||||||
|
end
|
36
vendor/plugins/HTML5lib/tests/test_encoding.rb
vendored
Executable file
36
vendor/plugins/HTML5lib/tests/test_encoding.rb
vendored
Executable file
|
@ -0,0 +1,36 @@
|
||||||
|
require File.join(File.dirname(__FILE__), 'preamble')
|
||||||
|
|
||||||
|
require 'html5lib/inputstream'
|
||||||
|
|
||||||
|
class Html5EncodingTestCase < Test::Unit::TestCase
|
||||||
|
|
||||||
|
begin
|
||||||
|
require 'rubygems'
|
||||||
|
require 'UniversalDetector'
|
||||||
|
|
||||||
|
def test_chardet
|
||||||
|
File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
|
||||||
|
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
|
||||||
|
assert_equal 'big5', stream.charEncoding.downcase
|
||||||
|
end
|
||||||
|
end
|
||||||
|
rescue LoadError
|
||||||
|
puts "chardet not found, skipping chardet tests"
|
||||||
|
end
|
||||||
|
|
||||||
|
html5lib_test_files('encoding').each do |test_file|
|
||||||
|
test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
|
||||||
|
|
||||||
|
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
||||||
|
next if data.empty?
|
||||||
|
input, encoding = data.split(/\n#encoding\s+/, 2)
|
||||||
|
encoding = encoding.split[0]
|
||||||
|
|
||||||
|
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
|
||||||
|
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
|
||||||
|
assert_equal encoding.downcase, stream.charEncoding.downcase, input
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
212
vendor/plugins/HTML5lib/tests/test_lxp.rb
vendored
Executable file
212
vendor/plugins/HTML5lib/tests/test_lxp.rb
vendored
Executable file
|
@ -0,0 +1,212 @@
|
||||||
|
require File.join(File.dirname(__FILE__), 'preamble')
|
||||||
|
|
||||||
|
require 'html5lib/liberalxmlparser'
|
||||||
|
|
||||||
|
XMLELEM = /<(\w+\s*)((?:[-:\w]+="[^"]*"\s*)+)(\/?)>/
|
||||||
|
SORTATTRS = '<#{$1+$2.split.sort.join(' ')+$3}>'
|
||||||
|
|
||||||
|
def assert_xml_equal(input, expected=nil, parser=HTML5lib::XMLParser)
|
||||||
|
document = parser.parse(input.chomp).root
|
||||||
|
if not expected
|
||||||
|
expected = input.chomp.gsub(XMLELEM,SORTATTRS)
|
||||||
|
expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
|
||||||
|
output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,SORTATTRS)
|
||||||
|
assert_equal(expected, output)
|
||||||
|
else
|
||||||
|
assert_equal(expected, document.to_s.gsub(/'/,'"'))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def assert_xhtml_equal(input, expected=nil, parser=HTML5lib::XHTMLParser)
|
||||||
|
assert_xml_equal(input, expected, parser)
|
||||||
|
end
|
||||||
|
|
||||||
|
class BasicXhtml5Test < Test::Unit::TestCase
|
||||||
|
|
||||||
|
def test_title_body_mismatched_close
|
||||||
|
assert_xhtml_equal(
|
||||||
|
'<title>Xhtml</title><b><i>content</b></i>',
|
||||||
|
'<html xmlns="http://www.w3.org/1999/xhtml">' +
|
||||||
|
'<head><title>Xhtml</title></head>' +
|
||||||
|
'<body><b><i>content</i></b></body>' +
|
||||||
|
'</html>')
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_title_body_named_charref
|
||||||
|
assert_xhtml_equal(
|
||||||
|
'<title>mdash</title>A &mdash B',
|
||||||
|
'<html xmlns="http://www.w3.org/1999/xhtml">' +
|
||||||
|
'<head><title>mdash</title></head>' +
|
||||||
|
'<body>A '+ [0x2014].pack('U') + ' B</body>' +
|
||||||
|
'</html>')
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class BasicXmlTest < Test::Unit::TestCase
|
||||||
|
|
||||||
|
def test_comment
|
||||||
|
assert_xml_equal("<x><!-- foo --></x>")
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_cdata
|
||||||
|
assert_xml_equal("<x><![CDATA[foo]]></x>","<x>foo</x>")
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_simple_text
|
||||||
|
assert_xml_equal("<p>foo</p>","<p>foo</p>")
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_optional_close
|
||||||
|
assert_xml_equal("<p>foo","<p>foo</p>")
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_html_mismatched
|
||||||
|
assert_xml_equal("<b><i>foo</b></i>","<b><i>foo</i></b>")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class OpmlTest < Test::Unit::TestCase
|
||||||
|
|
||||||
|
def test_mixedCaseElement
|
||||||
|
assert_xml_equal(
|
||||||
|
'<opml version="1.0">' +
|
||||||
|
'<head><ownerName>Dave Winer</ownerName></head>' +
|
||||||
|
'</opml>')
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_mixedCaseAttribute
|
||||||
|
assert_xml_equal(
|
||||||
|
'<opml version="1.0">' +
|
||||||
|
'<body><outline isComment="true"/></body>' +
|
||||||
|
'</opml>')
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_malformed
|
||||||
|
assert_xml_equal(
|
||||||
|
'<opml version="1.0">' +
|
||||||
|
'<body><outline text="Odds & Ends"/></body>' +
|
||||||
|
'</opml>',
|
||||||
|
'<opml version="1.0">' +
|
||||||
|
'<body><outline text="Odds & Ends"/></body>' +
|
||||||
|
'</opml>')
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class XhtmlTest < Test::Unit::TestCase
|
||||||
|
|
||||||
|
def test_mathml
|
||||||
|
assert_xhtml_equal <<EOX
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head><title>MathML</title></head>
|
||||||
|
<body>
|
||||||
|
<math xmlns="http://www.w3.org/1998/Math/MathML">
|
||||||
|
<mrow>
|
||||||
|
<mi>x</mi>
|
||||||
|
<mo>=</mo>
|
||||||
|
|
||||||
|
<mfrac>
|
||||||
|
<mrow>
|
||||||
|
<mrow>
|
||||||
|
<mo>-</mo>
|
||||||
|
<mi>b</mi>
|
||||||
|
</mrow>
|
||||||
|
<mo>±</mo>
|
||||||
|
<msqrt>
|
||||||
|
|
||||||
|
<mrow>
|
||||||
|
<msup>
|
||||||
|
<mi>b</mi>
|
||||||
|
<mn>2</mn>
|
||||||
|
</msup>
|
||||||
|
<mo>-</mo>
|
||||||
|
<mrow>
|
||||||
|
|
||||||
|
<mn>4</mn>
|
||||||
|
<mo>⁢</mo>
|
||||||
|
<mi>a</mi>
|
||||||
|
<mo>⁢</mo>
|
||||||
|
<mi>c</mi>
|
||||||
|
</mrow>
|
||||||
|
</mrow>
|
||||||
|
|
||||||
|
</msqrt>
|
||||||
|
</mrow>
|
||||||
|
<mrow>
|
||||||
|
<mn>2</mn>
|
||||||
|
<mo>⁢</mo>
|
||||||
|
<mi>a</mi>
|
||||||
|
</mrow>
|
||||||
|
</mfrac>
|
||||||
|
|
||||||
|
</mrow>
|
||||||
|
</math>
|
||||||
|
</body></html>
|
||||||
|
EOX
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_svg
|
||||||
|
assert_xhtml_equal <<EOX
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head><title>SVG</title></head>
|
||||||
|
<body>
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
|
||||||
|
<path d="M38,38c0-12,24-15,23-2c0,9-16,13-16,23v7h11v-4c0-9,17-12,17-27
|
||||||
|
c-2-22-45-22-45,3zM45,70h11v11h-11z" fill="#371">
|
||||||
|
</path>
|
||||||
|
<circle cx="50" cy="50" r="45" fill="none" stroke="#371" stroke-width="10">
|
||||||
|
</circle>
|
||||||
|
|
||||||
|
</svg>
|
||||||
|
</body></html>
|
||||||
|
EOX
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_xlink
|
||||||
|
assert_xhtml_equal <<EOX
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head><title>XLINK</title></head>
|
||||||
|
<body>
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
|
||||||
|
<defs xmlns:l="http://www.w3.org/1999/xlink">
|
||||||
|
<radialGradient id="s1" fx=".4" fy=".2" r=".7">
|
||||||
|
<stop stop-color="#FE8"/>
|
||||||
|
<stop stop-color="#D70" offset="1"/>
|
||||||
|
</radialGradient>
|
||||||
|
<radialGradient id="s2" fx=".8" fy=".5" l:href="#s1"/>
|
||||||
|
<radialGradient id="s3" fx=".5" fy=".9" l:href="#s1"/>
|
||||||
|
<radialGradient id="s4" fx=".1" fy=".5" l:href="#s1"/>
|
||||||
|
</defs>
|
||||||
|
<g stroke="#940">
|
||||||
|
<path d="M73,29c-37-40-62-24-52,4l6-7c-8-16,7-26,42,9z" fill="url(#s1)"/>
|
||||||
|
<path d="M47,8c33-16,48,21,9,47l-6-5c38-27,20-44,5-37z" fill="url(#s2)"/>
|
||||||
|
<path d="M77,32c22,30,10,57-39,51l-1-8c3,3,67,5,36-36z" fill="url(#s3)"/>
|
||||||
|
|
||||||
|
<path d="M58,84c-4,20-38-4-8-24l-6-5c-36,43,15,56,23,27z" fill="url(#s4)"/>
|
||||||
|
<path d="M40,14c-40,37-37,52-9,68l1-8c-16-13-29-21,16-56z" fill="url(#s1)"/>
|
||||||
|
<path d="M31,33c19,23,20,7,35,41l-9,1.7c-4-19-8-14-31-37z" fill="url(#s2)"/>
|
||||||
|
</g>
|
||||||
|
</svg>
|
||||||
|
</body></html>
|
||||||
|
EOX
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_br
|
||||||
|
assert_xhtml_equal <<EOX
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head><title>XLINK</title></head>
|
||||||
|
<body>
|
||||||
|
<br/>
|
||||||
|
</body></html>
|
||||||
|
EOX
|
||||||
|
end
|
||||||
|
|
||||||
|
def xtest_strong
|
||||||
|
assert_xhtml_equal <<EOX
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head><title>XLINK</title></head>
|
||||||
|
<body>
|
||||||
|
<strong></strong>
|
||||||
|
</body></html>
|
||||||
|
EOX
|
||||||
|
end
|
||||||
|
end
|
108
vendor/plugins/HTML5lib/tests/test_parser.rb
vendored
Normal file
108
vendor/plugins/HTML5lib/tests/test_parser.rb
vendored
Normal file
|
@ -0,0 +1,108 @@
|
||||||
|
require File.join(File.dirname(__FILE__), 'preamble')
|
||||||
|
|
||||||
|
require 'html5lib/treebuilders'
|
||||||
|
require 'html5lib/html5parser'
|
||||||
|
|
||||||
|
|
||||||
|
$tree_types_to_test = ['simpletree', 'rexml']
|
||||||
|
|
||||||
|
begin
|
||||||
|
require 'hpricot'
|
||||||
|
$tree_types_to_test.push('hpricot')
|
||||||
|
rescue LoadError
|
||||||
|
end
|
||||||
|
|
||||||
|
$CHECK_PARSER_ERRORS = false
|
||||||
|
|
||||||
|
puts 'Testing: ' + $tree_types_to_test * ', '
|
||||||
|
|
||||||
|
|
||||||
|
class Html5ParserTestCase < Test::Unit::TestCase
|
||||||
|
|
||||||
|
def self.startswith?(a, b)
|
||||||
|
b[0... a.length] == a
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.parseTestcase(data)
|
||||||
|
innerHTML = nil
|
||||||
|
input = []
|
||||||
|
output = []
|
||||||
|
errors = []
|
||||||
|
currentList = input
|
||||||
|
data.split(/\n/).each do |line|
|
||||||
|
if !line.empty? and !startswith?("#errors", line) and
|
||||||
|
!startswith?("#document", line) and
|
||||||
|
!startswith?("#data", line) and
|
||||||
|
!startswith?("#document-fragment", line)
|
||||||
|
|
||||||
|
if currentList == output and startswith?("|", line)
|
||||||
|
currentList.push(line[2..-1])
|
||||||
|
else
|
||||||
|
currentList.push(line)
|
||||||
|
end
|
||||||
|
elsif line == "#errors"
|
||||||
|
currentList = errors
|
||||||
|
elsif line == "#document" or startswith?("#document-fragment", line)
|
||||||
|
if startswith?("#document-fragment", line)
|
||||||
|
innerHTML = line[19..-1]
|
||||||
|
raise AssertionError unless innerHTML
|
||||||
|
end
|
||||||
|
currentList = output
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return innerHTML, input.join("\n"), output.join("\n"), errors
|
||||||
|
end
|
||||||
|
|
||||||
|
# convert the output of str(document) to the format used in the testcases
|
||||||
|
def convertTreeDump(treedump)
|
||||||
|
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
|
||||||
|
end
|
||||||
|
|
||||||
|
def sortattrs(output)
|
||||||
|
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
|
||||||
|
end
|
||||||
|
|
||||||
|
html5lib_test_files('tree-construction').each do |test_file|
|
||||||
|
|
||||||
|
test_name = File.basename(test_file).sub('.dat', '')
|
||||||
|
|
||||||
|
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
||||||
|
next if data.empty?
|
||||||
|
|
||||||
|
innerHTML, input, expected_output, expected_errors = parseTestcase(data)
|
||||||
|
|
||||||
|
$tree_types_to_test.each do |tree_name|
|
||||||
|
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
|
||||||
|
|
||||||
|
parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name))
|
||||||
|
|
||||||
|
if innerHTML
|
||||||
|
parser.parseFragment(input, innerHTML)
|
||||||
|
else
|
||||||
|
parser.parse(input)
|
||||||
|
end
|
||||||
|
|
||||||
|
actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
|
||||||
|
|
||||||
|
assert_equal sortattrs(expected_output), sortattrs(actual_output), [
|
||||||
|
'Input:', input,
|
||||||
|
'Expected:', expected_output,
|
||||||
|
'Recieved:', actual_output
|
||||||
|
].join("\n")
|
||||||
|
|
||||||
|
if $CHECK_PARSER_ERRORS
|
||||||
|
actual_errors = parser.errors.map do |(line, col), message|
|
||||||
|
'Line: %i Col: %i %s' % [line, col, message]
|
||||||
|
end
|
||||||
|
assert_equal parser.errors.length, expected_errors.length, [
|
||||||
|
'Expected errors:', expected_errors.join("\n"),
|
||||||
|
'Actual errors:', actual_errors.join("\n")
|
||||||
|
].join("\n")
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
206
vendor/plugins/HTML5lib/tests/test_sanitizer.rb
vendored
Normal file
206
vendor/plugins/HTML5lib/tests/test_sanitizer.rb
vendored
Normal file
|
@ -0,0 +1,206 @@
|
||||||
|
#!/usr/bin/env ruby
|
||||||
|
|
||||||
|
require File.join(File.dirname(__FILE__), 'preamble')
|
||||||
|
|
||||||
|
require 'html5lib/sanitizer'
|
||||||
|
require 'html5lib/html5parser'
|
||||||
|
require 'html5lib/liberalxmlparser'
|
||||||
|
|
||||||
|
class SanitizeTest < Test::Unit::TestCase
|
||||||
|
include HTML5lib
|
||||||
|
|
||||||
|
def sanitize_xhtml stream
|
||||||
|
XHTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
|
||||||
|
end
|
||||||
|
|
||||||
|
def sanitize_html stream
|
||||||
|
HTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
|
||||||
|
end
|
||||||
|
|
||||||
|
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
|
||||||
|
next if %w[caption col colgroup optgroup option table tbody td tfoot th thead tr].include?(tag_name) ### TODO
|
||||||
|
define_method "test_should_allow_#{tag_name}_tag" do
|
||||||
|
if tag_name == 'image'
|
||||||
|
assert_equal "<img title=\"1\"/>foo <bad>bar</bad> baz",
|
||||||
|
sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
|
||||||
|
elsif VOID_ELEMENTS.include?(tag_name)
|
||||||
|
assert_equal "<#{tag_name} title=\"1\"/>foo <bad>bar</bad> baz",
|
||||||
|
sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
|
||||||
|
else
|
||||||
|
assert_equal "<#{tag_name.downcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.downcase}>",
|
||||||
|
sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
|
||||||
|
assert_equal "<#{tag_name} title=\"1\">foo <bad>bar</bad> baz</#{tag_name}>",
|
||||||
|
sanitize_xhtml("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
|
||||||
|
define_method "test_should_forbid_#{tag_name.upcase}_tag" do
|
||||||
|
assert_equal "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>",
|
||||||
|
sanitize_html("<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
||||||
|
next if attribute_name == 'style'
|
||||||
|
define_method "test_should_allow_#{attribute_name}_attribute" do
|
||||||
|
assert_equal "<p #{attribute_name.downcase}=\"foo\">foo <bad>bar</bad> baz</p>",
|
||||||
|
sanitize_html("<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>")
|
||||||
|
assert_equal "<p #{attribute_name}=\"foo\">foo <bad>bar</bad> baz</p>",
|
||||||
|
sanitize_xhtml("<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
||||||
|
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
|
||||||
|
assert_equal "<p>foo <bad>bar</bad> baz</p>",
|
||||||
|
sanitize_html("<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
|
||||||
|
define_method "test_should_allow_#{protocol}_uris" do
|
||||||
|
assert_equal "<a href=\"#{protocol}\">foo</a>",
|
||||||
|
sanitize_html(%(<a href="#{protocol}">foo</a>))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
|
||||||
|
define_method "test_should_allow_uppercase_#{protocol}_uris" do
|
||||||
|
assert_equal "<a href=\"#{protocol.upcase}\">foo</a>",
|
||||||
|
sanitize_html(%(<a href="#{protocol.upcase}">foo</a>))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_allow_anchors
|
||||||
|
assert_equal "<a href=\"foo\"><script>baz</script></a>",
|
||||||
|
sanitize_html("<a href='foo' onclick='bar'><script>baz</script></a>")
|
||||||
|
end
|
||||||
|
|
||||||
|
# RFC 3986, sec 4.2
|
||||||
|
def test_allow_colons_in_path_component
|
||||||
|
assert_equal "<a href=\"./this:that\">foo</a>",
|
||||||
|
sanitize_html("<a href=\"./this:that\">foo</a>")
|
||||||
|
end
|
||||||
|
|
||||||
|
%w(src width height alt).each do |img_attr|
|
||||||
|
define_method "test_should_allow_image_#{img_attr}_attribute" do
|
||||||
|
assert_equal "<img #{img_attr}=\"foo\"/>",
|
||||||
|
sanitize_html("<img #{img_attr}='foo' onclick='bar' />")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_handle_non_html
|
||||||
|
assert_equal 'abc', sanitize_html("abc")
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_handle_blank_text
|
||||||
|
assert_equal '', sanitize_html('')
|
||||||
|
end
|
||||||
|
|
||||||
|
[%w(img src), %w(a href)].each do |(tag, attr)|
|
||||||
|
close = VOID_ELEMENTS.include?(tag) ? "/>boo" : ">boo</#{tag}>"
|
||||||
|
|
||||||
|
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols" do
|
||||||
|
assert_equal %(<#{tag} title="1"#{close}), sanitize_html(%(<#{tag} #{attr}="javascript:XSS" title="1">boo</#{tag}>))
|
||||||
|
end
|
||||||
|
|
||||||
|
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols_and_whitespace" do
|
||||||
|
assert_equal %(<#{tag} title="1"#{close}), sanitize_html(%(<#{tag} #{attr}=" javascript:XSS" title="1">boo</#{tag}>))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
[%(<img src="javascript:alert('XSS');" />),
|
||||||
|
%(<img src=javascript:alert('XSS') />),
|
||||||
|
%(<img src="JaVaScRiPt:alert('XSS')" />),
|
||||||
|
%(<img src='javascript:alert("XSS")' />),
|
||||||
|
%(<img src='javascript:alert(String.fromCharCode(88,83,83))' />),
|
||||||
|
%(<img src='javascript:alert('XSS')' />),
|
||||||
|
%(<img src='javascript:alert('XSS')' />),
|
||||||
|
%(<img src='javascript:alert('XSS')' />),
|
||||||
|
%(<img src="jav\tascript:alert('XSS');" />),
|
||||||
|
%(<img src="jav	ascript:alert('XSS');" />),
|
||||||
|
%(<img src="jav
ascript:alert('XSS');" />),
|
||||||
|
%(<img src="jav
ascript:alert('XSS');" />),
|
||||||
|
%(<img src="  javascript:alert('XSS');" />),
|
||||||
|
%(<img src=" javascript:alert('XSS');" />),
|
||||||
|
%(<img src=" javascript:alert('XSS');" />)].each_with_index do |img_hack, i|
|
||||||
|
define_method "test_should_not_fall_for_xss_image_hack_#{i}" do
|
||||||
|
assert_equal "<img/>", sanitize_html(img_hack)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_sanitize_tag_broken_up_by_null
|
||||||
|
assert_equal "<scr\357\277\275ipt>alert(\"XSS\")</scr\357\277\275ipt>", sanitize_html(%(<scr\0ipt>alert(\"XSS\")</scr\0ipt>))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_sanitize_invalid_script_tag
|
||||||
|
assert_equal "<script XSS=\"\" SRC=\"http://ha.ckers.org/xss.js\"></script>", sanitize_html(%(<script/XSS SRC="http://ha.ckers.org/xss.js"></script>))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_sanitize_script_tag_with_multiple_open_brackets
|
||||||
|
assert_equal "<<script>alert(\"XSS\");//<</script>", sanitize_html(%(<<script>alert("XSS");//<</script>))
|
||||||
|
assert_equal %(<iframe src=\"http://ha.ckers.org/scriptlet.html\"><), sanitize_html(%(<iframe src=http://ha.ckers.org/scriptlet.html\n<))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_sanitize_unclosed_script
|
||||||
|
assert_equal "<script src=\"http://ha.ckers.org/xss.js?\"><b/>", sanitize_html(%(<script src=http://ha.ckers.org/xss.js?<b>))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_sanitize_half_open_scripts
|
||||||
|
assert_equal "<img/>", sanitize_html(%(<img src="javascript:alert('XSS')"))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_should_not_fall_for_ridiculous_hack
|
||||||
|
img_hack = %(<img\nsrc\n=\n"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n"\n />)
|
||||||
|
assert_equal "<img/>", sanitize_html(img_hack)
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_platypus
|
||||||
|
assert_equal %(<a href=\"http://www.ragingplatypus.com/\" style=\"display: block; width: 100%; height: 100%; background-color: black; background-x: center; background-y: center;\">never trust your upstream platypus</a>),
|
||||||
|
sanitize_html(%(<a href="http://www.ragingplatypus.com/" style="display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;">never trust your upstream platypus</a>))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_xul
|
||||||
|
assert_equal %(<p style="">fubar</p>),
|
||||||
|
sanitize_html(%(<p style="-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')">fubar</p>))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_input_image
|
||||||
|
assert_equal %(<input type="image"/>),
|
||||||
|
sanitize_html(%(<input type="image" src="javascript:alert('XSS');" />))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_non_alpha_non_digit
|
||||||
|
assert_equal "<script XSS=\"\" src=\"http://ha.ckers.org/xss.js\"></script>",
|
||||||
|
sanitize_html(%(<script/XSS src="http://ha.ckers.org/xss.js"></script>))
|
||||||
|
assert_equal "<a>foo</a>",
|
||||||
|
sanitize_html('<a onclick!#$%&()*~+-_.,:;?@[/|\]^`=alert("XSS")>foo</a>')
|
||||||
|
assert_equal "<img src=\"http://ha.ckers.org/xss.js\"/>",
|
||||||
|
sanitize_html('<img/src="http://ha.ckers.org/xss.js"/>')
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_img_dynsrc_lowsrc
|
||||||
|
assert_equal "<img/>",
|
||||||
|
sanitize_html(%(<img dynsrc="javascript:alert('XSS')" />))
|
||||||
|
assert_equal "<img/>",
|
||||||
|
sanitize_html(%(<img lowsrc="javascript:alert('XSS')" />))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_div_background_image_unicode_encoded
|
||||||
|
assert_equal '<div style="">foo</div>',
|
||||||
|
sanitize_html(%(<div style="background-image:\0075\0072\006C\0028'\006a\0061\0076\0061\0073\0063\0072\0069\0070\0074\003a\0061\006c\0065\0072\0074\0028.1027\0058.1053\0053\0027\0029'\0029">foo</div>))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_div_expression
|
||||||
|
assert_equal '<div style="">foo</div>',
|
||||||
|
sanitize_html(%(<div style="width: expression(alert('XSS'));">foo</div>))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_img_vbscript
|
||||||
|
assert_equal '<img/>',
|
||||||
|
sanitize_html(%(<img src='vbscript:msgbox("XSS")' />))
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
78
vendor/plugins/HTML5lib/tests/test_tokenizer.rb
vendored
Normal file
78
vendor/plugins/HTML5lib/tests/test_tokenizer.rb
vendored
Normal file
|
@ -0,0 +1,78 @@
|
||||||
|
require File.join(File.dirname(__FILE__), 'preamble')
|
||||||
|
|
||||||
|
require 'html5lib/tokenizer'
|
||||||
|
|
||||||
|
require 'tokenizer_test_parser'
|
||||||
|
|
||||||
|
begin
|
||||||
|
require 'jsonx'
|
||||||
|
rescue LoadError
|
||||||
|
class JSON
|
||||||
|
def self.parse json
|
||||||
|
json.gsub! /"\s*:/, '"=>'
|
||||||
|
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
|
||||||
|
eval json
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class Html5TokenizerTestCase < Test::Unit::TestCase
|
||||||
|
|
||||||
|
def type_of?(token_name, token)
|
||||||
|
token != 'ParseError' and token_name == token.first
|
||||||
|
end
|
||||||
|
|
||||||
|
def convert_attribute_arrays_to_hashes(tokens)
|
||||||
|
tokens.inject([]) do |tokens, token|
|
||||||
|
token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
|
||||||
|
tokens << token
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def concatenate_consecutive_characters(tokens)
|
||||||
|
tokens.inject([]) do |tokens, token|
|
||||||
|
if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
|
||||||
|
tokens.last[1] = tokens.last[1] + token[1]
|
||||||
|
next tokens
|
||||||
|
end
|
||||||
|
tokens << token
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def tokenizer_test(data)
|
||||||
|
(data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
|
||||||
|
message = [
|
||||||
|
'Description:', data['description'],
|
||||||
|
'Input:', data['input'],
|
||||||
|
'Content Model Flag:', content_model_flag ] * "\n"
|
||||||
|
|
||||||
|
assert_nothing_raised message do
|
||||||
|
tokenizer = HTML5lib::HTMLTokenizer.new(data['input'])
|
||||||
|
|
||||||
|
tokenizer.contentModelFlag = content_model_flag.to_sym
|
||||||
|
|
||||||
|
tokenizer.currentToken = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
|
||||||
|
|
||||||
|
tokens = TokenizerTestParser.new(tokenizer).parse
|
||||||
|
|
||||||
|
actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
|
||||||
|
|
||||||
|
expected = concatenate_consecutive_characters(data['output'])
|
||||||
|
|
||||||
|
assert_equal expected, actual, message
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
html5lib_test_files('tokenizer').each do |test_file|
|
||||||
|
test_name = File.basename(test_file).sub('.test', '')
|
||||||
|
|
||||||
|
tests = JSON.parse(File.read(test_file))['tests']
|
||||||
|
|
||||||
|
tests.each_with_index do |data, index|
|
||||||
|
define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
|
62
vendor/plugins/HTML5lib/tests/tokenizer_test_parser.rb
vendored
Normal file
62
vendor/plugins/HTML5lib/tests/tokenizer_test_parser.rb
vendored
Normal file
|
@ -0,0 +1,62 @@
|
||||||
|
require 'html5lib/constants'
|
||||||
|
|
||||||
|
class TokenizerTestParser
|
||||||
|
def initialize(tokenizer)
|
||||||
|
@tokenizer = tokenizer
|
||||||
|
end
|
||||||
|
|
||||||
|
def parse
|
||||||
|
@outputTokens = []
|
||||||
|
|
||||||
|
debug = nil
|
||||||
|
for token in @tokenizer
|
||||||
|
debug = token.inspect if token[:type] == :ParseError
|
||||||
|
send ('process' + token[:type].to_s), token
|
||||||
|
end
|
||||||
|
|
||||||
|
return @outputTokens
|
||||||
|
end
|
||||||
|
|
||||||
|
def processDoctype(token)
|
||||||
|
@outputTokens.push(["DOCTYPE", token[:name], token[:data]])
|
||||||
|
end
|
||||||
|
|
||||||
|
def processStartTag(token)
|
||||||
|
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
||||||
|
end
|
||||||
|
|
||||||
|
def processEmptyTag(token)
|
||||||
|
if not HTML5lib::VOID_ELEMENTS.include? token[:name]
|
||||||
|
@outputTokens.push("ParseError")
|
||||||
|
end
|
||||||
|
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
||||||
|
end
|
||||||
|
|
||||||
|
def processEndTag(token)
|
||||||
|
if token[:data].length > 0
|
||||||
|
self.processParseError(token)
|
||||||
|
end
|
||||||
|
@outputTokens.push(["EndTag", token[:name]])
|
||||||
|
end
|
||||||
|
|
||||||
|
def processComment(token)
|
||||||
|
@outputTokens.push(["Comment", token[:data]])
|
||||||
|
end
|
||||||
|
|
||||||
|
def processCharacters(token)
|
||||||
|
@outputTokens.push(["Character", token[:data]])
|
||||||
|
end
|
||||||
|
|
||||||
|
alias processSpaceCharacters processCharacters
|
||||||
|
|
||||||
|
def processCharacters(token)
|
||||||
|
@outputTokens.push(["Character", token[:data]])
|
||||||
|
end
|
||||||
|
|
||||||
|
def processEOF(token)
|
||||||
|
end
|
||||||
|
|
||||||
|
def processParseError(token)
|
||||||
|
@outputTokens.push("ParseError")
|
||||||
|
end
|
||||||
|
end
|
3
vendor/plugins/maruku/lib/maruku/defaults.rb
vendored
3
vendor/plugins/maruku/lib/maruku/defaults.rb
vendored
|
@ -31,6 +31,9 @@ Globals = {
|
||||||
:maruku_signature => false,
|
:maruku_signature => false,
|
||||||
:code_background_color => '#fef',
|
:code_background_color => '#fef',
|
||||||
:code_show_spaces => false,
|
:code_show_spaces => false,
|
||||||
|
|
||||||
|
:filter_html => false,
|
||||||
|
|
||||||
:html_math_output_mathml => true, # also set :html_math_engine
|
:html_math_output_mathml => true, # also set :html_math_engine
|
||||||
:html_math_engine => 'itex2mml', #ritex, itex2mml, none
|
:html_math_engine => 'itex2mml', #ritex, itex2mml, none
|
||||||
|
|
||||||
|
|
|
@ -477,7 +477,7 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
|
||||||
end
|
end
|
||||||
|
|
||||||
id = match[1]; url = match[2]; title = match[3];
|
id = match[1]; url = match[2]; title = match[3];
|
||||||
id = id.strip.downcase.gsub(' ','_')
|
id = sanitize_ref_id(id)
|
||||||
|
|
||||||
hash = self.refs[id] = {:url=>url,:title=>title}
|
hash = self.refs[id] = {:url=>url,:title=>title}
|
||||||
|
|
||||||
|
|
|
@ -287,7 +287,7 @@ module MaRuKu; module In; module Markdown; module SpanLevelParser
|
||||||
end
|
end
|
||||||
|
|
||||||
def extension_meta(src, con, break_on_chars)
|
def extension_meta(src, con, break_on_chars)
|
||||||
if m = src.read_regexp(/([^\s\:]+):/)
|
if m = src.read_regexp(/([^\s\:\"\']+):/)
|
||||||
name = m[1]
|
name = m[1]
|
||||||
al = read_attribute_list(src, con, break_on_chars)
|
al = read_attribute_list(src, con, break_on_chars)
|
||||||
# puts "#{name}=#{al.inspect}"
|
# puts "#{name}=#{al.inspect}"
|
||||||
|
@ -581,9 +581,9 @@ module MaRuKu; module In; module Markdown; module SpanLevelParser
|
||||||
ref_id = read_ref_id(src,con)
|
ref_id = read_ref_id(src,con)
|
||||||
if ref_id
|
if ref_id
|
||||||
if ref_id.size == 0
|
if ref_id.size == 0
|
||||||
ref_id = children.to_s.downcase.gsub(' ','_')
|
ref_id = sanitize_ref_id(children.to_s)
|
||||||
else
|
else
|
||||||
ref_id = ref_id.downcase
|
ref_id = sanitize_ref_id(ref_id)
|
||||||
end
|
end
|
||||||
con.push_element md_link(children, ref_id)
|
con.push_element md_link(children, ref_id)
|
||||||
else
|
else
|
||||||
|
|
|
@ -108,6 +108,7 @@ module MaRuKu
|
||||||
# Input is a LineSource
|
# Input is a LineSource
|
||||||
def t2_parse_blocks(src, output)
|
def t2_parse_blocks(src, output)
|
||||||
while src.cur_line
|
while src.cur_line
|
||||||
|
l = src.shift_line
|
||||||
|
|
||||||
# ignore empty line
|
# ignore empty line
|
||||||
if l.t2_empty? then
|
if l.t2_empty? then
|
||||||
|
@ -115,7 +116,6 @@ module MaRuKu
|
||||||
next
|
next
|
||||||
end
|
end
|
||||||
|
|
||||||
l = src.shift_line
|
|
||||||
# TODO: lists
|
# TODO: lists
|
||||||
# TODO: xml
|
# TODO: xml
|
||||||
# TODO: `==`
|
# TODO: `==`
|
||||||
|
|
|
@ -741,7 +741,17 @@ of the form `#ff00ff`.
|
||||||
return a
|
return a
|
||||||
end
|
end
|
||||||
|
|
||||||
|
=begin maruku_doc
|
||||||
|
Attribute: filter_html
|
||||||
|
Scope: document
|
||||||
|
|
||||||
|
If true, raw HTML is discarded from the output.
|
||||||
|
|
||||||
|
=end
|
||||||
|
|
||||||
def to_html_raw_html
|
def to_html_raw_html
|
||||||
|
return [] if get_setting(:filter_html)
|
||||||
|
|
||||||
raw_html = self.raw_html
|
raw_html = self.raw_html
|
||||||
if rexml_doc = @parsed_html
|
if rexml_doc = @parsed_html
|
||||||
root = rexml_doc.root
|
root = rexml_doc.root
|
||||||
|
|
|
@ -152,7 +152,7 @@ end end
|
||||||
module MaRuKu; module Out; module Latex
|
module MaRuKu; module Out; module Latex
|
||||||
|
|
||||||
def to_latex_hrule; "\n\\vspace{.5em} \\hrule \\vspace{.5em}\n" end
|
def to_latex_hrule; "\n\\vspace{.5em} \\hrule \\vspace{.5em}\n" end
|
||||||
def to_latex_linebreak; "\\linebreak " end
|
def to_latex_linebreak; "\\newline " end
|
||||||
|
|
||||||
def to_latex_paragraph
|
def to_latex_paragraph
|
||||||
children_to_latex+"\n\n"
|
children_to_latex+"\n\n"
|
||||||
|
|
|
@ -146,6 +146,10 @@ module MaRuKu; module Strings
|
||||||
s[0, i+1].strip
|
s[0, i+1].strip
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def sanitize_ref_id(x)
|
||||||
|
x.downcase.gsub(' ','_').gsub(/[^\w]/,'')
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
# removes initial quote
|
# removes initial quote
|
||||||
def unquote(s)
|
def unquote(s)
|
||||||
|
|
|
@ -155,7 +155,7 @@ module MaRuKu; module Tests
|
||||||
["[a]", [ md_link(["a"],'a')], 'Empty link'],
|
["[a]", [ md_link(["a"],'a')], 'Empty link'],
|
||||||
["[a][]", ],
|
["[a][]", ],
|
||||||
["[a][]b", [ md_link(["a"],'a'),'b'], 'Empty link'],
|
["[a][]b", [ md_link(["a"],'a'),'b'], 'Empty link'],
|
||||||
["[a\\]][]", [ md_link(["a]"],'a]')], 'Escape inside link'],
|
["[a\\]][]", [ md_link(["a]"],'a')], 'Escape inside link (throw ?] away)'],
|
||||||
|
|
||||||
["[a", :throw, 'Link not closed'],
|
["[a", :throw, 'Link not closed'],
|
||||||
["[a][", :throw, 'Ref not closed'],
|
["[a][", :throw, 'Ref not closed'],
|
||||||
|
|
2
vendor/plugins/maruku/lib/maruku/version.rb
vendored
2
vendor/plugins/maruku/lib/maruku/version.rb
vendored
|
@ -19,7 +19,7 @@
|
||||||
#++
|
#++
|
||||||
|
|
||||||
module MaRuKu
|
module MaRuKu
|
||||||
Version = '0.5.5'
|
Version = '0.5.6'
|
||||||
|
|
||||||
MarukuURL = 'http://maruku.rubyforge.org/'
|
MarukuURL = 'http://maruku.rubyforge.org/'
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue