HTML5lib Sanitizer
Replaced native Sanitizer with HTML5lib version. Synced with latest Maruku.
This commit is contained in:
parent
457ec8627c
commit
6b21ac484f
36 changed files with 6534 additions and 215 deletions
|
@ -294,13 +294,13 @@ class WikiController < ApplicationController
|
|||
|
||||
def s5
|
||||
if @web.markup == :markdownMML
|
||||
@s5_content = sanitize_html(Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
|
||||
@s5_content = sanitize_xhtml(Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
|
||||
{:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}'], :content_only => true,
|
||||
:author => @page.author, :title => @page.plain_name}).to_s5).to_ncr
|
||||
:author => @page.author, :title => @page.plain_name}).to_s5.to_ncr)
|
||||
elsif @web.markup == :markdown
|
||||
@s5_content = sanitize_html(Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
|
||||
@s5_content = sanitize_xhtml(Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
|
||||
{:math_enabled => false, :content_only => true,
|
||||
:author => @page.author, :title => @page.plain_name}).to_s5).to_ncr
|
||||
:author => @page.author, :title => @page.plain_name}).to_s5.to_ncr)
|
||||
else
|
||||
@s5_content = "S5 not supported with this text filter"
|
||||
end
|
||||
|
|
207
attic/lib/sanitize.rb
Normal file
207
attic/lib/sanitize.rb
Normal file
|
@ -0,0 +1,207 @@
|
|||
module Sanitize
|
||||
|
||||
# This module provides sanitization of XHTML+MathML+SVG
|
||||
# and of inline style attributes.
|
||||
#
|
||||
# Based heavily on Sam Ruby's code in the Universal FeedParser.
|
||||
|
||||
require 'html/tokenizer'
|
||||
require 'node'
|
||||
|
||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
||||
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
||||
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
|
||||
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
|
||||
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
|
||||
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
|
||||
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
|
||||
'ul', 'var']
|
||||
|
||||
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
|
||||
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
|
||||
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
|
||||
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
|
||||
'munderover', 'none']
|
||||
|
||||
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
||||
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
|
||||
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
|
||||
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
|
||||
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
|
||||
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
|
||||
|
||||
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
||||
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
|
||||
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
|
||||
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
|
||||
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
|
||||
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
|
||||
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
|
||||
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
||||
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
|
||||
'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', 'title',
|
||||
'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang']
|
||||
|
||||
|
||||
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
|
||||
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
|
||||
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
|
||||
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
|
||||
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
|
||||
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
|
||||
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
|
||||
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
|
||||
'xlink:type', 'xmlns', 'xmlns:xlink']
|
||||
|
||||
|
||||
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
|
||||
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
||||
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
||||
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
|
||||
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
|
||||
'font-family', 'font-size', 'font-stretch', 'font-style', 'font-variant',
|
||||
'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
|
||||
'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
|
||||
'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
|
||||
'lang', 'marker-end', 'marker-mid', 'marker-start', 'markerHeight',
|
||||
'markerUnits', 'markerWidth', 'mathematical', 'max', 'min', 'name',
|
||||
'offset', 'opacity', 'orient', 'origin', 'overline-position',
|
||||
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
|
||||
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount', 'repeatDur',
|
||||
'requiredExtensions', 'requiredFeatures', 'restart', 'rotate', 'rx',
|
||||
'ry', 'slope', 'stemh', 'stemv', 'stop-color', 'stop-opacity',
|
||||
'strikethrough-position', 'strikethrough-thickness', 'stroke',
|
||||
'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
|
||||
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
|
||||
'stroke-width', 'systemLanguage', 'target',
|
||||
'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
|
||||
'underline-position', 'underline-thickness', 'unicode',
|
||||
'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
|
||||
'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
|
||||
'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
|
||||
'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
|
||||
'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
|
||||
|
||||
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'xlink:href']
|
||||
|
||||
acceptable_css_properties = ['azimuth', 'background-color',
|
||||
'border-bottom-color', 'border-collapse', 'border-color',
|
||||
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
|
||||
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
|
||||
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
|
||||
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
|
||||
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
|
||||
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
|
||||
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
|
||||
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
|
||||
'white-space', 'width']
|
||||
|
||||
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
|
||||
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
|
||||
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
|
||||
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
|
||||
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
|
||||
'transparent', 'underline', 'white', 'yellow']
|
||||
|
||||
acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
|
||||
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
|
||||
'stroke-opacity']
|
||||
|
||||
acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
|
||||
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
|
||||
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
|
||||
'ssh', 'sftp', 'rtsp', 'afs' ]
|
||||
|
||||
ALLOWED_ELEMENTS = acceptable_elements + mathml_elements + svg_elements unless defined?(ALLOWED_ELEMENTS)
|
||||
ALLOWED_ATTRIBUTES = acceptable_attributes + mathml_attributes + svg_attributes unless defined?(ALLOWED_ATTRIBUTES)
|
||||
ALLOWED_CSS_PROPERTIES = acceptable_css_properties unless defined?(ALLOWED_CSS_PROPERTIES)
|
||||
ALLOWED_CSS_KEYWORDS = acceptable_css_keywords unless defined?(ALLOWED_CSS_KEYWORDS)
|
||||
ALLOWED_SVG_PROPERTIES = acceptable_svg_properties unless defined?(ALLOWED_SVG_PROPERTIES)
|
||||
ALLOWED_PROTOCOLS = acceptable_protocols unless defined?(ALLOWED_PROTOCOLS)
|
||||
ATTR_VAL_IS_URI = attr_val_is_uri unless defined?(ATTR_VAL_IS_URI)
|
||||
|
||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and stripping out all
|
||||
# attributes not in ALLOWED_ATTRIBUTES. Style attributes are parsed, and a restricted set,
|
||||
# specified by ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
||||
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified in
|
||||
# ALLOWED_PROTOCOLS are allowed.
|
||||
# You can adjust what gets sanitized, by defining these constant arrays before this Module is loaded.
|
||||
#
|
||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||
# => <script> do_nasty_stuff() </script>
|
||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||
# => <a>Click here for $100</a>
|
||||
def sanitize_html(html)
|
||||
if html.index("<")
|
||||
tokenizer = HTML::Tokenizer.new(html)
|
||||
new_text = ""
|
||||
|
||||
while token = tokenizer.next
|
||||
node = XHTML::Node.parse(nil, 0, 0, token, false)
|
||||
new_text << case node.tag?
|
||||
when true
|
||||
if ALLOWED_ELEMENTS.include?(node.name)
|
||||
if node.closing != :close
|
||||
node.attributes.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
|
||||
ATTR_VAL_IS_URI.each do |attr|
|
||||
val_unescaped = CGI.unescapeHTML(node.attributes[attr].to_s).gsub(/[\000-\040\177\s]+|\302*[\200-\240]/,'').downcase
|
||||
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
|
||||
node.attributes.delete attr
|
||||
end
|
||||
end
|
||||
if node.attributes['style']
|
||||
node.attributes['style'] = sanitize_css(node.attributes['style'])
|
||||
end
|
||||
end
|
||||
node.to_s
|
||||
else
|
||||
node.to_s.gsub(/</, "<")
|
||||
end
|
||||
else
|
||||
node.to_s.gsub(/</, "<")
|
||||
end
|
||||
end
|
||||
|
||||
html = new_text
|
||||
end
|
||||
html
|
||||
end
|
||||
|
||||
def sanitize_css(style)
|
||||
# disallow urls
|
||||
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
|
||||
|
||||
# gauntlet
|
||||
if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
|
||||
style = ''
|
||||
return style
|
||||
end
|
||||
if style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
|
||||
style = ''
|
||||
return style
|
||||
end
|
||||
|
||||
clean = []
|
||||
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
|
||||
if ALLOWED_CSS_PROPERTIES.include?(prop.downcase)
|
||||
clean << prop + ': ' + val + ';'
|
||||
elsif ['background','border','margin','padding'].include?(prop.split('-')[0].downcase)
|
||||
goodval = true
|
||||
val.split().each do |keyword|
|
||||
if !ALLOWED_CSS_KEYWORDS.include?(keyword) and
|
||||
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
|
||||
goodval = false
|
||||
end
|
||||
end
|
||||
if goodval
|
||||
clean << prop + ': ' + val + ';'
|
||||
end
|
||||
elsif ALLOWED_SVG_PROPERTIES.include?(prop.downcase)
|
||||
clean << prop + ': ' + val + ';'
|
||||
end
|
||||
end
|
||||
|
||||
style = clean.join(' ')
|
||||
end
|
||||
end
|
187
attic/test/unit/sanitize_test.rb
Normal file
187
attic/test/unit/sanitize_test.rb
Normal file
|
@ -0,0 +1,187 @@
|
|||
#!/usr/bin/env ruby
|
||||
|
||||
require File.expand_path(File.join(File.dirname(__FILE__), '/../test_helper'))
|
||||
require 'sanitize'
|
||||
|
||||
class SanitizeTest < Test::Unit::TestCase
|
||||
include Sanitize
|
||||
|
||||
def setup
|
||||
|
||||
end
|
||||
|
||||
Sanitize::ALLOWED_ELEMENTS.each do |tag_name|
|
||||
define_method "test_should_allow_#{tag_name}_tag" do
|
||||
assert_equal "<#{tag_name} title=\"1\">foo <bad>bar</bad> baz</#{tag_name}>",
|
||||
sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
|
||||
end
|
||||
end
|
||||
|
||||
Sanitize::ALLOWED_ELEMENTS.each do |tag_name|
|
||||
define_method "test_should_forbid_#{tag_name.upcase}_tag" do
|
||||
assert_equal "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>",
|
||||
sanitize_html("<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>")
|
||||
end
|
||||
end
|
||||
|
||||
Sanitize::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
||||
if attribute_name != 'style'
|
||||
define_method "test_should_allow_#{attribute_name}_attribute" do
|
||||
assert_equal "<p #{attribute_name}=\"foo\">foo <bad>bar</bad> baz</p>",
|
||||
sanitize_html("<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
Sanitize::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
||||
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
|
||||
assert_equal "<p>foo <bad>bar</bad> baz</p>",
|
||||
sanitize_html("<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>")
|
||||
end
|
||||
end
|
||||
|
||||
Sanitize::ALLOWED_PROTOCOLS.each do |protocol|
|
||||
define_method "test_should_allow_#{protocol}_uris" do
|
||||
assert_equal "<a href=\"#{protocol}\">foo</a>",
|
||||
sanitize_html(%(<a href="#{protocol}">foo</a>))
|
||||
end
|
||||
end
|
||||
|
||||
Sanitize::ALLOWED_PROTOCOLS.each do |protocol|
|
||||
define_method "test_should_allow_uppercase_#{protocol}_uris" do
|
||||
assert_equal "<a href=\"#{protocol.upcase}\">foo</a>",
|
||||
sanitize_html(%(<a href="#{protocol.upcase}">foo</a>))
|
||||
end
|
||||
end
|
||||
|
||||
def test_should_allow_anchors
|
||||
assert_equal "<a href=\"foo\"><script>baz</script></a>",
|
||||
sanitize_html("<a href='foo' onclick='bar'><script>baz</script></a>")
|
||||
end
|
||||
|
||||
# RFC 3986, sec 4.2
|
||||
def test_allow_colons_in_path_component
|
||||
assert_equal "<a href=\"./this:that\">foo</a>",
|
||||
sanitize_html("<a href=\"./this:that\">foo</a>")
|
||||
end
|
||||
|
||||
%w(src width height alt).each do |img_attr|
|
||||
define_method "test_should_allow_image_#{img_attr}_attribute" do
|
||||
assert_equal "<img #{img_attr}=\"foo\" />",
|
||||
sanitize_html("<img #{img_attr}='foo' onclick='bar' />")
|
||||
end
|
||||
end
|
||||
|
||||
def test_should_handle_non_html
|
||||
assert_equal 'abc', sanitize_html("abc")
|
||||
end
|
||||
|
||||
def test_should_handle_blank_text
|
||||
assert_equal '', sanitize_html('')
|
||||
end
|
||||
|
||||
[%w(img src), %w(a href)].each do |(tag, attr)|
|
||||
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols" do
|
||||
assert_equal %(<#{tag} title="1">boo</#{tag}>), sanitize_html(%(<#{tag} #{attr}="javascript:XSS" title="1">boo</#{tag}>))
|
||||
end
|
||||
end
|
||||
|
||||
[%w(img src), %w(a href)].each do |(tag, attr)|
|
||||
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols_and_whitespace" do
|
||||
assert_equal %(<#{tag} title="1">boo</#{tag}>), sanitize_html(%(<#{tag} #{attr}=" javascript:XSS" title="1">boo</#{tag}>))
|
||||
end
|
||||
end
|
||||
|
||||
[%(<img src="javascript:alert('XSS');" />),
|
||||
%(<img src=javascript:alert('XSS') />),
|
||||
%(<img src="JaVaScRiPt:alert('XSS')" />),
|
||||
%(<img src='javascript:alert("XSS")' />),
|
||||
%(<img src='javascript:alert(String.fromCharCode(88,83,83))' />),
|
||||
%(<img src='javascript:alert('XSS')' />),
|
||||
%(<img src='javascript:alert('XSS')' />),
|
||||
%(<img src='javascript:alert('XSS')' />),
|
||||
%(<img src="jav\tascript:alert('XSS');" />),
|
||||
%(<img src="jav	ascript:alert('XSS');" />),
|
||||
%(<img src="jav
ascript:alert('XSS');" />),
|
||||
%(<img src="jav
ascript:alert('XSS');" />),
|
||||
%(<img src="  javascript:alert('XSS');" />),
|
||||
%(<img src=" javascript:alert('XSS');" />),
|
||||
%(<img src=" javascript:alert('XSS');" />)].each_with_index do |img_hack, i|
|
||||
define_method "test_should_not_fall_for_xss_image_hack_#{i}" do
|
||||
assert_equal "<img />", sanitize_html(img_hack)
|
||||
end
|
||||
end
|
||||
|
||||
def test_should_sanitize_tag_broken_up_by_null
|
||||
assert_equal "<scr>alert(\"XSS\")</scr>", sanitize_html(%(<scr\0ipt>alert(\"XSS\")</scr\0ipt>))
|
||||
end
|
||||
|
||||
def test_should_sanitize_invalid_script_tag
|
||||
assert_equal "<script /></script>", sanitize_html(%(<script/XSS SRC="http://ha.ckers.org/xss.js"></script>))
|
||||
end
|
||||
|
||||
def test_should_sanitize_script_tag_with_multiple_open_brackets
|
||||
assert_equal "<<script>alert(\"XSS\");//<</script>", sanitize_html(%(<<script>alert("XSS");//<</script>))
|
||||
assert_equal %(<iframe src="http:" /><), sanitize_html(%(<iframe src=http://ha.ckers.org/scriptlet.html\n<))
|
||||
end
|
||||
|
||||
def test_should_sanitize_unclosed_script
|
||||
assert_equal "<script src=\"http:\" /><b>", sanitize_html(%(<script src=http://ha.ckers.org/xss.js?<b>))
|
||||
end
|
||||
|
||||
def test_should_sanitize_half_open_scripts
|
||||
assert_equal "<img>", sanitize_html(%(<img src="javascript:alert('XSS')"))
|
||||
end
|
||||
|
||||
def test_should_not_fall_for_ridiculous_hack
|
||||
img_hack = %(<img\nsrc\n=\n"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n"\n />)
|
||||
assert_equal "<img />", sanitize_html(img_hack)
|
||||
end
|
||||
|
||||
def test_platypus
|
||||
assert_equal %(<a href=\"http://www.ragingplatypus.com/\" style=\"display: block; width: 100%; height: 100%; background-color: black; background-image: ; background-x: center; background-y: center;\">never trust your upstream platypus</a>),
|
||||
sanitize_html(%(<a href="http://www.ragingplatypus.com/" style="display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;">never trust your upstream platypus</a>))
|
||||
end
|
||||
|
||||
def test_xul
|
||||
assert_equal %(<p style="">fubar</p>),
|
||||
sanitize_html(%(<p style="-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')">fubar</p>))
|
||||
end
|
||||
|
||||
def test_input_image
|
||||
assert_equal %(<input type="image" />),
|
||||
sanitize_html(%(<input type="image" src="javascript:alert('XSS');" />))
|
||||
end
|
||||
|
||||
def test_non_alpha_non_digit
|
||||
assert_equal "<script /></script>",
|
||||
sanitize_html(%(<script/XSS src="http://ha.ckers.org/xss.js"></script>))
|
||||
assert_equal "<a>foo</a>",
|
||||
sanitize_html('<a onclick!#$%&()*~+-_.,:;?@[/|\]^`=alert("XSS")>foo</a>')
|
||||
assert_equal "<img />",
|
||||
sanitize_html('<img/src="http://ha.ckers.org/xss.js"/>')
|
||||
end
|
||||
|
||||
def test_img_dynsrc_lowsrc
|
||||
assert_equal "<img />",
|
||||
sanitize_html(%(<img dynsrc="javascript:alert('XSS')" />))
|
||||
assert_equal "<img />",
|
||||
sanitize_html(%(<img lowsrc="javascript:alert('XSS')" />))
|
||||
end
|
||||
|
||||
def test_div_background_image_unicode_encoded
|
||||
assert_equal '<div style="">foo</div>',
|
||||
sanitize_html(%(<div style="background-image:\0075\0072\006C\0028'\006a\0061\0076\0061\0073\0063\0072\0069\0070\0074\003a\0061\006c\0065\0072\0074\0028.1027\0058.1053\0053\0027\0029'\0029">foo</div>))
|
||||
end
|
||||
|
||||
def test_div_expression
|
||||
assert_equal '<div style="">foo</div>',
|
||||
sanitize_html(%(<div style="width: expression(alert('XSS'));">foo</div>))
|
||||
end
|
||||
|
||||
def test_img_vbscript
|
||||
assert_equal '<img />',
|
||||
sanitize_html(%(<img src='vbscript:msgbox("XSS")' />))
|
||||
end
|
||||
|
||||
end
|
|
@ -32,7 +32,7 @@ module Engines
|
|||
redcloth.filter_html = false
|
||||
redcloth.no_span_caps = false
|
||||
html = redcloth.to_html(:textile)
|
||||
sanitize_html(html)
|
||||
sanitize_xhtml(html)
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -43,7 +43,7 @@ module Engines
|
|||
require_dependency 'maruku'
|
||||
require_dependency 'maruku/ext/math'
|
||||
html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"), {:math_enabled => false}).to_html
|
||||
sanitize_html(html).to_ncr
|
||||
sanitize_xhtml(html.to_ncr)
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -55,7 +55,7 @@ module Engines
|
|||
require_dependency 'maruku/ext/math'
|
||||
html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
|
||||
{:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html
|
||||
sanitize_html(html).to_ncr
|
||||
sanitize_xhtml(html.to_ncr)
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -68,7 +68,7 @@ module Engines
|
|||
redcloth.filter_html = false
|
||||
redcloth.no_span_caps = false
|
||||
html = redcloth.to_html
|
||||
sanitize_html(html)
|
||||
sanitize_xhtml(html)
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -78,7 +78,7 @@ module Engines
|
|||
def mask
|
||||
require_dependency 'rdocsupport'
|
||||
html = RDocSupport::RDocFormatter.new(@content).to_html
|
||||
sanitize_html(html)
|
||||
sanitize_xhtml(html)
|
||||
end
|
||||
end
|
||||
|
||||
|
|
207
lib/sanitize.rb
207
lib/sanitize.rb
|
@ -3,205 +3,24 @@ module Sanitize
|
|||
# This module provides sanitization of XHTML+MathML+SVG
|
||||
# and of inline style attributes.
|
||||
#
|
||||
# Based heavily on Sam Ruby's code in the Universal FeedParser.
|
||||
|
||||
require 'html/tokenizer'
|
||||
require 'node'
|
||||
|
||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
||||
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
||||
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
|
||||
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
|
||||
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
|
||||
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
|
||||
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
|
||||
'ul', 'var']
|
||||
|
||||
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
|
||||
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
|
||||
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
|
||||
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
|
||||
'munderover', 'none']
|
||||
|
||||
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
||||
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
|
||||
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
|
||||
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
|
||||
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
|
||||
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
|
||||
|
||||
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
||||
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
|
||||
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
|
||||
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
|
||||
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
|
||||
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
|
||||
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
|
||||
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
||||
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
|
||||
'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', 'title',
|
||||
'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang']
|
||||
# Uses the HTML5lib parser, so that the parsing behaviour should
|
||||
# resemble that of browsers.
|
||||
#
|
||||
# sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
|
||||
# sanitize_html() is a case-insensitive sanitizer suitable for HTML
|
||||
|
||||
|
||||
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
|
||||
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
|
||||
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
|
||||
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
|
||||
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
|
||||
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
|
||||
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
|
||||
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
|
||||
'xlink:type', 'xmlns', 'xmlns:xlink']
|
||||
require 'html5lib/sanitizer'
|
||||
require 'html5lib/html5parser'
|
||||
require 'html5lib/liberalxmlparser'
|
||||
include HTML5lib
|
||||
|
||||
def sanitize_xhtml(html)
|
||||
XHTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
|
||||
end
|
||||
|
||||
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
|
||||
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
||||
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
||||
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
|
||||
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
|
||||
'font-family', 'font-size', 'font-stretch', 'font-style', 'font-variant',
|
||||
'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
|
||||
'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
|
||||
'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
|
||||
'lang', 'marker-end', 'marker-mid', 'marker-start', 'markerHeight',
|
||||
'markerUnits', 'markerWidth', 'mathematical', 'max', 'min', 'name',
|
||||
'offset', 'opacity', 'orient', 'origin', 'overline-position',
|
||||
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
|
||||
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount', 'repeatDur',
|
||||
'requiredExtensions', 'requiredFeatures', 'restart', 'rotate', 'rx',
|
||||
'ry', 'slope', 'stemh', 'stemv', 'stop-color', 'stop-opacity',
|
||||
'strikethrough-position', 'strikethrough-thickness', 'stroke',
|
||||
'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
|
||||
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
|
||||
'stroke-width', 'systemLanguage', 'target',
|
||||
'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
|
||||
'underline-position', 'underline-thickness', 'unicode',
|
||||
'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
|
||||
'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
|
||||
'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
|
||||
'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
|
||||
'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
|
||||
|
||||
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'xlink:href']
|
||||
|
||||
acceptable_css_properties = ['azimuth', 'background-color',
|
||||
'border-bottom-color', 'border-collapse', 'border-color',
|
||||
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
|
||||
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
|
||||
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
|
||||
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
|
||||
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
|
||||
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
|
||||
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
|
||||
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
|
||||
'white-space', 'width']
|
||||
|
||||
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
|
||||
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
|
||||
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
|
||||
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
|
||||
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
|
||||
'transparent', 'underline', 'white', 'yellow']
|
||||
|
||||
acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
|
||||
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
|
||||
'stroke-opacity']
|
||||
|
||||
acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
|
||||
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
|
||||
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
|
||||
'ssh', 'sftp', 'rtsp', 'afs' ]
|
||||
|
||||
ALLOWED_ELEMENTS = acceptable_elements + mathml_elements + svg_elements unless defined?(ALLOWED_ELEMENTS)
|
||||
ALLOWED_ATTRIBUTES = acceptable_attributes + mathml_attributes + svg_attributes unless defined?(ALLOWED_ATTRIBUTES)
|
||||
ALLOWED_CSS_PROPERTIES = acceptable_css_properties unless defined?(ALLOWED_CSS_PROPERTIES)
|
||||
ALLOWED_CSS_KEYWORDS = acceptable_css_keywords unless defined?(ALLOWED_CSS_KEYWORDS)
|
||||
ALLOWED_SVG_PROPERTIES = acceptable_svg_properties unless defined?(ALLOWED_SVG_PROPERTIES)
|
||||
ALLOWED_PROTOCOLS = acceptable_protocols unless defined?(ALLOWED_PROTOCOLS)
|
||||
ATTR_VAL_IS_URI = attr_val_is_uri unless defined?(ATTR_VAL_IS_URI)
|
||||
|
||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and stripping out all
|
||||
# attributes not in ALLOWED_ATTRIBUTES. Style attributes are parsed, and a restricted set,
|
||||
# specified by ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
||||
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified in
|
||||
# ALLOWED_PROTOCOLS are allowed.
|
||||
# You can adjust what gets sanitized, by defining these constant arrays before this Module is loaded.
|
||||
#
|
||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||
# => <script> do_nasty_stuff() </script>
|
||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||
# => <a>Click here for $100</a>
|
||||
def sanitize_html(html)
|
||||
if html.index("<")
|
||||
tokenizer = HTML::Tokenizer.new(html)
|
||||
new_text = ""
|
||||
|
||||
while token = tokenizer.next
|
||||
node = XHTML::Node.parse(nil, 0, 0, token, false)
|
||||
new_text << case node.tag?
|
||||
when true
|
||||
if ALLOWED_ELEMENTS.include?(node.name)
|
||||
if node.closing != :close
|
||||
node.attributes.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
|
||||
ATTR_VAL_IS_URI.each do |attr|
|
||||
val_unescaped = CGI.unescapeHTML(node.attributes[attr].to_s).gsub(/[\000-\040\177-\240]+/,'').downcase
|
||||
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
|
||||
node.attributes.delete attr
|
||||
end
|
||||
end
|
||||
if node.attributes['style']
|
||||
node.attributes['style'] = sanitize_css(node.attributes['style'])
|
||||
end
|
||||
end
|
||||
node.to_s
|
||||
else
|
||||
node.to_s.gsub(/</, "<")
|
||||
end
|
||||
else
|
||||
node.to_s.gsub(/</, "<")
|
||||
end
|
||||
HTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
|
||||
end
|
||||
|
||||
html = new_text
|
||||
end
|
||||
html
|
||||
end
|
||||
|
||||
def sanitize_css(style)
|
||||
# disallow urls
|
||||
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
|
||||
|
||||
# gauntlet
|
||||
if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
|
||||
style = ''
|
||||
return style
|
||||
end
|
||||
if style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
|
||||
style = ''
|
||||
return style
|
||||
end
|
||||
|
||||
clean = []
|
||||
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
|
||||
if ALLOWED_CSS_PROPERTIES.include?(prop.downcase)
|
||||
clean << prop + ': ' + val + ';'
|
||||
elsif ['background','border','margin','padding'].include?(prop.split('-')[0].downcase)
|
||||
goodval = true
|
||||
val.split().each do |keyword|
|
||||
if !ALLOWED_CSS_KEYWORDS.include?(keyword) and
|
||||
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
|
||||
goodval = false
|
||||
end
|
||||
end
|
||||
if goodval
|
||||
clean << prop + ': ' + val + ';'
|
||||
end
|
||||
elsif ALLOWED_SVG_PROPERTIES.include?(prop.downcase)
|
||||
clean << prop + ': ' + val + ';'
|
||||
end
|
||||
end
|
||||
|
||||
style = clean.join(' ')
|
||||
end
|
||||
end
|
||||
|
|
9
vendor/plugins/HTML5lib/README
vendored
Normal file
9
vendor/plugins/HTML5lib/README
vendored
Normal file
|
@ -0,0 +1,9 @@
|
|||
= HTML5lib
|
||||
|
||||
== Basic Usage
|
||||
|
||||
require 'html5lib'
|
||||
|
||||
doc = HTML5lib.parse('<html>...</html>')
|
||||
|
||||
doc.class # REXML::Document
|
7
vendor/plugins/HTML5lib/Rakefile.rb
vendored
Normal file
7
vendor/plugins/HTML5lib/Rakefile.rb
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
require 'rake'
|
||||
require 'rake/testtask'
|
||||
|
||||
Rake::TestTask.new do |task|
|
||||
task.pattern = 'tests/test_*.rb'
|
||||
task.verbose = true
|
||||
end
|
11
vendor/plugins/HTML5lib/lib/html5lib.rb
vendored
Normal file
11
vendor/plugins/HTML5lib/lib/html5lib.rb
vendored
Normal file
|
@ -0,0 +1,11 @@
|
|||
require 'html5lib/html5parser'
|
||||
|
||||
module HTML5lib
|
||||
def self.parse(stream, options={})
|
||||
HTMLParser.parse(stream, options)
|
||||
end
|
||||
|
||||
def self.parseFragment(stream, options={})
|
||||
HTMLParser.parse(stream, options)
|
||||
end
|
||||
end
|
676
vendor/plugins/HTML5lib/lib/html5lib/constants.rb
vendored
Executable file
676
vendor/plugins/HTML5lib/lib/html5lib/constants.rb
vendored
Executable file
|
@ -0,0 +1,676 @@
|
|||
module HTML5lib
|
||||
|
||||
class EOF < Exception; end
|
||||
|
||||
CONTENT_MODEL_FLAGS = [
|
||||
:PCDATA,
|
||||
:RCDATA,
|
||||
:CDATA,
|
||||
:PLAINTEXT
|
||||
]
|
||||
|
||||
SCOPING_ELEMENTS = %w[
|
||||
button
|
||||
caption
|
||||
html
|
||||
marquee
|
||||
object
|
||||
table
|
||||
td
|
||||
th
|
||||
]
|
||||
|
||||
FORMATTING_ELEMENTS = %w[
|
||||
a
|
||||
b
|
||||
big
|
||||
em
|
||||
font
|
||||
i
|
||||
nobr
|
||||
s
|
||||
small
|
||||
strike
|
||||
strong
|
||||
tt
|
||||
u
|
||||
]
|
||||
|
||||
SPECIAL_ELEMENTS = %w[
|
||||
address
|
||||
area
|
||||
base
|
||||
basefont
|
||||
bgsound
|
||||
blockquote
|
||||
body
|
||||
br
|
||||
center
|
||||
col
|
||||
colgroup
|
||||
dd
|
||||
dir
|
||||
div
|
||||
dl
|
||||
dt
|
||||
embed
|
||||
fieldset
|
||||
form
|
||||
frame
|
||||
frameset
|
||||
h1
|
||||
h2
|
||||
h3
|
||||
h4
|
||||
h5
|
||||
h6
|
||||
head
|
||||
hr
|
||||
iframe
|
||||
image
|
||||
img
|
||||
input
|
||||
isindex
|
||||
li
|
||||
link
|
||||
listing
|
||||
menu
|
||||
meta
|
||||
noembed
|
||||
noframes
|
||||
noscript
|
||||
ol
|
||||
optgroup
|
||||
option
|
||||
p
|
||||
param
|
||||
plaintext
|
||||
pre
|
||||
script
|
||||
select
|
||||
spacer
|
||||
style
|
||||
tbody
|
||||
textarea
|
||||
tfoot
|
||||
thead
|
||||
title
|
||||
tr
|
||||
ul
|
||||
wbr
|
||||
]
|
||||
|
||||
SPACE_CHARACTERS = %W[
|
||||
\t
|
||||
\n
|
||||
\x0B
|
||||
\x0C
|
||||
\x20
|
||||
\r
|
||||
]
|
||||
|
||||
TABLE_INSERT_MODE_ELEMENTS = %w[
|
||||
table
|
||||
tbody
|
||||
tfoot
|
||||
thead
|
||||
tr
|
||||
]
|
||||
|
||||
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
|
||||
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
|
||||
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
|
||||
DIGITS = '0'..'9'
|
||||
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
|
||||
|
||||
# Heading elements need to be ordered
|
||||
HEADING_ELEMENTS = %w[
|
||||
h1
|
||||
h2
|
||||
h3
|
||||
h4
|
||||
h5
|
||||
h6
|
||||
]
|
||||
|
||||
# XXX What about event-source and command?
|
||||
VOID_ELEMENTS = %w[
|
||||
base
|
||||
link
|
||||
meta
|
||||
hr
|
||||
br
|
||||
img
|
||||
embed
|
||||
param
|
||||
area
|
||||
col
|
||||
input
|
||||
]
|
||||
|
||||
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
|
||||
ENTITIES_WINDOWS1252 = [
|
||||
8364, # 0x80 0x20AC EURO SIGN
|
||||
65533, # 0x81 UNDEFINED
|
||||
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
|
||||
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
|
||||
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
|
||||
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
|
||||
8224, # 0x86 0x2020 DAGGER
|
||||
8225, # 0x87 0x2021 DOUBLE DAGGER
|
||||
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
|
||||
8240, # 0x89 0x2030 PER MILLE SIGN
|
||||
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
|
||||
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
|
||||
65533, # 0x8D UNDEFINED
|
||||
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
|
||||
65533, # 0x8F UNDEFINED
|
||||
65533, # 0x90 UNDEFINED
|
||||
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
|
||||
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
|
||||
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
|
||||
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
|
||||
8226, # 0x95 0x2022 BULLET
|
||||
8211, # 0x96 0x2013 EN DASH
|
||||
8212, # 0x97 0x2014 EM DASH
|
||||
732, # 0x98 0x02DC SMALL TILDE
|
||||
8482, # 0x99 0x2122 TRADE MARK SIGN
|
||||
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
|
||||
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
|
||||
65533, # 0x9D UNDEFINED
|
||||
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
|
||||
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||
]
|
||||
|
||||
private
|
||||
|
||||
def self.U n
|
||||
[n].pack('U')
|
||||
end
|
||||
|
||||
public
|
||||
|
||||
ENTITIES = {
|
||||
"AElig" => U(0xC6),
|
||||
"Aacute" => U(0xC1),
|
||||
"Acirc" => U(0xC2),
|
||||
"Agrave" => U(0xC0),
|
||||
"Alpha" => U(0x0391),
|
||||
"Aring" => U(0xC5),
|
||||
"Atilde" => U(0xC3),
|
||||
"Auml" => U(0xC4),
|
||||
"Beta" => U(0x0392),
|
||||
"Ccedil" => U(0xC7),
|
||||
"Chi" => U(0x03A7),
|
||||
"Dagger" => U(0x2021),
|
||||
"Delta" => U(0x0394),
|
||||
"ETH" => U(0xD0),
|
||||
"Eacute" => U(0xC9),
|
||||
"Ecirc" => U(0xCA),
|
||||
"Egrave" => U(0xC8),
|
||||
"Epsilon" => U(0x0395),
|
||||
"Eta" => U(0x0397),
|
||||
"Euml" => U(0xCB),
|
||||
"Gamma" => U(0x0393),
|
||||
"Iacute" => U(0xCD),
|
||||
"Icirc" => U(0xCE),
|
||||
"Igrave" => U(0xCC),
|
||||
"Iota" => U(0x0399),
|
||||
"Iuml" => U(0xCF),
|
||||
"Kappa" => U(0x039A),
|
||||
"Lambda" => U(0x039B),
|
||||
"Mu" => U(0x039C),
|
||||
"Ntilde" => U(0xD1),
|
||||
"Nu" => U(0x039D),
|
||||
"OElig" => U(0x0152),
|
||||
"Oacute" => U(0xD3),
|
||||
"Ocirc" => U(0xD4),
|
||||
"Ograve" => U(0xD2),
|
||||
"Omega" => U(0x03A9),
|
||||
"Omicron" => U(0x039F),
|
||||
"Oslash" => U(0xD8),
|
||||
"Otilde" => U(0xD5),
|
||||
"Ouml" => U(0xD6),
|
||||
"Phi" => U(0x03A6),
|
||||
"Pi" => U(0x03A0),
|
||||
"Prime" => U(0x2033),
|
||||
"Psi" => U(0x03A8),
|
||||
"Rho" => U(0x03A1),
|
||||
"Scaron" => U(0x0160),
|
||||
"Sigma" => U(0x03A3),
|
||||
"THORN" => U(0xDE),
|
||||
"Tau" => U(0x03A4),
|
||||
"Theta" => U(0x0398),
|
||||
"Uacute" => U(0xDA),
|
||||
"Ucirc" => U(0xDB),
|
||||
"Ugrave" => U(0xD9),
|
||||
"Upsilon" => U(0x03A5),
|
||||
"Uuml" => U(0xDC),
|
||||
"Xi" => U(0x039E),
|
||||
"Yacute" => U(0xDD),
|
||||
"Yuml" => U(0x0178),
|
||||
"Zeta" => U(0x0396),
|
||||
"aacute" => U(0xE1),
|
||||
"acirc" => U(0xE2),
|
||||
"acute" => U(0xB4),
|
||||
"aelig" => U(0xE6),
|
||||
"agrave" => U(0xE0),
|
||||
"alefsym" => U(0x2135),
|
||||
"alpha" => U(0x03B1),
|
||||
"amp" => U(0x26),
|
||||
"AMP" => U(0x26),
|
||||
"and" => U(0x2227),
|
||||
"ang" => U(0x2220),
|
||||
"apos" => U(0x27),
|
||||
"aring" => U(0xE5),
|
||||
"asymp" => U(0x2248),
|
||||
"atilde" => U(0xE3),
|
||||
"auml" => U(0xE4),
|
||||
"bdquo" => U(0x201E),
|
||||
"beta" => U(0x03B2),
|
||||
"brvbar" => U(0xA6),
|
||||
"bull" => U(0x2022),
|
||||
"cap" => U(0x2229),
|
||||
"ccedil" => U(0xE7),
|
||||
"cedil" => U(0xB8),
|
||||
"cent" => U(0xA2),
|
||||
"chi" => U(0x03C7),
|
||||
"circ" => U(0x02C6),
|
||||
"clubs" => U(0x2663),
|
||||
"cong" => U(0x2245),
|
||||
"copy" => U(0xA9),
|
||||
"COPY" => U(0xA9),
|
||||
"crarr" => U(0x21B5),
|
||||
"cup" => U(0x222A),
|
||||
"curren" => U(0xA4),
|
||||
"dArr" => U(0x21D3),
|
||||
"dagger" => U(0x2020),
|
||||
"darr" => U(0x2193),
|
||||
"deg" => U(0xB0),
|
||||
"delta" => U(0x03B4),
|
||||
"diams" => U(0x2666),
|
||||
"divide" => U(0xF7),
|
||||
"eacute" => U(0xE9),
|
||||
"ecirc" => U(0xEA),
|
||||
"egrave" => U(0xE8),
|
||||
"empty" => U(0x2205),
|
||||
"emsp" => U(0x2003),
|
||||
"ensp" => U(0x2002),
|
||||
"epsilon" => U(0x03B5),
|
||||
"equiv" => U(0x2261),
|
||||
"eta" => U(0x03B7),
|
||||
"eth" => U(0xF0),
|
||||
"euml" => U(0xEB),
|
||||
"euro" => U(0x20AC),
|
||||
"exist" => U(0x2203),
|
||||
"fnof" => U(0x0192),
|
||||
"forall" => U(0x2200),
|
||||
"frac12" => U(0xBD),
|
||||
"frac14" => U(0xBC),
|
||||
"frac34" => U(0xBE),
|
||||
"frasl" => U(0x2044),
|
||||
"gamma" => U(0x03B3),
|
||||
"ge" => U(0x2265),
|
||||
"gt" => U(0x3E),
|
||||
"GT" => U(0x3E),
|
||||
"hArr" => U(0x21D4),
|
||||
"harr" => U(0x2194),
|
||||
"hearts" => U(0x2665),
|
||||
"hellip" => U(0x2026),
|
||||
"iacute" => U(0xED),
|
||||
"icirc" => U(0xEE),
|
||||
"iexcl" => U(0xA1),
|
||||
"igrave" => U(0xEC),
|
||||
"image" => U(0x2111),
|
||||
"infin" => U(0x221E),
|
||||
"int" => U(0x222B),
|
||||
"iota" => U(0x03B9),
|
||||
"iquest" => U(0xBF),
|
||||
"isin" => U(0x2208),
|
||||
"iuml" => U(0xEF),
|
||||
"kappa" => U(0x03BA),
|
||||
"lArr" => U(0x21D0),
|
||||
"lambda" => U(0x03BB),
|
||||
"lang" => U(0x2329),
|
||||
"laquo" => U(0xAB),
|
||||
"larr" => U(0x2190),
|
||||
"lceil" => U(0x2308),
|
||||
"ldquo" => U(0x201C),
|
||||
"le" => U(0x2264),
|
||||
"lfloor" => U(0x230A),
|
||||
"lowast" => U(0x2217),
|
||||
"loz" => U(0x25CA),
|
||||
"lrm" => U(0x200E),
|
||||
"lsaquo" => U(0x2039),
|
||||
"lsquo" => U(0x2018),
|
||||
"lt" => U(0x3C),
|
||||
"LT" => U(0x3C),
|
||||
"macr" => U(0xAF),
|
||||
"mdash" => U(0x2014),
|
||||
"micro" => U(0xB5),
|
||||
"middot" => U(0xB7),
|
||||
"minus" => U(0x2212),
|
||||
"mu" => U(0x03BC),
|
||||
"nabla" => U(0x2207),
|
||||
"nbsp" => U(0xA0),
|
||||
"ndash" => U(0x2013),
|
||||
"ne" => U(0x2260),
|
||||
"ni" => U(0x220B),
|
||||
"not" => U(0xAC),
|
||||
"notin" => U(0x2209),
|
||||
"nsub" => U(0x2284),
|
||||
"ntilde" => U(0xF1),
|
||||
"nu" => U(0x03BD),
|
||||
"oacute" => U(0xF3),
|
||||
"ocirc" => U(0xF4),
|
||||
"oelig" => U(0x0153),
|
||||
"ograve" => U(0xF2),
|
||||
"oline" => U(0x203E),
|
||||
"omega" => U(0x03C9),
|
||||
"omicron" => U(0x03BF),
|
||||
"oplus" => U(0x2295),
|
||||
"or" => U(0x2228),
|
||||
"ordf" => U(0xAA),
|
||||
"ordm" => U(0xBA),
|
||||
"oslash" => U(0xF8),
|
||||
"otilde" => U(0xF5),
|
||||
"otimes" => U(0x2297),
|
||||
"ouml" => U(0xF6),
|
||||
"para" => U(0xB6),
|
||||
"part" => U(0x2202),
|
||||
"permil" => U(0x2030),
|
||||
"perp" => U(0x22A5),
|
||||
"phi" => U(0x03C6),
|
||||
"pi" => U(0x03C0),
|
||||
"piv" => U(0x03D6),
|
||||
"plusmn" => U(0xB1),
|
||||
"pound" => U(0xA3),
|
||||
"prime" => U(0x2032),
|
||||
"prod" => U(0x220F),
|
||||
"prop" => U(0x221D),
|
||||
"psi" => U(0x03C8),
|
||||
"quot" => U(0x22),
|
||||
"QUOT" => U(0x22),
|
||||
"rArr" => U(0x21D2),
|
||||
"radic" => U(0x221A),
|
||||
"rang" => U(0x232A),
|
||||
"raquo" => U(0xBB),
|
||||
"rarr" => U(0x2192),
|
||||
"rceil" => U(0x2309),
|
||||
"rdquo" => U(0x201D),
|
||||
"real" => U(0x211C),
|
||||
"reg" => U(0xAE),
|
||||
"REG" => U(0xAE),
|
||||
"rfloor" => U(0x230B),
|
||||
"rho" => U(0x03C1),
|
||||
"rlm" => U(0x200F),
|
||||
"rsaquo" => U(0x203A),
|
||||
"rsquo" => U(0x2019),
|
||||
"sbquo" => U(0x201A),
|
||||
"scaron" => U(0x0161),
|
||||
"sdot" => U(0x22C5),
|
||||
"sect" => U(0xA7),
|
||||
"shy" => U(0xAD),
|
||||
"sigma" => U(0x03C3),
|
||||
"sigmaf" => U(0x03C2),
|
||||
"sim" => U(0x223C),
|
||||
"spades" => U(0x2660),
|
||||
"sub" => U(0x2282),
|
||||
"sube" => U(0x2286),
|
||||
"sum" => U(0x2211),
|
||||
"sup" => U(0x2283),
|
||||
"sup1" => U(0xB9),
|
||||
"sup2" => U(0xB2),
|
||||
"sup3" => U(0xB3),
|
||||
"supe" => U(0x2287),
|
||||
"szlig" => U(0xDF),
|
||||
"tau" => U(0x03C4),
|
||||
"there4" => U(0x2234),
|
||||
"theta" => U(0x03B8),
|
||||
"thetasym" => U(0x03D1),
|
||||
"thinsp" => U(0x2009),
|
||||
"thorn" => U(0xFE),
|
||||
"tilde" => U(0x02DC),
|
||||
"times" => U(0xD7),
|
||||
"trade" => U(0x2122),
|
||||
"uArr" => U(0x21D1),
|
||||
"uacute" => U(0xFA),
|
||||
"uarr" => U(0x2191),
|
||||
"ucirc" => U(0xFB),
|
||||
"ugrave" => U(0xF9),
|
||||
"uml" => U(0xA8),
|
||||
"upsih" => U(0x03D2),
|
||||
"upsilon" => U(0x03C5),
|
||||
"uuml" => U(0xFC),
|
||||
"weierp" => U(0x2118),
|
||||
"xi" => U(0x03BE),
|
||||
"yacute" => U(0xFD),
|
||||
"yen" => U(0xA5),
|
||||
"yuml" => U(0xFF),
|
||||
"zeta" => U(0x03B6),
|
||||
"zwj" => U(0x200D),
|
||||
"zwnj" => U(0x200C)
|
||||
}
|
||||
|
||||
ENCODINGS = %w[
|
||||
ansi_x3.4-1968
|
||||
iso-ir-6
|
||||
ansi_x3.4-1986
|
||||
iso_646.irv:1991
|
||||
ascii
|
||||
iso646-us
|
||||
us-ascii
|
||||
us
|
||||
ibm367
|
||||
cp367
|
||||
csascii
|
||||
ks_c_5601-1987
|
||||
korean
|
||||
iso-2022-kr
|
||||
csiso2022kr
|
||||
euc-kr
|
||||
iso-2022-jp
|
||||
csiso2022jp
|
||||
iso-2022-jp-2
|
||||
iso-ir-58
|
||||
chinese
|
||||
csiso58gb231280
|
||||
iso_8859-1:1987
|
||||
iso-ir-100
|
||||
iso_8859-1
|
||||
iso-8859-1
|
||||
latin1
|
||||
l1
|
||||
ibm819
|
||||
cp819
|
||||
csisolatin1
|
||||
iso_8859-2:1987
|
||||
iso-ir-101
|
||||
iso_8859-2
|
||||
iso-8859-2
|
||||
latin2
|
||||
l2
|
||||
csisolatin2
|
||||
iso_8859-3:1988
|
||||
iso-ir-109
|
||||
iso_8859-3
|
||||
iso-8859-3
|
||||
latin3
|
||||
l3
|
||||
csisolatin3
|
||||
iso_8859-4:1988
|
||||
iso-ir-110
|
||||
iso_8859-4
|
||||
iso-8859-4
|
||||
latin4
|
||||
l4
|
||||
csisolatin4
|
||||
iso_8859-6:1987
|
||||
iso-ir-127
|
||||
iso_8859-6
|
||||
iso-8859-6
|
||||
ecma-114
|
||||
asmo-708
|
||||
arabic
|
||||
csisolatinarabic
|
||||
iso_8859-7:1987
|
||||
iso-ir-126
|
||||
iso_8859-7
|
||||
iso-8859-7
|
||||
elot_928
|
||||
ecma-118
|
||||
greek
|
||||
greek8
|
||||
csisolatingreek
|
||||
iso_8859-8:1988
|
||||
iso-ir-138
|
||||
iso_8859-8
|
||||
iso-8859-8
|
||||
hebrew
|
||||
csisolatinhebrew
|
||||
iso_8859-5:1988
|
||||
iso-ir-144
|
||||
iso_8859-5
|
||||
iso-8859-5
|
||||
cyrillic
|
||||
csisolatincyrillic
|
||||
iso_8859-9:1989
|
||||
iso-ir-148
|
||||
iso_8859-9
|
||||
iso-8859-9
|
||||
latin5
|
||||
l5
|
||||
csisolatin5
|
||||
iso-8859-10
|
||||
iso-ir-157
|
||||
l6
|
||||
iso_8859-10:1992
|
||||
csisolatin6
|
||||
latin6
|
||||
hp-roman8
|
||||
roman8
|
||||
r8
|
||||
ibm037
|
||||
cp037
|
||||
csibm037
|
||||
ibm424
|
||||
cp424
|
||||
csibm424
|
||||
ibm437
|
||||
cp437
|
||||
437
|
||||
cspc8codepage437
|
||||
ibm500
|
||||
cp500
|
||||
csibm500
|
||||
ibm775
|
||||
cp775
|
||||
cspc775baltic
|
||||
ibm850
|
||||
cp850
|
||||
850
|
||||
cspc850multilingual
|
||||
ibm852
|
||||
cp852
|
||||
852
|
||||
cspcp852
|
||||
ibm855
|
||||
cp855
|
||||
855
|
||||
csibm855
|
||||
ibm857
|
||||
cp857
|
||||
857
|
||||
csibm857
|
||||
ibm860
|
||||
cp860
|
||||
860
|
||||
csibm860
|
||||
ibm861
|
||||
cp861
|
||||
861
|
||||
cp-is
|
||||
csibm861
|
||||
ibm862
|
||||
cp862
|
||||
862
|
||||
cspc862latinhebrew
|
||||
ibm863
|
||||
cp863
|
||||
863
|
||||
csibm863
|
||||
ibm864
|
||||
cp864
|
||||
csibm864
|
||||
ibm865
|
||||
cp865
|
||||
865
|
||||
csibm865
|
||||
ibm866
|
||||
cp866
|
||||
866
|
||||
csibm866
|
||||
ibm869
|
||||
cp869
|
||||
869
|
||||
cp-gr
|
||||
csibm869
|
||||
ibm1026
|
||||
cp1026
|
||||
csibm1026
|
||||
koi8-r
|
||||
cskoi8r
|
||||
koi8-u
|
||||
big5-hkscs
|
||||
ptcp154
|
||||
csptcp154
|
||||
pt154
|
||||
cp154
|
||||
utf-7
|
||||
utf-16be
|
||||
utf-16le
|
||||
utf-16
|
||||
utf-8
|
||||
iso-8859-13
|
||||
iso-8859-14
|
||||
iso-ir-199
|
||||
iso_8859-14:1998
|
||||
iso_8859-14
|
||||
latin8
|
||||
iso-celtic
|
||||
l8
|
||||
iso-8859-15
|
||||
iso_8859-15
|
||||
iso-8859-16
|
||||
iso-ir-226
|
||||
iso_8859-16:2001
|
||||
iso_8859-16
|
||||
latin10
|
||||
l10
|
||||
gbk
|
||||
cp936
|
||||
ms936
|
||||
gb18030
|
||||
shift_jis
|
||||
ms_kanji
|
||||
csshiftjis
|
||||
euc-jp
|
||||
gb2312
|
||||
big5
|
||||
csbig5
|
||||
windows-1250
|
||||
windows-1251
|
||||
windows-1252
|
||||
windows-1253
|
||||
windows-1254
|
||||
windows-1255
|
||||
windows-1256
|
||||
windows-1257
|
||||
windows-1258
|
||||
tis-620
|
||||
hz-gb-2312
|
||||
]
|
||||
|
||||
end
|
2020
vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb
vendored
Normal file
2020
vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb
vendored
Normal file
File diff suppressed because it is too large
Load diff
549
vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
vendored
Executable file
549
vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
vendored
Executable file
|
@ -0,0 +1,549 @@
|
|||
require 'stringio'
|
||||
require 'html5lib/constants'
|
||||
|
||||
module HTML5lib
|
||||
|
||||
# Provides a unicode stream of characters to the HTMLTokenizer.
|
||||
|
||||
# This class takes care of character encoding and removing or replacing
|
||||
# incorrect byte-sequences and also provides column and line tracking.
|
||||
|
||||
class HTMLInputStream
|
||||
|
||||
attr_accessor :queue, :charEncoding
|
||||
|
||||
# Initialises the HTMLInputStream.
|
||||
#
|
||||
# HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||
# for use by the HTML5Lib.
|
||||
#
|
||||
# source can be either a file-object, local filename or a string.
|
||||
#
|
||||
# The optional encoding parameter must be a string that indicates
|
||||
# the encoding. If specified, that encoding will be used,
|
||||
# regardless of any BOM or later declaration (such as in a meta
|
||||
# element)
|
||||
#
|
||||
# parseMeta - Look for a <meta> element containing encoding information
|
||||
|
||||
def initialize(source, options = {})
|
||||
@encoding = nil
|
||||
@parseMeta = true
|
||||
@chardet = true
|
||||
|
||||
options.each { |name, value| instance_variable_set("@#{name}", value) }
|
||||
|
||||
# List of where new lines occur
|
||||
@newLines = []
|
||||
|
||||
# Raw Stream
|
||||
@rawStream = openStream(source)
|
||||
|
||||
# Encoding Information
|
||||
#Number of bytes to use when looking for a meta element with
|
||||
#encoding information
|
||||
@NUM_BYTES_META = 512
|
||||
#Encoding to use if no other information can be found
|
||||
@DEFAULT_ENCODING = 'windows-1252'
|
||||
|
||||
#Detect encoding iff no explicit "transport level" encoding is supplied
|
||||
if @encoding.nil? or not HTML5lib.isValidEncoding(@encoding)
|
||||
@charEncoding = detectEncoding
|
||||
else
|
||||
@charEncoding = @encoding
|
||||
end
|
||||
|
||||
# Read bytes from stream decoding them into Unicode
|
||||
uString = @rawStream.read
|
||||
unless @charEncoding == 'utf-8'
|
||||
begin
|
||||
require 'iconv'
|
||||
uString = Iconv.iconv('utf-8', @encoding, uString)[0]
|
||||
rescue
|
||||
end
|
||||
end
|
||||
|
||||
# Normalize newlines and null characters
|
||||
uString.gsub!(/\r\n?/, "\n")
|
||||
uString.gsub!("\x00", [0xFFFD].pack('U'))
|
||||
|
||||
# Convert the unicode string into a list to be used as the data stream
|
||||
@dataStream = uString
|
||||
|
||||
@queue = []
|
||||
|
||||
# Reset position in the list to read from
|
||||
reset
|
||||
end
|
||||
|
||||
# Produces a file object from source.
|
||||
#
|
||||
# source can be either a file object, local filename or a string.
|
||||
def openStream(source)
|
||||
# Already an IO like object
|
||||
if source.respond_to?(:read)
|
||||
@stream = source
|
||||
else
|
||||
# Treat source as a string and wrap in StringIO
|
||||
@stream = StringIO.new(source)
|
||||
end
|
||||
return @stream
|
||||
end
|
||||
|
||||
def detectEncoding
|
||||
|
||||
#First look for a BOM
|
||||
#This will also read past the BOM if present
|
||||
encoding = detectBOM
|
||||
#If there is no BOM need to look for meta elements with encoding
|
||||
#information
|
||||
if encoding.nil? and @parseMeta
|
||||
encoding = detectEncodingMeta
|
||||
end
|
||||
#Guess with chardet, if avaliable
|
||||
if encoding.nil? and @chardet
|
||||
begin
|
||||
require 'rubygems'
|
||||
require 'UniversalDetector' # gem install chardet
|
||||
buffer = @rawStream.read
|
||||
encoding = UniversalDetector::chardet(buffer)['encoding']
|
||||
@rawStream = openStream(buffer)
|
||||
rescue LoadError
|
||||
end
|
||||
end
|
||||
# If all else fails use the default encoding
|
||||
if encoding.nil?
|
||||
encoding = @DEFAULT_ENCODING
|
||||
end
|
||||
|
||||
#Substitute for equivalent encodings:
|
||||
encodingSub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
|
||||
|
||||
if encodingSub.has_key?(encoding.downcase)
|
||||
encoding = encodingSub[encoding.downcase]
|
||||
end
|
||||
|
||||
return encoding
|
||||
end
|
||||
|
||||
# Attempts to detect at BOM at the start of the stream. If
|
||||
# an encoding can be determined from the BOM return the name of the
|
||||
# encoding otherwise return nil
|
||||
def detectBOM
|
||||
bomDict = {
|
||||
"\xef\xbb\xbf" => 'utf-8',
|
||||
"\xff\xfe" => 'utf-16-le',
|
||||
"\xfe\xff" => 'utf-16-be',
|
||||
"\xff\xfe\x00\x00" => 'utf-32-le',
|
||||
"\x00\x00\xfe\xff" => 'utf-32-be'
|
||||
}
|
||||
|
||||
# Go to beginning of file and read in 4 bytes
|
||||
@rawStream.seek(0)
|
||||
string = @rawStream.read(4)
|
||||
return nil unless string
|
||||
|
||||
# Try detecting the BOM using bytes from the string
|
||||
encoding = bomDict[string[0...3]] # UTF-8
|
||||
seek = 3
|
||||
unless encoding
|
||||
# Need to detect UTF-32 before UTF-16
|
||||
encoding = bomDict[string] # UTF-32
|
||||
seek = 4
|
||||
unless encoding
|
||||
encoding = bomDict[string[0...2]] # UTF-16
|
||||
seek = 2
|
||||
end
|
||||
end
|
||||
|
||||
#AT - move this to the caller?
|
||||
# Set the read position past the BOM if one was found, otherwise
|
||||
# set it to the start of the stream
|
||||
@rawStream.seek(encoding ? seek : 0)
|
||||
|
||||
return encoding
|
||||
end
|
||||
|
||||
# Report the encoding declared by the meta element
|
||||
def detectEncodingMeta
|
||||
parser = EncodingParser.new(@rawStream.read(@NUM_BYTES_META))
|
||||
@rawStream.seek(0)
|
||||
return parser.getEncoding
|
||||
end
|
||||
|
||||
def determineNewLines
|
||||
# Looks through the stream to find where new lines occur so
|
||||
# the position method can tell where it is.
|
||||
@newLines.push(0)
|
||||
(0...@dataStream.length).each { |i| @newLines.push(i) if @dataStream[i] == ?\n }
|
||||
end
|
||||
|
||||
# Returns (line, col) of the current position in the stream.
|
||||
def position
|
||||
# Generate list of new lines first time around
|
||||
determineNewLines if @newLines.empty?
|
||||
line = 0
|
||||
tell = @tell
|
||||
@newLines.each do |pos|
|
||||
break unless pos < tell
|
||||
line += 1
|
||||
end
|
||||
col = tell - @newLines[line-1] - 1
|
||||
return [line, col]
|
||||
end
|
||||
|
||||
# Resets the position in the stream back to the start.
|
||||
def reset
|
||||
@tell = 0
|
||||
end
|
||||
|
||||
# Read one character from the stream or queue if available. Return
|
||||
# EOF when EOF is reached.
|
||||
def char
|
||||
unless @queue.empty?
|
||||
return @queue.shift
|
||||
else
|
||||
begin
|
||||
@tell += 1
|
||||
return @dataStream[@tell - 1].chr
|
||||
rescue
|
||||
return :EOF
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Returns a string of characters from the stream up to but not
|
||||
# including any character in characters or EOF. characters can be
|
||||
# any container that supports the in method being called on it.
|
||||
def charsUntil(characters, opposite = false)
|
||||
charStack = [char]
|
||||
|
||||
unless charStack[0] == :EOF
|
||||
while (characters.include? charStack[-1]) == opposite
|
||||
unless @queue.empty?
|
||||
# First from the queue
|
||||
charStack.push(@queue.shift)
|
||||
break if charStack[-1] == :EOF
|
||||
else
|
||||
# Then the rest
|
||||
begin
|
||||
charStack.push(@dataStream[@tell].chr)
|
||||
@tell += 1
|
||||
rescue
|
||||
charStack.push(:EOF)
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Put the character stopped on back to the front of the queue
|
||||
# from where it came.
|
||||
@queue.insert(0, charStack.pop)
|
||||
return charStack.join('')
|
||||
end
|
||||
end
|
||||
|
||||
# String-like object with an assosiated position and various extra methods
|
||||
# If the position is ever greater than the string length then an exception is raised
|
||||
class EncodingBytes < String
|
||||
|
||||
attr_accessor :position
|
||||
|
||||
def initialize(value)
|
||||
super(value)
|
||||
@position = -1
|
||||
end
|
||||
|
||||
def each
|
||||
while @position < length
|
||||
@position += 1
|
||||
yield self[@position]
|
||||
end
|
||||
rescue EOF
|
||||
end
|
||||
|
||||
def currentByte
|
||||
raise EOF if @position >= length
|
||||
return self[@position].chr
|
||||
end
|
||||
|
||||
# Skip past a list of characters
|
||||
def skip(chars = SPACE_CHARACTERS)
|
||||
while chars.include?(currentByte)
|
||||
@position += 1
|
||||
end
|
||||
end
|
||||
|
||||
# Look for a sequence of bytes at the start of a string. If the bytes
|
||||
# are found return true and advance the position to the byte after the
|
||||
# match. Otherwise return false and leave the position alone
|
||||
def matchBytes(bytes, lower = false)
|
||||
data = self[position ... position+bytes.length]
|
||||
data.downcase! if lower
|
||||
rv = (data == bytes)
|
||||
@position += bytes.length if rv == true
|
||||
return rv
|
||||
end
|
||||
|
||||
# Look for the next sequence of bytes matching a given sequence. If
|
||||
# a match is found advance the position to the last byte of the match
|
||||
def jumpTo(bytes)
|
||||
newPosition = self[position .. -1].index(bytes)
|
||||
if newPosition
|
||||
@position += (newPosition + bytes.length-1)
|
||||
return true
|
||||
else
|
||||
raise EOF
|
||||
end
|
||||
end
|
||||
|
||||
# Move the pointer so it points to the next byte in a set of possible
|
||||
# bytes
|
||||
def findNext(byteList)
|
||||
until byteList.include?(currentByte)
|
||||
@position += 1
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Mini parser for detecting character encoding from meta elements
|
||||
class EncodingParser
|
||||
|
||||
# string - the data to work on for encoding detection
|
||||
def initialize(data)
|
||||
@data = EncodingBytes.new(data.to_s)
|
||||
@encoding = nil
|
||||
end
|
||||
|
||||
@@method_dispatch = [
|
||||
['<!--', :handleComment],
|
||||
['<meta', :handleMeta],
|
||||
['</', :handlePossibleEndTag],
|
||||
['<!', :handleOther],
|
||||
['<?', :handleOther],
|
||||
['<', :handlePossibleStartTag]
|
||||
]
|
||||
|
||||
def getEncoding
|
||||
@data.each do |byte|
|
||||
keepParsing = true
|
||||
@@method_dispatch.each do |(key, method)|
|
||||
if @data.matchBytes(key, lower = true)
|
||||
keepParsing = send(method)
|
||||
break
|
||||
end
|
||||
end
|
||||
break unless keepParsing
|
||||
end
|
||||
@encoding = @encoding.strip unless @encoding.nil?
|
||||
return @encoding
|
||||
end
|
||||
|
||||
# Skip over comments
|
||||
def handleComment
|
||||
return @data.jumpTo('-->')
|
||||
end
|
||||
|
||||
def handleMeta
|
||||
# if we have <meta not followed by a space so just keep going
|
||||
return true unless SPACE_CHARACTERS.include?(@data.currentByte)
|
||||
|
||||
#We have a valid meta element we want to search for attributes
|
||||
while true
|
||||
#Try to find the next attribute after the current position
|
||||
attr = getAttribute
|
||||
|
||||
return true if attr.nil?
|
||||
|
||||
if attr[0] == 'charset'
|
||||
tentativeEncoding = attr[1]
|
||||
if HTML5lib.isValidEncoding(tentativeEncoding)
|
||||
@encoding = tentativeEncoding
|
||||
return false
|
||||
end
|
||||
elsif attr[0] == 'content'
|
||||
contentParser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
|
||||
tentativeEncoding = contentParser.parse
|
||||
if HTML5lib.isValidEncoding(tentativeEncoding)
|
||||
@encoding = tentativeEncoding
|
||||
return false
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def handlePossibleStartTag
|
||||
return handlePossibleTag(false)
|
||||
end
|
||||
|
||||
def handlePossibleEndTag
|
||||
@data.position+=1
|
||||
return handlePossibleTag(true)
|
||||
end
|
||||
|
||||
def handlePossibleTag(endTag)
|
||||
unless ASCII_LETTERS.include?(@data.currentByte)
|
||||
#If the next byte is not an ascii letter either ignore this
|
||||
#fragment (possible start tag case) or treat it according to
|
||||
#handleOther
|
||||
if endTag
|
||||
@data.position -= 1
|
||||
handleOther
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
@data.findNext(SPACE_CHARACTERS + ['<', '>'])
|
||||
|
||||
if @data.currentByte == '<'
|
||||
#return to the first step in the overall "two step" algorithm
|
||||
#reprocessing the < byte
|
||||
@data.position -= 1
|
||||
else
|
||||
#Read all attributes
|
||||
{} until getAttribute.nil?
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def handleOther
|
||||
return @data.jumpTo('>')
|
||||
end
|
||||
|
||||
# Return a name,value pair for the next attribute in the stream,
|
||||
# if one is found, or nil
|
||||
def getAttribute
|
||||
@data.skip(SPACE_CHARACTERS + ['/'])
|
||||
|
||||
if @data.currentByte == '<'
|
||||
@data.position -= 1
|
||||
return nil
|
||||
elsif @data.currentByte == '>'
|
||||
return nil
|
||||
end
|
||||
|
||||
attrName = []
|
||||
attrValue = []
|
||||
spaceFound = false
|
||||
#Step 5 attribute name
|
||||
while true
|
||||
if @data.currentByte == '=' and attrName:
|
||||
break
|
||||
elsif SPACE_CHARACTERS.include?(@data.currentByte)
|
||||
spaceFound = true
|
||||
break
|
||||
elsif ['/', '<', '>'].include?(@data.currentByte)
|
||||
return [attrName.join(''), '']
|
||||
elsif ASCII_UPPERCASE.include?(@data.currentByte)
|
||||
attrName.push(@data.currentByte.downcase)
|
||||
else
|
||||
attrName.push(@data.currentByte)
|
||||
end
|
||||
#Step 6
|
||||
@data.position += 1
|
||||
end
|
||||
#Step 7
|
||||
if spaceFound
|
||||
@data.skip
|
||||
#Step 8
|
||||
unless @data.currentByte == '='
|
||||
@data.position -= 1
|
||||
return [attrName.join(''), '']
|
||||
end
|
||||
end
|
||||
#XXX need to advance position in both spaces and value case
|
||||
#Step 9
|
||||
@data.position += 1
|
||||
#Step 10
|
||||
@data.skip
|
||||
#Step 11
|
||||
if ["'", '"'].include?(@data.currentByte)
|
||||
#11.1
|
||||
quoteChar = @data.currentByte
|
||||
while true
|
||||
@data.position+=1
|
||||
#11.3
|
||||
if @data.currentByte == quoteChar
|
||||
@data.position += 1
|
||||
return [attrName.join(''), attrValue.join('')]
|
||||
#11.4
|
||||
elsif ASCII_UPPERCASE.include?(@data.currentByte)
|
||||
attrValue.push(@data.currentByte.downcase)
|
||||
#11.5
|
||||
else
|
||||
attrValue.push(@data.currentByte)
|
||||
end
|
||||
end
|
||||
elsif ['>', '<'].include?(@data.currentByte)
|
||||
return [attrName.join(''), '']
|
||||
elsif ASCII_UPPERCASE.include?(@data.currentByte)
|
||||
attrValue.push(@data.currentByte.downcase)
|
||||
else
|
||||
attrValue.push(@data.currentByte)
|
||||
end
|
||||
while true
|
||||
@data.position +=1
|
||||
if (SPACE_CHARACTERS + ['>', '<']).include?(@data.currentByte)
|
||||
return [attrName.join(''), attrValue.join('')]
|
||||
elsif ASCII_UPPERCASE.include?(@data.currentByte)
|
||||
attrValue.push(@data.currentByte.downcase)
|
||||
else
|
||||
attrValue.push(@data.currentByte)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
class ContentAttrParser
|
||||
def initialize(data)
|
||||
@data = data
|
||||
end
|
||||
def parse
|
||||
begin
|
||||
#Skip to the first ";"
|
||||
@data.position = 0
|
||||
@data.jumpTo(';')
|
||||
@data.position += 1
|
||||
@data.skip
|
||||
#Check if the attr name is charset
|
||||
#otherwise return
|
||||
@data.jumpTo('charset')
|
||||
@data.position += 1
|
||||
@data.skip
|
||||
unless @data.currentByte == '='
|
||||
#If there is no = sign keep looking for attrs
|
||||
return nil
|
||||
end
|
||||
@data.position += 1
|
||||
@data.skip
|
||||
#Look for an encoding between matching quote marks
|
||||
if ['"', "'"].include?(@data.currentByte)
|
||||
quoteMark = @data.currentByte
|
||||
@data.position += 1
|
||||
oldPosition = @data.position
|
||||
@data.jumpTo(quoteMark)
|
||||
return @data[oldPosition ... @data.position]
|
||||
else
|
||||
#Unquoted value
|
||||
oldPosition = @data.position
|
||||
begin
|
||||
@data.findNext(SPACE_CHARACTERS)
|
||||
return @data[oldPosition ... @data.position]
|
||||
rescue EOF
|
||||
#Return the whole remaining value
|
||||
return @data[oldPosition .. -1]
|
||||
end
|
||||
end
|
||||
rescue EOF
|
||||
return nil
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Determine if a string is a supported encoding
|
||||
def self.isValidEncoding(encoding)
|
||||
(not encoding.nil? and encoding.kind_of?(String) and ENCODINGS.include?(encoding.downcase.strip))
|
||||
end
|
||||
|
||||
end
|
141
vendor/plugins/HTML5lib/lib/html5lib/liberalxmlparser.rb
vendored
Executable file
141
vendor/plugins/HTML5lib/lib/html5lib/liberalxmlparser.rb
vendored
Executable file
|
@ -0,0 +1,141 @@
|
|||
# Warning: this module is experimental and subject to change and even removal
|
||||
# at any time.
|
||||
#
|
||||
# For background/rationale, see:
|
||||
# * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
|
||||
# * http://tinyurl.com/ylfj8k (and follow-ups)
|
||||
#
|
||||
# References:
|
||||
# * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
|
||||
# * http://wiki.whatwg.org/wiki/HtmlVsXhtml
|
||||
#
|
||||
# @@TODO:
|
||||
# * Selectively lowercase only XHTML, but not foreign markup
|
||||
require 'html5lib/html5parser'
|
||||
require 'html5lib/constants'
|
||||
|
||||
module HTML5lib
|
||||
|
||||
# liberal XML parser
|
||||
class XMLParser < HTMLParser
|
||||
|
||||
def initialize(options={})
|
||||
super options
|
||||
@phases[:initial] = XmlRootPhase.new(self, @tree)
|
||||
end
|
||||
|
||||
def normalizeToken(token)
|
||||
if token[:type] == :StartTag or token[:type] == :EmptyTag
|
||||
# We need to remove the duplicate attributes and convert attributes
|
||||
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||
|
||||
token[:data] = Hash[*token[:data].reverse.flatten]
|
||||
|
||||
# For EmptyTags, process both a Start and an End tag
|
||||
if token[:type] == :EmptyTag
|
||||
@phase.processStartTag(token[:name], token[:data])
|
||||
token[:data] = {}
|
||||
token[:type] = :EndTag
|
||||
end
|
||||
|
||||
elsif token[:type] == :EndTag
|
||||
if token[:data]
|
||||
parseError(_("End tag contains unexpected attributes."))
|
||||
end
|
||||
|
||||
elsif token[:type] == :Comment
|
||||
# Rescue CDATA from the comments
|
||||
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
|
||||
token[:type] = :Characters
|
||||
token[:data] = token[:data][7 ... -2]
|
||||
end
|
||||
end
|
||||
|
||||
return token
|
||||
end
|
||||
end
|
||||
|
||||
# liberal XMTHML parser
|
||||
class XHTMLParser < XMLParser
|
||||
|
||||
def initialize(options={})
|
||||
super options
|
||||
@phases[:initial] = InitialPhase.new(self, @tree)
|
||||
@phases[:rootElement] = XhmlRootPhase.new(self, @tree)
|
||||
end
|
||||
|
||||
def normalizeToken(token)
|
||||
super(token)
|
||||
|
||||
# ensure that non-void XHTML elements have content so that separate
|
||||
# open and close tags are emitted
|
||||
if token[:type] == :EndTag and \
|
||||
not VOID_ELEMENTS.include? token[:name] and \
|
||||
token[:name] == @tree.openElements[-1].name and \
|
||||
not @tree.openElements[-1].hasContent
|
||||
@tree.insertText('') unless
|
||||
@tree.openElements.any? {|e|
|
||||
e.attributes.keys.include? 'xmlns' and
|
||||
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
|
||||
}
|
||||
end
|
||||
|
||||
return token
|
||||
end
|
||||
end
|
||||
|
||||
class XhmlRootPhase < RootElementPhase
|
||||
def insertHtmlElement
|
||||
element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
|
||||
@tree.openElements.push(element)
|
||||
@tree.document.appendChild(element)
|
||||
@parser.phase = @parser.phases[:beforeHead]
|
||||
end
|
||||
end
|
||||
|
||||
class XmlRootPhase < Phase
|
||||
# Prime the Xml parser
|
||||
@start_tag_handlers = Hash.new(:startTagOther)
|
||||
@end_tag_handlers = Hash.new(:endTagOther)
|
||||
def startTagOther(name, attributes)
|
||||
@tree.openElements.push(@tree.document)
|
||||
element = @tree.createElement(name, attributes)
|
||||
@tree.openElements[-1].appendChild(element)
|
||||
@tree.openElements.push(element)
|
||||
@parser.phase = XmlElementPhase.new(@parser,@tree)
|
||||
end
|
||||
def endTagOther(name)
|
||||
super
|
||||
@tree.openElements.pop
|
||||
end
|
||||
end
|
||||
|
||||
class XmlElementPhase < Phase
|
||||
# Generic handling for all XML elements
|
||||
|
||||
@start_tag_handlers = Hash.new(:startTagOther)
|
||||
@end_tag_handlers = Hash.new(:endTagOther)
|
||||
|
||||
def startTagOther(name, attributes)
|
||||
element = @tree.createElement(name, attributes)
|
||||
@tree.openElements[-1].appendChild(element)
|
||||
@tree.openElements.push(element)
|
||||
end
|
||||
|
||||
def endTagOther(name)
|
||||
for node in @tree.openElements.reverse
|
||||
if node.name == name
|
||||
{} while @tree.openElements.pop != node
|
||||
break
|
||||
else
|
||||
@parser.parseError
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def processCharacters(data)
|
||||
@tree.insertText(data)
|
||||
end
|
||||
end
|
||||
|
||||
end
|
178
vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb
vendored
Normal file
178
vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb
vendored
Normal file
|
@ -0,0 +1,178 @@
|
|||
require 'html5lib/tokenizer'
|
||||
require 'cgi'
|
||||
|
||||
module HTML5lib
|
||||
|
||||
# This module provides sanitization of XHTML+MathML+SVG
|
||||
# and of inline style attributes.
|
||||
|
||||
class HTMLSanitizer < HTMLTokenizer
|
||||
|
||||
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
|
||||
button caption center cite code col colgroup dd del dfn dir div dl dt
|
||||
em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label
|
||||
legend li map menu ol optgroup option p pre q s samp select small span
|
||||
strike strong sub sup table tbody td textarea tfoot th thead tr tt u
|
||||
ul var]
|
||||
|
||||
MATHML_ELEMENTS = %w[maction math merror mfrac mi mmultiscripts mn mo
|
||||
mover mpadded mphantom mprescripts mroot mrow mspace msqrt mstyle msub
|
||||
msubsup msup mtable mtd mtext mtr munder munderover none]
|
||||
|
||||
SVG_ELEMENTS = %w[a animate animateColor animateMotion animateTransform
|
||||
circle defs desc ellipse font-face font-face-name font-face-src g
|
||||
glyph hkern image linearGradient line marker metadata missing-glyph
|
||||
mpath path polygon polyline radialGradient rect set stop svg switch
|
||||
text title tspan use]
|
||||
|
||||
ACCEPTABLE_ATTRIBUTES = %w[abbr accept accept-charset accesskey action
|
||||
align alt axis border cellpadding cellspacing char charoff charset
|
||||
checked cite class clear cols colspan color compact coords datetime
|
||||
dir disabled enctype for frame headers height href hreflang hspace id
|
||||
ismap label lang longdesc maxlength media method multiple name nohref
|
||||
noshade nowrap prompt readonly rel rev rows rowspan rules scope
|
||||
selected shape size span src start style summary tabindex target title
|
||||
type usemap valign value vspace width xml:lang]
|
||||
|
||||
MATHML_ATTRIBUTES = %w[actiontype align columnalign columnalign
|
||||
columnalign columnlines columnspacing columnspan depth display
|
||||
displaystyle equalcolumns equalrows fence fontstyle fontweight frame
|
||||
height linethickness lspace mathbackground mathcolor mathvariant
|
||||
mathvariant maxsize minsize other rowalign rowalign rowalign rowlines
|
||||
rowspacing rowspan rspace scriptlevel selection separator stretchy
|
||||
width width xlink:href xlink:show xlink:type xmlns xmlns:xlink]
|
||||
|
||||
SVG_ATTRIBUTES = %w[accent-height accumulate additive alphabetic
|
||||
arabic-form ascent attributeName attributeType baseProfile bbox begin
|
||||
by calcMode cap-height class color color-rendering content cx cy d dx
|
||||
dy descent display dur end fill fill-rule font-family font-size
|
||||
font-stretch font-style font-variant font-weight from fx fy g1 g2
|
||||
glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
|
||||
ideographic k keyPoints keySplines keyTimes lang marker-end
|
||||
marker-mid marker-start markerHeight markerUnits markerWidth
|
||||
mathematical max min name offset opacity orient origin
|
||||
overline-position overline-thickness panose-1 path pathLength points
|
||||
preserveAspectRatio r refX refY repeatCount repeatDur
|
||||
requiredExtensions requiredFeatures restart rotate rx ry slope stemh
|
||||
stemv stop-color stop-opacity strikethrough-position
|
||||
strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
|
||||
stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
|
||||
stroke-width systemLanguage target text-anchor to transform type u1
|
||||
u2 underline-position underline-thickness unicode unicode-range
|
||||
units-per-em values version viewBox visibility width widths x
|
||||
x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
|
||||
xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
|
||||
xmlns:xlink y y1 y2 zoomAndPan]
|
||||
|
||||
ATTR_VAL_IS_URI = %w[href src cite action longdesc xlink:href]
|
||||
|
||||
ACCEPTABLE_CSS_PROPERTIES = %w[azimuth background-color
|
||||
border-bottom-color border-collapse border-color border-left-color
|
||||
border-right-color border-top-color clear color cursor direction
|
||||
display elevation float font font-family font-size font-style
|
||||
font-variant font-weight height letter-spacing line-height overflow
|
||||
pause pause-after pause-before pitch pitch-range richness speak
|
||||
speak-header speak-numeral speak-punctuation speech-rate stress
|
||||
text-align text-decoration text-indent unicode-bidi vertical-align
|
||||
voice-family volume white-space width]
|
||||
|
||||
ACCEPTABLE_CSS_KEYWORDS = %w[auto aqua black block blue bold both bottom
|
||||
brown center collapse dashed dotted fuchsia gray green !important
|
||||
italic left lime maroon medium none navy normal nowrap olive pointer
|
||||
purple red right solid silver teal top transparent underline white
|
||||
yellow]
|
||||
|
||||
ACCEPTABLE_SVG_PROPERTIES = %w[fill fill-opacity fill-rule stroke
|
||||
stroke-width stroke-linecap stroke-linejoin stroke-opacity]
|
||||
|
||||
ACCEPTABLE_PROTOCOLS = %w[ed2k ftp http https irc mailto news gopher nntp
|
||||
telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
|
||||
|
||||
# subclasses may define their own versions of these constants
|
||||
ALLOWED_ELEMENTS = ACCEPTABLE_ELEMENTS + MATHML_ELEMENTS + SVG_ELEMENTS
|
||||
ALLOWED_ATTRIBUTES = ACCEPTABLE_ATTRIBUTES + MATHML_ATTRIBUTES + SVG_ATTRIBUTES
|
||||
ALLOWED_CSS_PROPERTIES = ACCEPTABLE_CSS_PROPERTIES
|
||||
ALLOWED_CSS_KEYWORDS = ACCEPTABLE_CSS_KEYWORDS
|
||||
ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
|
||||
ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
|
||||
|
||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
||||
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
|
||||
# attributes are parsed, and a restricted set, # specified by
|
||||
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
||||
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
|
||||
# in ALLOWED_PROTOCOLS are allowed.
|
||||
#
|
||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||
# => <script> do_nasty_stuff() </script>
|
||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||
# => <a>Click here for $100</a>
|
||||
def each
|
||||
super do |token|
|
||||
case token[:type]
|
||||
when :StartTag, :EndTag, :EmptyTag
|
||||
if ALLOWED_ELEMENTS.include?(token[:name])
|
||||
if token.has_key? :data
|
||||
attrs = Hash[*token[:data].flatten]
|
||||
attrs.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
|
||||
ATTR_VAL_IS_URI.each do |attr|
|
||||
val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
|
||||
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
|
||||
attrs.delete attr
|
||||
end
|
||||
end
|
||||
if attrs['style']
|
||||
attrs['style'] = sanitize_css(attrs['style'])
|
||||
end
|
||||
token[:data] = attrs.map {|k,v| [k,v]}
|
||||
end
|
||||
yield token
|
||||
else
|
||||
if token[:type] == :EndTag
|
||||
token[:data] = "</#{token[:name]}>"
|
||||
elsif token[:data]
|
||||
attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('')
|
||||
token[:data] = "<#{token[:name]}#{attrs}>"
|
||||
else
|
||||
token[:data] = "<#{token[:name]}>"
|
||||
end
|
||||
token[:data].insert(-2,'/') if token[:type] == :EmptyTag
|
||||
token[:type] = :Characters
|
||||
token.delete(:name)
|
||||
yield token
|
||||
end
|
||||
else
|
||||
yield token
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
def sanitize_css(style)
|
||||
# disallow urls
|
||||
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
|
||||
|
||||
# gauntlet
|
||||
return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
|
||||
return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
|
||||
|
||||
clean = []
|
||||
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
|
||||
next if val.empty?
|
||||
prop.downcase!
|
||||
if ALLOWED_CSS_PROPERTIES.include?(prop)
|
||||
clean << "#{prop}: #{val};"
|
||||
elsif %w[background border margin padding].include?(prop.split('-')[0])
|
||||
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
|
||||
!ALLOWED_CSS_KEYWORDS.include?(keyword) and
|
||||
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
|
||||
end
|
||||
elsif ALLOWED_SVG_PROPERTIES.include?(prop)
|
||||
clean << "#{prop}: #{val};"
|
||||
end
|
||||
end
|
||||
|
||||
style = clean.join(' ')
|
||||
end
|
||||
end
|
||||
end
|
854
vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb
vendored
Normal file
854
vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb
vendored
Normal file
|
@ -0,0 +1,854 @@
|
|||
require 'html5lib/constants'
|
||||
require 'html5lib/inputstream'
|
||||
|
||||
module HTML5lib
|
||||
|
||||
# This class takes care of tokenizing HTML.
|
||||
#
|
||||
# * @currentToken
|
||||
# Holds the token that is currently being processed.
|
||||
#
|
||||
# * @state
|
||||
# Holds a reference to the method to be invoked... XXX
|
||||
#
|
||||
# * @states
|
||||
# Holds a mapping between states and methods that implement the state.
|
||||
#
|
||||
# * @stream
|
||||
# Points to HTMLInputStream object.
|
||||
|
||||
class HTMLTokenizer
|
||||
attr_accessor :contentModelFlag, :currentToken
|
||||
attr_reader :stream
|
||||
|
||||
# XXX need to fix documentation
|
||||
|
||||
def initialize(stream, options={})
|
||||
@stream = HTMLInputStream.new(stream, options)
|
||||
|
||||
@states = {
|
||||
:data => :dataState,
|
||||
:entityData => :entityDataState,
|
||||
:tagOpen => :tagOpenState,
|
||||
:closeTagOpen => :closeTagOpenState,
|
||||
:tagName => :tagNameState,
|
||||
:beforeAttributeName => :beforeAttributeNameState,
|
||||
:attributeName => :attributeNameState,
|
||||
:afterAttributeName => :afterAttributeNameState,
|
||||
:beforeAttributeValue => :beforeAttributeValueState,
|
||||
:attributeValueDoubleQuoted => :attributeValueDoubleQuotedState,
|
||||
:attributeValueSingleQuoted => :attributeValueSingleQuotedState,
|
||||
:attributeValueUnQuoted => :attributeValueUnQuotedState,
|
||||
:bogusComment => :bogusCommentState,
|
||||
:markupDeclarationOpen => :markupDeclarationOpenState,
|
||||
:comment => :commentState,
|
||||
:commentDash => :commentDashState,
|
||||
:commentEnd => :commentEndState,
|
||||
:doctype => :doctypeState,
|
||||
:beforeDoctypeName => :beforeDoctypeNameState,
|
||||
:doctypeName => :doctypeNameState,
|
||||
:afterDoctypeName => :afterDoctypeNameState,
|
||||
:bogusDoctype => :bogusDoctypeState
|
||||
}
|
||||
|
||||
# Setup the initial tokenizer state
|
||||
@contentModelFlag = :PCDATA
|
||||
@state = @states[:data]
|
||||
|
||||
# The current token being created
|
||||
@currentToken = nil
|
||||
|
||||
# Tokens to be processed.
|
||||
@tokenQueue = []
|
||||
end
|
||||
|
||||
# This is where the magic happens.
|
||||
#
|
||||
# We do our usually processing through the states and when we have a token
|
||||
# to return we yield the token which pauses processing until the next token
|
||||
# is requested.
|
||||
def each
|
||||
@stream.reset
|
||||
@tokenQueue = []
|
||||
# Start processing. When EOF is reached @state will return false
|
||||
# instead of true and the loop will terminate.
|
||||
while send @state
|
||||
while not @tokenQueue.empty?
|
||||
yield @tokenQueue.shift
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Below are various helper functions the tokenizer states use worked out.
|
||||
|
||||
# If the next character is a '>', convert the currentToken into
|
||||
# an EmptyTag
|
||||
|
||||
def processSolidusInTag
|
||||
|
||||
# We need to consume another character to make sure it's a ">"
|
||||
data = @stream.char
|
||||
|
||||
if @currentToken[:type] == :StartTag and data == ">"
|
||||
@currentToken[:type] = :EmptyTag
|
||||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Solidus (/) incorrectly placed in tag.")})
|
||||
end
|
||||
|
||||
# The character we just consumed need to be put back on the stack so it
|
||||
# doesn't get lost...
|
||||
@stream.queue.push(data)
|
||||
end
|
||||
|
||||
# This function returns either U+FFFD or the character based on the
|
||||
# decimal or hexadecimal representation. It also discards ";" if present.
|
||||
# If not present @tokenQueue.push({:type => :ParseError}") is invoked.
|
||||
|
||||
def consumeNumberEntity(isHex)
|
||||
|
||||
# XXX More need to be done here. For instance, #13 should prolly be
|
||||
# converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
|
||||
# such. Thoughts on this appreciated.
|
||||
allowed = DIGITS
|
||||
radix = 10
|
||||
if isHex
|
||||
allowed = HEX_DIGITS
|
||||
radix = 16
|
||||
end
|
||||
|
||||
char = [0xFFFD].pack('U')
|
||||
charStack = []
|
||||
|
||||
# Consume all the characters that are in range while making sure we
|
||||
# don't hit an EOF.
|
||||
c = @stream.char
|
||||
while allowed.include?(c) and c != :EOF
|
||||
charStack.push(c)
|
||||
c = @stream.char
|
||||
end
|
||||
|
||||
# Convert the set of characters consumed to an int.
|
||||
charAsInt = charStack.join('').to_i(radix)
|
||||
|
||||
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
|
||||
# smaller) we need to do the "windows trick".
|
||||
if (127...160).include? charAsInt
|
||||
#XXX - removed parse error from windows 1252 entity for now
|
||||
#we may want to reenable this later
|
||||
#@tokenQueue.push({:type => :ParseError, :data =>
|
||||
# _("Entity used with illegal number (windows-1252 reference).")})
|
||||
|
||||
charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
|
||||
end
|
||||
|
||||
# 0 is not a good number.
|
||||
if charAsInt == 0
|
||||
charAsInt = 65533
|
||||
end
|
||||
|
||||
if charAsInt <= 0x10FFF
|
||||
char = [charAsInt].pack('U')
|
||||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Numeric entity couldn't be converted to character.")})
|
||||
end
|
||||
|
||||
# Discard the ; if present. Otherwise, put it back on the queue and
|
||||
# invoke parseError on parser.
|
||||
if c != ";"
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Numeric entity didn't end with ';'.")})
|
||||
@stream.queue.push(c)
|
||||
end
|
||||
|
||||
return char
|
||||
end
|
||||
|
||||
def consumeEntity
|
||||
char = nil
|
||||
charStack = [@stream.char]
|
||||
if charStack[0] == "#"
|
||||
# We might have a number entity here.
|
||||
charStack += [@stream.char, @stream.char]
|
||||
if charStack.include? :EOF
|
||||
# If we reach the end of the file put everything up to :EOF
|
||||
# back in the queue
|
||||
charStack = charStack[0...charStack.index(:EOF)]
|
||||
@stream.queue+= charStack
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Numeric entity expected. Got end of file instead.")})
|
||||
else
|
||||
if charStack[1].downcase == "x" \
|
||||
and HEX_DIGITS.include? charStack[2]
|
||||
# Hexadecimal entity detected.
|
||||
@stream.queue.push(charStack[2])
|
||||
char = consumeNumberEntity(true)
|
||||
elsif DIGITS.include? charStack[1]
|
||||
# Decimal entity detected.
|
||||
@stream.queue += charStack[1..-1]
|
||||
char = consumeNumberEntity(false)
|
||||
else
|
||||
# No number entity detected.
|
||||
@stream.queue += charStack
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Numeric entity expected but none found.")})
|
||||
end
|
||||
end
|
||||
# Break out if we reach the end of the file
|
||||
elsif charStack[0] == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Entity expected. Got end of file instead.")})
|
||||
else
|
||||
# At this point in the process might have named entity. Entities
|
||||
# are stored in the global variable "entities".
|
||||
#
|
||||
# Consume characters and compare to these to a substring of the
|
||||
# entity names in the list until the substring no longer matches.
|
||||
filteredEntityList = ENTITIES.keys
|
||||
filteredEntityList.reject! {|e| e[0].chr != charStack[0]}
|
||||
entityName = nil
|
||||
|
||||
while charStack[-1] != :EOF
|
||||
name = charStack.join('')
|
||||
if filteredEntityList.any? {|e| e[0...name.length] == name}
|
||||
filteredEntityList.reject! {|e| e[0...name.length] != name}
|
||||
charStack.push(@stream.char)
|
||||
else
|
||||
break
|
||||
end
|
||||
|
||||
if ENTITIES.include? name
|
||||
entityName = name
|
||||
end
|
||||
end
|
||||
|
||||
if entityName != nil
|
||||
char = ENTITIES[entityName]
|
||||
|
||||
# Check whether or not the last character returned can be
|
||||
# discarded or needs to be put back.
|
||||
if not charStack[-1] == ";"
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Named entity didn't end with ';'.")})
|
||||
@stream.queue += charStack[entityName.length..-1]
|
||||
end
|
||||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Named entity expected. Got none.")})
|
||||
@stream.queue += charStack
|
||||
end
|
||||
end
|
||||
return char
|
||||
end
|
||||
|
||||
# This method replaces the need for "entityInAttributeValueState".
|
||||
def processEntityInAttribute
|
||||
entity = consumeEntity
|
||||
if entity
|
||||
@currentToken[:data][-1][1] += entity
|
||||
else
|
||||
@currentToken[:data][-1][1] += "&"
|
||||
end
|
||||
end
|
||||
|
||||
# This method is a generic handler for emitting the tags. It also sets
|
||||
# the state to "data" because that's what's needed after a token has been
|
||||
# emitted.
|
||||
def emitCurrentToken
|
||||
# Add token to the queue to be yielded
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
end
|
||||
|
||||
|
||||
# Below are the various tokenizer states worked out.
|
||||
|
||||
# XXX AT Perhaps we should have Hixie run some evaluation on billions of
|
||||
# documents to figure out what the order of the various if and elsif
|
||||
# statements should be.
|
||||
|
||||
def dataState
|
||||
data = @stream.char
|
||||
if data == "&" and (@contentModelFlag == :PCDATA or
|
||||
@contentModelFlag == :RCDATA)
|
||||
@state = @states[:entityData]
|
||||
elsif data == "<" and @contentModelFlag != :PLAINTEXT
|
||||
@state = @states[:tagOpen]
|
||||
elsif data == :EOF
|
||||
# Tokenization ends.
|
||||
return false
|
||||
elsif SPACE_CHARACTERS.include? data
|
||||
# Directly after emitting a token you switch back to the "data
|
||||
# state". At that point SPACE_CHARACTERS are important so they are
|
||||
# emitted separately.
|
||||
# XXX need to check if we don't need a special "spaces" flag on
|
||||
# characters.
|
||||
@tokenQueue.push({:type => :SpaceCharacters, :data =>
|
||||
data + @stream.charsUntil(SPACE_CHARACTERS, true)})
|
||||
else
|
||||
@tokenQueue.push({:type => :Characters, :data =>
|
||||
data + @stream.charsUntil(["&", "<"])})
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def entityDataState
|
||||
entity = consumeEntity
|
||||
if entity
|
||||
@tokenQueue.push({:type => :Characters, :data => entity})
|
||||
else
|
||||
@tokenQueue.push({:type => :Characters, :data => "&"})
|
||||
end
|
||||
@state = @states[:data]
|
||||
return true
|
||||
end
|
||||
|
||||
def tagOpenState
|
||||
data = @stream.char
|
||||
if @contentModelFlag == :PCDATA
|
||||
if data == "!"
|
||||
@state = @states[:markupDeclarationOpen]
|
||||
elsif data == "/"
|
||||
@state = @states[:closeTagOpen]
|
||||
elsif data != :EOF and ASCII_LETTERS.include? data
|
||||
@currentToken =\
|
||||
{:type => :StartTag, :name => data, :data => []}
|
||||
@state = @states[:tagName]
|
||||
elsif data == ">"
|
||||
# XXX In theory it could be something besides a tag name. But
|
||||
# do we really care?
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected tag name. Got '>' instead.")})
|
||||
@tokenQueue.push({:type => :Characters, :data => "<>"})
|
||||
@state = @states[:data]
|
||||
elsif data == "?"
|
||||
# XXX In theory it could be something besides a tag name. But
|
||||
# do we really care?
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected tag name. Got '?' instead (HTML doesn't " +
|
||||
"support processing instructions).")})
|
||||
@stream.queue.push(data)
|
||||
@state = @states[:bogusComment]
|
||||
else
|
||||
# XXX
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected tag name. Got something else instead")})
|
||||
@tokenQueue.push({:type => :Characters, :data => "<"})
|
||||
@stream.queue.push(data)
|
||||
@state = @states[:data]
|
||||
end
|
||||
else
|
||||
# We know the content model flag is set to either RCDATA or CDATA
|
||||
# now because this state can never be entered with the PLAINTEXT
|
||||
# flag.
|
||||
if data == "/"
|
||||
@state = @states[:closeTagOpen]
|
||||
else
|
||||
@tokenQueue.push({:type => :Characters, :data => "<"})
|
||||
@stream.queue.insert(0, data)
|
||||
@state = @states[:data]
|
||||
end
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def closeTagOpenState
|
||||
if (@contentModelFlag == :RCDATA or @contentModelFlag == :CDATA)
|
||||
if @currentToken
|
||||
charStack = []
|
||||
|
||||
# So far we know that "</" has been consumed. We now need to know
|
||||
# whether the next few characters match the name of last emitted
|
||||
# start tag which also happens to be the currentToken. We also need
|
||||
# to have the character directly after the characters that could
|
||||
# match the start tag name.
|
||||
(@currentToken[:name].length + 1).times do
|
||||
charStack.push(@stream.char)
|
||||
# Make sure we don't get hit by :EOF
|
||||
break if charStack[-1] == :EOF
|
||||
end
|
||||
|
||||
# Since this is just for checking. We put the characters back on
|
||||
# the stack.
|
||||
@stream.queue += charStack
|
||||
end
|
||||
|
||||
if @currentToken and
|
||||
@currentToken[:name].downcase ==
|
||||
charStack[0...-1].join('').downcase and
|
||||
(SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? charStack[-1]
|
||||
# Because the characters are correct we can safely switch to
|
||||
# PCDATA mode now. This also means we don't have to do it when
|
||||
# emitting the end tag token.
|
||||
@contentModelFlag = :PCDATA
|
||||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected closing tag after seeing '</'. None found.")})
|
||||
@tokenQueue.push({:type => :Characters, :data => "</"})
|
||||
@state = @states[:data]
|
||||
|
||||
# Need to return here since we don't want the rest of the
|
||||
# method to be walked through.
|
||||
return true
|
||||
end
|
||||
end
|
||||
|
||||
if @contentModelFlag == :PCDATA
|
||||
data = @stream.char
|
||||
if data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected closing tag. Unexpected end of file.")})
|
||||
@tokenQueue.push({:type => :Characters, :data => "</"})
|
||||
@state = @states[:data]
|
||||
elsif ASCII_LETTERS.include? data
|
||||
@currentToken =\
|
||||
{:type => :EndTag, :name => data, :data => []}
|
||||
@state = @states[:tagName]
|
||||
elsif data == ">"
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
|
||||
@state = @states[:data]
|
||||
else
|
||||
# XXX data can be _'_...
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected closing tag. Unexpected character '" + data + "' found.")})
|
||||
@stream.queue.push(data)
|
||||
@state = @states[:bogusComment]
|
||||
end
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def tagNameState
|
||||
data = @stream.char
|
||||
if SPACE_CHARACTERS.include? data
|
||||
@state = @states[:beforeAttributeName]
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in the tag name.")})
|
||||
emitCurrentToken
|
||||
elsif ASCII_LETTERS.include? data
|
||||
@currentToken[:name] += data +\
|
||||
@stream.charsUntil(ASCII_LETTERS, true)
|
||||
elsif data == ">"
|
||||
emitCurrentToken
|
||||
elsif data == "<"
|
||||
@stream.queue.push(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected < character when getting the tag name.")})
|
||||
emitCurrentToken
|
||||
elsif data == "/"
|
||||
processSolidusInTag
|
||||
@state = @states[:beforeAttributeName]
|
||||
else
|
||||
@currentToken[:name] += data
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def beforeAttributeNameState
|
||||
data = @stream.char
|
||||
if SPACE_CHARACTERS.include? data
|
||||
@stream.charsUntil(SPACE_CHARACTERS, true)
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file. Expected attribute name instead.")})
|
||||
emitCurrentToken
|
||||
elsif ASCII_LETTERS.include? data
|
||||
@currentToken[:data].push([data, ""])
|
||||
@state = @states[:attributeName]
|
||||
elsif data == ">"
|
||||
emitCurrentToken
|
||||
elsif data == "/"
|
||||
processSolidusInTag
|
||||
elsif data == "<"
|
||||
@stream.queue.push(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected < character. Expected attribute name instead.")})
|
||||
emitCurrentToken
|
||||
else
|
||||
@currentToken[:data].push([data, ""])
|
||||
@state = @states[:attributeName]
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def attributeNameState
|
||||
data = @stream.char
|
||||
leavingThisState = true
|
||||
if data == "="
|
||||
@state = @states[:beforeAttributeValue]
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in attribute name.")})
|
||||
emitCurrentToken
|
||||
leavingThisState = false
|
||||
elsif ASCII_LETTERS.include? data
|
||||
@currentToken[:data][-1][0] += data +\
|
||||
@stream.charsUntil(ASCII_LETTERS, true)
|
||||
leavingThisState = false
|
||||
elsif data == ">"
|
||||
# XXX If we emit here the attributes are converted to a dict
|
||||
# without being checked and when the code below runs we error
|
||||
# because data is a dict not a list
|
||||
elsif SPACE_CHARACTERS.include? data
|
||||
@state = @states[:afterAttributeName]
|
||||
elsif data == "/"
|
||||
processSolidusInTag
|
||||
@state = @states[:beforeAttributeName]
|
||||
elsif data == "<"
|
||||
@stream.queue.push(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected < character in attribute name.")})
|
||||
emitCurrentToken
|
||||
leavingThisState = false
|
||||
else
|
||||
@currentToken[:data][-1][0] += data
|
||||
leavingThisState = false
|
||||
end
|
||||
|
||||
if leavingThisState
|
||||
# Attributes are not dropped at this stage. That happens when the
|
||||
# start tag token is emitted so values can still be safely appended
|
||||
# to attributes, but we do want to report the parse error in time.
|
||||
@currentToken[:data][0...-1].each {|name,value|
|
||||
if @currentToken[:data][-1][0] == name
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Dropped duplicate attribute on tag.")})
|
||||
end
|
||||
}
|
||||
# XXX Fix for above XXX
|
||||
if data == ">"
|
||||
emitCurrentToken
|
||||
end
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def afterAttributeNameState
|
||||
data = @stream.char
|
||||
if SPACE_CHARACTERS.include? data
|
||||
@stream.charsUntil(SPACE_CHARACTERS, true)
|
||||
elsif data == "="
|
||||
@state = @states[:beforeAttributeValue]
|
||||
elsif data == ">"
|
||||
emitCurrentToken
|
||||
elsif ASCII_LETTERS.include? data
|
||||
@currentToken[:data].push([data, ""])
|
||||
@state = @states[:attributeName]
|
||||
elsif data == "/"
|
||||
processSolidusInTag
|
||||
@state = @states[:beforeAttributeName]
|
||||
elsif data == "<"
|
||||
@stream.queue.push(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected < character. Expected = or end of tag.")})
|
||||
emitCurrentToken
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file. Expected = or end of tag.")})
|
||||
emitCurrentToken
|
||||
else
|
||||
@currentToken[:data].push([data, ""])
|
||||
@state = @states[:attributeName]
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def beforeAttributeValueState
|
||||
data = @stream.char
|
||||
if SPACE_CHARACTERS.include? data
|
||||
@stream.charsUntil(SPACE_CHARACTERS, true)
|
||||
elsif data == "\""
|
||||
@state = @states[:attributeValueDoubleQuoted]
|
||||
elsif data == "&"
|
||||
@state = @states[:attributeValueUnQuoted]
|
||||
@stream.queue.push(data);
|
||||
elsif data == "'"
|
||||
@state = @states[:attributeValueSingleQuoted]
|
||||
elsif data == ">"
|
||||
emitCurrentToken
|
||||
elsif data == "<"
|
||||
@stream.queue.push(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected < character. Expected attribute value.")})
|
||||
emitCurrentToken
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file. Expected attribute value.")})
|
||||
emitCurrentToken
|
||||
else
|
||||
@currentToken[:data][-1][1] += data
|
||||
@state = @states[:attributeValueUnQuoted]
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def attributeValueDoubleQuotedState
|
||||
data = @stream.char
|
||||
if data == "\""
|
||||
@state = @states[:beforeAttributeName]
|
||||
elsif data == "&"
|
||||
processEntityInAttribute
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in attribute value (\").")})
|
||||
emitCurrentToken
|
||||
else
|
||||
@currentToken[:data][-1][1] += data +\
|
||||
@stream.charsUntil(["\"", "&"])
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def attributeValueSingleQuotedState
|
||||
data = @stream.char
|
||||
if data == "'"
|
||||
@state = @states[:beforeAttributeName]
|
||||
elsif data == "&"
|
||||
processEntityInAttribute
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in attribute value (').")})
|
||||
emitCurrentToken
|
||||
else
|
||||
@currentToken[:data][-1][1] += data +\
|
||||
@stream.charsUntil(["'", "&"])
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def attributeValueUnQuotedState
|
||||
data = @stream.char
|
||||
if SPACE_CHARACTERS.include? data
|
||||
@state = @states[:beforeAttributeName]
|
||||
elsif data == "&"
|
||||
processEntityInAttribute
|
||||
elsif data == ">"
|
||||
emitCurrentToken
|
||||
elsif data == "<"
|
||||
@stream.queue.push(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected < character in attribute value.")})
|
||||
emitCurrentToken
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in attribute value.")})
|
||||
emitCurrentToken
|
||||
else
|
||||
@currentToken[:data][-1][1] += data +
|
||||
@stream.charsUntil(["&", ">","<"] + SPACE_CHARACTERS)
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def bogusCommentState
|
||||
# Make a new comment token and give it as value all the characters
|
||||
# until the first > or :EOF (charsUntil checks for :EOF automatically)
|
||||
# and emit it.
|
||||
@tokenQueue.push(
|
||||
{:type => :Comment, :data => @stream.charsUntil((">"))})
|
||||
|
||||
# Eat the character directly after the bogus comment which is either a
|
||||
# ">" or an :EOF.
|
||||
@stream.char
|
||||
@state = @states[:data]
|
||||
return true
|
||||
end
|
||||
|
||||
def markupDeclarationOpenState
|
||||
charStack = [@stream.char, @stream.char]
|
||||
if charStack == ["-", "-"]
|
||||
@currentToken = {:type => :Comment, :data => ""}
|
||||
@state = @states[:comment]
|
||||
else
|
||||
5.times { charStack.push(@stream.char) }
|
||||
# Put in explicit :EOF check
|
||||
if ((not charStack.include? :EOF) and
|
||||
charStack.join("").upcase == "DOCTYPE")
|
||||
@currentToken =\
|
||||
{:type => :Doctype, :name => "", :data => true}
|
||||
@state = @states[:doctype]
|
||||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected '--' or 'DOCTYPE'. Not found.")})
|
||||
@stream.queue += charStack
|
||||
@state = @states[:bogusComment]
|
||||
end
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def commentState
|
||||
data = @stream.char
|
||||
if data == "-"
|
||||
@state = @states[:commentDash]
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in comment.")})
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
else
|
||||
@currentToken[:data] += data + @stream.charsUntil("-")
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def commentDashState
|
||||
data = @stream.char
|
||||
if data == "-"
|
||||
@state = @states[:commentEnd]
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in comment (-)")})
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
else
|
||||
@currentToken[:data] += "-" + data +\
|
||||
@stream.charsUntil("-")
|
||||
# Consume the next character which is either a "-" or an :EOF as
|
||||
# well so if there's a "-" directly after the "-" we go nicely to
|
||||
# the "comment end state" without emitting a ParseError there.
|
||||
@stream.char
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def commentEndState
|
||||
data = @stream.char
|
||||
if data == ">"
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
elsif data == "-"
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected '-' after '--' found in comment.")})
|
||||
@currentToken[:data] += data
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in comment (--).")})
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
else
|
||||
# XXX
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected character in comment found.")})
|
||||
@currentToken[:data] += "--" + data
|
||||
@state = @states[:comment]
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def doctypeState
|
||||
data = @stream.char
|
||||
if SPACE_CHARACTERS.include? data
|
||||
@state = @states[:beforeDoctypeName]
|
||||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("No space after literal string 'DOCTYPE'.")})
|
||||
@stream.queue.push(data)
|
||||
@state = @states[:beforeDoctypeName]
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def beforeDoctypeNameState
|
||||
data = @stream.char
|
||||
if SPACE_CHARACTERS.include? data
|
||||
elsif ASCII_LOWERCASE.include? data
|
||||
@currentToken[:name] = data.upcase
|
||||
@state = @states[:doctypeName]
|
||||
elsif data == ">"
|
||||
# Character needs to be consumed per the specification so don't
|
||||
# invoke emitCurrentTokenWithParseError with :data as argument.
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected > character. Expected DOCTYPE name.")})
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file. Expected DOCTYPE name.")})
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
else
|
||||
@currentToken[:name] = data
|
||||
@state = @states[:doctypeName]
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def doctypeNameState
|
||||
data = @stream.char
|
||||
needsDoctypeCheck = false
|
||||
if SPACE_CHARACTERS.include? data
|
||||
@state = @states[:afterDoctypeName]
|
||||
needsDoctypeCheck = true
|
||||
elsif data == ">"
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in DOCTYPE name.")})
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
else
|
||||
# We can't just uppercase everything that arrives here. For
|
||||
# instance, non-ASCII characters.
|
||||
if ASCII_LOWERCASE.include? data
|
||||
data = data.upcase
|
||||
end
|
||||
@currentToken[:name] += data
|
||||
needsDoctypeCheck = true
|
||||
end
|
||||
|
||||
# After some iterations through this state it should eventually say
|
||||
# "HTML". Otherwise there's an error.
|
||||
if needsDoctypeCheck and @currentToken[:name] == "HTML"
|
||||
@currentToken[:data] = false
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def afterDoctypeNameState
|
||||
data = @stream.char
|
||||
if SPACE_CHARACTERS.include? data
|
||||
elsif data == ">"
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
elsif data == :EOF
|
||||
@currentToken[:data] = true
|
||||
# XXX EMIT
|
||||
@stream.queue.push(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected space or '>'. Got '" + data + "'")})
|
||||
@currentToken[:data] = true
|
||||
@state = @states[:bogusDoctype]
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def bogusDoctypeState
|
||||
data = @stream.char
|
||||
if data == ">"
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
elsif data == :EOF
|
||||
# XXX EMIT
|
||||
@stream.queue.push(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in bogus doctype.")})
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def _(string); string; end
|
||||
end
|
||||
|
||||
end
|
21
vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb
vendored
Normal file
21
vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb
vendored
Normal file
|
@ -0,0 +1,21 @@
|
|||
module HTML5lib
|
||||
module TreeBuilders
|
||||
|
||||
def self.getTreeBuilder(name)
|
||||
case name.to_s.downcase
|
||||
when 'simpletree' then
|
||||
require 'html5lib/treebuilders/simpletree'
|
||||
SimpleTree::TreeBuilder
|
||||
when 'rexml' then
|
||||
require 'html5lib/treebuilders/rexml'
|
||||
REXMLTree::TreeBuilder
|
||||
when 'hpricot' then
|
||||
require 'html5lib/treebuilders/hpricot'
|
||||
Hpricot::TreeBuilder
|
||||
else
|
||||
raise "Unknown TreeBuilder #{name}"
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
330
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb
vendored
Executable file
330
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb
vendored
Executable file
|
@ -0,0 +1,330 @@
|
|||
require 'html5lib/constants'
|
||||
|
||||
#XXX - TODO; make the default interface more ElementTree-like rather than DOM-like
|
||||
|
||||
module HTML5lib
|
||||
|
||||
# The scope markers are inserted when entering buttons, object elements,
|
||||
# marquees, table cells, and table captions, and are used to prevent formatting
|
||||
# from "leaking" into tables, buttons, object elements, and marquees.
|
||||
Marker = nil
|
||||
|
||||
module TreeBuilders
|
||||
module Base
|
||||
|
||||
class Node
|
||||
# The parent of the current node (or nil for the document node)
|
||||
attr_accessor :parent
|
||||
|
||||
# a list of child nodes of the current node. This must
|
||||
# include all elements but not necessarily other node types
|
||||
attr_accessor :childNodes
|
||||
|
||||
# A list of miscellaneous flags that can be set on the node
|
||||
attr_accessor :_flags
|
||||
|
||||
def initialize(name)
|
||||
@parent = nil
|
||||
@childNodes = []
|
||||
@_flags = []
|
||||
end
|
||||
|
||||
# Insert node as a child of the current node
|
||||
def appendChild(node)
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
# Insert data as text in the current node, positioned before the
|
||||
# start of node insertBefore or to the end of the node's text.
|
||||
def insertText(data, insertBefore = nil)
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
# Insert node as a child of the current node, before refNode in the
|
||||
# list of child nodes. Raises ValueError if refNode is not a child of
|
||||
# the current node
|
||||
def insertBefore(node, refNode)
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
# Remove node from the children of the current node
|
||||
def removeChild(node)
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
# Move all the children of the current node to newParent.
|
||||
# This is needed so that trees that don't store text as nodes move the
|
||||
# text in the correct way
|
||||
def reparentChildren(newParent)
|
||||
#XXX - should this method be made more general?
|
||||
@childNodes.each { |child| newParent.appendChild(child) }
|
||||
@childNodes = []
|
||||
end
|
||||
|
||||
# Return a shallow copy of the current node i.e. a node with the same
|
||||
# name and attributes but with no parent or child nodes
|
||||
def cloneNode
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
# Return true if the node has children or text, false otherwise
|
||||
def hasContent
|
||||
raise NotImplementedError
|
||||
end
|
||||
end
|
||||
|
||||
# Base treebuilder implementation
|
||||
class TreeBuilder
|
||||
|
||||
attr_accessor :openElements
|
||||
|
||||
attr_accessor :activeFormattingElements
|
||||
|
||||
attr_accessor :document
|
||||
|
||||
attr_accessor :headPointer
|
||||
|
||||
attr_accessor :formPointer
|
||||
|
||||
# Class to use for document root
|
||||
documentClass = nil
|
||||
|
||||
# Class to use for HTML elements
|
||||
elementClass = nil
|
||||
|
||||
# Class to use for comments
|
||||
commentClass = nil
|
||||
|
||||
# Class to use for doctypes
|
||||
doctypeClass = nil
|
||||
|
||||
# Fragment class
|
||||
fragmentClass = nil
|
||||
|
||||
def initialize
|
||||
reset
|
||||
end
|
||||
|
||||
def reset
|
||||
@openElements = []
|
||||
@activeFormattingElements = []
|
||||
|
||||
#XXX - rename these to headElement, formElement
|
||||
@headPointer = nil
|
||||
@formPointer = nil
|
||||
|
||||
self.insertFromTable = false
|
||||
|
||||
@document = @documentClass.new
|
||||
end
|
||||
|
||||
def elementInScope(target, tableVariant = false)
|
||||
# Exit early when possible.
|
||||
return true if @openElements[-1].name == target
|
||||
|
||||
# AT How about while true and simply set node to [-1] and set it to
|
||||
# [-2] at the end...
|
||||
@openElements.reverse.each do |element|
|
||||
if element.name == target
|
||||
return true
|
||||
elsif element.name == 'table'
|
||||
return false
|
||||
elsif not tableVariant and SCOPING_ELEMENTS.include?(element.name)
|
||||
return false
|
||||
elsif element.name == 'html'
|
||||
return false
|
||||
end
|
||||
end
|
||||
assert false # We should never reach this point
|
||||
end
|
||||
|
||||
def reconstructActiveFormattingElements
|
||||
# Within this algorithm the order of steps described in the
|
||||
# specification is not quite the same as the order of steps in the
|
||||
# code. It should still do the same though.
|
||||
|
||||
# Step 1: stop the algorithm when there's nothing to do.
|
||||
return unless @activeFormattingElements
|
||||
|
||||
# Step 2 and step 3: we start with the last element. So i is -1.
|
||||
i = -1
|
||||
entry = @activeFormattingElements[i]
|
||||
return if entry == Marker or @openElements.include?(entry)
|
||||
|
||||
# Step 6
|
||||
until entry == Marker or @openElements.include?(entry)
|
||||
# Step 5: let entry be one earlier in the list.
|
||||
i -= 1
|
||||
begin
|
||||
entry = @activeFormattingElements[i]
|
||||
rescue
|
||||
# Step 4: at this point we need to jump to step 8. By not doing
|
||||
# i += 1 which is also done in step 7 we achieve that.
|
||||
break
|
||||
end
|
||||
end
|
||||
while true
|
||||
# Step 7
|
||||
i += 1
|
||||
|
||||
# Step 8
|
||||
clone = @activeFormattingElements[i].cloneNode
|
||||
|
||||
# Step 9
|
||||
element = insertElement(clone.name, clone.attributes)
|
||||
|
||||
# Step 10
|
||||
@activeFormattingElements[i] = element
|
||||
|
||||
# Step 11
|
||||
break if element == @activeFormattingElements[-1]
|
||||
end
|
||||
end
|
||||
|
||||
def clearActiveFormattingElements
|
||||
{} until @activeFormattingElements.empty? || @activeFormattingElements.pop == Marker
|
||||
end
|
||||
|
||||
# Check if an element exists between the end of the active
|
||||
# formatting elements and the last marker. If it does, return it, else
|
||||
# return false
|
||||
def elementInActiveFormattingElements(name)
|
||||
@activeFormattingElements.reverse.each do |element|
|
||||
# Check for Marker first because if it's a Marker it doesn't have a
|
||||
# name attribute.
|
||||
break if element == Marker
|
||||
return element if element.name == name
|
||||
end
|
||||
return false
|
||||
end
|
||||
|
||||
def insertDoctype(name)
|
||||
@document.appendChild(@doctypeClass.new(name))
|
||||
end
|
||||
|
||||
def insertComment(data, parent = nil)
|
||||
parent = @openElements[-1] if parent.nil?
|
||||
parent.appendChild(@commentClass.new(data))
|
||||
end
|
||||
|
||||
# Create an element but don't insert it anywhere
|
||||
def createElement(name, attributes)
|
||||
element = @elementClass.new(name)
|
||||
element.attributes = attributes
|
||||
return element
|
||||
end
|
||||
|
||||
# Switch the function used to insert an element from the
|
||||
# normal one to the misnested table one and back again
|
||||
def insertFromTable=(value)
|
||||
@insertFromTable = value
|
||||
@insertElement = value ? :insertElementTable : :insertElementNormal
|
||||
end
|
||||
|
||||
def insertElement(name, attributes)
|
||||
send(@insertElement, name, attributes)
|
||||
end
|
||||
|
||||
def insertElementNormal(name, attributes)
|
||||
element = @elementClass.new(name)
|
||||
element.attributes = attributes
|
||||
@openElements[-1].appendChild(element)
|
||||
@openElements.push(element)
|
||||
return element
|
||||
end
|
||||
|
||||
# Create an element and insert it into the tree
|
||||
def insertElementTable(name, attributes)
|
||||
element = @elementClass.new(name)
|
||||
element.attributes = attributes
|
||||
if TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)
|
||||
#We should be in the InTable mode. This means we want to do
|
||||
#special magic element rearranging
|
||||
parent, insertBefore = getTableMisnestedNodePosition
|
||||
if insertBefore.nil?
|
||||
parent.appendChild(element)
|
||||
else
|
||||
parent.insertBefore(element, insertBefore)
|
||||
end
|
||||
@openElements.push(element)
|
||||
else
|
||||
return insertElementNormal(name, attributes)
|
||||
end
|
||||
return element
|
||||
end
|
||||
|
||||
def insertText(data, parent = nil)
|
||||
parent = @openElements[-1] if parent.nil?
|
||||
|
||||
if (not(@insertFromTable) or (@insertFromTable and not TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)))
|
||||
parent.insertText(data)
|
||||
else
|
||||
#We should be in the InTable mode. This means we want to do
|
||||
#special magic element rearranging
|
||||
parent, insertBefore = getTableMisnestedNodePosition
|
||||
parent.insertText(data, insertBefore)
|
||||
end
|
||||
end
|
||||
|
||||
# Get the foster parent element, and sibling to insert before
|
||||
# (or nil) when inserting a misnested table node
|
||||
def getTableMisnestedNodePosition
|
||||
#The foster parent element is the one which comes before the most
|
||||
#recently opened table element
|
||||
#XXX - this is really inelegant
|
||||
lastTable = nil
|
||||
fosterParent = nil
|
||||
insertBefore = nil
|
||||
@openElements.reverse.each do |element|
|
||||
if element.name == "table"
|
||||
lastTable = element
|
||||
break
|
||||
end
|
||||
end
|
||||
if lastTable
|
||||
#XXX - we should really check that this parent is actually a
|
||||
#node here
|
||||
if lastTable.parent
|
||||
fosterParent = lastTable.parent
|
||||
insertBefore = lastTable
|
||||
else
|
||||
fosterParent = @openElements[@openElements.index(lastTable) - 1]
|
||||
end
|
||||
else
|
||||
fosterParent = @openElements[0]
|
||||
end
|
||||
return fosterParent, insertBefore
|
||||
end
|
||||
|
||||
def generateImpliedEndTags(exclude = nil)
|
||||
name = @openElements[-1].name
|
||||
|
||||
if (['dd', 'dt', 'li', 'p', 'td', 'th', 'tr'].include?(name) and name != exclude)
|
||||
@openElements.pop
|
||||
# XXX This is not entirely what the specification says. We should
|
||||
# investigate it more closely.
|
||||
generateImpliedEndTags(exclude)
|
||||
end
|
||||
end
|
||||
|
||||
def getDocument
|
||||
@document
|
||||
end
|
||||
|
||||
def getFragment
|
||||
#assert @innerHTML
|
||||
fragment = @fragmentClass.new
|
||||
@openElements[0].reparentChildren(fragment)
|
||||
return fragment
|
||||
end
|
||||
|
||||
# Serialize the subtree of node in the format required by unit tests
|
||||
# node - the node from which to start serializing
|
||||
def testSerializer(node)
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
211
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb
vendored
Normal file
211
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb
vendored
Normal file
|
@ -0,0 +1,211 @@
|
|||
require 'html5lib/treebuilders/base'
|
||||
require 'hpricot'
|
||||
require 'forwardable'
|
||||
|
||||
module HTML5lib
|
||||
module TreeBuilders
|
||||
module Hpricot
|
||||
|
||||
class Node < Base::Node
|
||||
|
||||
extend Forwardable
|
||||
|
||||
def_delegators :@hpricot, :name
|
||||
|
||||
attr_accessor :hpricot
|
||||
|
||||
def initialize(name)
|
||||
super(name)
|
||||
@hpricot = self.class.hpricot_class.new name
|
||||
end
|
||||
|
||||
def appendChild(node)
|
||||
if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
|
||||
childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s
|
||||
else
|
||||
childNodes << node
|
||||
hpricot.children << node.hpricot
|
||||
end
|
||||
node.parent = self
|
||||
end
|
||||
|
||||
def removeChild(node)
|
||||
childNodes.delete(node)
|
||||
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
|
||||
node.parent = nil
|
||||
end
|
||||
|
||||
def insertText(data, before = nil)
|
||||
if before
|
||||
insertBefore(TextNode.new(data), before)
|
||||
else
|
||||
appendChild(TextNode.new(data))
|
||||
end
|
||||
end
|
||||
|
||||
def insertBefore(node, refNode)
|
||||
index = childNodes.index(refNode)
|
||||
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
|
||||
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
|
||||
else
|
||||
childNodes.insert(index, node)
|
||||
end
|
||||
end
|
||||
|
||||
def hasContent
|
||||
childNodes.any?
|
||||
end
|
||||
end
|
||||
|
||||
class Element < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Elem
|
||||
end
|
||||
|
||||
def initialize(name)
|
||||
super(name)
|
||||
|
||||
@hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
|
||||
end
|
||||
|
||||
def name
|
||||
@hpricot.stag.name
|
||||
end
|
||||
|
||||
def cloneNode
|
||||
attributes.inject(self.class.new(name)) do |node, (name, value)|
|
||||
node.hpricot[name] = value
|
||||
node
|
||||
end
|
||||
end
|
||||
|
||||
# A call to Hpricot::Elem#raw_attributes is built dynamically,
|
||||
# so alterations to the returned value (a hash) will be lost.
|
||||
#
|
||||
# AttributeProxy works around this by forwarding :[]= calls
|
||||
# to the raw_attributes accessor on the element start tag.
|
||||
#
|
||||
class AttributeProxy
|
||||
def initialize(hpricot)
|
||||
@hpricot = hpricot
|
||||
end
|
||||
def []=(k, v)
|
||||
@hpricot.stag.send(stag_attributes_method)[k] = v
|
||||
end
|
||||
def stag_attributes_method
|
||||
# STag#attributes changed to STag#raw_attributes after Hpricot 0.5
|
||||
@hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
|
||||
end
|
||||
def method_missing(*a, &b)
|
||||
@hpricot.attributes.send(*a, &b)
|
||||
end
|
||||
end
|
||||
|
||||
def attributes
|
||||
AttributeProxy.new(@hpricot)
|
||||
end
|
||||
|
||||
def attributes=(attrs)
|
||||
attrs.each { |name, value| @hpricot[name] = value }
|
||||
end
|
||||
|
||||
def printTree(indent = 0)
|
||||
tree = "\n|#{' ' * indent}<#{name}>"
|
||||
indent += 2
|
||||
attributes.each do |name, value|
|
||||
next if name == 'xmlns'
|
||||
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
|
||||
end
|
||||
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
|
||||
end
|
||||
end
|
||||
|
||||
class Document < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Doc
|
||||
end
|
||||
|
||||
def initialize
|
||||
super(nil)
|
||||
end
|
||||
|
||||
def printTree(indent = 0)
|
||||
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentType < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::DocType
|
||||
end
|
||||
|
||||
def initialize(name)
|
||||
begin
|
||||
super(name)
|
||||
rescue ArgumentError # needs 3...
|
||||
end
|
||||
|
||||
@hpricot = ::Hpricot::DocType.new(name, nil, nil)
|
||||
end
|
||||
|
||||
def printTree(indent = 0)
|
||||
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentFragment < Element
|
||||
def initialize
|
||||
super('')
|
||||
end
|
||||
|
||||
def printTree(indent = 0)
|
||||
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
|
||||
end
|
||||
end
|
||||
|
||||
class TextNode < Node
|
||||
def initialize(data)
|
||||
@hpricot = ::Hpricot::Text.new(data)
|
||||
end
|
||||
|
||||
def printTree(indent = 0)
|
||||
"\n|#{' ' * indent}\"#{hpricot.content}\""
|
||||
end
|
||||
end
|
||||
|
||||
class CommentNode < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Comment
|
||||
end
|
||||
|
||||
def printTree(indent = 0)
|
||||
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
|
||||
end
|
||||
end
|
||||
|
||||
class TreeBuilder < Base::TreeBuilder
|
||||
def initialize
|
||||
@documentClass = Document
|
||||
@doctypeClass = DocumentType
|
||||
@elementClass = Element
|
||||
@commentClass = CommentNode
|
||||
@fragmentClass = DocumentFragment
|
||||
end
|
||||
|
||||
def testSerializer(node)
|
||||
node.printTree
|
||||
end
|
||||
|
||||
def getDocument
|
||||
@document.hpricot
|
||||
end
|
||||
|
||||
def getFragment
|
||||
@document = super
|
||||
return @document.hpricot.children
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
191
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb
vendored
Normal file
191
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb
vendored
Normal file
|
@ -0,0 +1,191 @@
|
|||
require 'html5lib/treebuilders/base'
|
||||
require 'rexml/document'
|
||||
require 'forwardable'
|
||||
|
||||
module HTML5lib
|
||||
module TreeBuilders
|
||||
module REXMLTree
|
||||
|
||||
class Node < Base::Node
|
||||
extend Forwardable
|
||||
def_delegators :@rxobj, :name, :attributes
|
||||
attr_accessor :rxobj
|
||||
|
||||
def initialize name
|
||||
super name
|
||||
@rxobj = self.class.rxclass.new name
|
||||
end
|
||||
|
||||
def appendChild node
|
||||
if node.kind_of? TextNode and
|
||||
childNodes.length>0 and childNodes[-1].kind_of? TextNode
|
||||
childNodes[-1].rxobj.value =
|
||||
childNodes[-1].rxobj.to_s + node.rxobj.to_s
|
||||
childNodes[-1].rxobj.raw = true
|
||||
else
|
||||
childNodes.push node
|
||||
rxobj.add node.rxobj
|
||||
end
|
||||
node.parent = self
|
||||
end
|
||||
|
||||
def removeChild node
|
||||
childNodes.delete node
|
||||
rxobj.delete node.rxobj
|
||||
node.parent = nil
|
||||
end
|
||||
|
||||
def insertText data, before=nil
|
||||
if before
|
||||
insertBefore TextNode.new(data), before
|
||||
else
|
||||
appendChild TextNode.new(data)
|
||||
end
|
||||
end
|
||||
|
||||
def insertBefore node, refNode
|
||||
index = childNodes.index(refNode)
|
||||
if node.kind_of? TextNode and index>0 and
|
||||
childNodes[index-1].kind_of? TextNode
|
||||
childNodes[index-1].rxobj.value =
|
||||
childNodes[index-1].rxobj.to_s + node.rxobj.to_s
|
||||
childNodes[index-1].rxobj.raw = true
|
||||
else
|
||||
childNodes.insert index, node
|
||||
end
|
||||
end
|
||||
|
||||
def hasContent
|
||||
return (childNodes.length > 0)
|
||||
end
|
||||
end
|
||||
|
||||
class Element < Node
|
||||
def self.rxclass
|
||||
REXML::Element
|
||||
end
|
||||
|
||||
def initialize name
|
||||
super name
|
||||
end
|
||||
|
||||
def cloneNode
|
||||
newNode = self.class.new name
|
||||
attributes.each {|name,value| newNode.attributes[name] = value}
|
||||
newNode
|
||||
end
|
||||
|
||||
def attributes= value
|
||||
value.each {|name,value| rxobj.attributes[name]=value}
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
tree = "\n|#{' ' * indent}<#{name}>"
|
||||
indent += 2
|
||||
for name, value in attributes
|
||||
next if name == 'xmlns'
|
||||
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
|
||||
end
|
||||
for child in childNodes
|
||||
tree += child.printTree(indent)
|
||||
end
|
||||
return tree
|
||||
end
|
||||
end
|
||||
|
||||
class Document < Node
|
||||
def self.rxclass
|
||||
REXML::Document
|
||||
end
|
||||
|
||||
def initialize
|
||||
super nil
|
||||
end
|
||||
|
||||
def appendChild node
|
||||
if node.kind_of? Element and node.name == 'html'
|
||||
node.rxobj.add_namespace('http://www.w3.org/1999/xhtml')
|
||||
end
|
||||
super node
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
tree = "#document"
|
||||
for child in childNodes
|
||||
tree += child.printTree(indent + 2)
|
||||
end
|
||||
return tree
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentType < Node
|
||||
def self.rxclass
|
||||
REXML::DocType
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
"\n|#{' ' * indent}<!DOCTYPE #{name}>"
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentFragment < Element
|
||||
def initialize
|
||||
super nil
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
tree = ""
|
||||
for child in childNodes
|
||||
tree += child.printTree(indent+2)
|
||||
end
|
||||
return tree
|
||||
end
|
||||
end
|
||||
|
||||
class TextNode < Node
|
||||
def initialize data
|
||||
raw=data.gsub('&','&').gsub('<','<').gsub('>','>')
|
||||
@rxobj = REXML::Text.new(raw, true, nil, true)
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
"\n|#{' ' * indent}\"#{rxobj.value}\""
|
||||
end
|
||||
end
|
||||
|
||||
class CommentNode < Node
|
||||
def self.rxclass
|
||||
REXML::Comment
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
"\n|#{' ' * indent}<!-- #{rxobj.string} -->"
|
||||
end
|
||||
end
|
||||
|
||||
class TreeBuilder < Base::TreeBuilder
|
||||
def initialize
|
||||
@documentClass = Document
|
||||
@doctypeClass = DocumentType
|
||||
@elementClass = Element
|
||||
@commentClass = CommentNode
|
||||
@fragmentClass = DocumentFragment
|
||||
end
|
||||
|
||||
def testSerializer node
|
||||
node.printTree()
|
||||
end
|
||||
|
||||
def getDocument
|
||||
@document.rxobj
|
||||
end
|
||||
|
||||
def getFragment
|
||||
@document = super
|
||||
return @document.rxobj.children
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
178
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/simpletree.rb
vendored
Normal file
178
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/simpletree.rb
vendored
Normal file
|
@ -0,0 +1,178 @@
|
|||
require 'html5lib/treebuilders/base'
|
||||
|
||||
module HTML5lib
|
||||
module TreeBuilders
|
||||
module SimpleTree
|
||||
|
||||
class Node < Base::Node
|
||||
# Node representing an item in the tree.
|
||||
# name - The tag name associated with the node
|
||||
attr_accessor :name
|
||||
|
||||
# The value of the current node (applies to text nodes and
|
||||
# comments
|
||||
attr_accessor :value
|
||||
|
||||
# a dict holding name, value pairs for attributes of the node
|
||||
attr_accessor :attributes
|
||||
|
||||
def initialize name
|
||||
super
|
||||
@name = name
|
||||
@value = nil
|
||||
@attributes = {}
|
||||
end
|
||||
|
||||
def appendChild node
|
||||
if node.kind_of? TextNode and
|
||||
childNodes.length>0 and childNodes[-1].kind_of? TextNode
|
||||
childNodes[-1].value += node.value
|
||||
else
|
||||
childNodes.push node
|
||||
end
|
||||
node.parent = self
|
||||
end
|
||||
|
||||
def removeChild node
|
||||
childNodes.delete node
|
||||
node.parent = nil
|
||||
end
|
||||
|
||||
def cloneNode
|
||||
newNode = self.class.new name
|
||||
attributes.each {|name,value| newNode.attributes[name] = value}
|
||||
newNode.value = value
|
||||
newNode
|
||||
end
|
||||
|
||||
def insertText data, before=nil
|
||||
if before
|
||||
insertBefore TextNode.new(data), before
|
||||
else
|
||||
appendChild TextNode.new(data)
|
||||
end
|
||||
end
|
||||
|
||||
def insertBefore node, refNode
|
||||
index = childNodes.index(refNode)
|
||||
if node.kind_of? TextNode and index>0 and
|
||||
childNodes[index-1].kind_of? TextNode
|
||||
childNodes[index-1].value += node.value
|
||||
else
|
||||
childNodes.insert index, node
|
||||
end
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
tree = "\n|%s%s" % [' '* indent, self.to_s]
|
||||
for child in childNodes
|
||||
tree += child.printTree(indent + 2)
|
||||
end
|
||||
return tree
|
||||
end
|
||||
|
||||
def hasContent
|
||||
return (childNodes.length > 0)
|
||||
end
|
||||
end
|
||||
|
||||
class Element < Node
|
||||
def to_s
|
||||
"<%s>" % name
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
tree = "\n|%s%s" % [' '* indent, self.to_s]
|
||||
indent += 2
|
||||
for name, value in attributes
|
||||
tree += "\n|%s%s=\"%s\"" % [' ' * indent, name, value]
|
||||
end
|
||||
for child in childNodes
|
||||
tree += child.printTree(indent)
|
||||
end
|
||||
return tree
|
||||
end
|
||||
end
|
||||
|
||||
class Document < Node
|
||||
def to_s
|
||||
"#document"
|
||||
end
|
||||
|
||||
def initialize
|
||||
super nil
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
tree = to_s
|
||||
for child in childNodes
|
||||
tree += child.printTree(indent + 2)
|
||||
end
|
||||
return tree
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentType < Node
|
||||
def to_s
|
||||
"<!DOCTYPE %s>" % name
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentFragment < Element
|
||||
def initialize
|
||||
super nil
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
tree = ""
|
||||
for child in childNodes
|
||||
tree += child.printTree(indent+2)
|
||||
end
|
||||
return tree
|
||||
end
|
||||
end
|
||||
|
||||
class TextNode < Node
|
||||
def initialize value
|
||||
super nil
|
||||
@value = value
|
||||
end
|
||||
|
||||
def to_s
|
||||
'"%s"' % value
|
||||
end
|
||||
end
|
||||
|
||||
class CommentNode < Node
|
||||
def initialize value
|
||||
super nil
|
||||
@value = value
|
||||
end
|
||||
|
||||
def to_s
|
||||
"<!-- %s -->" % value
|
||||
end
|
||||
end
|
||||
|
||||
class TreeBuilder < Base::TreeBuilder
|
||||
def initialize
|
||||
@documentClass = Document
|
||||
@doctypeClass = DocumentType
|
||||
@elementClass = Element
|
||||
@commentClass = CommentNode
|
||||
@fragmentClass = DocumentFragment
|
||||
end
|
||||
|
||||
def testSerializer node
|
||||
node.printTree()
|
||||
end
|
||||
|
||||
def getFragment
|
||||
@document = super
|
||||
return @document.childNodes
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
11
vendor/plugins/HTML5lib/tests/preamble.rb
vendored
Normal file
11
vendor/plugins/HTML5lib/tests/preamble.rb
vendored
Normal file
|
@ -0,0 +1,11 @@
|
|||
require 'test/unit'
|
||||
|
||||
HTML5LIB_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__))))
|
||||
|
||||
$:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
|
||||
|
||||
$:.unshift File.dirname(__FILE__)
|
||||
|
||||
def html5lib_test_files(subdirectory)
|
||||
Dir[File.join(HTML5LIB_BASE, 'tests', subdirectory, '*.*')]
|
||||
end
|
36
vendor/plugins/HTML5lib/tests/test_encoding.rb
vendored
Executable file
36
vendor/plugins/HTML5lib/tests/test_encoding.rb
vendored
Executable file
|
@ -0,0 +1,36 @@
|
|||
require File.join(File.dirname(__FILE__), 'preamble')
|
||||
|
||||
require 'html5lib/inputstream'
|
||||
|
||||
class Html5EncodingTestCase < Test::Unit::TestCase
|
||||
|
||||
begin
|
||||
require 'rubygems'
|
||||
require 'UniversalDetector'
|
||||
|
||||
def test_chardet
|
||||
File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
|
||||
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
|
||||
assert_equal 'big5', stream.charEncoding.downcase
|
||||
end
|
||||
end
|
||||
rescue LoadError
|
||||
puts "chardet not found, skipping chardet tests"
|
||||
end
|
||||
|
||||
html5lib_test_files('encoding').each do |test_file|
|
||||
test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
|
||||
|
||||
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
||||
next if data.empty?
|
||||
input, encoding = data.split(/\n#encoding\s+/, 2)
|
||||
encoding = encoding.split[0]
|
||||
|
||||
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
|
||||
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
|
||||
assert_equal encoding.downcase, stream.charEncoding.downcase, input
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
end
|
212
vendor/plugins/HTML5lib/tests/test_lxp.rb
vendored
Executable file
212
vendor/plugins/HTML5lib/tests/test_lxp.rb
vendored
Executable file
|
@ -0,0 +1,212 @@
|
|||
require File.join(File.dirname(__FILE__), 'preamble')
|
||||
|
||||
require 'html5lib/liberalxmlparser'
|
||||
|
||||
XMLELEM = /<(\w+\s*)((?:[-:\w]+="[^"]*"\s*)+)(\/?)>/
|
||||
SORTATTRS = '<#{$1+$2.split.sort.join(' ')+$3}>'
|
||||
|
||||
def assert_xml_equal(input, expected=nil, parser=HTML5lib::XMLParser)
|
||||
document = parser.parse(input.chomp).root
|
||||
if not expected
|
||||
expected = input.chomp.gsub(XMLELEM,SORTATTRS)
|
||||
expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
|
||||
output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,SORTATTRS)
|
||||
assert_equal(expected, output)
|
||||
else
|
||||
assert_equal(expected, document.to_s.gsub(/'/,'"'))
|
||||
end
|
||||
end
|
||||
|
||||
def assert_xhtml_equal(input, expected=nil, parser=HTML5lib::XHTMLParser)
|
||||
assert_xml_equal(input, expected, parser)
|
||||
end
|
||||
|
||||
class BasicXhtml5Test < Test::Unit::TestCase
|
||||
|
||||
def test_title_body_mismatched_close
|
||||
assert_xhtml_equal(
|
||||
'<title>Xhtml</title><b><i>content</b></i>',
|
||||
'<html xmlns="http://www.w3.org/1999/xhtml">' +
|
||||
'<head><title>Xhtml</title></head>' +
|
||||
'<body><b><i>content</i></b></body>' +
|
||||
'</html>')
|
||||
end
|
||||
|
||||
def test_title_body_named_charref
|
||||
assert_xhtml_equal(
|
||||
'<title>mdash</title>A &mdash B',
|
||||
'<html xmlns="http://www.w3.org/1999/xhtml">' +
|
||||
'<head><title>mdash</title></head>' +
|
||||
'<body>A '+ [0x2014].pack('U') + ' B</body>' +
|
||||
'</html>')
|
||||
end
|
||||
end
|
||||
|
||||
class BasicXmlTest < Test::Unit::TestCase
|
||||
|
||||
def test_comment
|
||||
assert_xml_equal("<x><!-- foo --></x>")
|
||||
end
|
||||
|
||||
def test_cdata
|
||||
assert_xml_equal("<x><![CDATA[foo]]></x>","<x>foo</x>")
|
||||
end
|
||||
|
||||
def test_simple_text
|
||||
assert_xml_equal("<p>foo</p>","<p>foo</p>")
|
||||
end
|
||||
|
||||
def test_optional_close
|
||||
assert_xml_equal("<p>foo","<p>foo</p>")
|
||||
end
|
||||
|
||||
def test_html_mismatched
|
||||
assert_xml_equal("<b><i>foo</b></i>","<b><i>foo</i></b>")
|
||||
end
|
||||
end
|
||||
|
||||
class OpmlTest < Test::Unit::TestCase
|
||||
|
||||
def test_mixedCaseElement
|
||||
assert_xml_equal(
|
||||
'<opml version="1.0">' +
|
||||
'<head><ownerName>Dave Winer</ownerName></head>' +
|
||||
'</opml>')
|
||||
end
|
||||
|
||||
def test_mixedCaseAttribute
|
||||
assert_xml_equal(
|
||||
'<opml version="1.0">' +
|
||||
'<body><outline isComment="true"/></body>' +
|
||||
'</opml>')
|
||||
end
|
||||
|
||||
def test_malformed
|
||||
assert_xml_equal(
|
||||
'<opml version="1.0">' +
|
||||
'<body><outline text="Odds & Ends"/></body>' +
|
||||
'</opml>',
|
||||
'<opml version="1.0">' +
|
||||
'<body><outline text="Odds & Ends"/></body>' +
|
||||
'</opml>')
|
||||
end
|
||||
end
|
||||
|
||||
class XhtmlTest < Test::Unit::TestCase
|
||||
|
||||
def test_mathml
|
||||
assert_xhtml_equal <<EOX
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>MathML</title></head>
|
||||
<body>
|
||||
<math xmlns="http://www.w3.org/1998/Math/MathML">
|
||||
<mrow>
|
||||
<mi>x</mi>
|
||||
<mo>=</mo>
|
||||
|
||||
<mfrac>
|
||||
<mrow>
|
||||
<mrow>
|
||||
<mo>-</mo>
|
||||
<mi>b</mi>
|
||||
</mrow>
|
||||
<mo>±</mo>
|
||||
<msqrt>
|
||||
|
||||
<mrow>
|
||||
<msup>
|
||||
<mi>b</mi>
|
||||
<mn>2</mn>
|
||||
</msup>
|
||||
<mo>-</mo>
|
||||
<mrow>
|
||||
|
||||
<mn>4</mn>
|
||||
<mo>⁢</mo>
|
||||
<mi>a</mi>
|
||||
<mo>⁢</mo>
|
||||
<mi>c</mi>
|
||||
</mrow>
|
||||
</mrow>
|
||||
|
||||
</msqrt>
|
||||
</mrow>
|
||||
<mrow>
|
||||
<mn>2</mn>
|
||||
<mo>⁢</mo>
|
||||
<mi>a</mi>
|
||||
</mrow>
|
||||
</mfrac>
|
||||
|
||||
</mrow>
|
||||
</math>
|
||||
</body></html>
|
||||
EOX
|
||||
end
|
||||
|
||||
def test_svg
|
||||
assert_xhtml_equal <<EOX
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>SVG</title></head>
|
||||
<body>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
|
||||
<path d="M38,38c0-12,24-15,23-2c0,9-16,13-16,23v7h11v-4c0-9,17-12,17-27
|
||||
c-2-22-45-22-45,3zM45,70h11v11h-11z" fill="#371">
|
||||
</path>
|
||||
<circle cx="50" cy="50" r="45" fill="none" stroke="#371" stroke-width="10">
|
||||
</circle>
|
||||
|
||||
</svg>
|
||||
</body></html>
|
||||
EOX
|
||||
end
|
||||
|
||||
def test_xlink
|
||||
assert_xhtml_equal <<EOX
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>XLINK</title></head>
|
||||
<body>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
|
||||
<defs xmlns:l="http://www.w3.org/1999/xlink">
|
||||
<radialGradient id="s1" fx=".4" fy=".2" r=".7">
|
||||
<stop stop-color="#FE8"/>
|
||||
<stop stop-color="#D70" offset="1"/>
|
||||
</radialGradient>
|
||||
<radialGradient id="s2" fx=".8" fy=".5" l:href="#s1"/>
|
||||
<radialGradient id="s3" fx=".5" fy=".9" l:href="#s1"/>
|
||||
<radialGradient id="s4" fx=".1" fy=".5" l:href="#s1"/>
|
||||
</defs>
|
||||
<g stroke="#940">
|
||||
<path d="M73,29c-37-40-62-24-52,4l6-7c-8-16,7-26,42,9z" fill="url(#s1)"/>
|
||||
<path d="M47,8c33-16,48,21,9,47l-6-5c38-27,20-44,5-37z" fill="url(#s2)"/>
|
||||
<path d="M77,32c22,30,10,57-39,51l-1-8c3,3,67,5,36-36z" fill="url(#s3)"/>
|
||||
|
||||
<path d="M58,84c-4,20-38-4-8-24l-6-5c-36,43,15,56,23,27z" fill="url(#s4)"/>
|
||||
<path d="M40,14c-40,37-37,52-9,68l1-8c-16-13-29-21,16-56z" fill="url(#s1)"/>
|
||||
<path d="M31,33c19,23,20,7,35,41l-9,1.7c-4-19-8-14-31-37z" fill="url(#s2)"/>
|
||||
</g>
|
||||
</svg>
|
||||
</body></html>
|
||||
EOX
|
||||
end
|
||||
|
||||
def test_br
|
||||
assert_xhtml_equal <<EOX
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>XLINK</title></head>
|
||||
<body>
|
||||
<br/>
|
||||
</body></html>
|
||||
EOX
|
||||
end
|
||||
|
||||
def xtest_strong
|
||||
assert_xhtml_equal <<EOX
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>XLINK</title></head>
|
||||
<body>
|
||||
<strong></strong>
|
||||
</body></html>
|
||||
EOX
|
||||
end
|
||||
end
|
108
vendor/plugins/HTML5lib/tests/test_parser.rb
vendored
Normal file
108
vendor/plugins/HTML5lib/tests/test_parser.rb
vendored
Normal file
|
@ -0,0 +1,108 @@
|
|||
require File.join(File.dirname(__FILE__), 'preamble')
|
||||
|
||||
require 'html5lib/treebuilders'
|
||||
require 'html5lib/html5parser'
|
||||
|
||||
|
||||
$tree_types_to_test = ['simpletree', 'rexml']
|
||||
|
||||
begin
|
||||
require 'hpricot'
|
||||
$tree_types_to_test.push('hpricot')
|
||||
rescue LoadError
|
||||
end
|
||||
|
||||
$CHECK_PARSER_ERRORS = false
|
||||
|
||||
puts 'Testing: ' + $tree_types_to_test * ', '
|
||||
|
||||
|
||||
class Html5ParserTestCase < Test::Unit::TestCase
|
||||
|
||||
def self.startswith?(a, b)
|
||||
b[0... a.length] == a
|
||||
end
|
||||
|
||||
def self.parseTestcase(data)
|
||||
innerHTML = nil
|
||||
input = []
|
||||
output = []
|
||||
errors = []
|
||||
currentList = input
|
||||
data.split(/\n/).each do |line|
|
||||
if !line.empty? and !startswith?("#errors", line) and
|
||||
!startswith?("#document", line) and
|
||||
!startswith?("#data", line) and
|
||||
!startswith?("#document-fragment", line)
|
||||
|
||||
if currentList == output and startswith?("|", line)
|
||||
currentList.push(line[2..-1])
|
||||
else
|
||||
currentList.push(line)
|
||||
end
|
||||
elsif line == "#errors"
|
||||
currentList = errors
|
||||
elsif line == "#document" or startswith?("#document-fragment", line)
|
||||
if startswith?("#document-fragment", line)
|
||||
innerHTML = line[19..-1]
|
||||
raise AssertionError unless innerHTML
|
||||
end
|
||||
currentList = output
|
||||
end
|
||||
end
|
||||
return innerHTML, input.join("\n"), output.join("\n"), errors
|
||||
end
|
||||
|
||||
# convert the output of str(document) to the format used in the testcases
|
||||
def convertTreeDump(treedump)
|
||||
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
|
||||
end
|
||||
|
||||
def sortattrs(output)
|
||||
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
|
||||
end
|
||||
|
||||
html5lib_test_files('tree-construction').each do |test_file|
|
||||
|
||||
test_name = File.basename(test_file).sub('.dat', '')
|
||||
|
||||
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
||||
next if data.empty?
|
||||
|
||||
innerHTML, input, expected_output, expected_errors = parseTestcase(data)
|
||||
|
||||
$tree_types_to_test.each do |tree_name|
|
||||
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
|
||||
|
||||
parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name))
|
||||
|
||||
if innerHTML
|
||||
parser.parseFragment(input, innerHTML)
|
||||
else
|
||||
parser.parse(input)
|
||||
end
|
||||
|
||||
actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
|
||||
|
||||
assert_equal sortattrs(expected_output), sortattrs(actual_output), [
|
||||
'Input:', input,
|
||||
'Expected:', expected_output,
|
||||
'Recieved:', actual_output
|
||||
].join("\n")
|
||||
|
||||
if $CHECK_PARSER_ERRORS
|
||||
actual_errors = parser.errors.map do |(line, col), message|
|
||||
'Line: %i Col: %i %s' % [line, col, message]
|
||||
end
|
||||
assert_equal parser.errors.length, expected_errors.length, [
|
||||
'Expected errors:', expected_errors.join("\n"),
|
||||
'Actual errors:', actual_errors.join("\n")
|
||||
].join("\n")
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
end
|
206
vendor/plugins/HTML5lib/tests/test_sanitizer.rb
vendored
Normal file
206
vendor/plugins/HTML5lib/tests/test_sanitizer.rb
vendored
Normal file
|
@ -0,0 +1,206 @@
|
|||
#!/usr/bin/env ruby
|
||||
|
||||
require File.join(File.dirname(__FILE__), 'preamble')
|
||||
|
||||
require 'html5lib/sanitizer'
|
||||
require 'html5lib/html5parser'
|
||||
require 'html5lib/liberalxmlparser'
|
||||
|
||||
class SanitizeTest < Test::Unit::TestCase
|
||||
include HTML5lib
|
||||
|
||||
def sanitize_xhtml stream
|
||||
XHTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
|
||||
end
|
||||
|
||||
def sanitize_html stream
|
||||
HTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
|
||||
end
|
||||
|
||||
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
|
||||
next if %w[caption col colgroup optgroup option table tbody td tfoot th thead tr].include?(tag_name) ### TODO
|
||||
define_method "test_should_allow_#{tag_name}_tag" do
|
||||
if tag_name == 'image'
|
||||
assert_equal "<img title=\"1\"/>foo <bad>bar</bad> baz",
|
||||
sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
|
||||
elsif VOID_ELEMENTS.include?(tag_name)
|
||||
assert_equal "<#{tag_name} title=\"1\"/>foo <bad>bar</bad> baz",
|
||||
sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
|
||||
else
|
||||
assert_equal "<#{tag_name.downcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.downcase}>",
|
||||
sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
|
||||
assert_equal "<#{tag_name} title=\"1\">foo <bad>bar</bad> baz</#{tag_name}>",
|
||||
sanitize_xhtml("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
|
||||
define_method "test_should_forbid_#{tag_name.upcase}_tag" do
|
||||
assert_equal "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>",
|
||||
sanitize_html("<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>")
|
||||
end
|
||||
end
|
||||
|
||||
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
||||
next if attribute_name == 'style'
|
||||
define_method "test_should_allow_#{attribute_name}_attribute" do
|
||||
assert_equal "<p #{attribute_name.downcase}=\"foo\">foo <bad>bar</bad> baz</p>",
|
||||
sanitize_html("<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>")
|
||||
assert_equal "<p #{attribute_name}=\"foo\">foo <bad>bar</bad> baz</p>",
|
||||
sanitize_xhtml("<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>")
|
||||
end
|
||||
end
|
||||
|
||||
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
||||
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
|
||||
assert_equal "<p>foo <bad>bar</bad> baz</p>",
|
||||
sanitize_html("<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>")
|
||||
end
|
||||
end
|
||||
|
||||
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
|
||||
define_method "test_should_allow_#{protocol}_uris" do
|
||||
assert_equal "<a href=\"#{protocol}\">foo</a>",
|
||||
sanitize_html(%(<a href="#{protocol}">foo</a>))
|
||||
end
|
||||
end
|
||||
|
||||
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
|
||||
define_method "test_should_allow_uppercase_#{protocol}_uris" do
|
||||
assert_equal "<a href=\"#{protocol.upcase}\">foo</a>",
|
||||
sanitize_html(%(<a href="#{protocol.upcase}">foo</a>))
|
||||
end
|
||||
end
|
||||
|
||||
def test_should_allow_anchors
|
||||
assert_equal "<a href=\"foo\"><script>baz</script></a>",
|
||||
sanitize_html("<a href='foo' onclick='bar'><script>baz</script></a>")
|
||||
end
|
||||
|
||||
# RFC 3986, sec 4.2
|
||||
def test_allow_colons_in_path_component
|
||||
assert_equal "<a href=\"./this:that\">foo</a>",
|
||||
sanitize_html("<a href=\"./this:that\">foo</a>")
|
||||
end
|
||||
|
||||
%w(src width height alt).each do |img_attr|
|
||||
define_method "test_should_allow_image_#{img_attr}_attribute" do
|
||||
assert_equal "<img #{img_attr}=\"foo\"/>",
|
||||
sanitize_html("<img #{img_attr}='foo' onclick='bar' />")
|
||||
end
|
||||
end
|
||||
|
||||
def test_should_handle_non_html
|
||||
assert_equal 'abc', sanitize_html("abc")
|
||||
end
|
||||
|
||||
def test_should_handle_blank_text
|
||||
assert_equal '', sanitize_html('')
|
||||
end
|
||||
|
||||
[%w(img src), %w(a href)].each do |(tag, attr)|
|
||||
close = VOID_ELEMENTS.include?(tag) ? "/>boo" : ">boo</#{tag}>"
|
||||
|
||||
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols" do
|
||||
assert_equal %(<#{tag} title="1"#{close}), sanitize_html(%(<#{tag} #{attr}="javascript:XSS" title="1">boo</#{tag}>))
|
||||
end
|
||||
|
||||
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols_and_whitespace" do
|
||||
assert_equal %(<#{tag} title="1"#{close}), sanitize_html(%(<#{tag} #{attr}=" javascript:XSS" title="1">boo</#{tag}>))
|
||||
end
|
||||
end
|
||||
|
||||
[%(<img src="javascript:alert('XSS');" />),
|
||||
%(<img src=javascript:alert('XSS') />),
|
||||
%(<img src="JaVaScRiPt:alert('XSS')" />),
|
||||
%(<img src='javascript:alert("XSS")' />),
|
||||
%(<img src='javascript:alert(String.fromCharCode(88,83,83))' />),
|
||||
%(<img src='javascript:alert('XSS')' />),
|
||||
%(<img src='javascript:alert('XSS')' />),
|
||||
%(<img src='javascript:alert('XSS')' />),
|
||||
%(<img src="jav\tascript:alert('XSS');" />),
|
||||
%(<img src="jav	ascript:alert('XSS');" />),
|
||||
%(<img src="jav
ascript:alert('XSS');" />),
|
||||
%(<img src="jav
ascript:alert('XSS');" />),
|
||||
%(<img src="  javascript:alert('XSS');" />),
|
||||
%(<img src=" javascript:alert('XSS');" />),
|
||||
%(<img src=" javascript:alert('XSS');" />)].each_with_index do |img_hack, i|
|
||||
define_method "test_should_not_fall_for_xss_image_hack_#{i}" do
|
||||
assert_equal "<img/>", sanitize_html(img_hack)
|
||||
end
|
||||
end
|
||||
|
||||
def test_should_sanitize_tag_broken_up_by_null
|
||||
assert_equal "<scr\357\277\275ipt>alert(\"XSS\")</scr\357\277\275ipt>", sanitize_html(%(<scr\0ipt>alert(\"XSS\")</scr\0ipt>))
|
||||
end
|
||||
|
||||
def test_should_sanitize_invalid_script_tag
|
||||
assert_equal "<script XSS=\"\" SRC=\"http://ha.ckers.org/xss.js\"></script>", sanitize_html(%(<script/XSS SRC="http://ha.ckers.org/xss.js"></script>))
|
||||
end
|
||||
|
||||
def test_should_sanitize_script_tag_with_multiple_open_brackets
|
||||
assert_equal "<<script>alert(\"XSS\");//<</script>", sanitize_html(%(<<script>alert("XSS");//<</script>))
|
||||
assert_equal %(<iframe src=\"http://ha.ckers.org/scriptlet.html\"><), sanitize_html(%(<iframe src=http://ha.ckers.org/scriptlet.html\n<))
|
||||
end
|
||||
|
||||
def test_should_sanitize_unclosed_script
|
||||
assert_equal "<script src=\"http://ha.ckers.org/xss.js?\"><b/>", sanitize_html(%(<script src=http://ha.ckers.org/xss.js?<b>))
|
||||
end
|
||||
|
||||
def test_should_sanitize_half_open_scripts
|
||||
assert_equal "<img/>", sanitize_html(%(<img src="javascript:alert('XSS')"))
|
||||
end
|
||||
|
||||
def test_should_not_fall_for_ridiculous_hack
|
||||
img_hack = %(<img\nsrc\n=\n"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n"\n />)
|
||||
assert_equal "<img/>", sanitize_html(img_hack)
|
||||
end
|
||||
|
||||
def test_platypus
|
||||
assert_equal %(<a href=\"http://www.ragingplatypus.com/\" style=\"display: block; width: 100%; height: 100%; background-color: black; background-x: center; background-y: center;\">never trust your upstream platypus</a>),
|
||||
sanitize_html(%(<a href="http://www.ragingplatypus.com/" style="display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;">never trust your upstream platypus</a>))
|
||||
end
|
||||
|
||||
def test_xul
|
||||
assert_equal %(<p style="">fubar</p>),
|
||||
sanitize_html(%(<p style="-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')">fubar</p>))
|
||||
end
|
||||
|
||||
def test_input_image
|
||||
assert_equal %(<input type="image"/>),
|
||||
sanitize_html(%(<input type="image" src="javascript:alert('XSS');" />))
|
||||
end
|
||||
|
||||
def test_non_alpha_non_digit
|
||||
assert_equal "<script XSS=\"\" src=\"http://ha.ckers.org/xss.js\"></script>",
|
||||
sanitize_html(%(<script/XSS src="http://ha.ckers.org/xss.js"></script>))
|
||||
assert_equal "<a>foo</a>",
|
||||
sanitize_html('<a onclick!#$%&()*~+-_.,:;?@[/|\]^`=alert("XSS")>foo</a>')
|
||||
assert_equal "<img src=\"http://ha.ckers.org/xss.js\"/>",
|
||||
sanitize_html('<img/src="http://ha.ckers.org/xss.js"/>')
|
||||
end
|
||||
|
||||
def test_img_dynsrc_lowsrc
|
||||
assert_equal "<img/>",
|
||||
sanitize_html(%(<img dynsrc="javascript:alert('XSS')" />))
|
||||
assert_equal "<img/>",
|
||||
sanitize_html(%(<img lowsrc="javascript:alert('XSS')" />))
|
||||
end
|
||||
|
||||
def test_div_background_image_unicode_encoded
|
||||
assert_equal '<div style="">foo</div>',
|
||||
sanitize_html(%(<div style="background-image:\0075\0072\006C\0028'\006a\0061\0076\0061\0073\0063\0072\0069\0070\0074\003a\0061\006c\0065\0072\0074\0028.1027\0058.1053\0053\0027\0029'\0029">foo</div>))
|
||||
end
|
||||
|
||||
def test_div_expression
|
||||
assert_equal '<div style="">foo</div>',
|
||||
sanitize_html(%(<div style="width: expression(alert('XSS'));">foo</div>))
|
||||
end
|
||||
|
||||
def test_img_vbscript
|
||||
assert_equal '<img/>',
|
||||
sanitize_html(%(<img src='vbscript:msgbox("XSS")' />))
|
||||
end
|
||||
|
||||
end
|
78
vendor/plugins/HTML5lib/tests/test_tokenizer.rb
vendored
Normal file
78
vendor/plugins/HTML5lib/tests/test_tokenizer.rb
vendored
Normal file
|
@ -0,0 +1,78 @@
|
|||
require File.join(File.dirname(__FILE__), 'preamble')
|
||||
|
||||
require 'html5lib/tokenizer'
|
||||
|
||||
require 'tokenizer_test_parser'
|
||||
|
||||
begin
|
||||
require 'jsonx'
|
||||
rescue LoadError
|
||||
class JSON
|
||||
def self.parse json
|
||||
json.gsub! /"\s*:/, '"=>'
|
||||
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
|
||||
eval json
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
class Html5TokenizerTestCase < Test::Unit::TestCase
|
||||
|
||||
def type_of?(token_name, token)
|
||||
token != 'ParseError' and token_name == token.first
|
||||
end
|
||||
|
||||
def convert_attribute_arrays_to_hashes(tokens)
|
||||
tokens.inject([]) do |tokens, token|
|
||||
token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
|
||||
tokens << token
|
||||
end
|
||||
end
|
||||
|
||||
def concatenate_consecutive_characters(tokens)
|
||||
tokens.inject([]) do |tokens, token|
|
||||
if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
|
||||
tokens.last[1] = tokens.last[1] + token[1]
|
||||
next tokens
|
||||
end
|
||||
tokens << token
|
||||
end
|
||||
end
|
||||
|
||||
def tokenizer_test(data)
|
||||
(data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
|
||||
message = [
|
||||
'Description:', data['description'],
|
||||
'Input:', data['input'],
|
||||
'Content Model Flag:', content_model_flag ] * "\n"
|
||||
|
||||
assert_nothing_raised message do
|
||||
tokenizer = HTML5lib::HTMLTokenizer.new(data['input'])
|
||||
|
||||
tokenizer.contentModelFlag = content_model_flag.to_sym
|
||||
|
||||
tokenizer.currentToken = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
|
||||
|
||||
tokens = TokenizerTestParser.new(tokenizer).parse
|
||||
|
||||
actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
|
||||
|
||||
expected = concatenate_consecutive_characters(data['output'])
|
||||
|
||||
assert_equal expected, actual, message
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
html5lib_test_files('tokenizer').each do |test_file|
|
||||
test_name = File.basename(test_file).sub('.test', '')
|
||||
|
||||
tests = JSON.parse(File.read(test_file))['tests']
|
||||
|
||||
tests.each_with_index do |data, index|
|
||||
define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
|
62
vendor/plugins/HTML5lib/tests/tokenizer_test_parser.rb
vendored
Normal file
62
vendor/plugins/HTML5lib/tests/tokenizer_test_parser.rb
vendored
Normal file
|
@ -0,0 +1,62 @@
|
|||
require 'html5lib/constants'
|
||||
|
||||
class TokenizerTestParser
|
||||
def initialize(tokenizer)
|
||||
@tokenizer = tokenizer
|
||||
end
|
||||
|
||||
def parse
|
||||
@outputTokens = []
|
||||
|
||||
debug = nil
|
||||
for token in @tokenizer
|
||||
debug = token.inspect if token[:type] == :ParseError
|
||||
send ('process' + token[:type].to_s), token
|
||||
end
|
||||
|
||||
return @outputTokens
|
||||
end
|
||||
|
||||
def processDoctype(token)
|
||||
@outputTokens.push(["DOCTYPE", token[:name], token[:data]])
|
||||
end
|
||||
|
||||
def processStartTag(token)
|
||||
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
||||
end
|
||||
|
||||
def processEmptyTag(token)
|
||||
if not HTML5lib::VOID_ELEMENTS.include? token[:name]
|
||||
@outputTokens.push("ParseError")
|
||||
end
|
||||
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
||||
end
|
||||
|
||||
def processEndTag(token)
|
||||
if token[:data].length > 0
|
||||
self.processParseError(token)
|
||||
end
|
||||
@outputTokens.push(["EndTag", token[:name]])
|
||||
end
|
||||
|
||||
def processComment(token)
|
||||
@outputTokens.push(["Comment", token[:data]])
|
||||
end
|
||||
|
||||
def processCharacters(token)
|
||||
@outputTokens.push(["Character", token[:data]])
|
||||
end
|
||||
|
||||
alias processSpaceCharacters processCharacters
|
||||
|
||||
def processCharacters(token)
|
||||
@outputTokens.push(["Character", token[:data]])
|
||||
end
|
||||
|
||||
def processEOF(token)
|
||||
end
|
||||
|
||||
def processParseError(token)
|
||||
@outputTokens.push("ParseError")
|
||||
end
|
||||
end
|
3
vendor/plugins/maruku/lib/maruku/defaults.rb
vendored
3
vendor/plugins/maruku/lib/maruku/defaults.rb
vendored
|
@ -31,6 +31,9 @@ Globals = {
|
|||
:maruku_signature => false,
|
||||
:code_background_color => '#fef',
|
||||
:code_show_spaces => false,
|
||||
|
||||
:filter_html => false,
|
||||
|
||||
:html_math_output_mathml => true, # also set :html_math_engine
|
||||
:html_math_engine => 'itex2mml', #ritex, itex2mml, none
|
||||
|
||||
|
|
|
@ -477,7 +477,7 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
|
|||
end
|
||||
|
||||
id = match[1]; url = match[2]; title = match[3];
|
||||
id = id.strip.downcase.gsub(' ','_')
|
||||
id = sanitize_ref_id(id)
|
||||
|
||||
hash = self.refs[id] = {:url=>url,:title=>title}
|
||||
|
||||
|
|
|
@ -287,7 +287,7 @@ module MaRuKu; module In; module Markdown; module SpanLevelParser
|
|||
end
|
||||
|
||||
def extension_meta(src, con, break_on_chars)
|
||||
if m = src.read_regexp(/([^\s\:]+):/)
|
||||
if m = src.read_regexp(/([^\s\:\"\']+):/)
|
||||
name = m[1]
|
||||
al = read_attribute_list(src, con, break_on_chars)
|
||||
# puts "#{name}=#{al.inspect}"
|
||||
|
@ -581,9 +581,9 @@ module MaRuKu; module In; module Markdown; module SpanLevelParser
|
|||
ref_id = read_ref_id(src,con)
|
||||
if ref_id
|
||||
if ref_id.size == 0
|
||||
ref_id = children.to_s.downcase.gsub(' ','_')
|
||||
ref_id = sanitize_ref_id(children.to_s)
|
||||
else
|
||||
ref_id = ref_id.downcase
|
||||
ref_id = sanitize_ref_id(ref_id)
|
||||
end
|
||||
con.push_element md_link(children, ref_id)
|
||||
else
|
||||
|
|
|
@ -108,6 +108,7 @@ module MaRuKu
|
|||
# Input is a LineSource
|
||||
def t2_parse_blocks(src, output)
|
||||
while src.cur_line
|
||||
l = src.shift_line
|
||||
|
||||
# ignore empty line
|
||||
if l.t2_empty? then
|
||||
|
@ -115,7 +116,6 @@ module MaRuKu
|
|||
next
|
||||
end
|
||||
|
||||
l = src.shift_line
|
||||
# TODO: lists
|
||||
# TODO: xml
|
||||
# TODO: `==`
|
||||
|
|
|
@ -741,7 +741,17 @@ of the form `#ff00ff`.
|
|||
return a
|
||||
end
|
||||
|
||||
=begin maruku_doc
|
||||
Attribute: filter_html
|
||||
Scope: document
|
||||
|
||||
If true, raw HTML is discarded from the output.
|
||||
|
||||
=end
|
||||
|
||||
def to_html_raw_html
|
||||
return [] if get_setting(:filter_html)
|
||||
|
||||
raw_html = self.raw_html
|
||||
if rexml_doc = @parsed_html
|
||||
root = rexml_doc.root
|
||||
|
|
|
@ -152,7 +152,7 @@ end end
|
|||
module MaRuKu; module Out; module Latex
|
||||
|
||||
def to_latex_hrule; "\n\\vspace{.5em} \\hrule \\vspace{.5em}\n" end
|
||||
def to_latex_linebreak; "\\linebreak " end
|
||||
def to_latex_linebreak; "\\newline " end
|
||||
|
||||
def to_latex_paragraph
|
||||
children_to_latex+"\n\n"
|
||||
|
|
|
@ -146,6 +146,10 @@ module MaRuKu; module Strings
|
|||
s[0, i+1].strip
|
||||
end
|
||||
|
||||
def sanitize_ref_id(x)
|
||||
x.downcase.gsub(' ','_').gsub(/[^\w]/,'')
|
||||
end
|
||||
|
||||
|
||||
# removes initial quote
|
||||
def unquote(s)
|
||||
|
|
|
@ -155,7 +155,7 @@ module MaRuKu; module Tests
|
|||
["[a]", [ md_link(["a"],'a')], 'Empty link'],
|
||||
["[a][]", ],
|
||||
["[a][]b", [ md_link(["a"],'a'),'b'], 'Empty link'],
|
||||
["[a\\]][]", [ md_link(["a]"],'a]')], 'Escape inside link'],
|
||||
["[a\\]][]", [ md_link(["a]"],'a')], 'Escape inside link (throw ?] away)'],
|
||||
|
||||
["[a", :throw, 'Link not closed'],
|
||||
["[a][", :throw, 'Ref not closed'],
|
||||
|
|
2
vendor/plugins/maruku/lib/maruku/version.rb
vendored
2
vendor/plugins/maruku/lib/maruku/version.rb
vendored
|
@ -19,7 +19,7 @@
|
|||
#++
|
||||
|
||||
module MaRuKu
|
||||
Version = '0.5.5'
|
||||
Version = '0.5.6'
|
||||
|
||||
MarukuURL = 'http://maruku.rubyforge.org/'
|
||||
|
||||
|
|
Loading…
Reference in a new issue